Commit fe20c6f5 by etcart

updated lots of stuff

parent 60856c1d
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "stemconfig/stemset.h"
struct treenode{ struct treenode{
void* data; void* data;
...@@ -11,14 +12,14 @@ struct treenode{ ...@@ -11,14 +12,14 @@ struct treenode{
struct treenode* links[26]; struct treenode* links[26];
int downstream; int downstream;
}; }*nextNode;
void stemInsert(struct treenode* node, char* letter, void* data);
int treecut(struct treenode* node, char* letter); int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data); void treeInsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter); void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup(); struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
...@@ -64,27 +65,34 @@ int wordtoSeed(char* word){ ...@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
return seed; return seed;
} }
struct treenode* stemTreeSetup(){ struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r"); FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
if(!netfile){ if(!wordFile){
printf("no stemnet file"); printf("no wordnet file");
return 0; return 0;
} }
struct treenode* rootNode = calloc(1, sizeof(struct treenode)); struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
nextNode = rootNode+1;
char word[100]; char word[100];
char stem[100]; char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
while(fscanf(netfile, "%s %s", word, stem)){ sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem); stemInsert(rootNode, word, stem);
if(feof(wordFile)){
break;
}
stem += displacement+1;
} }
fclose(wordFile);
return rootNode; return rootNode;
} }
void* treeSearch(struct treenode* node, char* letter){ void* treeSearch(struct treenode* node, char* letter){
...@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){ ...@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
return node->data; return node->data;
} }
} }
void RIVinsert(struct treenode* node, char* letter, void* data){ void stemInsert(struct treenode* node, char* letter, void* data){
node->downstream++; node->downstream++;
if(*(letter)){ if(*(letter)){
if(!node->links[*(letter)-'a']){ if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode)); node->links[*(letter)-'a'] = nextNode++;
} }
RIVinsert(node->links[*(letter)-'a'], letter+1, data); treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{ }else{
...@@ -119,43 +127,46 @@ void RIVinsert(struct treenode* node, char* letter, void* data){ ...@@ -119,43 +127,46 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
} }
} }
void stemInsert(struct treenode* node, char* letter, char* data){ void treeInsert(struct treenode* node, char* letter, void* data){
node->downstream++; node->downstream++;
if(*(letter)){ if(*(letter)){
if(!node->links[*(letter)-'a']){ if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode)); node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
} }
stemInsert(node->links[*(letter)-'a'], letter+1, data); treeInsert(node->links[*(letter)-'a'], letter+1, data);
}else{ }else{
if(node->data) return; if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char)); node->data = data;
strcpy((char*)node->data, data);
} }
} }
int treecut(struct treenode* node, char* letter){ int treecut(struct treenode* node, char* letter){
node->downstream--; node->downstream--;
int flag; int flag;
//continue searching downstream if there is a letter
if(*(letter)){ if(*(letter)){
if(node->links[*(letter)-'a']){ if(node->links[*(letter)-'a']){
//propagate to next section
flag = treecut(node->links[*(letter)-'a'], letter+1); flag = treecut(node->links[*(letter)-'a'], letter+1);
//if next section returned a "cut" flag, 0 it out
if(flag){ if(flag){
node->links[*(letter)-'a'] = NULL; node->links[*(letter)-'a'] = NULL;
} }
} }
if(!node->downstream){ //there are no more letters, we've reached our destination
}else{
free(node); node->data = NULL;
return 1;
} }
}else{ //this is on a branch that leads nowhere, free it and return "cut" flag
if(!node->downstream){
free(node); free(node);
return 1; return 1;
...@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){ ...@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
} }
void destroyTree(struct treenode* node){
if(node->data) free(node->data);
for(int i=0; i<26; i++){
if(node->links[i]){
destroyTree(node->links[i]);
}
}
free(node);
}
#endif #endif
No preview for this file type
File added
#include <stdio.h> #include <stdio.h>
#define RIVSIZE 50000 #define RIVSIZE 50000
#define CACHESIZE 20000
#include "RIVtools.h" #include "RIVtools.h"
char* clean(char* word); #define k 5
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
typedef char label[200]; typedef char label[200];
struct RIVclass{ struct RIVclass{
label name; label name;
sparseRIV* set; sparseRIV* set;
int setSize; int setSize;
}; };
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
LEXICON* lexicon; LEXICON* lexicon;
int main(){ int main(){
struct treenode* searchRoot = stemTreeSetup(); struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx"); lexicon = lexOpen("lexiconEnron50-4", "rx");
int classNo = 0; int classNo = 0;
...@@ -25,18 +30,38 @@ int main(){ ...@@ -25,18 +30,38 @@ int main(){
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r"); FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
if(!textSet){ if(!textSet){
puts("no file"); puts("no file");
return 1; return 1;
} }
struct RIVclass* class; struct RIVclass* class = 0;
char text[20000]; char text[20000];
label className; label className;
while(fscanf(textSet, "%s\t%s", text, className)){ //int j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
//if(j++>100) break;
if(feof(textSet)) break;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){ sparseRIV temp = line2L3(text, searchRoot);
temp.magnitude = getMagnitudeSparse(temp);
if(temp.magnitude == 0){
printf("%s, empty\n", text);
continue;
}
//printf("%s, %s", text, className);
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}
if(i == classCount){
/* reinitialize the classnames with a new member */ /* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label)); classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className); strcpy(classNames[classCount], className);
...@@ -53,14 +78,10 @@ int main(){ ...@@ -53,14 +78,10 @@ int main(){
classNo = classCount; classNo = classCount;
classCount++; classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
} }
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV)); class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot); sparseRIV thing= temp;
class->set[class->setSize] = thing; class->set[class->setSize] = thing;
class->setSize++; class->setSize++;
...@@ -69,10 +90,71 @@ int main(){ ...@@ -69,10 +90,71 @@ int main(){
for(int i=0; i<classCount; i++){ for(int i=0; i<classCount; i++){
puts(classNames[i]); puts(classNames[i]);
puts(classes[i].name);
printf("%d\n\n", classes[i].setSize); printf("%d\n\n", classes[i].setSize);
} }
fclose(textSet);
textSet = fopen("../../Downloads/validationText.tsv", "r");
if(!textSet) return 1;
int won = 0;
int docTotal = 0;
//scanf("%d", &won);
//j=0;
while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
if(feof(textSet)) break;
//if(j++>30) break;
int i=0;
for(; i< classCount; i++){
if(!strcmp(className, classNames[i])){
classNo = i;
class = classes+classNo;
break;
}
}if(i == classCount){
printf("unclassifiable\n");
continue;
}
sparseRIV thing= line2L3(text, searchRoot);
if(thing.count ==0){
continue;
}
docTotal++;
denseRIV inQuestion = {0};
addS2D(inQuestion.values, thing);
inQuestion.magnitude = getMagnitudeDense(&inQuestion);
double weights[classCount];
int choice = kNearest(weights, classes, classCount, inQuestion);
if(choice == -1){
printf("classificationFailed");
}else{
//puts(text);
printf("survey says! %s ", className);
printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
}
if(choice == classNo){
won++;
}
free(thing.locations);
}
printf("\n\n we got %d/%d ", won, docTotal);
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
free(classes[i].set[j].locations);
}
free(classes[i].set);
}
free(classes);
free(classNames);
destroyTree(searchRoot);
lexClose(lexicon);
fclose(textSet);
return 0; return 0;
} }
...@@ -132,24 +214,72 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){ ...@@ -132,24 +214,72 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
continue; continue;
}else{ }else{
//printf("%s, succesfully pulled\n", stem); //printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values); temp = normalize(*wordRIV, 10000);
//temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp); addS2D(accumulate.values, temp);
free(temp.locations); free(temp.locations);
free(wordRIV); //free(wordRIV);
lexPush(lexicon, wordRIV);
} }
} }
} }
temp = consolidateD2S(accumulate.values); temp = consolidateD2S(accumulate.values);
return temp; return temp;
}
int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
int choice = -1;
memset(weights, 0, classCount*sizeof(double));
double distances[k] = {-2};
int labels[k] = {0};
int fill = 0;
for(int i=0; i<classCount; i++){
for(int j=0; j<classes[i].setSize; j++){
double cosine = cosCompare(inQuestion, classes[i].set[j]);
if(fill < k){
distances[fill] = cosine;
fill++;
continue;
}
for(int x = 0; x<k; x++){
} if(cosine>distances[x]){
distances[x] = cosine;
labels[x] = i;
break;
}
}
}
}
double totalweight = 0;
for(int i=0; i<classCount; i++){
for(int j = 0; j<k; j++){
if(labels[j] == i){
weights[i] += distances[j];
totalweight += distances[j];
}
}
}
double tempmax = -2;
for(int i=0; i<classCount; i++){
weights[i] /= totalweight;
if(weights[i] > tempmax){
choice = i;
tempmax = weights[i];
}
}
return choice;
}
......
File added
This diff is collapsed. Click to expand it.
...@@ -6,10 +6,11 @@ ...@@ -6,10 +6,11 @@
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include <string.h> #include <string.h>
//#define HASHCACHE
#define RIVSIZE 50000 #define RIVSIZE 50000
#define NONZEROS 4 #define NONZEROS 4
#define CACHESIZE 27000 #define CACHESIZE 25000
#define SORTCACHE
#include "RIVtools.h" #include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context) //this program reads a directory full of files, and adds all context vectors (considering file as context)
...@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context); ...@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void lineGrind(char* textLine); void lineGrind(char* textLine);
LEXICON* lp; LEXICON* lp;
//int COUNTY = 0; int COUNTY = 0;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
lp = lexOpen("lexicon", "rw"); lp = lexOpen("lexiconshitty", "r");
//we open the lexicon, if it does not yet exist, it will be created //we open the lexicon, if it does not yet exist, it will be created
...@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){ ...@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
//ensure that the targeted root directory exists //ensure that the targeted root directory exists
struct stat st; struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist"); printf("directory doesn't seem to exist");
...@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){ ...@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
//open a file within root directory //open a file within root directory
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
if(input){ if(input){
if(COUNTY++>1000) return;
//process this file and add it's data to lexicon //process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++); //fprintf(stderr, "***%d", COUNTY++);
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
...@@ -133,7 +135,10 @@ void lineGrind(char* textLine){ ...@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon //we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(lp, word); lexiconRIV= lexPull(lp, word);
if(!lexiconRIV){
printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
continue;
}
//we add the context of this file to this wordVector //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector); addContext(lexiconRIV, contextVector);
...@@ -150,20 +155,13 @@ void lineGrind(char* textLine){ ...@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
} }
//free the heap allocated context vector data //free the heap allocated context vector data
free(contextVector.locations); free(contextVector.locations);
} }
void addContext(denseRIV* lexRIV, sparseRIV context){ void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison) //add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context); sparseRIV thing = context;
addS2D(lexRIV->values, thing);
//log the "size" of the vector which was added //log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises //this is not directly necessary, but is useful metadata for some analises
......
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
#include <stdio.h>
#include "RIVaccessories.h"
#include <time.h>
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
clock_t start, end;
puts("tree ready");
while(1){
scanf("%s", word);
start = clock();
stem = treeSearch(root, word) ;
end = clock();
if(stem){
puts(stem);
}else{
puts("no entry");
}
printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
}
}
import pymongo
from pymongo import MongoClient
def dbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
database = client.rivwordnet
collection = database.stems
collection.create_index("from")
return collection
def dbPost(wordset, collection):
if not len(wordset):
return
posts = []
for key, value in wordset.iteritems():
post = {"from": key, "to": value}
posts.append(post)
collection.insert_many(posts)
def cleanDbSetup():
client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
database = client.rivetcleandocs
collection = database.cleaned
collection.create_index("file")
return collection
def dbPostCleaned(text, file, collection):
if not len(text):
return
document = {
"text": text,
"file": file,
}
collection.insert_one(document)
def dbGet(words, collection):
if mebewords:
return mebeword["to"]
else:
return 0
\ No newline at end of file
#include <stdio.h>
#include "../RIVaccessories.h"
int configInsert(struct treenode* node, char* letter, int treeSize);
int stemTreeConfig();
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
int configInsert(struct treenode* node, char* letter, int treeSize){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
treeSize++;
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
}else{
return treeSize;
}
}
int stemTreeConfig(){
int treeSize = 1;
FILE* wordFile = fopen("wordset.txt", "r");
if(!wordFile){
printf("no wordnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char* stem = (char*)stemset;
int displacement;
while(fscanf(wordFile, "%s", word)){
sscanf(stem, "%*s%n", &displacement);
stem[displacement] = '\0';
treeSize = configInsert(rootNode, word, treeSize);
if(feof(wordFile)){
break;
}
stem += displacement+1;
}
fclose(wordFile);
return treeSize;
}
#include <stdio.h>
#include "../RIVaccessories.h"
int main(){
int count = stemTreeConfig();
printf("%d", count);
}
import dbtools
from subprocess import call
collection = dbtools.dbSetup()
preset = collection.find()
set = {}
for doc in preset:
set[doc["from"]] = doc["to"]
words = [];
stems = [];
for key, value in set.iteritems():
words.append(key);
stems.append(value);
wordFILE = open("wordset.txt", "w")
wordFILE.write(' '.join(words));
wordFILE.close()
stemFILE = open("stemset.h", "w")
finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
stemFILE.write(finalOut + '0;')
stemFILE.close()
tempfile = open("tempfile.txt", "w")
call(["gcc", "stemconf.c","-o", "stemconfig"])
call(["./stemconfig"], stdout=tempfile)
tempfile.close()
tempfile = open("tempfile.txt", "r")
treesize = tempfile.read();
finalOut = finalOut + treesize + ';'
stemFile = open("stemset.h", "w")
stemFile.write(finalOut)
stemFile.close;
This source diff could not be displayed because it is too large. You can view the blob instead.
279920
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#include <stdio.h>
#include "RIVtools.h"
int main(){
struct treenode* root = stemTreeSetup();
char word[100];
char* stem;
while(1){
while(*word != '1'){
scanf("%s", word);
stem = treeSearch(root, word);
if(stem){
puts(stem);
}else{
puts("NULL return");
}
}
while(*word != '0'){
scanf("%s", word);
treecut(root, word);
}
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment