updated lots of stuff

fe20c6f5 · etcart · 60856c1d · fe20c6f5 · fe20c6f5 · fe20c6f5
Commit fe20c6f5 authored May 09, 2018 by etcart
Showing with 426 additions and 135 deletions
.stemnet2.txt.swp
.stemnete.txt.swp
RIVaccessories.h
RIVaccessories.h.gch
RIVclasses
RIVclasses.c
RIVclasses.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread.c
runscriptUb.sh
saturation.c
someshit.c
stemconfig/dbtools.py
stemconfig/dbtools.pyc
stemconfig/stemconf
stemconfig/stemconf.c
stemconfig/stemconf.o
stemconfig/stemconfig
stemconfig/stemconfig.c
--- a/.stemnet2.txt.swp
+++ b/.stemnet2.txt.swp
--- a/.stemnete.txt.swp
+++ b/.stemnete.txt.swp
--- a/RIVaccessories.h
+++ b/RIVaccessories.h
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "stemconfig/stemset.h"
 struct treenode{
 	void* data;
@@ -11,14 +12,14 @@ struct treenode{
 	struct treenode* links[26];
 	int downstream;
-};
+}*nextNode;
+void stemInsert(struct treenode* node, char* letter, void* data);
 int treecut(struct treenode* node, char* letter);
-void stemInsert(struct treenode* node, char* letter, char* data);
-void RIVinsert(struct treenode* node, char* letter, void* data);
+void treeInsert(struct treenode* node, char* letter, void* data);
 void* treeSearch(struct treenode* node, char* letter);
 struct treenode* stemTreeSetup();
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
 	return seed;
 }
 struct treenode* stemTreeSetup(){
-	FILE* netfile = fopen("stemnet2.txt", "r");
+	FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
-	if(!netfile){
+	if(!wordFile){
-		printf("no stemnet file");
+		printf("no wordnet file");
 		return 0;
 	}
-	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
+	nextNode = rootNode+1;
 	char word[100];
-	char stem[100];
+	char* stem = (char*)stemset;
+	int displacement;
-	while(fscanf(netfile, "%s %s", word, stem)){
+	while(fscanf(wordFile, "%s", word)){
-		if(feof(netfile)){
+		sscanf(stem, "%*s%n", &displacement);
-			break;
+		stem[displacement] = '\0';
-		}
-		stemInsert(rootNode, word, stem);
+		stemInsert(rootNode, word, stem);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
 	}
+	fclose(wordFile);
 	return rootNode;
 }
 void* treeSearch(struct treenode* node, char* letter){
@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
 		return node->data;
 	}
 }
-void RIVinsert(struct treenode* node, char* letter, void* data){
+void stemInsert(struct treenode* node, char* letter, void* data){
 	node->downstream++;
 	if(*(letter)){
 		if(!node->links[*(letter)-'a']){
-			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+			node->links[*(letter)-'a'] = nextNode++;
 		}
-		RIVinsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 	}else{
@@ -119,44 +127,47 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
 	}
 }
-void stemInsert(struct treenode* node, char* letter, char* data){
+void treeInsert(struct treenode* node, char* letter, void* data){
 	node->downstream++;
 	if(*(letter)){
 		if(!node->links[*(letter)-'a']){
 			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
 		}
-		stemInsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 	}else{
-		if(node->data) return;
-		node->data = calloc(strlen(data)+1, sizeof(char));
+		if(node->data) return;
+		node->data = data;
-		strcpy((char*)node->data, data);
 	}
 }
 int treecut(struct treenode* node, char* letter){
 	node->downstream--;
 	int flag;
+	//continue searching downstream if there is a letter
 	if(*(letter)){
 		if(node->links[*(letter)-'a']){
+			//propagate to next section
 			flag = treecut(node->links[*(letter)-'a'], letter+1);
+			//if next section returned a "cut" flag, 0 it out
 			if(flag){
 				node->links[*(letter)-'a'] = NULL;
 			}
 		}
-		if(!node->downstream){
+	//there are no more letters, we've reached our destination
-			free(node);
-			return 1;
-		}
 	}else{
+		node->data = NULL;
+	}
+	//this is on a branch that leads nowhere, free it and return "cut" flag
+	if(!node->downstream){
 		free(node);
 		return 1;
 	}
@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
 }
+void destroyTree(struct treenode* node){
+	if(node->data) free(node->data);
+	for(int i=0; i<26; i++){
+		if(node->links[i]){
+			destroyTree(node->links[i]);
+		}
+	}
+	free(node);
+}
 #endif
--- a/RIVaccessories.h.gch
+++ b/RIVaccessories.h.gch
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
 #include <stdio.h>
 #define RIVSIZE 50000
+#define CACHESIZE 20000
 #include "RIVtools.h"
-char* clean(char* word);
+#define k 5
-char* stemmy(struct treenode* searchRoot, char* word);
-sparseRIV line2L3(char* text, struct treenode* searchRoot);
 typedef char label[200];
 struct RIVclass{
 	label name;
 	sparseRIV* set;
 	int setSize;
 };
+char* clean(char* word);
+char* stemmy(struct treenode* searchRoot, char* word);
+sparseRIV line2L3(char* text, struct treenode* searchRoot);
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
 LEXICON* lexicon;
 int main(){
 	struct treenode* searchRoot = stemTreeSetup();
-	lexicon = lexOpen("consolidatedLexicon", "rx");
+	lexicon = lexOpen("lexiconEnron50-4", "rx");
 	int classNo = 0;
@@ -25,18 +30,38 @@ int main(){
-	FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
+	FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
 	if(!textSet){
 		puts("no file");
 		return 1;
 	}
-	struct RIVclass* class;
+	struct RIVclass* class = 0;
 	char text[20000];
 	label className;
-	while(fscanf(textSet, "%s\t%s", text, className)){
+	//int j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		//if(j++>100) break;
+		if(feof(textSet)) break;
-		char* labelTemp = strstr(*classNames, className);
-		if(!labelTemp){
+		sparseRIV temp = line2L3(text, searchRoot);
+		temp.magnitude = getMagnitudeSparse(temp);
+		if(temp.magnitude == 0){
+			printf("%s, empty\n", text);
+			continue;
+		}
+		//printf("%s, %s", text, className);
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}
+		if(i == classCount){
 			/* reinitialize the classnames with a new member */
 			classNames = realloc(classNames, (classCount+1)*sizeof(label));
 			strcpy(classNames[classCount], className);
@@ -53,14 +78,10 @@ int main(){
 			classNo = classCount;
 			classCount++;
-		}else{
-			classNo = (labelTemp-*classNames);
-			class = classes+classNo;
 		}
 		class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
-		sparseRIV thing= line2L3(text, searchRoot);
+		sparseRIV thing= temp;
 		class->set[class->setSize] = thing;
 		class->setSize++;
@@ -69,10 +90,71 @@ int main(){
 	for(int i=0; i<classCount; i++){
 		puts(classNames[i]);
+		puts(classes[i].name);
 		printf("%d\n\n", classes[i].setSize);
 	}
+	fclose(textSet);
+	textSet = fopen("../../Downloads/validationText.tsv", "r");
+	if(!textSet) return 1;
+	int won = 0;
+	int docTotal = 0;
+	//scanf("%d", &won);
+	//j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		if(feof(textSet)) break;
+		//if(j++>30) break;
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}if(i == classCount){
+			printf("unclassifiable\n");
+			continue;
+		}
+		sparseRIV thing= line2L3(text, searchRoot);
+		if(thing.count ==0){
+			continue;
+		}
+		docTotal++;
+		denseRIV inQuestion = {0};
+		addS2D(inQuestion.values, thing);
+		inQuestion.magnitude = getMagnitudeDense(&inQuestion);
+		double weights[classCount];
+		int choice = kNearest(weights, classes, classCount, inQuestion);
+		if(choice == -1){
+			printf("classificationFailed");
+		}else{
+			//puts(text);
+			printf("survey says! %s  ", className);
+			printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
+		}
+		if(choice == classNo){
+			won++;
+		}
+		free(thing.locations);
+	}
+	printf("\n\n we got %d/%d ", won, docTotal);
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			free(classes[i].set[j].locations);
+		}
+		free(classes[i].set);
+	}
+	free(classes);
+	free(classNames);
+	destroyTree(searchRoot);
+	lexClose(lexicon);
+	fclose(textSet);
 	return 0;
 }
@@ -132,26 +214,74 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
 				continue;
 			}else{
 				//printf("%s, succesfully pulled\n", stem);
-				temp = consolidateD2S(wordRIV->values);
+				temp = normalize(*wordRIV, 10000);
+				//temp = consolidateD2S(wordRIV->values);
 				addS2D(accumulate.values, temp);
 				free(temp.locations);
-				free(wordRIV);
+				//free(wordRIV);
+				lexPush(lexicon, wordRIV);
 			}
 		}
 	}
 	temp = consolidateD2S(accumulate.values);
 	return temp;
+}
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
+	int choice = -1;
+	memset(weights, 0, classCount*sizeof(double));
+	double distances[k] = {-2};
+	int labels[k] = {0};
+	int fill = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			double cosine = cosCompare(inQuestion, classes[i].set[j]);
+			if(fill < k){
+				distances[fill] = cosine;
+				fill++;
+				continue;
+			}
+			for(int x = 0; x<k; x++){
+				if(cosine>distances[x]){
+					distances[x] = cosine;
+					labels[x] = i;
+					break;
+				}
+			}
+		}
+	}
+	double totalweight = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j = 0; j<k; j++){
+			if(labels[j] == i){
+				weights[i] += distances[j];
+				totalweight += distances[j];
+			}
+		}
+	}
+	double tempmax = -2;
+	for(int i=0; i<classCount; i++){
+		weights[i] /= totalweight;
+		if(weights[i] > tempmax){
+			choice = i;
+			tempmax = weights[i];
+		}
+	}
+	return choice;
 }

--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,10 +6,11 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
-//#define HASHCACHE
 #define RIVSIZE 50000
 #define NONZEROS 4
-#define CACHESIZE 27000
+#define CACHESIZE 25000
+#define SORTCACHE
 #include "RIVtools.h"
 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
 void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
 LEXICON* lp;
-//int COUNTY = 0;
+int COUNTY = 0;
 int main(int argc, char *argv[]){
 	char pathString[1000];
-	lp = lexOpen("lexicon", "rw");
+	lp = lexOpen("lexiconshitty", "r");
 	//we open the lexicon, if it does not yet exist, it will be created
@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
 	//ensure that the targeted root directory exists
 	struct stat st;
 	if(stat(pathString, &st) == -1) {
 		printf("directory doesn't seem to exist");
@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
 		//open a file within root directory
 		FILE *input = fopen(pathString, "r");
 		if(input){
+			if(COUNTY++>1000) return;
 			//process this file and add it's data to lexicon
 			//fprintf(stderr, "***%d", COUNTY++);
 			fileGrind(input);
 			fclose(input);
@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
 		//we pull the vector corresponding to each word from the lexicon
 		//if it's a new word, lexPull returns a 0 vector
 		lexiconRIV= lexPull(lp, word);
+		if(!lexiconRIV){
+			printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
+			continue;
+		}
 		//we add the context of this file to this wordVector
 		addContext(lexiconRIV, contextVector);
@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
 	}
 	//free the heap allocated context vector data
 	free(contextVector.locations);
 }
 void addContext(denseRIV* lexRIV, sparseRIV context){
 		//add context to the lexRIV, (using sparse-dense vector comparison)
-		addS2D(lexRIV->values, context);
+		sparseRIV thing = context;
+		addS2D(lexRIV->values, thing);
 		//log the "size" of the vector which was added
 		//this is not directly necessary, but is useful metadata for some analises

--- a/runscriptUb.sh
+++ b/runscriptUb.sh
-clean(){
-	while [ "$1" ]; do
-		./RIVread "$1"
-		shift
-	done
-}
-clean ../bookCleaner/cleanbooks/*
--- a/saturation.c
+++ b/saturation.c
-#include <stdio.h>
-#include <stdlib.h>
-#include <dirent.h>
-#include <time.h>
-#include "RIVtoolsCPUlinux.h"
-void directoryToL2s(char *rootString);
-int main(){
-	RIVInit();
-	char rootString[] = "lexicon/";
-	directoryToL2s(rootString);
-}
-void directoryToL2s(char *rootString){
-	sparseRIV fileRIV;
-	char pathString[2000];
-	DIR *directory;
-    struct dirent *files = 0;
-	if(!(directory = opendir(rootString))){
-		printf("location not found, %s\n", rootString);
-		return;
-	}
-	while((files=readdir(directory))){
-		if(*(files->d_name) == '.') continue;
-		if(files->d_type == DT_DIR){
-			strcpy(pathString, rootString);
-			strcat(pathString, files->d_name);
-			strcat(pathString, "/");
-			directoryToL2s(pathString);
-		}
-		strcpy(pathString, rootString);
-		strcat(pathString, files->d_name);
-		FILE *input = fopen(pathString, "r");
-		if(!input){
-			printf("file %s doesn't seem to exist, breaking out of loop", pathString);
-			return;
-		}else{
-			denseRIV temp = lexPull(pathString);
-			fileRIV = consolidateD2S(temp.values);
-			strcpy(fileRIV.name, pathString);
-			float count = fileRIV.count;
-			printf("%s, saturation: %f\n", fileRIV.name, count);
-			fclose(input);
-			free(temp.values);
-			//free(fileRIV.locations);
-		}
-	}
-}
--- a/someshit.c
+++ b/someshit.c
+#include <stdio.h>
+#include "RIVaccessories.h"
+#include <time.h>
+int main(){
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	clock_t start, end;
+	puts("tree ready");
+	while(1){
+		scanf("%s", word);
+		start = clock();
+		stem = treeSearch(root, word) ;
+		end = clock();
+		if(stem){
+			puts(stem);
+		}else{
+			puts("no entry");
+		}
+		printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
+	}
+}
--- a/stemconfig/dbtools.py
+++ b/stemconfig/dbtools.py
+import pymongo
+from pymongo import MongoClient
+def dbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
+    database = client.rivwordnet
+    collection = database.stems
+    collection.create_index("from")
+    return collection
+def dbPost(wordset, collection):
+    if not len(wordset):
+        return
+    posts = []
+    for key, value in wordset.iteritems():
+        post = {"from": key, "to": value}
+        posts.append(post)
+    collection.insert_many(posts)
+def cleanDbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
+    database = client.rivetcleandocs
+    collection = database.cleaned
+    collection.create_index("file")
+    return collection
+def dbPostCleaned(text, file, collection):
+    if not len(text):
+        return
+    document = {
+        "text": text,
+        "file": file,
+    }
+    collection.insert_one(document)
+def dbGet(words, collection):
+    if mebewords:
+        return mebeword["to"]
+    else:
+        return 0
\ No newline at end of file
--- a/stemconfig/dbtools.pyc
+++ b/stemconfig/dbtools.pyc
--- a/stemconfig/stemconf
+++ b/stemconfig/stemconf
--- a/stemconfig/stemconf.c
+++ b/stemconfig/stemconf.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int configInsert(struct treenode* node, char* letter, int treeSize);
+int stemTreeConfig();
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+}
+int configInsert(struct treenode* node, char* letter, int treeSize){
+	node->downstream++;
+	if(*(letter)){
+		if(!node->links[*(letter)-'a']){
+			treeSize++;
+			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+		}
+		return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
+	}else{
+		return treeSize;
+	}
+}
+int stemTreeConfig(){
+	int treeSize = 1;
+	FILE* wordFile = fopen("wordset.txt", "r");
+	if(!wordFile){
+		printf("no wordnet file");
+		return 0;
+	}
+	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	char word[100];
+	char* stem = (char*)stemset;
+	int displacement;
+	while(fscanf(wordFile, "%s", word)){
+		sscanf(stem, "%*s%n", &displacement);
+		stem[displacement] = '\0';
+		treeSize = configInsert(rootNode, word, treeSize);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
+	}
+	fclose(wordFile);
+	return treeSize;
+}
--- a/stemconfig/stemconf.o
+++ b/stemconfig/stemconf.o
--- a/stemconfig/stemconfig
+++ b/stemconfig/stemconfig
--- a/stemconfig/stemconfig.c
+++ b/stemconfig/stemconfig.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+}
--- a/stemconfig/stemconfig.py
+++ b/stemconfig/stemconfig.py
+import dbtools
+from subprocess import call
+collection = dbtools.dbSetup()
+preset = collection.find()
+set = {}
+for doc in preset:
+	set[doc["from"]] = doc["to"]
+words = [];
+stems = [];
+for key, value in set.iteritems():
+	words.append(key);
+	stems.append(value);
+wordFILE = open("wordset.txt", "w")
+wordFILE.write(' '.join(words));
+wordFILE.close()
+stemFILE = open("stemset.h", "w")
+finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
+stemFILE.write(finalOut + '0;')
+stemFILE.close()
+tempfile = open("tempfile.txt", "w")
+call(["gcc", "stemconf.c","-o", "stemconfig"])
+call(["./stemconfig"], stdout=tempfile)
+tempfile.close()
+tempfile = open("tempfile.txt", "r")
+treesize = tempfile.read();
+finalOut = finalOut + treesize + ';'
+stemFile = open("stemset.h", "w")
+stemFile.write(finalOut)
+stemFile.close;
--- a/stemconfig/stemset.h
+++ b/stemconfig/stemset.h
--- a/stemconfig/tempfile.txt
+++ b/stemconfig/tempfile.txt
+279920
\ No newline at end of file
--- a/stemconfig/wordset.txt
+++ b/stemconfig/wordset.txt
--- a/stemnet2.txt
+++ b/stemnet2.txt
--- a/treetest.c
+++ b/treetest.c
+#include <stdio.h>
+#include "RIVtools.h"
+int main(){
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	while(1){
+		while(*word != '1'){
+			scanf("%s", word);
+			stem = treeSearch(root, word);
+			if(stem){
+				puts(stem);
+			}else{
+				puts("NULL return");
+			}
+		}
+		while(*word != '0'){
+			scanf("%s", word);
+			treecut(root, word);
+		}
+	}
+	return 0;
+}