updated RIVreads

34c65893 · amberhosen · 9d2c0fed · 34c65893 · 9d2c0fed · 34c65893
Commit 34c65893 authored Apr 06, 2018 by amberhosen
Showing with 602 additions and 287 deletions
RIVLower.h
RIVPACK1/RIVread
RIVPACK1/RIVread.c
RIVPACK1/RIVread1
RIVPACK1/RIVread2
RIVPACK1/RIVread3
RIVPACK1/RIVread4
RIVPACK1/RIVread5
RIVPACK1/RIVread6
RIVPACK1/runscriptUb.sh
RIVPACK1/shittyballs.py
RIVaccessories.h
RIVlexicon.h
RIVread.c
RIVtools.h
--- a/RIVLower.h
+++ b/RIVLower.h
--- a/RIVPACK1/RIVread
+++ b/RIVPACK1/RIVread
--- a/RIVPACK1/RIVread.c
+++ b/RIVPACK1/RIVread.c
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#define CACHESIZE 15000
-#define RIVSIZE 50000
-#define NONZEROS 8
-#include <setjmp.h>
-#include <signal.h>
-#include "../RIVet/RIVtools.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <error.h>
+#define RIVSIZE 200000
+#define NONZEROS 2
+#define CACHESIZE 1000
+#include "../RIVtools.h"
+
+
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
+//to all words found in these files. this is used to create a lexicon, or add to an existing one

 void fileGrind(FILE* textFile);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
+void addContext(denseRIV* lexRIV, sparseRIV context);
 void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);

-jmp_buf readdirRecov;
+
 int main(int argc, char *argv[]){
-	clock_t begintotal = clock();
-	lexOpen("/home/drbob/Documents/lexicon8-50");
 	char pathString[1000];
+
+	//we open the lexicon, if it does not yet exist, it will be created
+	lexOpen("lexicon200-2");
+	
+	//we format the root directory, preparing to scan its contents
+	
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
-	struct stat st = {0};
+	//ensure that the targeted root directory exists
+	
+	struct stat st;
 	if(stat(pathString, &st) == -1) {
+		printf("directory doesn't seem to exist");
 		return 1;
 	}
-	
+	//we will scan the directory, adding all data to our lexicon, as seen inside
 	directoryGrind(pathString);

-	clock_t endtotal = clock();
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
-	printf("total time:%lf\n\n", time_spent);
+	//we close the lexicon again, ensuring all data is secured
 	lexClose();
 	return 0;
 }

-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
-
-
-
-	while(denseSet_slider<dense_stop){
-		addS2D((*denseSet_slider).values, additive);
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
-
-	}
-	
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
-		if(!strcmp(word, RIVSet->name)){
-			return 1;
-		}
-		RIVSet++;
-	}
-	return 0;
-}
+//mostly a standard recursive Dirent-walk
 void directoryGrind(char *rootString){
-
+	/* *** begin Dirent walk *** */
 	char pathString[2000];
 	DIR *directory;
 	struct dirent *files = 0;
@@ -76,79 +57,101 @@ void directoryGrind(char *rootString){
 	}

 	while((files=readdir(directory))){
-		if(setjmp(readdirRecov)){
-			continue;
-		}
-
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
+		
+		if(!files->d_name[0]) break;
 		while(*(files->d_name)=='.'){
 			files = readdir(directory);
 		}
-		//signal(SIGSEGV, signalSecure);
-
+		
+		
+		
 		if(files->d_type == DT_DIR){
 			strcpy(pathString, rootString);

 			strcat(pathString, files->d_name);
 			strcat(pathString, "/");
 			directoryGrind(pathString);
+			continue;
 		}
+		
+		
+		
 		strcpy(pathString, rootString);
 		strcat(pathString, files->d_name);
 		printf("%s\n", pathString);
-		FILE *input = fopen(pathString, "r+");
+/* *** end dirent walk, begin meat of function  *** */
+		
+		//check for non-txt files
+		char *fileEnding = pathString+strlen(pathString)-4;
+		if(strcmp(fileEnding, ".txt")){
+			printf("skipped: %s\n", files->d_name); 
+			continue;
+		}
+		
+		//open a file within root directory
+		FILE *input = fopen(pathString, "r");
 		if(input){
+			//process this file and add it's data to lexicon
 			fileGrind(input);
+			
 			fclose(input);
 		}
 	}
 }

+//form context vector from contents of file, then add that vector to
+//all lexicon entries of the words contained
 void fileGrind(FILE* textFile){
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
-	fseek(textFile, 0, SEEK_SET);
-
-	int wordCount = 0;
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
-	char word[200];
+	//form a context vector.  "clean" indicates that it will ignore any word which
+	//contains unwanted characters
+	sparseRIV contextVector = fileToL2Clean(textFile);

+	//an array of denseRIVs, large enough to hold all vectors 
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
+	denseRIV* lexiconRIV;
+	
+	char word[100] = {0};
 	while(fscanf(textFile, "%99s", word)){
-
+		//we ensure that each word exists, and is free of unwanted characters
 		if(feof(textFile)) break;
+		
 		if(!(*word))continue;

 		if(!isWordClean((char*)word)){
 			continue;
 		}
-		if(checkDupe(RIVArray, word, wordCount)){
-			continue;
-		}
-		RIVArray[wordCount] = lexPull(word);
-
-		if(!*((RIVArray[wordCount].name))) break;
-
-		*(RIVArray[wordCount].frequency)+= 1;;
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
-
-		wordCount++;
-
-	}
-	//printf("%d\n", wordCount);
-
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
-	denseRIV* RIVArray_slider = RIVArray;
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
-	while(RIVArray_slider<RIVArray_stop){
-
-		lexPush(*RIVArray_slider);
-		RIVArray_slider++;
+		
+		
+		//we pull the vector corresponding to each word from the lexicon
+		//if it's a new word, lexPull returns a 0 vector
+		lexiconRIV= lexPull(word);
+
+		//we add the context of this file to this wordVector
+		addContext(lexiconRIV, contextVector);
+		
+		//we remove the sub-vector corresponding to the word itself
+		subtractThisWord(lexiconRIV);
+		
+		//we log that this word has been encountered one more time
+		lexiconRIV->frequency += 1;
+		
+		//and finally we push it back to the lexicon for permanent storage
+		lexPush(lexiconRIV);
+		
 	}
-	free(RIVArray);
-	free(aggregateRIV.locations);
-
+	free(contextVector.locations);
 }
-void readdirContingency(int sigNumber){
-	puts("readdir segfaulted, trying to recover");
-	longjmp(readdirRecov, 1);

+void addContext(denseRIV* lexRIV, sparseRIV context){
+		
+		//add context to the lexRIV, (using sparse-dense vector comparison)
+		addS2D(lexRIV->values, context);
+		
+		//log the "size" of the vector which was added
+		//this is not directly necessary, but is useful metadata for some analises
+		lexRIV->contextSize += context.contextSize;
+		
 }
+
+
+	
--- a/RIVPACK1/RIVread1
+++ b/RIVPACK1/RIVread1
--- a/RIVPACK1/RIVread2
+++ b/RIVPACK1/RIVread2
--- a/RIVPACK1/RIVread3
+++ b/RIVPACK1/RIVread3
--- a/RIVPACK1/RIVread4
+++ b/RIVPACK1/RIVread4
--- a/RIVPACK1/RIVread5
+++ b/RIVPACK1/RIVread5
--- a/RIVPACK1/RIVread6
+++ b/RIVPACK1/RIVread6
--- a/RIVPACK1/runscriptUb.sh
+++ b/RIVPACK1/runscriptUb.sh
@@ -5,15 +5,16 @@ clean(){
 		else
 			python shittyballs.py "$1"
 			
-			./RIVread cleanbooks/
-			# ./RIVread1 cleanbooks/
+			./RIVread1 cleanbooks/
 			./RIVread2 cleanbooks/
-			#./RIVread3 cleanbooks/
-			#./RIVread4 cleanbooks/
+			./RIVread3 cleanbooks/
+			./RIVread4 cleanbooks/
 			./RIVread5 cleanbooks/
 			./RIVread6 cleanbooks/
-			
+			./RIVread7 cleanbooks/
+
 			rm  -r cleanbooks/
+			#rm "$1"
 		fi
 		shift
 	done
@@ -21,4 +22,4 @@ clean(){



-clean ../bookCleaner/books/*
+clean ../../books/*
--- a/RIVPACK1/shittyballs.py
+++ b/RIVPACK1/shittyballs.py
-import requests
+#import requests
 import re
 import string
 import os
@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
 import pdb
 from nltk.stem import PorterStemmer

-def adverbFix(word):
-    if not nltk.pos_tag(word)[0][1] == 'RB':
-        return word

-    adjective = word[:-2]
-    if not nltk.pos_tag(word)[0][1] == 'JJ':
-        return word;
-    FILE = open("lexicon/" + word, "w")
-    FILE.write("2" + temp)
-    FILE.close()
-    FILE = open("lexicon/" + adjective, "w")
-    FILE.write("1")
-    FILE.close()
-    return adjective
-
-def strip(word):
-    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
-        if word.endswith(suffix):
-            return word[:-len(suffix)]
+def writeWord(cleanString, word, stem, blacklist):
+    if word == stem:
+        FILE = open("lexicon/" + word, "w")
+        FILE.write("1");
+        FILE.close();
+        return (cleanString + " " + word)
+        
+    elif stem not in blacklist:
+        if len(stem) > 2:
+            FILE = open("lexicon/" + word, "w")
+            FILE.write("2"+stem);
+            FILE.close();
+            FILE = open("lexicon/" + stem, "w")
+            FILE.write("1")
+            FILE.close();
+            return (cleanString + " " + stem)
+
+    return cleanString
+	
+
+def liFix(word):
+    if not word[len(word)-2:] == "li":
        return word
-
+    
+    temp = ps.stem(word[:-2])
+    if temp:
+        return temp
+    return word

 def cleanWord(word):
-    #if(len(word) == 0):
-        #print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
    word = word.lower();
    regex = re.compile('[^a-z]+')
    word = regex.sub('', word)
@@ -44,13 +50,11 @@ def cleanWord(word):
 def fileCheck(word):

    try:
-        #print("trying")
+        
        wordFile = open("lexicon/{}".format(word), "r")
        code = int(wordFile.read(1))
    except:
-        #print("file does not exist")
        return 0
-    #print("fileCode{}".format(code))

    if code == 2:
        word = wordFile.read()
@@ -74,6 +78,8 @@ def morphyTest(word):
    return morphyTemp;


+#begin mainfunction
+
 blacklist = ["a", "an", "the", "so", "as", "how",
             "i", "me", "we", "they", "you", "it", "he", "she",
             "but", "have", "had",
@@ -90,13 +96,13 @@ print(sourceString + "\n")

 if not os.path.exists('cleanbooks'):
    os.makedirs('cleanbooks')
-# if not os.path.exists('lexicon'):
-#     os.makedirs('lexicon')
+if not os.path.exists('lexicon'):
+    os.makedirs('lexicon')

 if not os.path.exists(pathString):
    os.makedirs(pathString)

-#call(["python", "blacklist.py"])
+call(["python", "blacklist.py"])
 i=0
 skip = 1
 with open(sourceString, 'U') as fileIn:
@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:

                for tempWord in line.split():
                    word=cleanWord(tempWord)
-
                    if not word:
                        continue
-
-                    # temp = fileCheck(word)
-                    #
-                    # if temp == -1:
-                    #     continue
-                    # if temp == 0:
-                    temp = morphyTest(word)
-                    if temp:
-                        stem = ps.stem(temp)
-                        if stem and not stem in blacklist:
-                            cleanString = cleanString + ' ' + stem
+                    if len(word) < 3:
+						continue;
+                    if word in blacklist:
+						continue;


+                    temp = fileCheck(word)
+                    if temp == -1:
+                        continue
+                    if temp:
+                        cleanString = (cleanString + " " + temp);
+                        continue
+						
+                    else:
+                        morphy = morphyTest(word)
+                        if morphy:
+                            stem = ps.stem(morphy)
+                            if stem:
+				stem = liFix(stem)
+                                cleanString = writeWord(cleanString, word, stem, blacklist)

-                    #if temp == 0:
-                    #    catchAll(word)
                cleanString = cleanString + os.linesep
-            if len(cleanString.split(' ')) > 10:
+            if len(cleanString.split(' ')) > 2:
                
                fileOut.write(cleanString)
                fileOut.close()

--- a/RIVaccessories.h
+++ b/RIVaccessories.h
 #ifndef RIVACCESS_H_
 #define RIVACCESS_H_
+
+
+
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
 int isWordClean(char* word);
+
 /* used by wordClean */
 int isLetter(char c);

+/* creates a standard seed from the characters in a word, hopefully unique */
+int wordtoSeed(char* word);
+
 int isLetter(char c){
 	
 	if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
@@ -26,5 +33,19 @@ int isWordClean(char* word){
 	return 1;
 		
 }
+int wordtoSeed(char* word){
+	int i=0;
+	int seed = 0;
+	while(*word){
+		/* left-shift 5 each time *should* make seeds unique to words
+		 * this means letters are taken as characters counted in base 32, which
+		 * should be large enough to hold all english characters plus a few outliers
+		 * */
+		seed += (*(word))<<(i*5);
+		word++;
+		i++;
+	}
+	return seed;
+}

 #endif
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
+#ifndef RIV_LEXICON_H
+#define RIV_LEXICON_H
+
+#include "RIVLower.h"
+#include "RIVaccessories.h"
+
+/* lexOpen is called to "open the lexicon", setting up for later calls to
+ * lexPush and lexPull. if the lexicon has not been opened before calls
+ * to these functions, their behavior can be unpredictable, most likely crashing
+ */
+void lexOpen();
+
+/* lexClose should always be called after the last lex push or lex pull call
+ * if the lexicon is left open, some vector data may be lost due to 
+ * un-flushed RIV cache
+ */
+void lexClose();
+
+
+/* both lexPush and lexPull must be called *after* the lexOpen() function
+ * and after using them the lexClose() function must be called to ensure
+ * data security */
+ 
+/* lexPush writes a denseRIV to the lexicon for permanent storage */
+int lexPush(denseRIV* RIVout);
+
+int cacheCheckOnPush(denseRIV* RIVout);
+/* lexPull reads a denseRIV from the lexicon, under "word"
+ * if the file does not exist, it creates a 0 vector with the name of word
+ * lexPull returns a denseRIV *pointer* because its data must be tracked 
+ * globally for key optimizations
+ */
+denseRIV* lexPull(char* word);
+
+denseRIV* cacheCheckOnPull(char* word);
+
+/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
+ * saving it for long-term aggregation.  function is called by "lexPush",
+ * which is what users should actually use.  lexPush, unlike fLexPush,
+ * has cache logic under the hood for speed and harddrive optimization
+ */
+int fLexPush(denseRIV* RIVout);
+
+/* flexPull pulls data directly from a file and converts it (if necessary)
+ * to a denseRIV.  function is called by "lexPull" which is what users 
+ * should actually use.  lexPull, unlike FlexPull, has cache logic under
+ * the hood for speed and harddrive optimization 
+ */
+denseRIV* fLexPull(FILE* lexWord);
+
+/* redefines signal behavior to protect cached data against seg-faults etc*/
+void signalSecure(int signum, siginfo_t *si, void* arg);
+
+/* begin definitions */
+void lexOpen(char* lexName){
+	
+	struct stat st = {0};
+	if (stat(lexName, &st) == -1) {
+		mkdir(lexName, 0777);
+	}	
+	strcpy(RIVKey.lexName, lexName);
+	/* open a slot at least large enough for ;worst case handling of
+	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
+	struct sigaction action = {0};
+	action.sa_sigaction = signalSecure;
+	action.sa_flags = SA_SIGINFO;
+	for(int i=1; i<27; i++){
+		sigaction(i,&action,NULL);
+	}
+	 
+
+	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
+	memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
+}
+void lexClose(){
+	
+	 
+	if(cacheDump()){
+		puts("cache dump failed, some lexicon data was lost");
+	}
+}
+#if CACHESIZE > 0
+denseRIV* cacheCheckOnPull(char* word){
+	srand(wordtoSeed(word));
+	int hash = rand()%CACHESIZE;
+	if(RIVKey.RIVCache[hash]){
+		if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
+
+			/* if word is cached, pull from cache and exit */
+			return RIVKey.RIVCache[hash];
+		}
+	}
+	return NULL;
+}
+#endif
+denseRIV* lexPull(char* word){
+	denseRIV* output;
+	
+	#if CACHESIZE > 0
+
+	/* if there is a cache, first check if the word is cached */
+	if((output = cacheCheckOnPull(word))){
+		return output;
+	}
+	#endif /* CACHESIZE > 0 */
+
+	/* if not, attempt to pull the word data from lexicon file */
+
+
+	char pathString[200];
+
+	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
+	FILE *lexWord = fopen(pathString, "rb");
+
+	/* if this lexicon file already exists */
+	if(lexWord){
+		/* pull data from file */
+		output = fLexPull(lexWord);
+		fclose(lexWord);
+	}else{
+		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
+		output = calloc(1, sizeof(denseRIV));
+	}
+
+	strcpy(output->name, word);
+	return output;
+}
+#if CACHESIZE > 0
+int cacheCheckOnPush(denseRIV* RIVout){
+	/* if our RIV was cached already, no need to play with it */
+	if(RIVout->cached){
+		return 1;
+	}
+	
+	srand(wordtoSeed(RIVout->name));
+	int hash = rand()%CACHESIZE;
+	
+	/* if there is no word in this cache slot */
+	if(!RIVKey.RIVCache[hash]){
+		/* push to cache instead of file */
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		return 1;
+	/*if the current RIV is more frequent than the RIV holding its slot */
+	}
+	if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
+		/* push the lower frequency cache entry to a file */
+		fLexPush(RIVKey.RIVCache[hash]);
+		/* replace this cache-slot with the current vector */
+
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		
+		return 1;
+	}
+	return 0;
+	
+	
+}
+#endif
+int lexPush(denseRIV* RIVout){
+	#if CACHESIZE > 0
+
+	if(cacheCheckOnPush(RIVout)){
+		return 0;
+	}
+	
+	#endif /* CACHESIZE != 0 */
+	
+	/* find the cache-slot where this word belongs */
+
+	return fLexPush(RIVout);
+	
+
+}
+int fLexPush(denseRIV* output){	
+	char pathString[200] = {0};
+	denseRIV RIVout = *output;
+	/* word data will be placed in a (new?) file under the lexicon directory
+	 * in a file named after the word itself */
+	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
+	FILE *lexWord = fopen(pathString, "wb");
+
+	if(!lexWord){
+		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
+		return 1;
+	}
+
+	sparseRIV temp = consolidateD2S(RIVout.values);
+	if(temp.count<(RIVSIZE/2)){
+		/* smaller stored as sparse vector */
+
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
+		fwrite(temp.values, temp.count, sizeof(int), lexWord);
+	}else{
+		/* saturation is too high, better to store dense */
+		/* there's gotta be a better way to do this */
+		temp.count = 0;
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
+	}
+
+	fclose(lexWord);
+	free(output);
+	free(temp.locations);
+
+	return 0;
+}
+
+denseRIV* fLexPull(FILE* lexWord){
+	denseRIV *output = calloc(1,sizeof(denseRIV));
+	size_t typeCheck;
+	/* get metadata for vector */
+	fread(&typeCheck, 1, sizeof(size_t), lexWord);
+	fread(&output->frequency, 1, sizeof(int), lexWord);
+	fread(&output->contextSize, 1, sizeof(int), lexWord);
+	fread(&output->magnitude, 1, sizeof(float), lexWord);
+
+	/* first value stored is the value count if sparse, and 0 if dense */
+	if (typeCheck){
+		/* pull as sparseVector */
+		sparseRIV temp;
+		/* value was not 0, so it's the value count */
+		temp.count = typeCheck;
+
+		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
+		temp.values = temp.locations+temp.count;
+		fread(temp.locations, temp.count, sizeof(int), lexWord);
+		fread(temp.values, temp.count, sizeof(int), lexWord);
+
+		addS2D(output->values, temp);
+		free(temp.locations);
+	}else{
+		/* typecheck is thrown away, just a flag in this case */
+		fread(output->values, RIVSIZE, sizeof(int), lexWord);
+	}
+
+
+	output->cached = 0;
+
+	return output;
+
+}
+
+
+
+int cacheDump(){
+
+	int flag = 0;
+
+	for(int i = 0; i < CACHESIZE; i++){
+		if(RIVKey.RIVCache[i]){
+
+			flag += fLexPush(RIVKey.RIVCache[i]);
+		}
+	}
+	return flag;
+}
+
+
+/*TODO add a simplified free function*/
+void signalSecure(int signum, siginfo_t *si, void* arg){
+  if(cacheDump()){
+	  puts("cache dump failed, some lexicon data lost");
+  }else{
+	puts("cache dumped successfully");
+  }
+  signal(signum, SIG_DFL);
+  kill(getpid(), signum);
+}
+
+#endif
--- a/RIVread.c
+++ b/RIVread.c
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#define CACHESIZE 15000
-#include <setjmp.h>
-#include <signal.h>
-#include "RIVtools.h"
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <dirent.h>
-#include <error.h>
-
-void fileGrind(FILE* textFile);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
-void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);
-
-jmp_buf readdirRecov;
-int main(int argc, char *argv[]){
-	clock_t begintotal = clock();
-	lexOpen("/home/drbob/Documents/lexicon");
-	char pathString[1000];
-	strcpy(pathString, argv[1]);
-	strcat(pathString, "/");
-	struct stat st = {0};
-	if(stat(pathString, &st) == -1) {
-		return 1;
-	}
-	
-	directoryGrind(pathString);
-
-	clock_t endtotal = clock();
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
-	printf("total time:%lf\n\n", time_spent);
-	lexClose();
-	return 0;
-}
-
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
-
-
-
-	while(denseSet_slider<dense_stop){
-		addS2D((*denseSet_slider).values, additive);
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
-
-	}
-	
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
-		if(!strcmp(word, RIVSet->name)){
-			return 1;
-		}
-		RIVSet++;
-	}
-	return 0;
-}
-void directoryGrind(char *rootString){
-
-	char pathString[2000];
-	DIR *directory;
-	struct dirent *files = 0;
-
-	if(!(directory = opendir(rootString))){
-		printf("location not found, %s\n", rootString);
-		return;
-	}
-
-	while((files=readdir(directory))){
-		if(setjmp(readdirRecov)){
-			continue;
-		}
-
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
-		while(*(files->d_name)=='.'){
-			files = readdir(directory);
-		}
-		//signal(SIGSEGV, signalSecure);
-
-		if(files->d_type == DT_DIR){
-			strcpy(pathString, rootString);
-
-			strcat(pathString, files->d_name);
-			strcat(pathString, "/");
-			directoryGrind(pathString);
-		}
-		strcpy(pathString, rootString);
-		strcat(pathString, files->d_name);
-		printf("%s\n", pathString);
-		FILE *input = fopen(pathString, "r+");
-		if(input){
-			fileGrind(input);
-			fclose(input);
-		}
-	}
-}
-
-void fileGrind(FILE* textFile){
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
-	fseek(textFile, 0, SEEK_SET);
-
-	int wordCount = 0;
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
-	char word[200];
-
-	while(fscanf(textFile, "%99s", word)){
-
-		if(feof(textFile)) break;
-		if(!(*word))continue;
-
-		if(!isWordClean((char*)word)){
-			continue;
-		}
-		if(checkDupe(RIVArray, word, wordCount)){
-			continue;
-		}
-		RIVArray[wordCount] = lexPull(word);
-
-		if(!*((RIVArray[wordCount].name))) break;
-
-		*(RIVArray[wordCount].frequency)+= 1;;
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
-
-		wordCount++;
-
-	}
-	//printf("%d\n", wordCount);
-
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
-	denseRIV* RIVArray_slider = RIVArray;
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
-	while(RIVArray_slider<RIVArray_stop){
-
-		lexPush(*RIVArray_slider);
-		RIVArray_slider++;
-	}
-	free(RIVArray);
-	free(aggregateRIV.locations);
-
-}
-void readdirContingency(int sigNumber){
-	puts("readdir segfaulted, trying to recover");
-	longjmp(readdirRecov, 1);
-
-}
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <error.h>
+#include "../../RIVtools.h"
+
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
+//to all words found in these files. this is used to create a lexicon, or add to an existing one
+
+void fileGrind(FILE* textFile);
+void addContext(denseRIV* lexRIV, sparseRIV context);
+void directoryGrind(char *rootString);
+
+
+int main(int argc, char *argv[]){
+	char pathString[1000];
+
+	//we open the lexicon, if it does not yet exist, it will be created
+	lexOpen("lexicon");
+	
+	//we format the root directory, preparing to scan its contents
+	
+	strcpy(pathString, argv[1]);
+	strcat(pathString, "/");
+	//ensure that the targeted root directory exists
+	
+	struct stat st;
+	if(stat(pathString, &st) == -1) {
+		printf("directory doesn't seem to exist");
+		return 1;
+	}
+	//we will scan the directory, adding all data to our lexicon, as seen inside
+	directoryGrind(pathString);
+
+	//we close the lexicon again, ensuring all data is secured
+	lexClose();
+	return 0;
+}
+
+//mostly a standard recursive Dirent-walk
+void directoryGrind(char *rootString){
+	/* *** begin Dirent walk *** */
+	char pathString[2000];
+	DIR *directory;
+	struct dirent *files = 0;
+
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
+		return;
+	}
+
+	while((files=readdir(directory))){
+		
+		if(!files->d_name[0]) break;
+		while(*(files->d_name)=='.'){
+			files = readdir(directory);
+		}
+		
+		
+		
+		if(files->d_type == DT_DIR){
+			strcpy(pathString, rootString);
+
+			strcat(pathString, files->d_name);
+			strcat(pathString, "/");
+			directoryGrind(pathString);
+			continue;
+		}
+		
+		
+		
+		strcpy(pathString, rootString);
+		strcat(pathString, files->d_name);
+		printf("%s\n", pathString);
+/* *** end dirent walk, begin meat of function  *** */
+		
+		//check for non-txt files
+		char *fileEnding = pathString+strlen(pathString)-4;
+		if(strcmp(fileEnding, ".txt")){
+			printf("skipped: %s\n", files->d_name); 
+			continue;
+		}
+		
+		//open a file within root directory
+		FILE *input = fopen(pathString, "r");
+		if(input){
+			//process this file and add it's data to lexicon
+			fileGrind(input);
+			
+			fclose(input);
+		}
+	}
+}
+
+//form context vector from contents of file, then add that vector to
+//all lexicon entries of the words contained
+void fileGrind(FILE* textFile){
+	//form a context vector.  "clean" indicates that it will ignore any word which
+	//contains unwanted characters
+	sparseRIV contextVector = fileToL2Clean(textFile);
+
+	//an array of denseRIVs, large enough to hold all vectors 
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
+	denseRIV* lexiconRIV;
+	
+	char word[100] = {0};
+	while(fscanf(textFile, "%99s", word)){
+		//we ensure that each word exists, and is free of unwanted characters
+		if(feof(textFile)) break;
+		
+		if(!(*word))continue;
+
+		if(!isWordClean((char*)word)){
+			continue;
+		}
+		
+		
+		//we pull the vector corresponding to each word from the lexicon
+		//if it's a new word, lexPull returns a 0 vector
+		lexiconRIV= lexPull(word);
+
+		//we add the context of this file to this wordVector
+		addContext(lexiconRIV, contextVector);
+		
+		//we remove the sub-vector corresponding to the word itself
+		subtractThisWord(lexiconRIV);
+		
+		//we log that this word has been encountered one more time
+		lexiconRIV->frequency += 1;
+		
+		//and finally we push it back to the lexicon for permanent storage
+		lexPush(lexiconRIV);
+		
+	}
+	free(contextVector.locations);
+}
+
+void addContext(denseRIV* lexRIV, sparseRIV context){
+		
+		//add context to the lexRIV, (using sparse-dense vector comparison)
+		addS2D(lexRIV->values, context);
+		
+		//log the "size" of the vector which was added
+		//this is not directly necessary, but is useful metadata for some analises
+		lexRIV->contextSize += context.contextSize;
+		
+}
+
+
+	
--- a/RIVtools.h
+++ b/RIVtools.h