updated RIVreads

34c65893 · amberhosen · 9d2c0fed · 34c65893 · 9d2c0fed · 34c65893
Commit 34c65893 authored Apr 06, 2018 by amberhosen
Showing with 600 additions and 661 deletions
RIVLower.h
RIVPACK1/RIVread
RIVPACK1/RIVread.c
RIVPACK1/RIVread1
RIVPACK1/RIVread2
RIVPACK1/RIVread3
RIVPACK1/RIVread4
RIVPACK1/RIVread5
RIVPACK1/RIVread6
RIVPACK1/runscriptUb.sh
RIVPACK1/shittyballs.py
RIVaccessories.h
RIVlexicon.h
RIVread.c
RIVtools.h
--- a/RIVLower.h
+++ b/RIVLower.h
 #ifndef RIVLOWER_H_
 #define RIVLOWER_H_
 #include <stdio.h>
@@ -6,9 +5,8 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
-#include <math.h>
 #include <sys/stat.h>
-#include <sys/types.h>
+#include "RIVaccessories.h"
 /* RIVSIZE macro defines the dimensionality off the RIVs we will use
 * 25000 is the standard, but can be redefined specifically
 */
@@ -38,7 +36,7 @@
 * that do not use lexpull/push
 */
 #ifndef CACHESIZE
-#define CACHESIZE 20
+#define CACHESIZE 5000
 #endif
 #if CACHESIZE<0
@@ -60,10 +58,9 @@ typedef struct{
 	int *values;
 	int *locations;
 	size_t count;
-	int frequency;
 	double magnitude;
-	int boolean;
 	int contextSize;
+	int frequency;
 }sparseRIV;
 /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
 * this is rarely the case, but its primary use is for performing vector
@@ -71,12 +68,12 @@ typedef struct{
 * performed between sparse and dense (hetero-arithmetic)
 */
 typedef struct{
+	int cached;
 	char name[100];
-	int* values;
+	int frequency;
-	int* frequency;
 	double magnitude;
-	int cached;
+	int contextSize;
-	int *contextSize;
+	int values[RIVSIZE];
 }denseRIV;
 /*RIVKey, holds global variables used under the hood, primarily for the lexicon
@@ -87,21 +84,9 @@ struct RIVData{
 	int h_tempBlock[TEMPSIZE];
 	int tempSize;
 	char lexName[255];
-	denseRIV RIVCache[CACHESIZE];
+	denseRIV* RIVCache[CACHESIZE];
 }static RIVKey;
-/* lexOpen is called to "open the lexicon", setting up for later calls to
- * lexPush and lexPull. if the lexicon has not been opened before calls
- * to these functions, their behavior can be unpredictable, most likely crashing
- */
-void lexOpen();
-/* lexClose should always be called after the last lex push or lex pull call
- * if the lexicon is left open, some vector data may be lost due to 
- * un-flushed RIV cache
- */
-void lexClose();
 /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
 * all 0s removed. it does not automatically carry metadata, which must be assigned
 * to a denseRIV after the fact.  often denseRIVs are only temporary, and don't
@@ -109,32 +94,12 @@ void lexClose();
 */
 sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
 /* makeSparseLocations must be called repeatedly in the processing of a 
 * file to produce a series of locations from the words of the file
 * this produces an "implicit" RIV which can be used with the mapI2D function
 * to create a denseRIV.
 */
-void makeSparseLocations(unsigned char* word,  int *seeds, size_t seedCount);
+void makeSparseLocations(char* word,  int *seeds, size_t seedCount);
-/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
- * saving it for long-term aggregation.  function is called by "lexPush",
- * which is what users should actually use.  lexPush, unlike fLexPush,
- * has cache logic under the hood for speed and harddrive optimization
- */
-int fLexPush(denseRIV RIVout);
-/* flexPull pulls data directly from a file and converts it (if necessary)
- * to a denseRIV.  function is called by "lexPull" which is what users 
- * should actually use.  lexPull, unlike FlexPull, has cache logic under
- * the hood for speed and harddrive optimization 
- */
-denseRIV fLexPull(FILE* lexWord);
-/* creates a standard seed from the characters in a word, hopefully unique */
-int wordtoSeed(unsigned char* word);
 /* mapI2D maps an "implicit RIV" that is, an array of index values, 
 * arranged by chronological order of generation (as per makesparseLocations)
@@ -147,11 +112,6 @@ int* mapI2D(int *locations, size_t seedCount);
 * to be more than worth using
 */
 int* addS2D(int* destination, sparseRIV input);
-/*
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
-* consolidate I2S is temporarily deprecated.  may be brought back.
-* in tandem they are much faster, but less careful with RAM */
 /* caheDump flushes the RIV cache out to relevant files, backing up all 
 * data.  this is called by the lexClose and signalSecure functions
@@ -163,11 +123,9 @@ int cacheDump();
 */
 int* addI2D(int* destination, int* locations, size_t seedCount);
-/* allocates a denseRIV filled with 0s
+/*subtracts a words vector from its own context.  regularly used in lex building
 */
-denseRIV denseAllocate();
+void subtractThisWord(denseRIV* vector);
-/* redefines signal behavior to protect cached data against seg-faults etc*/
-void signalSecure(int signum);
 /* begin definitions */
 int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
@@ -186,7 +144,6 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
 	return destination;
 }
 int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
 	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
 	int *locations_slider = locations;
@@ -201,7 +158,6 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
 		locations_slider++;
 	}
 	return destination;
 }
 int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
@@ -221,50 +177,8 @@ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix de
 	return destination;
 }
-/*
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
-	int *denseTemp = mapI2D(implicit, valueCount);
-	sparseRIV sparseOut = consolidateD2S(denseTemp);
-	free(denseTemp);
-	return sparseOut;
-}
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
-	sparseRIV sparseOut;
-	int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
-	int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
-	sparseOut.count = 0;
-	int add = 1;
-	int found;
-	for(int i=0; i<valueCount; i++){
-		found = 0;
-		for(int j=0; j<sparseOut.count; j++){
-			if(implicit[i] == locationsTemp[j]){
-				valuesTemp[i] += add;
-				add *= -1;
-				found = 1;
-			}
-		}
-		if(!found){
-			locationsTemp[sparseOut.count] = implicit[i];
-			valuesTemp[sparseOut.count] = add;
-			sparseOut.count++;
-			add*= -1;
-		}
-	}
-	sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
-	sparseOut.values = sparseOut.locations+sparseOut.count;
-	memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
-	memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
-	return sparseOut;
-}*/
 sparseRIV consolidateD2S(int *denseInput){
 	sparseRIV output;
 	output.count = 0;
@@ -305,46 +219,8 @@ sparseRIV consolidateD2S(int *denseInput){
 }
-void lexOpen(char* lexName){
-	/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
-	struct stat st;
-	if (stat(lexName, &st) == -1) {
-		mkdir(lexName, 0777);
-	}	
-	strcpy(RIVKey.lexName, lexName);
-	/* open a slot at least large enough for worst case handling of
-	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
-	for(int i=1; i<20; i++){
-		signal(i, signalSecure);
-	}
-	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
-	memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
-}
-void lexClose(){
-	if(cacheDump()){
-		puts("cache dump failed, some lexicon data was lost");
-	}
-}
-int wordtoSeed(unsigned char* word){
-	int i=0;
-	int seed = 0;
-	while(*word){
-		/* left-shift 5 each time *should* make seeds unique to words
-		 * this means letters are taken as characters counted in base 32, which
-		 * should be large enough to hold all english characters plus a few outliers
-		 * */
-		seed += (*(word))<<(i*5);
-		word++;
-		i++;
-	}
-	return seed;
-}
-void makeSparseLocations(unsigned char* word,  int *locations, size_t count){
+void makeSparseLocations(char* word,  int *locations, size_t count){
 	locations+=count;
 	srand(wordtoSeed(word));
 	int *locations_stop = locations+NONZEROS;
@@ -358,128 +234,29 @@ void makeSparseLocations(unsigned char* word,  int *locations, size_t count){
 	return;
 }
-int fLexPush(denseRIV RIVout){	
+sparseRIV* sparseAllocateFormatted(){
-	char pathString[200] = {0};
+	sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
-	/* word data will be placed in a (new?) file under the lexicon directory
-	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
-	FILE *lexWord = fopen(pathString, "wb");
-	if(!lexWord){
-		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
-		return 1;
-	}
-	sparseRIV temp = consolidateD2S(RIVout.values);
-	if(temp.count<(RIVSIZE/2)){
-		/* smaller stored as sparse vector */
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
-		fwrite(temp.values, temp.count, sizeof(int), lexWord);
-	//	printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
-	}else{
-		/* saturation is too high, better to store dense */
-		/* there's gotta be a better way to do this */
-		temp.count = 0;
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
-	//	printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
-	}
-	fclose(lexWord);
-	free(RIVout.values);
-	free(temp.locations);
-	return 0;
-}
-denseRIV fLexPull(FILE* lexWord){
-	denseRIV output = denseAllocate();
-	size_t typeCheck;
-	int flag = 0;
-	/* get metadata for vector */
-	flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
-	flag+= fread(output.frequency, 1, sizeof(int), lexWord);
-	flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
-	flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
-	/* first value stored is the value count if sparse, and 0 if dense */
-	if (typeCheck){
-		/* pull as sparseVector */
-		sparseRIV temp;
-		/* value was not 0, so it's the value count */
-		temp.count = typeCheck;
-		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
-		temp.values = temp.locations+temp.count;
-		flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
-		flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
-		addS2D(output.values, temp);
-		free(temp.locations);
-	}else{
-		/* typecheck is thrown away, just a flag in this case */
-		flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
-	}
-	output.cached = 0;
 	return output;
-}
-void signalSecure(int signum){
-  if(cacheDump()){
-	  puts("cache dump failed, some lexicon data lost");
-  }else{
-	puts("cache dumped successfully");
-  }
-  signal(signum, SIG_DFL);
-  exit(1);
 }
+void subtractThisWord(denseRIV* vector){
-int cacheDump(){
+	//set the rand() seed to the word
+	srand(wordtoSeed(vector->name));
-	int flag = 0;
+	/* the base word vector is composed of NONZERO (always an even number)
-	denseRIV* cache_slider = RIVKey.RIVCache;
+	 * +1s and -1s at "random" points (defined by the above seed.
-	denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
+	 * if we invert it to -1s and +1s, we have subtraction */
-	while(cache_slider<cache_stop){
-		if((*cache_slider).cached){
+	for(int i = 0; i < NONZEROS; i+= 2){
+		vector->values[rand()%RIVSIZE] -= 1;
-			flag += fLexPush(*cache_slider);
+		vector->values[rand()%RIVSIZE] += 1;	
-		}
-		else{
 	}
-		cache_slider++;
+	/* record a context size 1 smaller */
-	}
+	vector->contextSize-= 1;
-	return flag;
-}
-denseRIV denseAllocate(){
-	/* allocates a 0 vector */
-	denseRIV output;
-	output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
-	/* for compact memory use, frequency is placed immediately after values */
-	output.frequency = output.values+RIVSIZE;
-	output.contextSize = output.frequency+1;
-	output.magnitude = 0;
-	output.cached = 0;
-	return output;
 }
-/*TODO add a simplified free function*/
 #endif
--- a/RIVPACK1/RIVread
+++ b/RIVPACK1/RIVread
--- a/RIVPACK1/RIVread.c
+++ b/RIVPACK1/RIVread.c
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#define CACHESIZE 15000
-#define RIVSIZE 50000
-#define NONZEROS 8
-#include <setjmp.h>
-#include <signal.h>
-#include "../RIVet/RIVtools.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <error.h>
+#define RIVSIZE 200000
+#define NONZEROS 2
+#define CACHESIZE 1000
+#include "../RIVtools.h"
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
+//to all words found in these files. this is used to create a lexicon, or add to an existing one
 void fileGrind(FILE* textFile);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
+void addContext(denseRIV* lexRIV, sparseRIV context);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
 void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);
-jmp_buf readdirRecov;
 int main(int argc, char *argv[]){
-	clock_t begintotal = clock();
-	lexOpen("/home/drbob/Documents/lexicon8-50");
 	char pathString[1000];
+	//we open the lexicon, if it does not yet exist, it will be created
+	lexOpen("lexicon200-2");
+	//we format the root directory, preparing to scan its contents
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
-	struct stat st = {0};
+	//ensure that the targeted root directory exists
+	struct stat st;
 	if(stat(pathString, &st) == -1) {
+		printf("directory doesn't seem to exist");
 		return 1;
 	}
+	//we will scan the directory, adding all data to our lexicon, as seen inside
 	directoryGrind(pathString);
-	clock_t endtotal = clock();
+	//we close the lexicon again, ensuring all data is secured
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
-	printf("total time:%lf\n\n", time_spent);
 	lexClose();
 	return 0;
 }
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
+//mostly a standard recursive Dirent-walk
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
-	while(denseSet_slider<dense_stop){
-		addS2D((*denseSet_slider).values, additive);
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
-	}
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
-		if(!strcmp(word, RIVSet->name)){
-			return 1;
-		}
-		RIVSet++;
-	}
-	return 0;
-}
 void directoryGrind(char *rootString){
+	/* *** begin Dirent walk *** */
 	char pathString[2000];
 	DIR *directory;
 	struct dirent *files = 0;
@@ -76,15 +57,13 @@ void directoryGrind(char *rootString){
 	}
 	while((files=readdir(directory))){
-		if(setjmp(readdirRecov)){
-			continue;
-		}
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
+		if(!files->d_name[0]) break;
 		while(*(files->d_name)=='.'){
 			files = readdir(directory);
 		}
-		//signal(SIGSEGV, signalSecure);
 		if(files->d_type == DT_DIR){
 			strcpy(pathString, rootString);
@@ -92,63 +71,87 @@ void directoryGrind(char *rootString){
 			strcat(pathString, files->d_name);
 			strcat(pathString, "/");
 			directoryGrind(pathString);
+			continue;
 		}
 		strcpy(pathString, rootString);
 		strcat(pathString, files->d_name);
 		printf("%s\n", pathString);
-		FILE *input = fopen(pathString, "r+");
+/* *** end dirent walk, begin meat of function  *** */
+		//check for non-txt files
+		char *fileEnding = pathString+strlen(pathString)-4;
+		if(strcmp(fileEnding, ".txt")){
+			printf("skipped: %s\n", files->d_name); 
+			continue;
+		}
+		//open a file within root directory
+		FILE *input = fopen(pathString, "r");
 		if(input){
+			//process this file and add it's data to lexicon
 			fileGrind(input);
 			fclose(input);
 		}
 	}
 }
+//form context vector from contents of file, then add that vector to
+//all lexicon entries of the words contained
 void fileGrind(FILE* textFile){
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
+	//form a context vector.  "clean" indicates that it will ignore any word which
-	fseek(textFile, 0, SEEK_SET);
+	//contains unwanted characters
+	sparseRIV contextVector = fileToL2Clean(textFile);
-	int wordCount = 0;
+	//an array of denseRIVs, large enough to hold all vectors 
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
-	char word[200];
+	denseRIV* lexiconRIV;
+	char word[100] = {0};
 	while(fscanf(textFile, "%99s", word)){
+		//we ensure that each word exists, and is free of unwanted characters
 		if(feof(textFile)) break;
 		if(!(*word))continue;
 		if(!isWordClean((char*)word)){
 			continue;
 		}
-		if(checkDupe(RIVArray, word, wordCount)){
-			continue;
-		}
-		RIVArray[wordCount] = lexPull(word);
-		if(!*((RIVArray[wordCount].name))) break;
-		*(RIVArray[wordCount].frequency)+= 1;;
+		//we pull the vector corresponding to each word from the lexicon
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
+		//if it's a new word, lexPull returns a 0 vector
+		lexiconRIV= lexPull(word);
-		wordCount++;
+		//we add the context of this file to this wordVector
+		addContext(lexiconRIV, contextVector);
-	}
+		//we remove the sub-vector corresponding to the word itself
-	//printf("%d\n", wordCount);
+		subtractThisWord(lexiconRIV);
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
+		//we log that this word has been encountered one more time
-	denseRIV* RIVArray_slider = RIVArray;
+		lexiconRIV->frequency += 1;
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
-	while(RIVArray_slider<RIVArray_stop){
-		lexPush(*RIVArray_slider);
+		//and finally we push it back to the lexicon for permanent storage
-		RIVArray_slider++;
+		lexPush(lexiconRIV);
-	}
-	free(RIVArray);
-	free(aggregateRIV.locations);
+	}
+	free(contextVector.locations);
 }
-void readdirContingency(int sigNumber){
-	puts("readdir segfaulted, trying to recover");
+void addContext(denseRIV* lexRIV, sparseRIV context){
-	longjmp(readdirRecov, 1);
+		//add context to the lexRIV, (using sparse-dense vector comparison)
+		addS2D(lexRIV->values, context);
+		//log the "size" of the vector which was added
+		//this is not directly necessary, but is useful metadata for some analises
+		lexRIV->contextSize += context.contextSize;
 }
--- a/RIVPACK1/RIVread1
+++ b/RIVPACK1/RIVread1
--- a/RIVPACK1/RIVread2
+++ b/RIVPACK1/RIVread2
--- a/RIVPACK1/RIVread3
+++ b/RIVPACK1/RIVread3
--- a/RIVPACK1/RIVread4
+++ b/RIVPACK1/RIVread4
--- a/RIVPACK1/RIVread5
+++ b/RIVPACK1/RIVread5
--- a/RIVPACK1/RIVread6
+++ b/RIVPACK1/RIVread6
--- a/RIVPACK1/runscriptUb.sh
+++ b/RIVPACK1/runscriptUb.sh
@@ -5,15 +5,16 @@ clean(){
 		else
 			python shittyballs.py "$1"
-			./RIVread cleanbooks/
+			./RIVread1 cleanbooks/
-			# ./RIVread1 cleanbooks/
 			./RIVread2 cleanbooks/
-			#./RIVread3 cleanbooks/
+			./RIVread3 cleanbooks/
-			#./RIVread4 cleanbooks/
+			./RIVread4 cleanbooks/
 			./RIVread5 cleanbooks/
 			./RIVread6 cleanbooks/
+			./RIVread7 cleanbooks/
 			rm  -r cleanbooks/
+			#rm "$1"
 		fi
 		shift
 	done
@@ -21,4 +22,4 @@ clean(){
-clean ../bookCleaner/books/*
+clean ../../books/*
--- a/RIVPACK1/shittyballs.py
+++ b/RIVPACK1/shittyballs.py
-import requests
+#import requests
 import re
 import string
 import os
@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
 import pdb
 from nltk.stem import PorterStemmer
-def adverbFix(word):
-    if not nltk.pos_tag(word)[0][1] == 'RB':
-        return word
-    adjective = word[:-2]
+def writeWord(cleanString, word, stem, blacklist):
-    if not nltk.pos_tag(word)[0][1] == 'JJ':
+    if word == stem:
-        return word;
+        FILE = open("lexicon/" + word, "w")
+        FILE.write("1");
+        FILE.close();
+        return (cleanString + " " + word)
+    elif stem not in blacklist:
+        if len(stem) > 2:
            FILE = open("lexicon/" + word, "w")
-    FILE.write("2" + temp)
+            FILE.write("2"+stem);
-    FILE.close()
+            FILE.close();
-    FILE = open("lexicon/" + adjective, "w")
+            FILE = open("lexicon/" + stem, "w")
            FILE.write("1")
-    FILE.close()
+            FILE.close();
-    return adjective
+            return (cleanString + " " + stem)
+    return cleanString
-def strip(word):
-    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
+def liFix(word):
-        if word.endswith(suffix):
+    if not word[len(word)-2:] == "li":
-            return word[:-len(suffix)]
        return word
+    temp = ps.stem(word[:-2])
+    if temp:
+        return temp
+    return word
 def cleanWord(word):
-    #if(len(word) == 0):
-        #print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
    word = word.lower();
    regex = re.compile('[^a-z]+')
    word = regex.sub('', word)
@@ -44,13 +50,11 @@ def cleanWord(word):
 def fileCheck(word):
    try:
-        #print("trying")
        wordFile = open("lexicon/{}".format(word), "r")
        code = int(wordFile.read(1))
    except:
-        #print("file does not exist")
        return 0
-    #print("fileCode{}".format(code))
    if code == 2:
        word = wordFile.read()
@@ -74,6 +78,8 @@ def morphyTest(word):
    return morphyTemp;
+#begin mainfunction
 blacklist = ["a", "an", "the", "so", "as", "how",
             "i", "me", "we", "they", "you", "it", "he", "she",
             "but", "have", "had",
@@ -90,13 +96,13 @@ print(sourceString + "\n")
 if not os.path.exists('cleanbooks'):
    os.makedirs('cleanbooks')
-# if not os.path.exists('lexicon'):
+if not os.path.exists('lexicon'):
-#     os.makedirs('lexicon')
+    os.makedirs('lexicon')
 if not os.path.exists(pathString):
    os.makedirs(pathString)
-#call(["python", "blacklist.py"])
+call(["python", "blacklist.py"])
 i=0
 skip = 1
 with open(sourceString, 'U') as fileIn:
@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
                for tempWord in line.split():
                    word=cleanWord(tempWord)
                    if not word:
                        continue
+                    if len(word) < 3:
+						continue;
+                    if word in blacklist:
+						continue;
-                    # temp = fileCheck(word)
-                    #
-                    # if temp == -1:
-                    #     continue
-                    # if temp == 0:
-                    temp = morphyTest(word)
-                    if temp:
-                        stem = ps.stem(temp)
-                        if stem and not stem in blacklist:
-                            cleanString = cleanString + ' ' + stem
+                    temp = fileCheck(word)
+                    if temp == -1:
+                        continue
+                    if temp:
+                        cleanString = (cleanString + " " + temp);
+                        continue
+                    else:
+                        morphy = morphyTest(word)
+                        if morphy:
+                            stem = ps.stem(morphy)
+                            if stem:
+				stem = liFix(stem)
+                                cleanString = writeWord(cleanString, word, stem, blacklist)
-                    #if temp == 0:
-                    #    catchAll(word)
                cleanString = cleanString + os.linesep
-            if len(cleanString.split(' ')) > 10:
+            if len(cleanString.split(' ')) > 2:
                fileOut.write(cleanString)
                fileOut.close()

--- a/RIVaccessories.h
+++ b/RIVaccessories.h
 #ifndef RIVACCESS_H_
 #define RIVACCESS_H_
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
 int isWordClean(char* word);
 /* used by wordClean */
 int isLetter(char c);
+/* creates a standard seed from the characters in a word, hopefully unique */
+int wordtoSeed(char* word);
 int isLetter(char c){
 	if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
@@ -26,5 +33,19 @@ int isWordClean(char* word){
 	return 1;
 }
+int wordtoSeed(char* word){
+	int i=0;
+	int seed = 0;
+	while(*word){
+		/* left-shift 5 each time *should* make seeds unique to words
+		 * this means letters are taken as characters counted in base 32, which
+		 * should be large enough to hold all english characters plus a few outliers
+		 * */
+		seed += (*(word))<<(i*5);
+		word++;
+		i++;
+	}
+	return seed;
+}
 #endif
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
+#ifndef RIV_LEXICON_H
+#define RIV_LEXICON_H
+#include "RIVLower.h"
+#include "RIVaccessories.h"
+/* lexOpen is called to "open the lexicon", setting up for later calls to
+ * lexPush and lexPull. if the lexicon has not been opened before calls
+ * to these functions, their behavior can be unpredictable, most likely crashing
+ */
+void lexOpen();
+/* lexClose should always be called after the last lex push or lex pull call
+ * if the lexicon is left open, some vector data may be lost due to 
+ * un-flushed RIV cache
+ */
+void lexClose();
+/* both lexPush and lexPull must be called *after* the lexOpen() function
+ * and after using them the lexClose() function must be called to ensure
+ * data security */
+/* lexPush writes a denseRIV to the lexicon for permanent storage */
+int lexPush(denseRIV* RIVout);
+int cacheCheckOnPush(denseRIV* RIVout);
+/* lexPull reads a denseRIV from the lexicon, under "word"
+ * if the file does not exist, it creates a 0 vector with the name of word
+ * lexPull returns a denseRIV *pointer* because its data must be tracked 
+ * globally for key optimizations
+ */
+denseRIV* lexPull(char* word);
+denseRIV* cacheCheckOnPull(char* word);
+/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
+ * saving it for long-term aggregation.  function is called by "lexPush",
+ * which is what users should actually use.  lexPush, unlike fLexPush,
+ * has cache logic under the hood for speed and harddrive optimization
+ */
+int fLexPush(denseRIV* RIVout);
+/* flexPull pulls data directly from a file and converts it (if necessary)
+ * to a denseRIV.  function is called by "lexPull" which is what users 
+ * should actually use.  lexPull, unlike FlexPull, has cache logic under
+ * the hood for speed and harddrive optimization 
+ */
+denseRIV* fLexPull(FILE* lexWord);
+/* redefines signal behavior to protect cached data against seg-faults etc*/
+void signalSecure(int signum, siginfo_t *si, void* arg);
+/* begin definitions */
+void lexOpen(char* lexName){
+	struct stat st = {0};
+	if (stat(lexName, &st) == -1) {
+		mkdir(lexName, 0777);
+	}	
+	strcpy(RIVKey.lexName, lexName);
+	/* open a slot at least large enough for ;worst case handling of
+	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
+	struct sigaction action = {0};
+	action.sa_sigaction = signalSecure;
+	action.sa_flags = SA_SIGINFO;
+	for(int i=1; i<27; i++){
+		sigaction(i,&action,NULL);
+	}
+	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
+	memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
+}
+void lexClose(){
+	if(cacheDump()){
+		puts("cache dump failed, some lexicon data was lost");
+	}
+}
+#if CACHESIZE > 0
+denseRIV* cacheCheckOnPull(char* word){
+	srand(wordtoSeed(word));
+	int hash = rand()%CACHESIZE;
+	if(RIVKey.RIVCache[hash]){
+		if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
+			/* if word is cached, pull from cache and exit */
+			return RIVKey.RIVCache[hash];
+		}
+	}
+	return NULL;
+}
+#endif
+denseRIV* lexPull(char* word){
+	denseRIV* output;
+	#if CACHESIZE > 0
+	/* if there is a cache, first check if the word is cached */
+	if((output = cacheCheckOnPull(word))){
+		return output;
+	}
+	#endif /* CACHESIZE > 0 */
+	/* if not, attempt to pull the word data from lexicon file */
+	char pathString[200];
+	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
+	FILE *lexWord = fopen(pathString, "rb");
+	/* if this lexicon file already exists */
+	if(lexWord){
+		/* pull data from file */
+		output = fLexPull(lexWord);
+		fclose(lexWord);
+	}else{
+		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
+		output = calloc(1, sizeof(denseRIV));
+	}
+	strcpy(output->name, word);
+	return output;
+}
+#if CACHESIZE > 0
+int cacheCheckOnPush(denseRIV* RIVout){
+	/* if our RIV was cached already, no need to play with it */
+	if(RIVout->cached){
+		return 1;
+	}
+	srand(wordtoSeed(RIVout->name));
+	int hash = rand()%CACHESIZE;
+	/* if there is no word in this cache slot */
+	if(!RIVKey.RIVCache[hash]){
+		/* push to cache instead of file */
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		return 1;
+	/*if the current RIV is more frequent than the RIV holding its slot */
+	}
+	if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
+		/* push the lower frequency cache entry to a file */
+		fLexPush(RIVKey.RIVCache[hash]);
+		/* replace this cache-slot with the current vector */
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		return 1;
+	}
+	return 0;
+}
+#endif
+int lexPush(denseRIV* RIVout){
+	#if CACHESIZE > 0
+	if(cacheCheckOnPush(RIVout)){
+		return 0;
+	}
+	#endif /* CACHESIZE != 0 */
+	/* find the cache-slot where this word belongs */
+	return fLexPush(RIVout);
+}
+int fLexPush(denseRIV* output){	
+	char pathString[200] = {0};
+	denseRIV RIVout = *output;
+	/* word data will be placed in a (new?) file under the lexicon directory
+	 * in a file named after the word itself */
+	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
+	FILE *lexWord = fopen(pathString, "wb");
+	if(!lexWord){
+		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
+		return 1;
+	}
+	sparseRIV temp = consolidateD2S(RIVout.values);
+	if(temp.count<(RIVSIZE/2)){
+		/* smaller stored as sparse vector */
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
+		fwrite(temp.values, temp.count, sizeof(int), lexWord);
+	}else{
+		/* saturation is too high, better to store dense */
+		/* there's gotta be a better way to do this */
+		temp.count = 0;
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
+	}
+	fclose(lexWord);
+	free(output);
+	free(temp.locations);
+	return 0;
+}
+denseRIV* fLexPull(FILE* lexWord){
+	denseRIV *output = calloc(1,sizeof(denseRIV));
+	size_t typeCheck;
+	/* get metadata for vector */
+	fread(&typeCheck, 1, sizeof(size_t), lexWord);
+	fread(&output->frequency, 1, sizeof(int), lexWord);
+	fread(&output->contextSize, 1, sizeof(int), lexWord);
+	fread(&output->magnitude, 1, sizeof(float), lexWord);
+	/* first value stored is the value count if sparse, and 0 if dense */
+	if (typeCheck){
+		/* pull as sparseVector */
+		sparseRIV temp;
+		/* value was not 0, so it's the value count */
+		temp.count = typeCheck;
+		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
+		temp.values = temp.locations+temp.count;
+		fread(temp.locations, temp.count, sizeof(int), lexWord);
+		fread(temp.values, temp.count, sizeof(int), lexWord);
+		addS2D(output->values, temp);
+		free(temp.locations);
+	}else{
+		/* typecheck is thrown away, just a flag in this case */
+		fread(output->values, RIVSIZE, sizeof(int), lexWord);
+	}
+	output->cached = 0;
+	return output;
+}
+int cacheDump(){
+	int flag = 0;
+	for(int i = 0; i < CACHESIZE; i++){
+		if(RIVKey.RIVCache[i]){
+			flag += fLexPush(RIVKey.RIVCache[i]);
+		}
+	}
+	return flag;
+}
+/*TODO add a simplified free function*/
+void signalSecure(int signum, siginfo_t *si, void* arg){
+  if(cacheDump()){
+	  puts("cache dump failed, some lexicon data lost");
+  }else{
+	puts("cache dumped successfully");
+  }
+  signal(signum, SIG_DFL);
+  kill(getpid(), signum);
+}
+#endif
--- a/RIVread.c
+++ b/RIVread.c
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#define CACHESIZE 15000
-#include <setjmp.h>
-#include <signal.h>
-#include "RIVtools.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <error.h>
+#include "../../RIVtools.h"
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
+//to all words found in these files. this is used to create a lexicon, or add to an existing one
 void fileGrind(FILE* textFile);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
+void addContext(denseRIV* lexRIV, sparseRIV context);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
 void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);
-jmp_buf readdirRecov;
 int main(int argc, char *argv[]){
-	clock_t begintotal = clock();
-	lexOpen("/home/drbob/Documents/lexicon");
 	char pathString[1000];
+	//we open the lexicon, if it does not yet exist, it will be created
+	lexOpen("lexicon");
+	//we format the root directory, preparing to scan its contents
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
-	struct stat st = {0};
+	//ensure that the targeted root directory exists
+	struct stat st;
 	if(stat(pathString, &st) == -1) {
+		printf("directory doesn't seem to exist");
 		return 1;
 	}
+	//we will scan the directory, adding all data to our lexicon, as seen inside
 	directoryGrind(pathString);
-	clock_t endtotal = clock();
+	//we close the lexicon again, ensuring all data is secured
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
-	printf("total time:%lf\n\n", time_spent);
 	lexClose();
 	return 0;
 }
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
+//mostly a standard recursive Dirent-walk
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
-	while(denseSet_slider<dense_stop){
-		addS2D((*denseSet_slider).values, additive);
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
-	}
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
-		if(!strcmp(word, RIVSet->name)){
-			return 1;
-		}
-		RIVSet++;
-	}
-	return 0;
-}
 void directoryGrind(char *rootString){
+	/* *** begin Dirent walk *** */
 	char pathString[2000];
 	DIR *directory;
 	struct dirent *files = 0;
@@ -74,15 +53,13 @@ void directoryGrind(char *rootString){
 	}
 	while((files=readdir(directory))){
-		if(setjmp(readdirRecov)){
-			continue;
-		}
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
+		if(!files->d_name[0]) break;
 		while(*(files->d_name)=='.'){
 			files = readdir(directory);
 		}
-		//signal(SIGSEGV, signalSecure);
 		if(files->d_type == DT_DIR){
 			strcpy(pathString, rootString);
@@ -90,63 +67,87 @@ void directoryGrind(char *rootString){
 			strcat(pathString, files->d_name);
 			strcat(pathString, "/");
 			directoryGrind(pathString);
+			continue;
 		}
 		strcpy(pathString, rootString);
 		strcat(pathString, files->d_name);
 		printf("%s\n", pathString);
-		FILE *input = fopen(pathString, "r+");
+/* *** end dirent walk, begin meat of function  *** */
+		//check for non-txt files
+		char *fileEnding = pathString+strlen(pathString)-4;
+		if(strcmp(fileEnding, ".txt")){
+			printf("skipped: %s\n", files->d_name); 
+			continue;
+		}
+		//open a file within root directory
+		FILE *input = fopen(pathString, "r");
 		if(input){
+			//process this file and add it's data to lexicon
 			fileGrind(input);
 			fclose(input);
 		}
 	}
 }
+//form context vector from contents of file, then add that vector to
+//all lexicon entries of the words contained
 void fileGrind(FILE* textFile){
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
+	//form a context vector.  "clean" indicates that it will ignore any word which
-	fseek(textFile, 0, SEEK_SET);
+	//contains unwanted characters
+	sparseRIV contextVector = fileToL2Clean(textFile);
-	int wordCount = 0;
+	//an array of denseRIVs, large enough to hold all vectors 
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
-	char word[200];
+	denseRIV* lexiconRIV;
+	char word[100] = {0};
 	while(fscanf(textFile, "%99s", word)){
+		//we ensure that each word exists, and is free of unwanted characters
 		if(feof(textFile)) break;
 		if(!(*word))continue;
 		if(!isWordClean((char*)word)){
 			continue;
 		}
-		if(checkDupe(RIVArray, word, wordCount)){
-			continue;
-		}
-		RIVArray[wordCount] = lexPull(word);
-		if(!*((RIVArray[wordCount].name))) break;
-		*(RIVArray[wordCount].frequency)+= 1;;
+		//we pull the vector corresponding to each word from the lexicon
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
+		//if it's a new word, lexPull returns a 0 vector
+		lexiconRIV= lexPull(word);
-		wordCount++;
+		//we add the context of this file to this wordVector
+		addContext(lexiconRIV, contextVector);
-	}
+		//we remove the sub-vector corresponding to the word itself
-	//printf("%d\n", wordCount);
+		subtractThisWord(lexiconRIV);
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
+		//we log that this word has been encountered one more time
-	denseRIV* RIVArray_slider = RIVArray;
+		lexiconRIV->frequency += 1;
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
-	while(RIVArray_slider<RIVArray_stop){
-		lexPush(*RIVArray_slider);
+		//and finally we push it back to the lexicon for permanent storage
-		RIVArray_slider++;
+		lexPush(lexiconRIV);
-	}
-	free(RIVArray);
-	free(aggregateRIV.locations);
+	}
+	free(contextVector.locations);
 }
-void readdirContingency(int sigNumber){
-	puts("readdir segfaulted, trying to recover");
+void addContext(denseRIV* lexRIV, sparseRIV context){
-	longjmp(readdirRecov, 1);
+		//add context to the lexRIV, (using sparse-dense vector comparison)
+		addS2D(lexRIV->values, context);
+		//log the "size" of the vector which was added
+		//this is not directly necessary, but is useful metadata for some analises
+		lexRIV->contextSize += context.contextSize;
 }
--- a/RIVtools.h
+++ b/RIVtools.h
 #ifndef RIVTOOLS_H_
 #define RIVTOOLS_H_
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include "RIVLower.h"
 #include "RIVaccessories.h"
+#include "RIVlexicon.h"
-/* lexPush writes a denseRIV to a file for permanent storage */
-int lexPush(denseRIV RIVout);
-/* lexPull reads an existing lexicon entry (under directory "lexicon")
- * and creates a denseRIV with those attributes.
- * if the file does not exist, it creates a 0 vector with the name of word
- */
-denseRIV lexPull(char* word);
 /* fileToL2 takes an input file, reads words (delimiting on " " and "\n") 
 * and returns a sparse RIV which is the vector sum of the base RIVs of each 
 * word contained
@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input);
 */
 sparseRIV fileToL2Clean(FILE *data);
-/*filetoL2direct is an experiment in simplifying the process.  it's slow */
+/* like fileToL2 but takes a block of text */
-sparseRIV fileToL2direct(FILE *data);
+sparseRIV textToL2(char *text);
 /*cosine determines the "similarity" between two RIVs. */
 double cosCompare(denseRIV baseRIV, sparseRIV comparator);
-/*currently unused */
+/*used for analysis of lexicon vectors (not simply accumulation)
-sparseRIV wordtoL2(char* word);
+ * to avoid overflow of even a 64 bit integer, vectors must be normalized
+ * this is an experimental approximation of true normal, which should yield 
-/* converts an implicit RIV (a set of unvalued locations) into a formal 
+ * some extra data about the nature of this word's context
- * sparse RIV.  this chooses the best method to perform the consolidation
+ */
- * and launches that function   defunct right now for memory usage reasons*/
-sparseRIV consolidateI2S(int *implicit, size_t valueCount);
-sparseRIV normalizeFloored(denseRIV input, int factor);
 sparseRIV normalize(denseRIV input, int factor);
-int roundMultiply(int base, float divisor);
-/* like fileToL2 but takes a block of text */
-sparseRIV text2L2(char *text);
 /* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
 double getMagnitudeSparse(sparseRIV input);
+/* same for denseVector */
+double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
-sparseRIV text2L2(char *text){
+sparseRIV textToL2(char *text){
 	int wordCount = 0;
-	unsigned char word[100] = {0};
+	char word[100] = {0};
 	int denseTemp[RIVSIZE] = {0};
 	/* locations (implicit RIV) are temp stored in temp block, and moved 
@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){
 		if(!displacement){
 			break;
 		}
 		if(!(*word)){
 			break;
 		}
@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){
 	addI2D(denseTemp, locations, locationCount);
 	sparseRIV output = consolidateD2S(denseTemp);
-	/* frequency records the number of words in this file, untill frequency
+	/* contextSize stores the number of words read */
-	 * is needed to hold some more useful data point */
+	output.contextSize = wordCount;
-	output.frequency = wordCount;
-	output.boolean = 1;
 	return output;
 }
 sparseRIV fileToL2(FILE *data){
-	unsigned char word[100] = {0};
+	char word[100] = {0};
-	/* locations (implicit RIV) are temp stored in temp block, and moved
+	/* locations (implicit RIV) are temporarily stored in temp block, 
-	 * to permanent home in consolidation */
+	 * and moved to permanent home in consolidation */
 	int *locations = RIVKey.h_tempBlock;
 	int locationCount = 0;
 	int denseTemp[RIVSIZE] = {0};
@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){
 	addI2D(denseTemp, locations, locationCount);
 	sparseRIV output = consolidateD2S(denseTemp);
-	/* frequency records the number of words in this file */
+	/* contextSize records the number of words in this file */
-	output.frequency = wordCount;
+	output.contextSize = wordCount;
-	output.boolean = 1;
+	fseek(data, 0, SEEK_SET);
 	return output;
 }
 sparseRIV fileToL2Clean(FILE *data){
 	int denseTemp[RIVSIZE] = {0};
-	unsigned char word[100] = {0};
+	char word[100] = {0};
 	int *locations = RIVKey.h_tempBlock;
 	unsigned int wordCount = 0;
@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){
 	sparseRIV output = consolidateD2S(denseTemp);
 	/* frequency records the number of words in this file */
-	output.frequency = locationCount/NONZEROS;
+	output.contextSize = locationCount/NONZEROS;
-	output.boolean = 1;
+	fseek(data, 0, SEEK_SET);
 	return output;
 }
-//defunct temporarily, might make a return
-/*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
-	if(valueCount<RIVKey.I2SThreshold){
-		 //direct method is faster on small datasets, but has geometric scaling on large datasets 
-		return consolidateI2SDirect(implicit, valueCount);
-	}else{
-		// optimized for large datasets 
-		return consolidateI2SIndirect(implicit, valueCount);
-	}
-}*/
-void aggregateWord2D(denseRIV destination, char* word){
-	srand(wordtoSeed((unsigned char*)word));
-	for(int i=0; i<NONZEROS; i++){
-		destination.values[(rand()%RIVSIZE)] +=1;
-		destination.values[(rand()%RIVSIZE)] -= 1;
-	}
-}
 double cosCompare(denseRIV baseRIV, sparseRIV comparator){
-	int dot = 0;
+	long long int dot = 0;
-	int n = comparator.count;
+	int* locations_stop = comparator.locations+comparator.count;
-	while(n){
+	int* locations_slider = comparator.locations;
-		n--;
+	int* values_slider = comparator.values;
+	while(locations_slider<locations_stop){
 		/* we calculate the dot-product to derive the cosine 
 		 * comparing sparse to dense by index*/
-		//dot += values[i]*baseRIV.values[locations[i]];
+		dot += *values_slider * baseRIV.values[*locations_slider];
-		dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
+		locations_slider++;
+		values_slider++;
-		//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
 	}
 	/*dot divided by product of magnitudes */
@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){
 	int *values = input.values;
 	int *values_stop = values+input.count;
 	while(values<values_stop){
+		/* we sum the squares of all elements */
 		temp += (*values)*(*values);
-		//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
 		values++;
 	}
+	/* we take the root of that sum */
 	return sqrt(temp);
 }
-denseRIV lexPull(char* word){
+double getMagnitudeDense(denseRIV *input){
-	#if CACHESIZE > 0
+	size_t temp = 0;
+	int *values = input->values;
-	/* if there is a cache, first check if the word is cached */
+	int *values_stop = values+RIVSIZE;
-	srand(wordtoSeed((unsigned char*)word));
+	while(values<values_stop){
-	int hash = rand()%CACHESIZE;
+		if(*values){
-	if(!strcmp(word, RIVKey.RIVCache[hash].name)){
+			temp += (*values)*(*values);
-		/* if word is cached, pull from cache and exit */
-		return RIVKey.RIVCache[hash];
-	}
-	#endif /* CACHESIZE > 0 */
-	/* if not, attempt to pull the word data from lexicon file */
-	denseRIV output;
-	char pathString[200];
-	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
-	FILE *lexWord = fopen(pathString, "rb");
-	/* if this lexicon file already exists */
-	if(lexWord){
-		/* pull data from file */
-		output = fLexPull(lexWord);
-		fclose(lexWord);
-	}else{
-		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
-		output = denseAllocate();
-	}
-	strcpy(output.name, word);
-	return output;
-}
-int lexPush(denseRIV RIVout){
-	#if CACHESIZE == 0
-	/* if there is no cache, simply push to file */
-	fLexPush(RIVout);
-	return 0;
-	#else /* CACHESIZE != 0 */
-	/* if our RIV was cached, there are two options (hopefully)
-	 * either the RIV is still cached, and the data has been updated 
-	 * to the cache or the RIV was pushed out from under it, 
-	 * in which case it has already been pushed! move on*/
-	if(RIVout.cached){
-		return 0;
 		}
+		values++;
-	srand(wordtoSeed((unsigned char*)RIVout.name));
-	int hash = rand()%CACHESIZE;
-	if(!RIVKey.RIVCache[hash].cached){
-		/* if there is no word in this cache slot, push to cache instead of file */
-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash].cached = 1;
-		return 0;
-	/*if the current RIV is more frequent than the RIV holding its slot */
-	}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
-		/* push the current cache entry to a file */
-		int diag = fLexPush(RIVKey.RIVCache[hash]);
-		/* push the current RIV to cache */
-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash].cached = 1;
-		return diag;
-	}else{
-		/* push current RIV to file */
-		fLexPush(RIVout);
 	}
-	return 0;
+	return sqrt(temp);
-	#endif /* CACHESIZE == 0 */
 }
-sparseRIV fileToL2direct(FILE *data){;
-	unsigned char word[100] = {0};
-	denseRIV denseTemp;
-	// a temporary dense RIV is stored in the tempBlock 
-	denseTemp.values = RIVKey.h_tempBlock;
-	memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
-	int count = 0;
-	while(fscanf(data, "%99s", word)){
-		count++;
-		if(feof(data)){
-			break;
-		}
-		if(!(*word)){
-			break;
-		}
-		// add word's L1 RIV to the accumulating implicit RIV 
-		aggregateWord2D(denseTemp, (char*)word);
-	}
-	sparseRIV output = consolidateD2S(denseTemp.values);
-	// frequency records the number of words in this file 
+sparseRIV normalize(denseRIV input, int factor){
-	output.frequency = count;
+	/* multiplier is the scaling factor we need to bring our vector to the right size */
-	output.boolean = 1;
+	float multiplier = (float)factor/(input.contextSize);
-	return output;
-}
-sparseRIV normalizeFloored(denseRIV input, int factor){
+	/* write to temp slot, data will go to a permanent home lower in function */
-	float divisor = (float)factor/(*input.contextSize);
+	int* locations = RIVKey.h_tempBlock+RIVSIZE;
-//	printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
-	int* locations = RIVKey.h_tempBlock;
 	int* values = locations+RIVSIZE;
-	int count = 0;
-	for(int i=0; i<RIVSIZE; i++){
-		if(!input.values[i]) continue;
-		locations[count] = i;
-		values[count]= input.values[i]*divisor;
-		if(values[count])count++;
-	}
-	sparseRIV output;
-	output.locations = (int*) malloc(count*2*sizeof(int));
-	output.values = output.locations+count;
-	memcpy(output.locations, locations, count*sizeof(int));
-	memcpy(output.values, values, count*sizeof(int));
-	strcpy(output.name, input.name);
-	output.count = count;
-	output.magnitude = getMagnitudeSparse(output);
-	output.contextSize = *input.contextSize;
-	output.frequency = *input.frequency;
-	return output;
-}
-sparseRIV normalize(denseRIV input, int factor){
-	float divisor = (float)factor/(*input.contextSize);
-//	printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
-	int* locations = RIVKey.h_tempBlock;
-	int* values = locations+RIVSIZE;
 	int count = 0;
 	for(int i=0; i<RIVSIZE; i++){
+		/* if this point is 0, skip it */
 		if(!input.values[i]) continue;
+		/* record position and value in the forming sparse vector */
 		locations[count] = i;
-		values[count]= roundMultiply(input.values[i], divisor);
+		values[count]= round(input.values[i]*multiplier);
-		if(values[count])count++;
+		/* drop any 0 values */
+		if(values[count] > 1)count++; 
 	}
 	sparseRIV output;
+	output.count = count;
+	/* for memory conservation, both datasets are put inline with each other */
 	output.locations = (int*) malloc(count*2*sizeof(int));
 	output.values = output.locations+count;
+	/* copy the data from tempBlock into permanent home */
 	memcpy(output.locations, locations, count*sizeof(int));
 	memcpy(output.values, values, count*sizeof(int));
+	/* carry metadata */
 	strcpy(output.name, input.name);
-	output.count = count;
 	output.magnitude = getMagnitudeSparse(output);
-	output.contextSize = *input.contextSize;
+	output.contextSize = input.contextSize;
-	output.frequency = *input.frequency;
+	output.frequency = input.frequency;
-	return output;
-}
-int roundMultiply(int base, float divisor){
-	float temp = base*divisor;
-	int output = temp*2;
-	if (output%2){
-		output/=2;
-		output+=1;
-	}else{
-		output/=2;
-	}
 	return output;
 }
 #endif