updated RIVreads

34c65893 · amberhosen · 9d2c0fed · 34c65893 · 9d2c0fed · 34c65893
Commit 34c65893 authored Apr 06, 2018 by amberhosen
Showing with 930 additions and 991 deletions
RIVLower.h
RIVPACK1/RIVread
RIVPACK1/RIVread.c
RIVPACK1/RIVread1
RIVPACK1/RIVread2
RIVPACK1/RIVread3
RIVPACK1/RIVread4
RIVPACK1/RIVread5
RIVPACK1/RIVread6
RIVPACK1/runscriptUb.sh
RIVPACK1/shittyballs.py
RIVaccessories.h
RIVlexicon.h
RIVread.c
RIVtools.h
--- a/RIVLower.h
+++ b/RIVLower.h
+#ifndef RIVLOWER_H_
-#ifndef RIVLOWER_H_
+#define RIVLOWER_H_
-#define RIVLOWER_H_
+#include <stdio.h>
-#include <stdio.h>
+#include <stdlib.h>
-#include <stdlib.h>
+#include <string.h>
-#include <string.h>
+#include <signal.h>
-#include <signal.h>
+#include <unistd.h>
-#include <unistd.h>
+#include <sys/stat.h>
-#include <math.h>
+#include "RIVaccessories.h"
-#include <sys/stat.h>
+/* RIVSIZE macro defines the dimensionality off the RIVs we will use
-#include <sys/types.h>
+ * 25000 is the standard, but can be redefined specifically
-/* RIVSIZE macro defines the dimensionality off the RIVs we will use
+ */
- * 25000 is the standard, but can be redefined specifically
+#ifndef RIVSIZE
- */
+#define RIVSIZE 25000
-#ifndef RIVSIZE
+#endif
-#define RIVSIZE 25000
-#endif
+#if RIVSIZE<0
+#error "RIVSIZE must be a positive number (preferably a large positive)"
-#if RIVSIZE<0
+#endif
-#error "RIVSIZE must be a positive number (preferably a large positive)"
-#endif
+/* NONZeros macro defines the number of non-zero values that will be generated
+ * for any level one (barcode) RIV.  2 is simple and lightweight to begin
-/* NONZeros macro defines the number of non-zero values that will be generated
+ */
- * for any level one (barcode) RIV.  2 is simple and lightweight to begin
+#ifndef NONZEROS
- */
+#define NONZEROS 2
-#ifndef NONZEROS
+#endif
-#define NONZEROS 2
-#endif
+#if NONZEROS%2 || NONZEROS<1
+#error "NONZEROS must be an even, greater than 0 number"
-#if NONZEROS%2 || NONZEROS<1
+#endif
-#error "NONZEROS must be an even, greater than 0 number"
-#endif
+/* CACHESIZE macro defines the number of RIVs the system will cache.
+ * a larger cache means more memory consumption, but will also be significantly
-/* CACHESIZE macro defines the number of RIVs the system will cache.
+ * faster in aggregation and reading applications. doesn't affect systems
- * a larger cache means more memory consumption, but will also be significantly
+ * that do not use lexpull/push
- * faster in aggregation and reading applications. doesn't affect systems
+ */
- * that do not use lexpull/push
+#ifndef CACHESIZE
- */
+#define CACHESIZE 5000
-#ifndef CACHESIZE
+#endif
-#define CACHESIZE 20
-#endif
+#if CACHESIZE<0
+#error "CACHESIZE cannot be a negative number"
-#if CACHESIZE<0
+#endif
-#error "CACHESIZE cannot be a negative number"
-#endif
+/* the size of the tempBlock used in consolidation and implicit RIVs */
+#define TEMPSIZE 3*RIVSIZE
-/* the size of the tempBlock used in consolidation and implicit RIVs */
-#define TEMPSIZE 3*RIVSIZE
+/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
+ * as this is often an ideal case, it is adviseable as the default 
-/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
+ * unless we are doing long term RIV aggregation.
- * as this is often an ideal case, it is adviseable as the default 
+ * specifically, a sparseRIV contains a pair of arrays, 
- * unless we are doing long term RIV aggregation.
+ * containing locations and values, where pairs are found in like array 
- * specifically, a sparseRIV contains a pair of arrays, 
+ * indices.
- * containing locations and values, where pairs are found in like array 
+ */
- * indices.
+typedef struct{
- */
+	char name[100];
-typedef struct{
+	int *values;
-	char name[100];
+	int *locations;
-	int *values;
+	size_t count;
-	int *locations;
+	double magnitude;
-	size_t count;
+	int contextSize;
 	int frequency;
-	double magnitude;
+}sparseRIV;
-	int boolean;
+/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
-	int contextSize;
+ * this is rarely the case, but its primary use is for performing vector
-}sparseRIV;
+ * math, as comparisons and arithmetic between vectors are ideally 
-/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
+ * performed between sparse and dense (hetero-arithmetic)
- * this is rarely the case, but its primary use is for performing vector
+ */
- * math, as comparisons and arithmetic between vectors are ideally 
+typedef struct{
- * performed between sparse and dense (hetero-arithmetic)
+	int cached;
- */
+	char name[100];
-typedef struct{
+	int frequency;
-	char name[100];
+	double magnitude;
-	int* values;
+	int contextSize;
-	int* frequency;
+	int values[RIVSIZE];
-	double magnitude;
+}denseRIV;
-	int cached;
-	int *contextSize;
+/*RIVKey, holds global variables used under the hood, primarily for the lexicon
-}denseRIV;
+ * it also holds a "temp block" that will be used by the dense to sparse 
+ * conversion and implicit RIV aggregation 
-/*RIVKey, holds global variables used under the hood, primarily for the lexicon
+*/
- * it also holds a "temp block" that will be used by the dense to sparse 
+struct RIVData{
- * conversion and implicit RIV aggregation 
+	int h_tempBlock[TEMPSIZE];
-*/
+	int tempSize;
-struct RIVData{
+	char lexName[255];
-	int h_tempBlock[TEMPSIZE];
+	denseRIV* RIVCache[CACHESIZE];
-	int tempSize;
+}static RIVKey;
-	char lexName[255];
-	denseRIV RIVCache[CACHESIZE];
+/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
-}static RIVKey;
+ * all 0s removed. it does not automatically carry metadata, which must be assigned
+ * to a denseRIV after the fact.  often denseRIVs are only temporary, and don't
-/* lexOpen is called to "open the lexicon", setting up for later calls to
+ * contain any metadata
- * lexPush and lexPull. if the lexicon has not been opened before calls
+ */
- * to these functions, their behavior can be unpredictable, most likely crashing
+sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
- */
-void lexOpen();
+/* makeSparseLocations must be called repeatedly in the processing of a 
+ * file to produce a series of locations from the words of the file
-/* lexClose should always be called after the last lex push or lex pull call
+ * this produces an "implicit" RIV which can be used with the mapI2D function
- * if the lexicon is left open, some vector data may be lost due to 
+ * to create a denseRIV.
- * un-flushed RIV cache
+ */
- */
+void makeSparseLocations(char* word,  int *seeds, size_t seedCount);
-void lexClose();
+/* mapI2D maps an "implicit RIV" that is, an array of index values, 
-/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
+ * arranged by chronological order of generation (as per makesparseLocations)
- * all 0s removed. it does not automatically carry metadata, which must be assigned
+ * it assigns, in the process of mapping, values according to ordering
- * to a denseRIV after the fact.  often denseRIVs are only temporary, and don't
+ */
- * contain any metadata
+int* mapI2D(int *locations, size_t seedCount);
- */
-sparseRIV consolidateD2S(int *denseInput);  //#TODO fix int*/denseRIV confusion
+/* highly optimized method for adding vectors.  there is no method 
+ * included for adding D2D or S2S, as this system is faster-enough
+ * to be more than worth using
+ */
+int* addS2D(int* destination, sparseRIV input);
-/* makeSparseLocations must be called repeatedly in the processing of a 
- * file to produce a series of locations from the words of the file
+/* caheDump flushes the RIV cache out to relevant files, backing up all 
- * this produces an "implicit" RIV which can be used with the mapI2D function
+ * data.  this is called by the lexClose and signalSecure functions
- * to create a denseRIV.
+ */
- */
+int cacheDump();
-void makeSparseLocations(unsigned char* word,  int *seeds, size_t seedCount);
+/* adds all elements of an implicit RIV (a sparseRIV represented without values)
-/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
+ * to a denseRIV.  used by the file2L2 functions in aggregating a document vector
- * saving it for long-term aggregation.  function is called by "lexPush",
+ */
- * which is what users should actually use.  lexPush, unlike fLexPush,
+int* addI2D(int* destination, int* locations, size_t seedCount);
- * has cache logic under the hood for speed and harddrive optimization
- */
+/*subtracts a words vector from its own context.  regularly used in lex building
-int fLexPush(denseRIV RIVout);
+ */
+void subtractThisWord(denseRIV* vector);
-/* flexPull pulls data directly from a file and converts it (if necessary)
+/* begin definitions */
- * to a denseRIV.  function is called by "lexPull" which is what users 
- * should actually use.  lexPull, unlike FlexPull, has cache logic under
+int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
- * the hood for speed and harddrive optimization 
- */
+	int *locations_slider = input.locations;
-denseRIV fLexPull(FILE* lexWord);
+	int *values_slider = input.values;
+	int *locations_stop = locations_slider+input.count;
-/* creates a standard seed from the characters in a word, hopefully unique */
-int wordtoSeed(unsigned char* word);
+	/* apply values at an index based on locations */
+	while(locations_slider<locations_stop){
-/* mapI2D maps an "implicit RIV" that is, an array of index values, 
+		destination[*locations_slider] += *values_slider;
- * arranged by chronological order of generation (as per makesparseLocations)
+		locations_slider++;
- * it assigns, in the process of mapping, values according to ordering
+		values_slider++;
- */
+	}
-int* mapI2D(int *locations, size_t seedCount);
+	return destination;
-/* highly optimized method for adding vectors.  there is no method 
+}
- * included for adding D2D or S2S, as this system is faster-enough
- * to be more than worth using
+int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
- */
+	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
-int* addS2D(int* destination, sparseRIV input);
+	int *locations_slider = locations;
-/*
+	int *locations_stop = locations_slider+valueCount;
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
+	/*apply values +1 or -1 at an index based on locations */
-* consolidate I2S is temporarily deprecated.  may be brought back.
+	while(locations_slider<locations_stop){
-* in tandem they are much faster, but less careful with RAM */
+		destination[*locations_slider] +=1;
-/* caheDump flushes the RIV cache out to relevant files, backing up all 
+		locations_slider++;
- * data.  this is called by the lexClose and signalSecure functions
+		destination[*locations_slider] -= 1;
- */
+		locations_slider++;
-int cacheDump();
+	}
-/* adds all elements of an implicit RIV (a sparseRIV represented without values)
+	return destination;
- * to a denseRIV.  used by the file2L2 functions in aggregating a document vector
+}
- */
+int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
-int* addI2D(int* destination, int* locations, size_t seedCount);
+	int *locations_slider = locations;
+	int *locations_stop = locations_slider+valueCount;
-/* allocates a denseRIV filled with 0s
- */
+	/*apply values +1 or -1 at an index based on locations */
-denseRIV denseAllocate();
+	while(locations_slider<locations_stop){
-/* redefines signal behavior to protect cached data against seg-faults etc*/
-void signalSecure(int signum);
+		destination[*locations_slider] +=1;
-/* begin definitions */
+		locations_slider++;
+		destination[*locations_slider] -= 1;
-int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
+		locations_slider++;
+	}
-	int *locations_slider = input.locations;
-	int *values_slider = input.values;
-	int *locations_stop = locations_slider+input.count;
+	return destination;
+}
-	/* apply values at an index based on locations */
-	while(locations_slider<locations_stop){
-		destination[*locations_slider] += *values_slider;
-		locations_slider++;
+sparseRIV consolidateD2S(int *denseInput){
-		values_slider++;
+	sparseRIV output;
-	}
+	output.count = 0;
+	/* key/value pairs will be loaded to a worst-case sized temporary slot */
-	return destination;
+	int* locations = RIVKey.h_tempBlock+RIVSIZE;
-}
+	int* values = locations+RIVSIZE;
+	int* locations_slider = locations;
+	int* values_slider = values;
-int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+	for(int i=0; i<RIVSIZE; i++){
-	int *destination = (int*)calloc(RIVSIZE,sizeof(int));
-	int *locations_slider = locations;
+		/* act only on non-zeros */
-	int *locations_stop = locations_slider+valueCount;
+		if(denseInput[i]){
-	/*apply values +1 or -1 at an index based on locations */
+			/* assign index to locations */
-	while(locations_slider<locations_stop){
+			*(locations_slider++) = i;
-		destination[*locations_slider] +=1;
+			/* assign value to values */
-		locations_slider++;
+			*(values_slider++) = denseInput[i];
-		destination[*locations_slider] -= 1;
-		locations_slider++;
+			/* track size of forming sparseRIV */
-	}
+			output.count++;
+		}
+	}
-	return destination;
+	/* a slot is opened for the locations/values pair */
-}
+	output.locations = (int*) malloc(output.count*2*sizeof(int));
-int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
+	if(!output.locations){
-	int *locations_slider = locations;
+		printf("memory allocation failed"); //*TODO enable fail point knowledge and security
-	int *locations_stop = locations_slider+valueCount;
+	}
+	/* copy locations values into opened slot */
-	/*apply values +1 or -1 at an index based on locations */
+	memcpy(output.locations, locations, output.count*sizeof(int));
-	while(locations_slider<locations_stop){
+	output.values = output.locations + output.count;
-		destination[*locations_slider] +=1;
-		locations_slider++;
+	/* copy values into opened slot */
-		destination[*locations_slider] -= 1;
+	memcpy(output.values, values, output.count*sizeof(int));
-		locations_slider++;
-	}
+	return output;
+}
-	return destination;
-}
+void makeSparseLocations(char* word,  int *locations, size_t count){
-/*
+	locations+=count;
-sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
+	srand(wordtoSeed(word));
-	int *denseTemp = mapI2D(implicit, valueCount);
+	int *locations_stop = locations+NONZEROS;
+	while(locations<locations_stop){
-	sparseRIV sparseOut = consolidateD2S(denseTemp);
+		/* unrolled for speed, guaranteed to be an even number of steps */
+		*locations = rand()%RIVSIZE;
-	free(denseTemp);
+		locations++;
+		*locations = rand()%RIVSIZE;
+		locations++;
-	return sparseOut;
+	}
+	return;
+}
-}
-sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
+sparseRIV* sparseAllocateFormatted(){
-	sparseRIV sparseOut;
+	sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
-	int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
-	int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
-	sparseOut.count = 0;
-	int add = 1;
-	int found;
+	return output;
-	for(int i=0; i<valueCount; i++){
+}
-		found = 0;
+void subtractThisWord(denseRIV* vector){
-		for(int j=0; j<sparseOut.count; j++){
+	//set the rand() seed to the word
-			if(implicit[i] == locationsTemp[j]){
+	srand(wordtoSeed(vector->name));
-				valuesTemp[i] += add;
+	/* the base word vector is composed of NONZERO (always an even number)
-				add *= -1;
+	 * +1s and -1s at "random" points (defined by the above seed.
-				found = 1;
+	 * if we invert it to -1s and +1s, we have subtraction */
-			}
-		}
+	for(int i = 0; i < NONZEROS; i+= 2){
-		if(!found){
+		vector->values[rand()%RIVSIZE] -= 1;
-			locationsTemp[sparseOut.count] = implicit[i];
+		vector->values[rand()%RIVSIZE] += 1;	
+	}
-			valuesTemp[sparseOut.count] = add;
+	/* record a context size 1 smaller */
-			sparseOut.count++;
+	vector->contextSize-= 1;
-			add*= -1;
-		}
+}
-	}
-	sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
+#endif
-	sparseOut.values = sparseOut.locations+sparseOut.count;
-	memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
-	memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
-	return sparseOut;
-}*/
-sparseRIV consolidateD2S(int *denseInput){
-	sparseRIV output;
-	output.count = 0;
-	/* key/value pairs will be loaded to a worst-case sized temporary slot */
-	int* locations = RIVKey.h_tempBlock+RIVSIZE;
-	int* values = locations+RIVSIZE;
-	int* locations_slider = locations;
-	int* values_slider = values;
-	for(int i=0; i<RIVSIZE; i++){
-		/* act only on non-zeros */
-		if(denseInput[i]){
-			/* assign index to locations */
-			*(locations_slider++) = i;
-			/* assign value to values */
-			*(values_slider++) = denseInput[i];
-			/* track size of forming sparseRIV */
-			output.count++;
-		}
-	}
-	/* a slot is opened for the locations/values pair */
-	output.locations = (int*) malloc(output.count*2*sizeof(int));
-	if(!output.locations){
-		printf("memory allocation failed"); //*TODO enable fail point knowledge and security
-	}
-	/* copy locations values into opened slot */
-	memcpy(output.locations, locations, output.count*sizeof(int));
-	output.values = output.locations + output.count;
-	/* copy values into opened slot */
-	memcpy(output.values, values, output.count*sizeof(int));
-	return output;
-}
-void lexOpen(char* lexName){
-	/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
-	struct stat st;
-	if (stat(lexName, &st) == -1) {
-		mkdir(lexName, 0777);
-	}	
-	strcpy(RIVKey.lexName, lexName);
-	/* open a slot at least large enough for worst case handling of
-	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
-	for(int i=1; i<20; i++){
-		signal(i, signalSecure);
-	}
-	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
-	memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
-}
-void lexClose(){
-	if(cacheDump()){
-		puts("cache dump failed, some lexicon data was lost");
-	}
-}
-int wordtoSeed(unsigned char* word){
-	int i=0;
-	int seed = 0;
-	while(*word){
-		/* left-shift 5 each time *should* make seeds unique to words
-		 * this means letters are taken as characters counted in base 32, which
-		 * should be large enough to hold all english characters plus a few outliers
-		 * */
-		seed += (*(word))<<(i*5);
-		word++;
-		i++;
-	}
-	return seed;
-}
-void makeSparseLocations(unsigned char* word,  int *locations, size_t count){
-	locations+=count;
-	srand(wordtoSeed(word));
-	int *locations_stop = locations+NONZEROS;
-	while(locations<locations_stop){
-		/* unrolled for speed, guaranteed to be an even number of steps */
-		*locations = rand()%RIVSIZE;
-		locations++;
-		*locations = rand()%RIVSIZE;
-		locations++;
-	}
-	return;
-}
-int fLexPush(denseRIV RIVout){	
-	char pathString[200] = {0};
-	/* word data will be placed in a (new?) file under the lexicon directory
-	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
-	FILE *lexWord = fopen(pathString, "wb");
-	if(!lexWord){
-		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
-		return 1;
-	}
-	sparseRIV temp = consolidateD2S(RIVout.values);
-	if(temp.count<(RIVSIZE/2)){
-		/* smaller stored as sparse vector */
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
-		fwrite(temp.values, temp.count, sizeof(int), lexWord);
-	//	printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
-	}else{
-		/* saturation is too high, better to store dense */
-		/* there's gotta be a better way to do this */
-		temp.count = 0;
-		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
-		fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
-		fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
-		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
-		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
-	//	printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
-	}
-	fclose(lexWord);
-	free(RIVout.values);
-	free(temp.locations);
-	return 0;
-}
-denseRIV fLexPull(FILE* lexWord){
-	denseRIV output = denseAllocate();
-	size_t typeCheck;
-	int flag = 0;
-	/* get metadata for vector */
-	flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
-	flag+= fread(output.frequency, 1, sizeof(int), lexWord);
-	flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
-	flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
-	/* first value stored is the value count if sparse, and 0 if dense */
-	if (typeCheck){
-		/* pull as sparseVector */
-		sparseRIV temp;
-		/* value was not 0, so it's the value count */
-		temp.count = typeCheck;
-		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
-		temp.values = temp.locations+temp.count;
-		flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
-		flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
-		addS2D(output.values, temp);
-		free(temp.locations);
-	}else{
-		/* typecheck is thrown away, just a flag in this case */
-		flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
-	}
-	output.cached = 0;
-	return output;
-}
-void signalSecure(int signum){
-  if(cacheDump()){
-	  puts("cache dump failed, some lexicon data lost");
-  }else{
-	puts("cache dumped successfully");
-  }
-  signal(signum, SIG_DFL);
-  exit(1);
-}
-int cacheDump(){
-	int flag = 0;
-	denseRIV* cache_slider = RIVKey.RIVCache;
-	denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
-	while(cache_slider<cache_stop){
-		if((*cache_slider).cached){
-			flag += fLexPush(*cache_slider);
-		}
-		else{
-		}
-		cache_slider++;
-	}
-	return flag;
-}
-denseRIV denseAllocate(){
-	/* allocates a 0 vector */
-	denseRIV output;
-	output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
-	/* for compact memory use, frequency is placed immediately after values */
-	output.frequency = output.values+RIVSIZE;
-	output.contextSize = output.frequency+1;
-	output.magnitude = 0;
-	output.cached = 0;
-	return output;
-}
-/*TODO add a simplified free function*/
-#endif
--- a/RIVPACK1/RIVread
+++ b/RIVPACK1/RIVread
--- a/RIVPACK1/RIVread.c
+++ b/RIVPACK1/RIVread.c
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#define CACHESIZE 15000
-#define RIVSIZE 50000
-#define NONZEROS 8
-#include <setjmp.h>
-#include <signal.h>
-#include "../RIVet/RIVtools.h"
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <dirent.h>
 #include <error.h>
+#define RIVSIZE 200000
+#define NONZEROS 2
+#define CACHESIZE 1000
+#include "../RIVtools.h"
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
+//to all words found in these files. this is used to create a lexicon, or add to an existing one
 void fileGrind(FILE* textFile);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
+void addContext(denseRIV* lexRIV, sparseRIV context);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
 void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);
-jmp_buf readdirRecov;
 int main(int argc, char *argv[]){
-	clock_t begintotal = clock();
-	lexOpen("/home/drbob/Documents/lexicon8-50");
 	char pathString[1000];
+	//we open the lexicon, if it does not yet exist, it will be created
+	lexOpen("lexicon200-2");
+	//we format the root directory, preparing to scan its contents
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
-	struct stat st = {0};
+	//ensure that the targeted root directory exists
+	struct stat st;
 	if(stat(pathString, &st) == -1) {
+		printf("directory doesn't seem to exist");
 		return 1;
 	}
+	//we will scan the directory, adding all data to our lexicon, as seen inside
 	directoryGrind(pathString);
-	clock_t endtotal = clock();
+	//we close the lexicon again, ensuring all data is secured
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
-	printf("total time:%lf\n\n", time_spent);
 	lexClose();
 	return 0;
 }
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
+//mostly a standard recursive Dirent-walk
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
-	while(denseSet_slider<dense_stop){
-		addS2D((*denseSet_slider).values, additive);
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
-	}
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
-		if(!strcmp(word, RIVSet->name)){
-			return 1;
-		}
-		RIVSet++;
-	}
-	return 0;
-}
 void directoryGrind(char *rootString){
+	/* *** begin Dirent walk *** */
 	char pathString[2000];
 	DIR *directory;
 	struct dirent *files = 0;
@@ -76,79 +57,101 @@ void directoryGrind(char *rootString){
 	}
 	while((files=readdir(directory))){
-		if(setjmp(readdirRecov)){
-			continue;
+		if(!files->d_name[0]) break;
-		}
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
 		while(*(files->d_name)=='.'){
 			files = readdir(directory);
 		}
-		//signal(SIGSEGV, signalSecure);
 		if(files->d_type == DT_DIR){
 			strcpy(pathString, rootString);
 			strcat(pathString, files->d_name);
 			strcat(pathString, "/");
 			directoryGrind(pathString);
+			continue;
 		}
 		strcpy(pathString, rootString);
 		strcat(pathString, files->d_name);
 		printf("%s\n", pathString);
-		FILE *input = fopen(pathString, "r+");
+/* *** end dirent walk, begin meat of function  *** */
+		//check for non-txt files
+		char *fileEnding = pathString+strlen(pathString)-4;
+		if(strcmp(fileEnding, ".txt")){
+			printf("skipped: %s\n", files->d_name); 
+			continue;
+		}
+		//open a file within root directory
+		FILE *input = fopen(pathString, "r");
 		if(input){
+			//process this file and add it's data to lexicon
 			fileGrind(input);
 			fclose(input);
 		}
 	}
 }
+//form context vector from contents of file, then add that vector to
+//all lexicon entries of the words contained
 void fileGrind(FILE* textFile){
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
+	//form a context vector.  "clean" indicates that it will ignore any word which
-	fseek(textFile, 0, SEEK_SET);
+	//contains unwanted characters
+	sparseRIV contextVector = fileToL2Clean(textFile);
-	int wordCount = 0;
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
-	char word[200];
+	//an array of denseRIVs, large enough to hold all vectors 
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
+	denseRIV* lexiconRIV;
+	char word[100] = {0};
 	while(fscanf(textFile, "%99s", word)){
+		//we ensure that each word exists, and is free of unwanted characters
 		if(feof(textFile)) break;
 		if(!(*word))continue;
 		if(!isWordClean((char*)word)){
 			continue;
 		}
-		if(checkDupe(RIVArray, word, wordCount)){
-			continue;
-		}
+		//we pull the vector corresponding to each word from the lexicon
-		RIVArray[wordCount] = lexPull(word);
+		//if it's a new word, lexPull returns a 0 vector
+		lexiconRIV= lexPull(word);
-		if(!*((RIVArray[wordCount].name))) break;
+		//we add the context of this file to this wordVector
-		*(RIVArray[wordCount].frequency)+= 1;;
+		addContext(lexiconRIV, contextVector);
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
+		//we remove the sub-vector corresponding to the word itself
-		wordCount++;
+		subtractThisWord(lexiconRIV);
-	}
+		//we log that this word has been encountered one more time
-	//printf("%d\n", wordCount);
+		lexiconRIV->frequency += 1;
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
+		//and finally we push it back to the lexicon for permanent storage
-	denseRIV* RIVArray_slider = RIVArray;
+		lexPush(lexiconRIV);
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
-	while(RIVArray_slider<RIVArray_stop){
-		lexPush(*RIVArray_slider);
-		RIVArray_slider++;
 	}
-	free(RIVArray);
+	free(contextVector.locations);
-	free(aggregateRIV.locations);
 }
-void readdirContingency(int sigNumber){
-	puts("readdir segfaulted, trying to recover");
-	longjmp(readdirRecov, 1);
+void addContext(denseRIV* lexRIV, sparseRIV context){
+		//add context to the lexRIV, (using sparse-dense vector comparison)
+		addS2D(lexRIV->values, context);
+		//log the "size" of the vector which was added
+		//this is not directly necessary, but is useful metadata for some analises
+		lexRIV->contextSize += context.contextSize;
 }
--- a/RIVPACK1/RIVread1
+++ b/RIVPACK1/RIVread1
--- a/RIVPACK1/RIVread2
+++ b/RIVPACK1/RIVread2
--- a/RIVPACK1/RIVread3
+++ b/RIVPACK1/RIVread3
--- a/RIVPACK1/RIVread4
+++ b/RIVPACK1/RIVread4
--- a/RIVPACK1/RIVread5
+++ b/RIVPACK1/RIVread5
--- a/RIVPACK1/RIVread6
+++ b/RIVPACK1/RIVread6
--- a/RIVPACK1/runscriptUb.sh
+++ b/RIVPACK1/runscriptUb.sh
@@ -5,15 +5,16 @@ clean(){
 		else
 			python shittyballs.py "$1"
-			./RIVread cleanbooks/
+			./RIVread1 cleanbooks/
-			# ./RIVread1 cleanbooks/
 			./RIVread2 cleanbooks/
-			#./RIVread3 cleanbooks/
+			./RIVread3 cleanbooks/
-			#./RIVread4 cleanbooks/
+			./RIVread4 cleanbooks/
 			./RIVread5 cleanbooks/
 			./RIVread6 cleanbooks/
+			./RIVread7 cleanbooks/
 			rm  -r cleanbooks/
+			#rm "$1"
 		fi
 		shift
 	done
@@ -21,4 +22,4 @@ clean(){
-clean ../bookCleaner/books/*
+clean ../../books/*
--- a/RIVPACK1/shittyballs.py
+++ b/RIVPACK1/shittyballs.py
-import requests
+#import requests
 import re
 import string
 import os
@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
 import pdb
 from nltk.stem import PorterStemmer
-def adverbFix(word):
-    if not nltk.pos_tag(word)[0][1] == 'RB':
-        return word
-    adjective = word[:-2]
+def writeWord(cleanString, word, stem, blacklist):
-    if not nltk.pos_tag(word)[0][1] == 'JJ':
+    if word == stem:
-        return word;
+        FILE = open("lexicon/" + word, "w")
-    FILE = open("lexicon/" + word, "w")
+        FILE.write("1");
-    FILE.write("2" + temp)
+        FILE.close();
-    FILE.close()
+        return (cleanString + " " + word)
-    FILE = open("lexicon/" + adjective, "w")
-    FILE.write("1")
+    elif stem not in blacklist:
-    FILE.close()
+        if len(stem) > 2:
-    return adjective
+            FILE = open("lexicon/" + word, "w")
+            FILE.write("2"+stem);
-def strip(word):
+            FILE.close();
-    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
+            FILE = open("lexicon/" + stem, "w")
-        if word.endswith(suffix):
+            FILE.write("1")
-            return word[:-len(suffix)]
+            FILE.close();
+            return (cleanString + " " + stem)
+    return cleanString
+def liFix(word):
+    if not word[len(word)-2:] == "li":
        return word
+    temp = ps.stem(word[:-2])
+    if temp:
+        return temp
+    return word
 def cleanWord(word):
-    #if(len(word) == 0):
-        #print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
    word = word.lower();
    regex = re.compile('[^a-z]+')
    word = regex.sub('', word)
@@ -44,13 +50,11 @@ def cleanWord(word):
 def fileCheck(word):
    try:
-        #print("trying")
        wordFile = open("lexicon/{}".format(word), "r")
        code = int(wordFile.read(1))
    except:
-        #print("file does not exist")
        return 0
-    #print("fileCode{}".format(code))
    if code == 2:
        word = wordFile.read()
@@ -74,6 +78,8 @@ def morphyTest(word):
    return morphyTemp;
+#begin mainfunction
 blacklist = ["a", "an", "the", "so", "as", "how",
             "i", "me", "we", "they", "you", "it", "he", "she",
             "but", "have", "had",
@@ -90,13 +96,13 @@ print(sourceString + "\n")
 if not os.path.exists('cleanbooks'):
    os.makedirs('cleanbooks')
-# if not os.path.exists('lexicon'):
+if not os.path.exists('lexicon'):
-#     os.makedirs('lexicon')
+    os.makedirs('lexicon')
 if not os.path.exists(pathString):
    os.makedirs(pathString)
-#call(["python", "blacklist.py"])
+call(["python", "blacklist.py"])
 i=0
 skip = 1
 with open(sourceString, 'U') as fileIn:
@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
                for tempWord in line.split():
                    word=cleanWord(tempWord)
                    if not word:
                        continue
+                    if len(word) < 3:
-                    # temp = fileCheck(word)
+						continue;
-                    #
+                    if word in blacklist:
-                    # if temp == -1:
+						continue;
-                    #     continue
-                    # if temp == 0:
-                    temp = morphyTest(word)
-                    if temp:
-                        stem = ps.stem(temp)
-                        if stem and not stem in blacklist:
-                            cleanString = cleanString + ' ' + stem
+                    temp = fileCheck(word)
+                    if temp == -1:
+                        continue
+                    if temp:
+                        cleanString = (cleanString + " " + temp);
+                        continue
+                    else:
+                        morphy = morphyTest(word)
+                        if morphy:
+                            stem = ps.stem(morphy)
+                            if stem:
+				stem = liFix(stem)
+                                cleanString = writeWord(cleanString, word, stem, blacklist)
-                    #if temp == 0:
-                    #    catchAll(word)
                cleanString = cleanString + os.linesep
-            if len(cleanString.split(' ')) > 10:
+            if len(cleanString.split(' ')) > 2:
                fileOut.write(cleanString)
                fileOut.close()

--- a/RIVaccessories.h
+++ b/RIVaccessories.h
 #ifndef RIVACCESS_H_
 #define RIVACCESS_H_
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
 int isWordClean(char* word);
 /* used by wordClean */
 int isLetter(char c);
+/* creates a standard seed from the characters in a word, hopefully unique */
+int wordtoSeed(char* word);
 int isLetter(char c){
 	if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
@@ -26,5 +33,19 @@ int isWordClean(char* word){
 	return 1;
 }
+int wordtoSeed(char* word){
+	int i=0;
+	int seed = 0;
+	while(*word){
+		/* left-shift 5 each time *should* make seeds unique to words
+		 * this means letters are taken as characters counted in base 32, which
+		 * should be large enough to hold all english characters plus a few outliers
+		 * */
+		seed += (*(word))<<(i*5);
+		word++;
+		i++;
+	}
+	return seed;
+}
 #endif
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
+#ifndef RIV_LEXICON_H
+#define RIV_LEXICON_H
+#include "RIVLower.h"
+#include "RIVaccessories.h"
+/* lexOpen is called to "open the lexicon", setting up for later calls to
+ * lexPush and lexPull. if the lexicon has not been opened before calls
+ * to these functions, their behavior can be unpredictable, most likely crashing
+ */
+void lexOpen();
+/* lexClose should always be called after the last lex push or lex pull call
+ * if the lexicon is left open, some vector data may be lost due to 
+ * un-flushed RIV cache
+ */
+void lexClose();
+/* both lexPush and lexPull must be called *after* the lexOpen() function
+ * and after using them the lexClose() function must be called to ensure
+ * data security */
+/* lexPush writes a denseRIV to the lexicon for permanent storage */
+int lexPush(denseRIV* RIVout);
+int cacheCheckOnPush(denseRIV* RIVout);
+/* lexPull reads a denseRIV from the lexicon, under "word"
+ * if the file does not exist, it creates a 0 vector with the name of word
+ * lexPull returns a denseRIV *pointer* because its data must be tracked 
+ * globally for key optimizations
+ */
+denseRIV* lexPull(char* word);
+denseRIV* cacheCheckOnPull(char* word);
+/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
+ * saving it for long-term aggregation.  function is called by "lexPush",
+ * which is what users should actually use.  lexPush, unlike fLexPush,
+ * has cache logic under the hood for speed and harddrive optimization
+ */
+int fLexPush(denseRIV* RIVout);
+/* flexPull pulls data directly from a file and converts it (if necessary)
+ * to a denseRIV.  function is called by "lexPull" which is what users 
+ * should actually use.  lexPull, unlike FlexPull, has cache logic under
+ * the hood for speed and harddrive optimization 
+ */
+denseRIV* fLexPull(FILE* lexWord);
+/* redefines signal behavior to protect cached data against seg-faults etc*/
+void signalSecure(int signum, siginfo_t *si, void* arg);
+/* begin definitions */
+void lexOpen(char* lexName){
+	struct stat st = {0};
+	if (stat(lexName, &st) == -1) {
+		mkdir(lexName, 0777);
+	}	
+	strcpy(RIVKey.lexName, lexName);
+	/* open a slot at least large enough for ;worst case handling of
+	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
+	struct sigaction action = {0};
+	action.sa_sigaction = signalSecure;
+	action.sa_flags = SA_SIGINFO;
+	for(int i=1; i<27; i++){
+		sigaction(i,&action,NULL);
+	}
+	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
+	memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
+}
+void lexClose(){
+	if(cacheDump()){
+		puts("cache dump failed, some lexicon data was lost");
+	}
+}
+#if CACHESIZE > 0
+denseRIV* cacheCheckOnPull(char* word){
+	srand(wordtoSeed(word));
+	int hash = rand()%CACHESIZE;
+	if(RIVKey.RIVCache[hash]){
+		if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
+			/* if word is cached, pull from cache and exit */
+			return RIVKey.RIVCache[hash];
+		}
+	}
+	return NULL;
+}
+#endif
+denseRIV* lexPull(char* word){
+	denseRIV* output;
+	#if CACHESIZE > 0
+	/* if there is a cache, first check if the word is cached */
+	if((output = cacheCheckOnPull(word))){
+		return output;
+	}
+	#endif /* CACHESIZE > 0 */
+	/* if not, attempt to pull the word data from lexicon file */
+	char pathString[200];
+	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
+	FILE *lexWord = fopen(pathString, "rb");
+	/* if this lexicon file already exists */
+	if(lexWord){
+		/* pull data from file */
+		output = fLexPull(lexWord);
+		fclose(lexWord);
+	}else{
+		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
+		output = calloc(1, sizeof(denseRIV));
+	}
+	strcpy(output->name, word);
+	return output;
+}
+#if CACHESIZE > 0
+int cacheCheckOnPush(denseRIV* RIVout){
+	/* if our RIV was cached already, no need to play with it */
+	if(RIVout->cached){
+		return 1;
+	}
+	srand(wordtoSeed(RIVout->name));
+	int hash = rand()%CACHESIZE;
+	/* if there is no word in this cache slot */
+	if(!RIVKey.RIVCache[hash]){
+		/* push to cache instead of file */
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		return 1;
+	/*if the current RIV is more frequent than the RIV holding its slot */
+	}
+	if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
+		/* push the lower frequency cache entry to a file */
+		fLexPush(RIVKey.RIVCache[hash]);
+		/* replace this cache-slot with the current vector */
+		RIVKey.RIVCache[hash] = RIVout;
+		RIVKey.RIVCache[hash]->cached = 1;
+		return 1;
+	}
+	return 0;
+}
+#endif
+int lexPush(denseRIV* RIVout){
+	#if CACHESIZE > 0
+	if(cacheCheckOnPush(RIVout)){
+		return 0;
+	}
+	#endif /* CACHESIZE != 0 */
+	/* find the cache-slot where this word belongs */
+	return fLexPush(RIVout);
+}
+int fLexPush(denseRIV* output){	
+	char pathString[200] = {0};
+	denseRIV RIVout = *output;
+	/* word data will be placed in a (new?) file under the lexicon directory
+	 * in a file named after the word itself */
+	sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
+	FILE *lexWord = fopen(pathString, "wb");
+	if(!lexWord){
+		printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
+		return 1;
+	}
+	sparseRIV temp = consolidateD2S(RIVout.values);
+	if(temp.count<(RIVSIZE/2)){
+		/* smaller stored as sparse vector */
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(temp.locations, temp.count, sizeof(int), lexWord);
+		fwrite(temp.values, temp.count, sizeof(int), lexWord);
+	}else{
+		/* saturation is too high, better to store dense */
+		/* there's gotta be a better way to do this */
+		temp.count = 0;
+		fwrite(&temp.count, 1, sizeof(size_t), lexWord);
+		fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
+		fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
+		fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
+	}
+	fclose(lexWord);
+	free(output);
+	free(temp.locations);
+	return 0;
+}
+denseRIV* fLexPull(FILE* lexWord){
+	denseRIV *output = calloc(1,sizeof(denseRIV));
+	size_t typeCheck;
+	/* get metadata for vector */
+	fread(&typeCheck, 1, sizeof(size_t), lexWord);
+	fread(&output->frequency, 1, sizeof(int), lexWord);
+	fread(&output->contextSize, 1, sizeof(int), lexWord);
+	fread(&output->magnitude, 1, sizeof(float), lexWord);
+	/* first value stored is the value count if sparse, and 0 if dense */
+	if (typeCheck){
+		/* pull as sparseVector */
+		sparseRIV temp;
+		/* value was not 0, so it's the value count */
+		temp.count = typeCheck;
+		temp.locations = (int*)malloc(temp.count*2*sizeof(int));
+		temp.values = temp.locations+temp.count;
+		fread(temp.locations, temp.count, sizeof(int), lexWord);
+		fread(temp.values, temp.count, sizeof(int), lexWord);
+		addS2D(output->values, temp);
+		free(temp.locations);
+	}else{
+		/* typecheck is thrown away, just a flag in this case */
+		fread(output->values, RIVSIZE, sizeof(int), lexWord);
+	}
+	output->cached = 0;
+	return output;
+}
+int cacheDump(){
+	int flag = 0;
+	for(int i = 0; i < CACHESIZE; i++){
+		if(RIVKey.RIVCache[i]){
+			flag += fLexPush(RIVKey.RIVCache[i]);
+		}
+	}
+	return flag;
+}
+/*TODO add a simplified free function*/
+void signalSecure(int signum, siginfo_t *si, void* arg){
+  if(cacheDump()){
+	  puts("cache dump failed, some lexicon data lost");
+  }else{
+	puts("cache dumped successfully");
+  }
+  signal(signum, SIG_DFL);
+  kill(getpid(), signum);
+}
+#endif
--- a/RIVread.c
+++ b/RIVread.c
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
+#include <sys/stat.h>
-#define CACHESIZE 15000
+#include <sys/types.h>
-#include <setjmp.h>
+#include <unistd.h>
-#include <signal.h>
+#include <dirent.h>
-#include "RIVtools.h"
+#include <error.h>
-#include <sys/stat.h>
+#include "../../RIVtools.h"
-#include <sys/types.h>
-#include <unistd.h>
+//this program reads a directory full of files, and adds all context vectors (considering file as context)
-#include <dirent.h>
+//to all words found in these files. this is used to create a lexicon, or add to an existing one
-#include <error.h>
+void fileGrind(FILE* textFile);
-void fileGrind(FILE* textFile);
+void addContext(denseRIV* lexRIV, sparseRIV context);
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
+void directoryGrind(char *rootString);
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
-void directoryGrind(char *rootString);
-void readdirContingency(int sigNumber);
+int main(int argc, char *argv[]){
+	char pathString[1000];
-jmp_buf readdirRecov;
-int main(int argc, char *argv[]){
+	//we open the lexicon, if it does not yet exist, it will be created
-	clock_t begintotal = clock();
+	lexOpen("lexicon");
-	lexOpen("/home/drbob/Documents/lexicon");
-	char pathString[1000];
+	//we format the root directory, preparing to scan its contents
-	strcpy(pathString, argv[1]);
-	strcat(pathString, "/");
+	strcpy(pathString, argv[1]);
-	struct stat st = {0};
+	strcat(pathString, "/");
-	if(stat(pathString, &st) == -1) {
+	//ensure that the targeted root directory exists
-		return 1;
-	}
+	struct stat st;
+	if(stat(pathString, &st) == -1) {
-	directoryGrind(pathString);
+		printf("directory doesn't seem to exist");
+		return 1;
-	clock_t endtotal = clock();
+	}
-	double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
+	//we will scan the directory, adding all data to our lexicon, as seen inside
-	printf("total time:%lf\n\n", time_spent);
+	directoryGrind(pathString);
-	lexClose();
-	return 0;
+	//we close the lexicon again, ensuring all data is secured
-}
+	lexClose();
+	return 0;
-void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
+}
-	denseRIV *denseSet_slider = denseSet;
-	denseRIV *dense_stop = denseSet+RIVCount;
+//mostly a standard recursive Dirent-walk
+void directoryGrind(char *rootString){
+	/* *** begin Dirent walk *** */
+	char pathString[2000];
-	while(denseSet_slider<dense_stop){
+	DIR *directory;
-		addS2D((*denseSet_slider).values, additive);
+	struct dirent *files = 0;
-		*(denseSet_slider->contextSize) += additive.frequency;
-		denseSet_slider++;
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
-	}
+		return;
+	}
-}
-int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
+	while((files=readdir(directory))){
-	denseRIV* RIVStop = RIVSet+wordCount;
-	while(RIVSet<RIVStop){
+		if(!files->d_name[0]) break;
-		if(!strcmp(word, RIVSet->name)){
+		while(*(files->d_name)=='.'){
-			return 1;
+			files = readdir(directory);
 		}
-		RIVSet++;
-	}
-	return 0;
-}
+		if(files->d_type == DT_DIR){
-void directoryGrind(char *rootString){
+			strcpy(pathString, rootString);
-	char pathString[2000];
+			strcat(pathString, files->d_name);
-	DIR *directory;
+			strcat(pathString, "/");
-	struct dirent *files = 0;
+			directoryGrind(pathString);
+			continue;
-	if(!(directory = opendir(rootString))){
+		}
-		printf("location not found, %s\n", rootString);
-		return;
-	}
+		strcpy(pathString, rootString);
-	while((files=readdir(directory))){
+		strcat(pathString, files->d_name);
-		if(setjmp(readdirRecov)){
+		printf("%s\n", pathString);
-			continue;
+/* *** end dirent walk, begin meat of function  *** */
-		}
+		//check for non-txt files
-		//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
+		char *fileEnding = pathString+strlen(pathString)-4;
-		while(*(files->d_name)=='.'){
+		if(strcmp(fileEnding, ".txt")){
-			files = readdir(directory);
+			printf("skipped: %s\n", files->d_name); 
-		}
+			continue;
-		//signal(SIGSEGV, signalSecure);
+		}
-		if(files->d_type == DT_DIR){
+		//open a file within root directory
-			strcpy(pathString, rootString);
+		FILE *input = fopen(pathString, "r");
+		if(input){
-			strcat(pathString, files->d_name);
+			//process this file and add it's data to lexicon
-			strcat(pathString, "/");
+			fileGrind(input);
-			directoryGrind(pathString);
-		}
+			fclose(input);
-		strcpy(pathString, rootString);
+		}
-		strcat(pathString, files->d_name);
+	}
-		printf("%s\n", pathString);
+}
-		FILE *input = fopen(pathString, "r+");
-		if(input){
+//form context vector from contents of file, then add that vector to
-			fileGrind(input);
+//all lexicon entries of the words contained
-			fclose(input);
+void fileGrind(FILE* textFile){
-		}
+	//form a context vector.  "clean" indicates that it will ignore any word which
-	}
+	//contains unwanted characters
-}
+	sparseRIV contextVector = fileToL2Clean(textFile);
-void fileGrind(FILE* textFile){
+	//an array of denseRIVs, large enough to hold all vectors 
-	sparseRIV aggregateRIV = fileToL2Clean(textFile);
+	//(we don't yet know how many vectors there will be, so we make it big enough for the  maximum)
-	fseek(textFile, 0, SEEK_SET);
+	denseRIV* lexiconRIV;
-	int wordCount = 0;
+	char word[100] = {0};
-	denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
+	while(fscanf(textFile, "%99s", word)){
-	char word[200];
+		//we ensure that each word exists, and is free of unwanted characters
+		if(feof(textFile)) break;
-	while(fscanf(textFile, "%99s", word)){
+		if(!(*word))continue;
-		if(feof(textFile)) break;
-		if(!(*word))continue;
+		if(!isWordClean((char*)word)){
+			continue;
-		if(!isWordClean((char*)word)){
+		}
-			continue;
-		}
-		if(checkDupe(RIVArray, word, wordCount)){
+		//we pull the vector corresponding to each word from the lexicon
-			continue;
+		//if it's a new word, lexPull returns a 0 vector
-		}
+		lexiconRIV= lexPull(word);
-		RIVArray[wordCount] = lexPull(word);
+		//we add the context of this file to this wordVector
-		if(!*((RIVArray[wordCount].name))) break;
+		addContext(lexiconRIV, contextVector);
-		*(RIVArray[wordCount].frequency)+= 1;;
+		//we remove the sub-vector corresponding to the word itself
-		//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
+		subtractThisWord(lexiconRIV);
-		wordCount++;
+		//we log that this word has been encountered one more time
+		lexiconRIV->frequency += 1;
-	}
-	//printf("%d\n", wordCount);
+		//and finally we push it back to the lexicon for permanent storage
+		lexPush(lexiconRIV);
-	addS2Ds(RIVArray, aggregateRIV, wordCount);
-	denseRIV* RIVArray_slider = RIVArray;
+	}
-	denseRIV* RIVArray_stop = RIVArray+wordCount;
+	free(contextVector.locations);
-	while(RIVArray_slider<RIVArray_stop){
+}
-		lexPush(*RIVArray_slider);
+void addContext(denseRIV* lexRIV, sparseRIV context){
-		RIVArray_slider++;
-	}
+		//add context to the lexRIV, (using sparse-dense vector comparison)
-	free(RIVArray);
+		addS2D(lexRIV->values, context);
-	free(aggregateRIV.locations);
+		//log the "size" of the vector which was added
-}
+		//this is not directly necessary, but is useful metadata for some analises
-void readdirContingency(int sigNumber){
+		lexRIV->contextSize += context.contextSize;
-	puts("readdir segfaulted, trying to recover");
-	longjmp(readdirRecov, 1);
+}
-}
--- a/RIVtools.h
+++ b/RIVtools.h
 #ifndef RIVTOOLS_H_
 #define RIVTOOLS_H_
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include "RIVLower.h"
 #include "RIVaccessories.h"
+#include "RIVlexicon.h"
-/* lexPush writes a denseRIV to a file for permanent storage */
-int lexPush(denseRIV RIVout);
-/* lexPull reads an existing lexicon entry (under directory "lexicon")
- * and creates a denseRIV with those attributes.
- * if the file does not exist, it creates a 0 vector with the name of word
- */
-denseRIV lexPull(char* word);
 /* fileToL2 takes an input file, reads words (delimiting on " " and "\n") 
 * and returns a sparse RIV which is the vector sum of the base RIVs of each 
 * word contained
@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input);
 */
 sparseRIV fileToL2Clean(FILE *data);
-/*filetoL2direct is an experiment in simplifying the process.  it's slow */
+/* like fileToL2 but takes a block of text */
-sparseRIV fileToL2direct(FILE *data);
+sparseRIV textToL2(char *text);
 /*cosine determines the "similarity" between two RIVs. */
 double cosCompare(denseRIV baseRIV, sparseRIV comparator);
-/*currently unused */
+/*used for analysis of lexicon vectors (not simply accumulation)
-sparseRIV wordtoL2(char* word);
+ * to avoid overflow of even a 64 bit integer, vectors must be normalized
+ * this is an experimental approximation of true normal, which should yield 
-/* converts an implicit RIV (a set of unvalued locations) into a formal 
+ * some extra data about the nature of this word's context
- * sparse RIV.  this chooses the best method to perform the consolidation
+ */
- * and launches that function   defunct right now for memory usage reasons*/
-sparseRIV consolidateI2S(int *implicit, size_t valueCount);
-sparseRIV normalizeFloored(denseRIV input, int factor);
 sparseRIV normalize(denseRIV input, int factor);
-int roundMultiply(int base, float divisor);
-/* like fileToL2 but takes a block of text */
-sparseRIV text2L2(char *text);
 /* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
 double getMagnitudeSparse(sparseRIV input);
+/* same for denseVector */
-sparseRIV text2L2(char *text){
+double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
+sparseRIV textToL2(char *text){
 	int wordCount = 0;
-	unsigned char word[100] = {0};
+	char word[100] = {0};
 	int denseTemp[RIVSIZE] = {0};
 	/* locations (implicit RIV) are temp stored in temp block, and moved 
@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){
 		if(!displacement){
 			break;
 		}
 		if(!(*word)){
 			break;
 		}
@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){
 	addI2D(denseTemp, locations, locationCount);
 	sparseRIV output = consolidateD2S(denseTemp);
-	/* frequency records the number of words in this file, untill frequency
+	/* contextSize stores the number of words read */
-	 * is needed to hold some more useful data point */
+	output.contextSize = wordCount;
-	output.frequency = wordCount;
-	output.boolean = 1;
 	return output;
 }
 sparseRIV fileToL2(FILE *data){
-	unsigned char word[100] = {0};
+	char word[100] = {0};
-	/* locations (implicit RIV) are temp stored in temp block, and moved
+	/* locations (implicit RIV) are temporarily stored in temp block, 
-	 * to permanent home in consolidation */
+	 * and moved to permanent home in consolidation */
 	int *locations = RIVKey.h_tempBlock;
 	int locationCount = 0;
 	int denseTemp[RIVSIZE] = {0};
@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){
 	addI2D(denseTemp, locations, locationCount);
 	sparseRIV output = consolidateD2S(denseTemp);
-	/* frequency records the number of words in this file */
+	/* contextSize records the number of words in this file */
-	output.frequency = wordCount;
+	output.contextSize = wordCount;
-	output.boolean = 1;
+	fseek(data, 0, SEEK_SET);
 	return output;
 }
 sparseRIV fileToL2Clean(FILE *data){
 	int denseTemp[RIVSIZE] = {0};
-	unsigned char word[100] = {0};
+	char word[100] = {0};
 	int *locations = RIVKey.h_tempBlock;
 	unsigned int wordCount = 0;
@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){
 	sparseRIV output = consolidateD2S(denseTemp);
 	/* frequency records the number of words in this file */
-	output.frequency = locationCount/NONZEROS;
+	output.contextSize = locationCount/NONZEROS;
-	output.boolean = 1;
+	fseek(data, 0, SEEK_SET);
 	return output;
 }
-//defunct temporarily, might make a return
-/*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
-	if(valueCount<RIVKey.I2SThreshold){
-		 //direct method is faster on small datasets, but has geometric scaling on large datasets 
-		return consolidateI2SDirect(implicit, valueCount);
-	}else{
-		// optimized for large datasets 
-		return consolidateI2SIndirect(implicit, valueCount);
-	}
-}*/
-void aggregateWord2D(denseRIV destination, char* word){
-	srand(wordtoSeed((unsigned char*)word));
-	for(int i=0; i<NONZEROS; i++){
-		destination.values[(rand()%RIVSIZE)] +=1;
-		destination.values[(rand()%RIVSIZE)] -= 1;
-	}
-}
 double cosCompare(denseRIV baseRIV, sparseRIV comparator){
-	int dot = 0;
+	long long int dot = 0;
-	int n = comparator.count;
+	int* locations_stop = comparator.locations+comparator.count;
-	while(n){
+	int* locations_slider = comparator.locations;
-		n--;
+	int* values_slider = comparator.values;
+	while(locations_slider<locations_stop){
 		/* we calculate the dot-product to derive the cosine 
 		 * comparing sparse to dense by index*/
-		//dot += values[i]*baseRIV.values[locations[i]];
+		dot += *values_slider * baseRIV.values[*locations_slider];
-		dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
+		locations_slider++;
+		values_slider++;
-		//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
 	}
 	/*dot divided by product of magnitudes */
@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){
 	int *values = input.values;
 	int *values_stop = values+input.count;
 	while(values<values_stop){
+		/* we sum the squares of all elements */
 		temp += (*values)*(*values);
-		//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
 		values++;
 	}
+	/* we take the root of that sum */
 	return sqrt(temp);
 }
-denseRIV lexPull(char* word){
+double getMagnitudeDense(denseRIV *input){
-	#if CACHESIZE > 0
+	size_t temp = 0;
+	int *values = input->values;
-	/* if there is a cache, first check if the word is cached */
+	int *values_stop = values+RIVSIZE;
-	srand(wordtoSeed((unsigned char*)word));
+	while(values<values_stop){
-	int hash = rand()%CACHESIZE;
+		if(*values){
-	if(!strcmp(word, RIVKey.RIVCache[hash].name)){
+			temp += (*values)*(*values);
-		/* if word is cached, pull from cache and exit */
-		return RIVKey.RIVCache[hash];
-	}
-	#endif /* CACHESIZE > 0 */
-	/* if not, attempt to pull the word data from lexicon file */
-	denseRIV output;
-	char pathString[200];
-	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
-	FILE *lexWord = fopen(pathString, "rb");
-	/* if this lexicon file already exists */
-	if(lexWord){
-		/* pull data from file */
-		output = fLexPull(lexWord);
-		fclose(lexWord);
-	}else{
-		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
-		output = denseAllocate();
-	}
-	strcpy(output.name, word);
-	return output;
-}
-int lexPush(denseRIV RIVout){
-	#if CACHESIZE == 0
-	/* if there is no cache, simply push to file */
-	fLexPush(RIVout);
-	return 0;
-	#else /* CACHESIZE != 0 */
-	/* if our RIV was cached, there are two options (hopefully)
-	 * either the RIV is still cached, and the data has been updated 
-	 * to the cache or the RIV was pushed out from under it, 
-	 * in which case it has already been pushed! move on*/
-	if(RIVout.cached){
-		return 0;
-	}
-	srand(wordtoSeed((unsigned char*)RIVout.name));
-	int hash = rand()%CACHESIZE;
-	if(!RIVKey.RIVCache[hash].cached){
-		/* if there is no word in this cache slot, push to cache instead of file */
-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash].cached = 1;
-		return 0;
-	/*if the current RIV is more frequent than the RIV holding its slot */
-	}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
-		/* push the current cache entry to a file */
-		int diag = fLexPush(RIVKey.RIVCache[hash]);
-		/* push the current RIV to cache */
-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash].cached = 1;
-		return diag;
-	}else{
-		/* push current RIV to file */
-		fLexPush(RIVout);
-	}
-	return 0;
-	#endif /* CACHESIZE == 0 */
-}
-sparseRIV fileToL2direct(FILE *data){;
-	unsigned char word[100] = {0};
-	denseRIV denseTemp;
-	// a temporary dense RIV is stored in the tempBlock 
-	denseTemp.values = RIVKey.h_tempBlock;
-	memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
-	int count = 0;
-	while(fscanf(data, "%99s", word)){
-		count++;
-		if(feof(data)){
-			break;
-		}
-		if(!(*word)){
-			break;
 		}
+		values++;
-		// add word's L1 RIV to the accumulating implicit RIV 
-		aggregateWord2D(denseTemp, (char*)word);
 	}
-	sparseRIV output = consolidateD2S(denseTemp.values);
+	return sqrt(temp);
-	// frequency records the number of words in this file 
-	output.frequency = count;
-	output.boolean = 1;
-	return output;
 }
-sparseRIV normalizeFloored(denseRIV input, int factor){
-	float divisor = (float)factor/(*input.contextSize);
-//	printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
-	int* locations = RIVKey.h_tempBlock;
-	int* values = locations+RIVSIZE;
-	int count = 0;
-	for(int i=0; i<RIVSIZE; i++){
-		if(!input.values[i]) continue;
-		locations[count] = i;
-		values[count]= input.values[i]*divisor;
-		if(values[count])count++;
-	}
-	sparseRIV output;
-	output.locations = (int*) malloc(count*2*sizeof(int));
-	output.values = output.locations+count;
-	memcpy(output.locations, locations, count*sizeof(int));
-	memcpy(output.values, values, count*sizeof(int));
-	strcpy(output.name, input.name);
-	output.count = count;
-	output.magnitude = getMagnitudeSparse(output);
-	output.contextSize = *input.contextSize;
-	output.frequency = *input.frequency;
-	return output;
-}
 sparseRIV normalize(denseRIV input, int factor){
-	float divisor = (float)factor/(*input.contextSize);
+	/* multiplier is the scaling factor we need to bring our vector to the right size */
-//	printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
+	float multiplier = (float)factor/(input.contextSize);
-	int* locations = RIVKey.h_tempBlock;
+	/* write to temp slot, data will go to a permanent home lower in function */
+	int* locations = RIVKey.h_tempBlock+RIVSIZE;
 	int* values = locations+RIVSIZE;
 	int count = 0;
 	for(int i=0; i<RIVSIZE; i++){
+		/* if this point is 0, skip it */
 		if(!input.values[i]) continue;
+		/* record position and value in the forming sparse vector */
 		locations[count] = i;
-		values[count]= roundMultiply(input.values[i], divisor);
+		values[count]= round(input.values[i]*multiplier);
-		if(values[count])count++;
+		/* drop any 0 values */
+		if(values[count] > 1)count++; 
 	}
 	sparseRIV output;
+	output.count = count;
+	/* for memory conservation, both datasets are put inline with each other */
 	output.locations = (int*) malloc(count*2*sizeof(int));
 	output.values = output.locations+count;
+	/* copy the data from tempBlock into permanent home */
 	memcpy(output.locations, locations, count*sizeof(int));
 	memcpy(output.values, values, count*sizeof(int));
+	/* carry metadata */
 	strcpy(output.name, input.name);
-	output.count = count;
 	output.magnitude = getMagnitudeSparse(output);
-	output.contextSize = *input.contextSize;
+	output.contextSize = input.contextSize;
-	output.frequency = *input.frequency;
+	output.frequency = input.frequency;
 	return output;
 }
-int roundMultiply(int base, float divisor){
-	float temp = base*divisor;
-	int output = temp*2;
-	if (output%2){
-		output/=2;
-		output+=1;
-	}else{
-		output/=2;
-	}
-	return output;
-}
 #endif