added comments and explanations

3179d5fd · etcart · d78631fd · 3179d5fd · 3179d5fd · 3179d5fd
Commit 3179d5fd authored Feb 21, 2018 by etcart
Showing with 179 additions and 17 deletions
RIVLower.h
RIVLowerMorphic.h
RIVLowerMorphic.h.gch
RIVcentroids.c
RIVcull
RIVcullCPUlinux.c
RIVread
RIVread.c
RIVtoolsCPUlinux.h
RIVtoolsCPUwindows.h
RIVtoolsGPU.h
RIVtoolsMorphic.h
saturation
saturation.c
saturation.o
testfolder/numba1.txt
testfolder/numba2.txt
--- a/RIVLower.h
+++ b/RIVLower.h
--- a/RIVLowerMorphic.h
+++ b/RIVLowerMorphic.h
@@ -191,7 +191,7 @@ sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
 	int *denseTemp = mapI2D(implicit, valueCount);
 	sparseRIV sparseOut = consolidateD2S(denseTemp);
+	/* sparseOut is flagged as sparse in consolidate step */
 	free(denseTemp);
@@ -362,6 +362,7 @@ denseRIV fLexPull(FILE* lexWord){
 		output.magnitude = -1;
 	}
 	output.cached = 0;
+	output.flags &= ~SPARSE;
 	return output;
 }
@@ -397,6 +398,7 @@ denseRIV denseAllocate(){
 	output.frequency = (unsigned int*)(output.values+RIVSIZE);
 	output.magnitude = 0;	
 	output.cached = 0;
+	output.flags &= ~SPARSE;
 	return output;
 }

--- a/RIVLowerMorphic.h.gch
+++ b/RIVLowerMorphic.h.gch
--- a/RIVcentroids.c
+++ b/RIVcentroids.c
--- a/RIVcull
+++ b/RIVcull
--- a/RIVcullCPUlinux.c
+++ b/RIVcullCPUlinux.c
--- a/RIVread
+++ b/RIVread
--- a/RIVread.c
+++ b/RIVread.c
--- a/RIVtoolsCPUlinux.h
+++ b/RIVtoolsCPUlinux.h
@@ -5,6 +5,95 @@
 #include "RIVLower.h"
 #include "RIVaccessories.h"
+/* RIV stands for Random Index Vector, referring to the method of generating
+ * the basic vectors that correspond to each word.  each word has an algorithmically
+ * generated vector which represents it in this mathematical model, such that a word
+ * will produce the same vector each time it is encountered*[1]. this base
+ * vector will be referred to as a L1 vector or a barcode vector
+ * 
+ * by summing these vectors, we can get a mathematical representation of
+ * a set of text.  this summed vector will be referred to as an L2 vector
+ * or aggregate vector.  in its simplest implimentation, an L2 vector
+ * representation of a document contains a model of the contents of the 
+ * document, enabling us to compare direction and magnitude of document 
+ * vectors to understand their relationships to each other.
+ * 
+ * but the system we are really interested in is the ability to form 
+ * context vectors
+ * a context vector is the sum of all (L1?) vectors that the word
+ * has been encountered in context with. from these context vectors
+ * certain patterns and relationships between words should emerge. 
+ * what patterns? that is the key question we will try to answer
+ * 
+ * [1] a word produces the same vector each time it is encountered only 
+ * if the environment is the same, ie. RIVs are the same dimensionality
+ * nonzero count is the same.  comparing vectors produced in different 
+ * environments yields meaningless drivel and should be avoided
+ * 
+ * [2] what exactly "context" means remains a major stumbling point.
+ * paragraphs?  sentences?  some potential analyses would expect a static
+ * sized context (the nearest 10 words?) in order to be sensible, but 
+ * it may be that some other definition of context is the most valid for
+ * this model.  we will have to find out.
+ * 
+ * some notes:
+ * 
+ * -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
+ * the two primary data structures we will use to analyze these vectors
+ * each vector type is packed with some metadata 
+ * (name, magnitude, frequency, flags)
+ * 
+ * 	-denseRIV is a standard vector representation.  
+ * each array index corresponds to a dimension
+ * each value corresponds to a measurement in that dimension
+ * 
+ * 	-sparseRIV is vector representation optimized for largely empty vectors
+ * each data point is a location/value pair where the
+ * location represents array index 
+ * value represents value in that array index
+ * 
+ * if we have a sparsely populated dense vector (mostly 0s) such as:
+ * 
+ * |0|0|5|0|0|0|0|0|4|0|
+ * 
+ * there are only 2 values in a ten element array. this could, instead
+ * be represented as
+ * 
+ * |2|8| array indexes
+ * |5|4| array values
+ * |2|   record of size
+ * 
+ * and so, a 10 element vector has been represented in only 5 integers
+ * 
+ * this is important for memory use, of course, but also for rapid calculations
+ * if we have two vectors
+ * 
+ * |0|0|5|0|0|0|0|0|4|0|
+ * |0|0|0|0|0|0|7|0|3|-2|
+ * and we wish to perform the dot product this will take 10 steps,
+ * 9 of which are either 0*0 = 0, or 0*x = 0
+ * if we instead have these represented as sparse vectors
+ * |2|8| 
+ * |5|4| 
+ * |2|  
+ * 
+ * |6|8|9|
+ * |7|3|-2|
+ * |3|
+ * 
+ * we only need to search for matching location values 
+ * or, better yet, if we use a hybrid analysis:
+ * |0|0|5|0|0|0|0|0|4|0|
+ *   ___________/__/_/ 
+ *  / / /
+ * |6|8|9|
+ * |7|3|-2|
+ * |3|
+ * we can simply access the dense vector by indexes held in the sparse vector
+ * reducing this operation to only 3 steps
 /* lexPush writes a denseRIV to a file for permanent storage */
 int lexPush(denseRIV RIVout);
 /* lexPull reads an existing lexicon entry (under directory "lexicon")
@@ -18,19 +107,28 @@ denseRIV lexPull(char* word);
 * word contained
 */
 sparseRIV fileToL2(FILE *input);
 /* fileToL2Clean operates the same as fileToL2 butkeeps only words 
 * containing lowercase letters and the '_' symbol
 * this is important if you will be lexPush-ing those words later
 */
 sparseRIV fileToL2Clean(FILE *data);
+/*filetoL2direct is an experiment in simplifying the process.  it's slow */
 sparseRIV fileToL2direct(FILE *data);
 /*cosine determines the "similarity" between two RIVs. */
 float cosCompare(denseRIV baseRIV, sparseRIV comparator);
+/*currently unused */
 sparseRIV wordtoL2(char* word);
+/* converts an implicit RIV (a set of unvalued locations) into a formal 
+ * sparse RIV.  this chooses the best method to perform the consolidation
+ * and launches that function */
 sparseRIV consolidateI2S(int *implicit, size_t valueCount);
+/* like fileToL2 but takes a block of text */
 sparseRIV text2L2(char *text);
 sparseRIV text2L2(char *text){
 	unsigned int blockSize;
@@ -67,7 +165,8 @@ sparseRIV text2L2(char *text){
 	}
 	sparseRIV output = consolidateI2S(locations, locationCount);
-	/* frequency records the number of words in this file */
+	/* frequency records the number of words in this file, untill frequency
+	 * is needed to hold some more useful data point */
 	output.frequency = locationCount/NONZEROS;
 	output.boolean = 1;
 	return output;
@@ -158,16 +257,18 @@ sparseRIV fileToL2Clean(FILE *data){
 }
 sparseRIV consolidateI2S(int *implicit, size_t valueCount){
-	if(valueCount>RIVKey.I2SThreshold){
+	if(valueCount<RIVKey.I2SThreshold){
-		return consolidateI2SIndirect(implicit, valueCount);
+		/* direct method is faster on small datasets, but has geometric scaling on large datasets */
-	}else{
 		return consolidateI2SDirect(implicit, valueCount);
+	}else{
+		/* optimized for large datasets */		
+		return consolidateI2SIndirect(implicit, valueCount);
 	}	
 }
 void aggregateWord2D(denseRIV destination, char* word){
-	//makeSparseLocations((unsigned char*)word, locationSlot, 0);
 	srand(wordtoSeed((unsigned char*)word));
 	for(int i=0; i<NONZEROS; i++){
@@ -185,11 +286,13 @@ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
 	int *locations_Stop = locations+comparator.count;
 	while(locations<locations_Stop){
-		/* we calculate the dot-product to derive the cosine */
+		/* we calculate the dot-product to derive the cosine 
+		 * comparing sparse to dense by index*/
 		dot += (*values)*(*(baseRIV.values+(*locations)));
 		locations++;
 		values++;
 	}
+	/*dot divided by product of magnitudes */
 	float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
 	return cosine;
@@ -221,9 +324,9 @@ denseRIV lexPull(char* word){
 		return RIVKey.RIVCache[hash];
 	}
 	#endif /* CACHESIZE > 0 */
-	denseRIV output;
+	/* if not, attempt to pull the word data from lexicon file */
+	denseRIV output;
 	char pathString[200];
@@ -236,7 +339,7 @@ denseRIV lexPull(char* word){
 		output = fLexPull(lexWord);
 		fclose(lexWord);
 	}else{
-		/*if file does not exist, return a 0 vector */
+		/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons? 
 		output = denseAllocate();
 	}
@@ -244,15 +347,16 @@ denseRIV lexPull(char* word){
 	return output;
 }
 int lexPush(denseRIV RIVout){
-	//printf("%s\n", (*RIVout).name);
 	#if CACHESIZE == 0
+	/* if there is no cache, simply push to file */
 		fLexPush(RIVout);
 		return 0;
 	#else /* CACHESIZE != 0 */
 		/* if our RIV was cached, there are two options (hopefully)
-		 * either the RIV is still cached, and the data has been updated to the cache
+		 * either the RIV is still cached, and the data has been updated 
-		 * or the RIV was pushed out from under it, in which case it has already been pushed*/
+		 * to the cache or the RIV was pushed out from under it, 
+		 * in which case it has already been pushed! move on*/
 		if(RIVout.cached){
 			return 0;
@@ -262,17 +366,16 @@ int lexPush(denseRIV RIVout){
 		int hash = rand()%CACHESIZE;
 		if(!RIVKey.RIVCache[hash].cached){
+			/* if there is no word in this cache slot, push to cache instead of file */
 			RIVKey.RIVCache[hash] = RIVout;
 			RIVKey.RIVCache[hash].cached = 1;
 			return 0;
-		/*if the current RIV is more frequent than the RIV holding it's slot */
+		/*if the current RIV is more frequent than the RIV holding its slot */
 		}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
-			//scanf("%f", &(*RIVout).magnitude);
-			//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
 			/* push the current cache entry to a file */
 			int diag = fLexPush(RIVKey.RIVCache[hash]);
-			/* replace the cache entry with the currrent RIV */
+			/* push the current RIV to cache */
 			RIVKey.RIVCache[hash] = RIVout;
 			RIVKey.RIVCache[hash].cached = 1;

--- a/RIVtoolsCPUwindows.h
+++ b/RIVtoolsCPUwindows.h
--- a/RIVtoolsGPU.h
+++ b/RIVtoolsGPU.h
--- a/RIVtoolsMorphic.h
+++ b/RIVtoolsMorphic.h
--- a/saturation
+++ b/saturation
--- a/saturation.c
+++ b/saturation.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <time.h>
+#include "RIVtoolsCPUlinux.h"
+void directoryToL2s(char *rootString);
+int main(){
+	RIVInit();
+	char rootString[] = "lexicon/";
+	directoryToL2s(rootString);
+}
+void directoryToL2s(char *rootString){
+	sparseRIV fileRIV;
+	char pathString[2000];
+	DIR *directory;
+    struct dirent *files = 0;
+	if(!(directory = opendir(rootString))){
+		printf("location not found, %s\n", rootString);
+		return;
+	}
+	while((files=readdir(directory))){
+		if(*(files->d_name) == '.') continue;
+		if(files->d_type == DT_DIR){
+			strcpy(pathString, rootString);
+			strcat(pathString, files->d_name);
+			strcat(pathString, "/");
+			directoryToL2s(pathString);
+		}
+		strcpy(pathString, rootString);
+		strcat(pathString, files->d_name);
+		FILE *input = fopen(pathString, "r");
+		if(!input){
+			printf("file %s doesn't seem to exist, breaking out of loop", pathString);
+			return;
+		}else{
+			denseRIV temp = lexPull(pathString);
+			fileRIV = consolidateD2S(temp.values);
+			strcpy(fileRIV.name, pathString);
+			float count = fileRIV.count;
+			printf("%s, saturation: %f\n", fileRIV.name, count);
+			fclose(input);
+			free(temp.values);
+			//free(fileRIV.locations);
+		}
+	}
+}
--- a/saturation.o
+++ b/saturation.o
--- a/testfolder/numba1.txt
+++ b/testfolder/numba1.txt
--- a/testfolder/numba2.txt
+++ b/testfolder/numba2.txt