improved lex pointer system

47297b52 · etcart · ad4b27c9 · 47297b52 · 47297b52 · 47297b52
Commit 47297b52 authored Apr 30, 2018 by etcart
Showing with 319 additions and 142 deletions
RIVLower.h
RIVLower.h.gch
RIVclasses
RIVclasses.c
RIVclasses.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread
RIVread.c
RIVread.o
RIVtools.h.gch
--- a/RIVLower.h
+++ b/RIVLower.h
@@ -47,6 +47,7 @@
 /* the size of the tempBlock used in consolidation and implicit RIVs */
 #define TEMPSIZE 3*RIVSIZE

+
 /* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
 * as this is often an ideal case, it is adviseable as the default 
 * unless we are doing long term RIV aggregation.
@@ -54,6 +55,7 @@
 * containing locations and values, where pairs are found in like array 
 * indices.
 */
+ 
 typedef struct{
 	char name[100];
 	int *values;
@@ -70,7 +72,7 @@ typedef struct{
 */
 typedef struct{
 	char name[100];
-	int cached;
+	void* cached;
 	int frequency;
 	int contextSize;
 	float magnitude;
@@ -85,7 +87,8 @@ struct RIVData{
 	int h_tempBlock[TEMPSIZE];
 	int tempSize;
 	char lexName[255];
-	denseRIV* RIVCache[CACHESIZE];
+	denseRIV** RIVCache;
+	char flags;
 }static RIVKey;

 /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with

--- a/RIVLower.h.gch
+++ b/RIVLower.h.gch
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
 #include <stdio.h>
-#define CACHESIZE 0
-#define CACHEEXCLUSIVE 1
 #define RIVSIZE 50000
 #include "RIVtools.h"
+char* clean(char* word);
 char* stem(char* word);
+
+typedef char label[200];
+struct RIVclass{
+	label name;
+	sparseRIV* set;
+	int setSize;
+};
 int main(){
-	lexOpen("consolidatedLexicon50-8");
-	FILE* text = fopen("../books/pg56902.txt", "r");
-	if(!text){
-		puts("no file");
-		return 1;
-	}
-	denseRIV accumulate = {0};
-	sparseRIV temp;
-	char word[100];
-	while(fscanf(text, "%99s", word)){
-		if(feof(text)) break;
-		if(!*word) break;
-		
+	
+	lexOpen("lexicon", "rx");
+	
+	int classNo = 0;
+	label className = "tempName";
+	label* classNames = calloc(1, sizeof(label));
+	int classCount = 0;
+	
+	struct RIVclass* classes = malloc(sizeof(struct RIVclass));
+	
+	strcpy(classes[classCount].name, className);
+	strcpy(classNames[classCount], className);
+	classCount++;
+	while(1){
 		
-		if(stem(word)){
+		FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
+		if(!text){
+			puts("no file");
+			return 1;
+		}
+		denseRIV accumulate = {0};
+		sparseRIV temp;
+		char word[100];
+		while(fscanf(text, "%99s", word)){
+			if(feof(text)) break;
+			if(!*word) break;
+			if(!*clean(word)) continue;
+			
+			//if(stem(word)){
 			denseRIV* wordRIV = lexPull(word);
 			if(!wordRIV){
-				printf("%s, not in lexicon\n", word);
+				//printf("%s, not in lexicon\n", word);
 				continue;
 			}else{
+				//printf("%s, succesfully pulled\n", word);
 				temp = consolidateD2S(wordRIV->values);
+				
 				addS2D(accumulate.values, temp);
 				
 				
 				free(temp.locations);
 				free(wordRIV);
-				
-				
-				
-				
 			}
-		}else{
-			printf("%s, not in wordNet\n", word);
+		}
+	
+
 		
+		struct RIVclass* class = classes+classNo;
+		class->set = malloc(sizeof(sparseRIV));
+		class->setSize = 0;
+		class->set[class->setSize] = consolidateD2S(accumulate.values);
+		class->setSize++;
+	
+	
+		
+		
+		char* labelTemp = strstr(*classNames, className);
+		if(!labelTemp){
+			classNames = realloc(classNames, classCount*sizeof(label));
+			strcpy(classNames[classCount], className);
+			classCount++;
+		}else{
+			classNo = (labelTemp-*classNames);
+			
+			
 		}
 		
 	}
 	
 	
-	
 	return 0;
 	
 }
-
+char* clean(char* word){
+	char* letter = word;
+	char output[100] = {0};
+	char *outLetter = output;
+	while(*letter){
+		if(*letter >= 'A' && *letter <= 'Z'){
+			*outLetter = *letter + 32;
+			outLetter++;
+		}else if( *letter >= 'a' && *letter <= 'z'){
+			*outLetter = *letter;
+			outLetter++;
+		}
+		letter++;
+	}
+	
+	
+	strcpy(word,output);
+	return word;	
+}

 char* stem(char* word){
 	
@@ -61,10 +115,15 @@ char* stem(char* word){
 	
 	fscanf(WNfile, "%d", &WNdata);
 	
-	if(!WNdata) return NULL;
-	
-	if(WNdata == 1) return word;
+	if(!WNdata) {
 	
+		fclose(WNfile);
+		return NULL;
+	}
+	if(WNdata == 1) {
+		fclose(WNfile);
+		return word;
+	}
 	if(WNdata == 2){
 		fscanf(WNfile, "%s", word);
 		fclose(WNfile);

--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
@@ -5,9 +5,39 @@
 #include "RIVaccessories.h"
 #include "assert.h"

-#ifndef CACHEEXCLUSIVE
-#define CACHEEXCLUSIVE 0
+#ifndef READFLAG
+#define READFLAG 0x01
 #endif
+
+#ifndef WRITEFLAG
+#define WRITEFLAG 0x02
+#endif
+
+#ifndef INCFLAG 
+#define INCFLAG 0x04
+#endif
+
+#ifndef CACHEFLAG
+#define CACHEFLAG 0x08
+#endif
+
+#ifndef SORTCACHE
+	#ifndef HASHCACHE
+		#define HASHCACHE
+	#endif
+#endif
+typedef struct{
+	char lexName[100];
+	denseRIV* *cache;
+	struct cacheList* listPoint;
+	char flags;
+}LEXICON;
+struct cacheList{
+	denseRIV* *cache;
+	struct cacheList* next;
+	struct cacheList* prev;
+}*rootCache = NULL;
+
 #define IODISPLACEMENT   (sizeof(((sparseRIV*)0)->count)\
 						+ sizeof(((sparseRIV*)0)->frequency)\
 						+ sizeof(((sparseRIV*)0)->contextSize)\
@@ -15,17 +45,18 @@
 						/ sizeof(int)
 int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better

+
 /* lexOpen is called to "open the lexicon", setting up for later calls to
 * lexPush and lexPull. if the lexicon has not been opened before calls
 * to these functions, their behavior can be unpredictable, most likely crashing
 */
-void lexOpen();
+LEXICON* lexOpen(const char* lexName, const char* flags);

 /* lexClose should always be called after the last lex push or lex pull call
 * if the lexicon is left open, some vector data may be lost due to 
 * un-flushed RIV cache
 */
-void lexClose();
+void lexClose(LEXICON*);


 /* both lexPush and lexPull must be called *after* the lexOpen() function
@@ -33,34 +64,34 @@ void lexClose();
 * data security */
 
 /* lexPush writes a denseRIV to the lexicon for permanent storage */
-int lexPush(denseRIV* RIVout);
+int lexPush(LEXICON* lexicon, denseRIV* RIVout);

 /* cacheCheckOnPush tests the state of this vector in our lexicon cache
 * and returns 1 on "success" indicating cache storage and no need to push to file
 * or returns 0 on "failure" indicating that the vector need be pushed to file 
 */
-int cacheCheckOnPush(denseRIV* RIVout);
+int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);

 /* lexPull reads a denseRIV from the lexicon, under "word"
 * if the file does not exist, it creates a 0 vector with the name of word
 * lexPull returns a denseRIV *pointer* because its data must be tracked 
 * globally for key optimizations
 */
-denseRIV* lexPull(char* word);
+denseRIV* lexPull(LEXICON* lexicon, char* word);

 /* cacheCheckonPull checks if the word's vector is stored in cache,
 * and returns a pointer to that vector on success
 * or returns a NULL pointer if the word is not cached, indicating a need 
 * to pull from file
 */
-denseRIV* cacheCheckOnPull(char* word);
+denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);

 /* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
 * saving it for long-term aggregation.  function is called by "lexPush",
 * which is what users should actually use.  lexPush, unlike fLexPush,
 * has cache logic under the hood for speed and harddrive optimization
 */
-int fLexPush(denseRIV* RIVout);
+int fLexPush(LEXICON* lexicon, denseRIV* RIVout);

 /* flexPull pulls data directly from a file and converts it (if necessary)
 * to a denseRIV.  function is called by "lexPull" which is what users 
@@ -77,63 +108,187 @@ void signalSecure(int signum, siginfo_t *si, void* arg);
 */
 int saturationForStaging(denseRIV* output);
 /* begin definitions */
-void lexOpen(char* lexName){
-	
+LEXICON* lexOpen(const char* lexName, const char* flags){
+	LEXICON* output = calloc(1, sizeof(LEXICON));
+	/* identify the presence of read, write, and exclusive flags */
+	char* r = strstr(flags, "r");
+	char* w = strstr(flags, "w");
+	char* x = strstr(flags, "x");
 	struct stat st = {0};
-	if (stat(lexName, &st) == -1) {
-		mkdir(lexName, 0777);
-	}	
-	strcpy(RIVKey.lexName, lexName);
-	/* open a slot at least large enough for ;worst case handling of
-	 * sparse to dense conversion.  may be enlarged by filetoL2 functions */
-	struct sigaction action = {0};
-	action.sa_sigaction = signalSecure;
-	action.sa_flags = SA_SIGINFO;
-	for(int i=1; i<27; i++){
-		sigaction(i,&action,NULL);
+	
+	
+	if(w){
+		/* if set to write, we check and create if necessary, the lexicon */
+		if (stat(lexName, &st) == -1) {
+			mkdir(lexName, 0777);
+		}	
+		output->flags |= WRITEFLAG;
+	}else if(r){
+		/* if set to read and not write, return null if lexicon does not exist */
+		if (stat(lexName, &st) == -1) {
+			free(output);
+			return NULL;
+		}	
+		output->flags |= READFLAG;
+	}
+		/* if not set to exclusive, set the inclusive flag */
+	if(!x){
+		output->flags |= INCFLAG;
 	}
-	 
+	strcpy(output->lexName, lexName);
+	
+	
+	#if CACHESIZE > 0
+	
+	if(r && w){
+		//#TODO include hash vs sort cache logic flags
+		/* if we will be reading and writing the same lexicon, setup a
+		 * cache for this lexicon to speed up rewrites */
+		struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
+		newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
+		output->flags |= CACHEFLAG;
+		
+		output->cache = newCache->cache;
+		newCache->next = rootCache;
+		if(rootCache){
+			rootCache->prev = newCache;
+		}
+		rootCache = newCache;
+		output->listPoint = newCache;
+		
+		struct sigaction action = {0};
+		action.sa_sigaction = signalSecure;
+		action.sa_flags = SA_SIGINFO;
+		
+		for(int i=1; i<27; i++){
+			sigaction(i,&action,NULL);
+		}
+	}
+	#endif

-	/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
-	memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
+	return output;
+}
+void lexClose(LEXICON* toClose){
+	
+#if CACHESIZE>0 
+	if(toClose->flags & CACHEFLAG){
+		if(cacheDump(toClose->cache)){
+			puts("cache dump failed, some lexicon data was lost");
+		}
+		struct cacheList* listPoint = toClose->listPoint;
+		if(listPoint->prev){
+			listPoint->prev->next = toClose->listPoint->next;
+		}
+		if(listPoint->next){
+			listPoint->next->prev = toClose->listPoint->prev;
+		}
+		free(listPoint);
+	}
+#endif
+	free(toClose);
 }
-void lexClose(){
+
+int cacheDump(denseRIV* *toDump){
 	
-	 
-	if(cacheDump()){
-		puts("cache dump failed, some lexicon data was lost");
+	int flag = 0;
+	denseRIV* *toDump_slider = toDump;
+	denseRIV* *toDump_stop = toDump+CACHESIZE;
+	while(toDump_slider<toDump_stop){
+		if(*toDump_slider){
+
+			flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
+		}
+		toDump_slider++;
 	}
+	free(toDump);
+	
+	return flag;
 }
+
 #if CACHESIZE > 0
-denseRIV* cacheCheckOnPull(char* word){
+denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
+	#ifdef HASHCACHE
 	srand(wordtoSeed(word));
 	int hash = rand()%CACHESIZE;
-	if(RIVKey.RIVCache[hash]){
-		if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
+	if(lexicon->cache[hash]){
+		if(!strcmp(word, lexicon->cache[hash]->name)){

 			/* if word is cached, pull from cache and exit */
-			return RIVKey.RIVCache[hash];
+			return lexicon->cache[hash];
 		}
 	}
 	return NULL;
+	#endif
+	#ifdef SORTCACHE
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	
+	#endif
+}
+#if CACHESIZE > 0
+int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
+	
+	/* if our RIV was cached already, no need to play with it */
+	if(RIVout->cached == lexicon){
+		return 1;
+	}
+	#if HASHCACHE
+	srand(wordtoSeed(RIVout->name));
+	int hash = rand()%CACHESIZE;
+	
+	/* if there is no word in this cache slot */
+	if(!lexicon->cache[hash]){
+		/* push to cache instead of file */
+		lexicon->cache[hash] = RIVout;
+		lexicon->cache[hash]->cached = lexicon;
+		return 1;
+	/*if the current RIV is more frequent than the RIV holding its slot */
+	}
+	if(RIVout->frequency > lexicon->cache[hash]->frequency ){
+		/* push the lower frequency cache entry to a file */
+		fLexPush(lexicon, lexicon->cache[hash]);
+		/* replace this cache-slot with the current vector */
+
+		lexicon->cache[hash] = RIVout;
+		lexicon->cache[hash]->cached = lexicon;
+		
+		return 1;
+	}
+	return 0;
+	#endif
+	#if SORTCACHE
+	
+	
+	
+	
+	#endif
 }
 #endif
-denseRIV* lexPull(char* word){
+#endif
+denseRIV* lexPull(LEXICON* lexicon, char* word){
 	
 	denseRIV* output = NULL;
 	
 	#if CACHESIZE > 0
-
-	/* if there is a cache, first check if the word is cached */
-	if((output = cacheCheckOnPull(word))){
-		return output;
+	if(lexicon->flags & CACHEFLAG){
+		/* if there is a cache, first check if the word is cached */
+		if((output = cacheCheckOnPull(lexicon, word))){
+			return output;
+		}
 	}
 	#endif /* CACHESIZE > 0 */

 	/* if not, attempt to pull the word data from lexicon file */
 	char pathString[200];

-	sprintf(pathString, "%s/%s", RIVKey.lexName, word);
+	sprintf(pathString, "%s/%s", lexicon->lexName, word);

 	FILE *lexWord = fopen(pathString, "rb");

@@ -145,11 +300,12 @@ denseRIV* lexPull(char* word){
 		strcpy(output->name, word);
 		fclose(lexWord);
 	}else{
-		#if CACHEEXCLUSIVE == 0
-		/*if file does not exist, return a 0 vector (word is new to the lexicon */
-		output = calloc(1, sizeof(denseRIV));
-		strcpy(output->name, word);
-		#endif
+		/* if lexicon is set to inclusive (can gain new words) */
+		if(lexicon->flags & INCFLAG){
+			/*if file does not exist, return a 0 vector (word is new to the lexicon) */
+			output = calloc(1, sizeof(denseRIV));
+			strcpy(output->name, word);
+		}
 		/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
 	}

@@ -157,53 +313,23 @@ denseRIV* lexPull(char* word){

 	return output;
 }
-#if CACHESIZE > 0
-int cacheCheckOnPush(denseRIV* RIVout){
-	/* if our RIV was cached already, no need to play with it */
-	if(RIVout->cached){
-		return 1;
-	}
-	
-	srand(wordtoSeed(RIVout->name));
-	int hash = rand()%CACHESIZE;
-	
-	/* if there is no word in this cache slot */
-	if(!RIVKey.RIVCache[hash]){
-		/* push to cache instead of file */
-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash]->cached = 1;
-		return 1;
-	/*if the current RIV is more frequent than the RIV holding its slot */
-	}
-	if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
-		/* push the lower frequency cache entry to a file */
-		fLexPush(RIVKey.RIVCache[hash]);
-		/* replace this cache-slot with the current vector */

-		RIVKey.RIVCache[hash] = RIVout;
-		RIVKey.RIVCache[hash]->cached = 1;
-		
-		return 1;
-	}
-	return 0;
-	
-	
-}
-#endif
-int lexPush(denseRIV* RIVout){
+int lexPush(LEXICON* lexicon, denseRIV* RIVout){
 	
 	#if CACHESIZE > 0
+	if(lexicon->flags & CACHEFLAG){
 	/* check the cache to see if it belongs in cache */
-	if(cacheCheckOnPush(RIVout)){
-		/* if the cache check returns 1, it has been dealth with in cache */
-		return 0;
+		if(cacheCheckOnPush(lexicon, RIVout)){
+			/* if the cache check returns 1, it has been dealth with in cache */
+			return 0;
+		}
 	}
 	
-	#endif /* CACHESIZE != 0 */
+	#endif
 	
 	
 	/* push to the lexicon */
-	return fLexPush(RIVout);
+	return fLexPush(lexicon, RIVout);
 	
 }

@@ -242,12 +368,12 @@ int saturationForStaging(denseRIV* output){
 	
 	return *count;
 }
-int fLexPush(denseRIV* output){	
+int fLexPush(LEXICON* lexicon, denseRIV* output){	
 	char pathString[200] = {0};
 	
 	/* word data will be placed in a (new?) file under the lexicon directory
 	 * in a file named after the word itself */
-	sprintf(pathString, "%s/%s", RIVKey.lexName, output->name);
+	sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
 	
 	int saturation = saturationForStaging(output);
 	
@@ -267,7 +393,7 @@ int fLexPush(denseRIV* output){
 			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
 			return 1;
 		}
-		fwrite(&output->cached, sizeof(int), RIVSIZE+4, lexWord);
+		fwrite(((int*)&output->cached)+1, sizeof(int), RIVSIZE+4, lexWord);
 		
 		fclose(lexWord);
 	}
@@ -364,28 +490,16 @@ denseRIV* fLexPull(FILE* lexWord){

 }

-
-
-int cacheDump(){
-
-	int flag = 0;
-	for(int i = 0; i < CACHESIZE; i++){
-		if(RIVKey.RIVCache[i]){
-
-			flag += fLexPush(RIVKey.RIVCache[i]);
+void signalSecure(int signum, siginfo_t *si, void* arg){
+	while(rootCache){
+		if(cacheDump(rootCache->cache)){
+			puts("cache dump failed, some lexicon data lost");
 		}
+		rootCache = rootCache->next;
+		free(rootCache->prev);
 	}
-	return flag;
-}
-
-
-/*TODO add a simplified free function*/
-void signalSecure(int signum, siginfo_t *si, void* arg){
-  if(cacheDump()){
-	  puts("cache dump failed, some lexicon data lost");
-  }
-  signal(signum, SIG_DFL);
-  kill(getpid(), signum);
+	signal(signum, SIG_DFL);
+	kill(getpid(), signum);
 }

 #endif
--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread
+++ b/RIVread
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,7 +6,7 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
-#define CACHESIZE 100000
+#define CACHESIZE 10000
 #include "RIVtools.h"

 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -16,13 +16,13 @@ void fileGrind(FILE* textFile);
 void addContext(denseRIV* lexRIV, sparseRIV context);
 void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
-
+LEXICON* lp;
 int main(int argc, char *argv[]){

 	char pathString[1000];
-
+	lp = lexOpen("lexicon", "rw");
 	//we open the lexicon, if it does not yet exist, it will be created
-	lexOpen("lexicon");
+	
 	
 	//we format the root directory, preparing to scan its contents
 	
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]){
 	directoryGrind(pathString);

 	//we close the lexicon again, ensuring all data is secured
-	lexClose();
+	lexClose(lp);
 	return 0;
 }

@@ -81,6 +81,7 @@ void directoryGrind(char *rootString){
 			fclose(input);
 		}
 	}
+	closedir(directory);
 }


@@ -126,7 +127,7 @@ void lineGrind(char* textLine){
 		
 		//we pull the vector corresponding to each word from the lexicon
 		//if it's a new word, lexPull returns a 0 vector
-		lexiconRIV= lexPull(word);
+		lexiconRIV= lexPull(lp, word);

 		//we add the context of this file to this wordVector
 		addContext(lexiconRIV, contextVector);
@@ -138,7 +139,7 @@ void lineGrind(char* textLine){
 		lexiconRIV->frequency += 1;
 		
 		//and finally we push it back to the lexicon for permanent storage
-		lexPush(lexiconRIV);
+		lexPush(lp, lexiconRIV);
 		
 		
 	}

--- a/RIVread.o
+++ b/RIVread.o
--- a/RIVtools.h.gch
+++ b/RIVtools.h.gch