Commit 34c65893 by amberhosen

updated RIVreads

parent 9d2c0fed
#ifndef RIVLOWER_H_
#ifndef RIVLOWER_H_ #define RIVLOWER_H_
#define RIVLOWER_H_ #include <stdio.h>
#include <stdio.h> #include <stdlib.h>
#include <stdlib.h> #include <string.h>
#include <string.h> #include <signal.h>
#include <signal.h> #include <unistd.h>
#include <unistd.h> #include <sys/stat.h>
#include <math.h> #include "RIVaccessories.h"
#include <sys/stat.h> /* RIVSIZE macro defines the dimensionality off the RIVs we will use
#include <sys/types.h> * 25000 is the standard, but can be redefined specifically
/* RIVSIZE macro defines the dimensionality off the RIVs we will use */
* 25000 is the standard, but can be redefined specifically #ifndef RIVSIZE
*/ #define RIVSIZE 25000
#ifndef RIVSIZE #endif
#define RIVSIZE 25000
#endif #if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#if RIVSIZE<0 #endif
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif /* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
/* NONZeros macro defines the number of non-zero values that will be generated */
* for any level one (barcode) RIV. 2 is simple and lightweight to begin #ifndef NONZEROS
*/ #define NONZEROS 2
#ifndef NONZEROS #endif
#define NONZEROS 2
#endif #if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#if NONZEROS%2 || NONZEROS<1 #endif
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
/* CACHESIZE macro defines the number of RIVs the system will cache. * faster in aggregation and reading applications. doesn't affect systems
* a larger cache means more memory consumption, but will also be significantly * that do not use lexpull/push
* faster in aggregation and reading applications. doesn't affect systems */
* that do not use lexpull/push #ifndef CACHESIZE
*/ #define CACHESIZE 5000
#ifndef CACHESIZE #endif
#define CACHESIZE 20
#endif #if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#if CACHESIZE<0 #endif
#error "CACHESIZE cannot be a negative number"
#endif /* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE /* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s * unless we are doing long term RIV aggregation.
* as this is often an ideal case, it is adviseable as the default * specifically, a sparseRIV contains a pair of arrays,
* unless we are doing long term RIV aggregation. * containing locations and values, where pairs are found in like array
* specifically, a sparseRIV contains a pair of arrays, * indices.
* containing locations and values, where pairs are found in like array */
* indices. typedef struct{
*/ char name[100];
typedef struct{ int *values;
char name[100]; int *locations;
int *values; size_t count;
int *locations; double magnitude;
size_t count; int contextSize;
int frequency; int frequency;
double magnitude; }sparseRIV;
int boolean; /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
int contextSize; * this is rarely the case, but its primary use is for performing vector
}sparseRIV; * math, as comparisons and arithmetic between vectors are ideally
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors * performed between sparse and dense (hetero-arithmetic)
* this is rarely the case, but its primary use is for performing vector */
* math, as comparisons and arithmetic between vectors are ideally typedef struct{
* performed between sparse and dense (hetero-arithmetic) int cached;
*/ char name[100];
typedef struct{ int frequency;
char name[100]; double magnitude;
int* values; int contextSize;
int* frequency; int values[RIVSIZE];
double magnitude; }denseRIV;
int cached;
int *contextSize; /*RIVKey, holds global variables used under the hood, primarily for the lexicon
}denseRIV; * it also holds a "temp block" that will be used by the dense to sparse
* conversion and implicit RIV aggregation
/*RIVKey, holds global variables used under the hood, primarily for the lexicon */
* it also holds a "temp block" that will be used by the dense to sparse struct RIVData{
* conversion and implicit RIV aggregation int h_tempBlock[TEMPSIZE];
*/ int tempSize;
struct RIVData{ char lexName[255];
int h_tempBlock[TEMPSIZE]; denseRIV* RIVCache[CACHESIZE];
int tempSize; }static RIVKey;
char lexName[255];
denseRIV RIVCache[CACHESIZE]; /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
}static RIVKey; * all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
/* lexOpen is called to "open the lexicon", setting up for later calls to * contain any metadata
* lexPush and lexPull. if the lexicon has not been opened before calls */
* to these functions, their behavior can be unpredictable, most likely crashing sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
*/
void lexOpen(); /* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
/* lexClose should always be called after the last lex push or lex pull call * this produces an "implicit" RIV which can be used with the mapI2D function
* if the lexicon is left open, some vector data may be lost due to * to create a denseRIV.
* un-flushed RIV cache */
*/ void makeSparseLocations(char* word, int *seeds, size_t seedCount);
void lexClose();
/* mapI2D maps an "implicit RIV" that is, an array of index values,
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with * arranged by chronological order of generation (as per makesparseLocations)
* all 0s removed. it does not automatically carry metadata, which must be assigned * it assigns, in the process of mapping, values according to ordering
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't */
* contain any metadata int* mapI2D(int *locations, size_t seedCount);
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion /* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using
*/
int* addS2D(int* destination, sparseRIV input);
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file /* caheDump flushes the RIV cache out to relevant files, backing up all
* this produces an "implicit" RIV which can be used with the mapI2D function * data. this is called by the lexClose and signalSecure functions
* to create a denseRIV. */
*/ int cacheDump();
void makeSparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file, * to a denseRIV. used by the file2L2 functions in aggregating a document vector
* saving it for long-term aggregation. function is called by "lexPush", */
* which is what users should actually use. lexPush, unlike fLexPush, int* addI2D(int* destination, int* locations, size_t seedCount);
* has cache logic under the hood for speed and harddrive optimization
*/ /*subtracts a words vector from its own context. regularly used in lex building
int fLexPush(denseRIV RIVout); */
void subtractThisWord(denseRIV* vector);
/* flexPull pulls data directly from a file and converts it (if necessary) /* begin definitions */
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
* the hood for speed and harddrive optimization
*/ int *locations_slider = input.locations;
denseRIV fLexPull(FILE* lexWord); int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word); /* apply values at an index based on locations */
while(locations_slider<locations_stop){
/* mapI2D maps an "implicit RIV" that is, an array of index values, destination[*locations_slider] += *values_slider;
* arranged by chronological order of generation (as per makesparseLocations) locations_slider++;
* it assigns, in the process of mapping, values according to ordering values_slider++;
*/ }
int* mapI2D(int *locations, size_t seedCount);
return destination;
/* highly optimized method for adding vectors. there is no method }
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
*/ int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int* addS2D(int* destination, sparseRIV input); int *locations_slider = locations;
/* int *locations_stop = locations_slider+valueCount;
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount); /*apply values +1 or -1 at an index based on locations */
* consolidate I2S is temporarily deprecated. may be brought back. while(locations_slider<locations_stop){
* in tandem they are much faster, but less careful with RAM */
destination[*locations_slider] +=1;
/* caheDump flushes the RIV cache out to relevant files, backing up all locations_slider++;
* data. this is called by the lexClose and signalSecure functions destination[*locations_slider] -= 1;
*/ locations_slider++;
int cacheDump(); }
/* adds all elements of an implicit RIV (a sparseRIV represented without values) return destination;
* to a denseRIV. used by the file2L2 functions in aggregating a document vector }
*/ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int* addI2D(int* destination, int* locations, size_t seedCount); int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/* allocates a denseRIV filled with 0s
*/ /*apply values +1 or -1 at an index based on locations */
denseRIV denseAllocate(); while(locations_slider<locations_stop){
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum); destination[*locations_slider] +=1;
/* begin definitions */ locations_slider++;
destination[*locations_slider] -= 1;
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination locations_slider++;
}
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count; return destination;
}
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++; sparseRIV consolidateD2S(int *denseInput){
values_slider++; sparseRIV output;
} output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
return destination; int* locations = RIVKey.h_tempBlock+RIVSIZE;
} int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination for(int i=0; i<RIVSIZE; i++){
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations; /* act only on non-zeros */
int *locations_stop = locations_slider+valueCount; if(denseInput[i]){
/*apply values +1 or -1 at an index based on locations */ /* assign index to locations */
while(locations_slider<locations_stop){ *(locations_slider++) = i;
destination[*locations_slider] +=1; /* assign value to values */
locations_slider++; *(values_slider++) = denseInput[i];
destination[*locations_slider] -= 1;
locations_slider++; /* track size of forming sparseRIV */
} output.count++;
}
}
return destination; /* a slot is opened for the locations/values pair */
} output.locations = (int*) malloc(output.count*2*sizeof(int));
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination if(!output.locations){
int *locations_slider = locations; printf("memory allocation failed"); //*TODO enable fail point knowledge and security
int *locations_stop = locations_slider+valueCount; }
/* copy locations values into opened slot */
/*apply values +1 or -1 at an index based on locations */ memcpy(output.locations, locations, output.count*sizeof(int));
while(locations_slider<locations_stop){
output.values = output.locations + output.count;
destination[*locations_slider] +=1;
locations_slider++; /* copy values into opened slot */
destination[*locations_slider] -= 1; memcpy(output.values, values, output.count*sizeof(int));
locations_slider++;
} return output;
}
return destination;
}
void makeSparseLocations(char* word, int *locations, size_t count){
/* locations+=count;
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){ srand(wordtoSeed(word));
int *denseTemp = mapI2D(implicit, valueCount); int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
sparseRIV sparseOut = consolidateD2S(denseTemp); /* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
free(denseTemp); locations++;
*locations = rand()%RIVSIZE;
locations++;
return sparseOut; }
return;
}
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){ sparseRIV* sparseAllocateFormatted(){
sparseRIV sparseOut; sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found; return output;
for(int i=0; i<valueCount; i++){ }
found = 0; void subtractThisWord(denseRIV* vector){
for(int j=0; j<sparseOut.count; j++){ //set the rand() seed to the word
if(implicit[i] == locationsTemp[j]){ srand(wordtoSeed(vector->name));
valuesTemp[i] += add; /* the base word vector is composed of NONZERO (always an even number)
add *= -1; * +1s and -1s at "random" points (defined by the above seed.
found = 1; * if we invert it to -1s and +1s, we have subtraction */
}
} for(int i = 0; i < NONZEROS; i+= 2){
if(!found){ vector->values[rand()%RIVSIZE] -= 1;
locationsTemp[sparseOut.count] = implicit[i]; vector->values[rand()%RIVSIZE] += 1;
}
valuesTemp[sparseOut.count] = add; /* record a context size 1 smaller */
sparseOut.count++; vector->contextSize-= 1;
add*= -1;
} }
}
sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int)); #endif
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}*/
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void lexOpen(char* lexName){
/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
struct stat st;
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
for(int i=1; i<20; i++){
signal(i, signalSecure);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
// printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
// printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
size_t typeCheck;
int flag = 0;
/* get metadata for vector */
flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
flag+= fread(output.frequency, 1, sizeof(int), lexWord);
flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
exit(1);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
else{
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.contextSize = output.frequency+1;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#define RIVSIZE 50000
#define NONZEROS 8
#include <setjmp.h>
#include <signal.h>
#include "../RIVet/RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#define RIVSIZE 200000
#define NONZEROS 2
#define CACHESIZE 1000
#include "../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addContext(denseRIV* lexRIV, sparseRIV context);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon8-50");
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0}; //ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1; return 1;
} }
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); //we close the lexicon again, ensuring all data is secured
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose(); lexClose();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ //mostly a standard recursive Dirent-walk
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -76,79 +57,101 @@ void directoryGrind(char *rootString){ ...@@ -76,79 +57,101 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue; if(!files->d_name[0]) break;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); /* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); //form a context vector. "clean" indicates that it will ignore any word which
fseek(textFile, 0, SEEK_SET); //contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
//an array of denseRIVs, large enough to hold all vectors
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){
continue;
} //we pull the vector corresponding to each word from the lexicon
RIVArray[wordCount] = lexPull(word); //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
//we add the context of this file to this wordVector
*(RIVArray[wordCount].frequency)+= 1;; addContext(lexiconRIV, contextVector);
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
//we remove the sub-vector corresponding to the word itself
wordCount++; subtractThisWord(lexiconRIV);
} //we log that this word has been encountered one more time
//printf("%d\n", wordCount); lexiconRIV->frequency += 1;
addS2Ds(RIVArray, aggregateRIV, wordCount); //and finally we push it back to the lexicon for permanent storage
denseRIV* RIVArray_slider = RIVArray; lexPush(lexiconRIV);
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
} }
free(RIVArray); free(contextVector.locations);
free(aggregateRIV.locations);
} }
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1);
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
} }
...@@ -5,15 +5,16 @@ clean(){ ...@@ -5,15 +5,16 @@ clean(){
else else
python shittyballs.py "$1" python shittyballs.py "$1"
./RIVread cleanbooks/ ./RIVread1 cleanbooks/
# ./RIVread1 cleanbooks/
./RIVread2 cleanbooks/ ./RIVread2 cleanbooks/
#./RIVread3 cleanbooks/ ./RIVread3 cleanbooks/
#./RIVread4 cleanbooks/ ./RIVread4 cleanbooks/
./RIVread5 cleanbooks/ ./RIVread5 cleanbooks/
./RIVread6 cleanbooks/ ./RIVread6 cleanbooks/
./RIVread7 cleanbooks/
rm -r cleanbooks/ rm -r cleanbooks/
#rm "$1"
fi fi
shift shift
done done
...@@ -21,4 +22,4 @@ clean(){ ...@@ -21,4 +22,4 @@ clean(){
clean ../bookCleaner/books/* clean ../../books/*
import requests #import requests
import re import re
import string import string
import os import os
...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn ...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
import pdb import pdb
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
def adverbFix(word):
if not nltk.pos_tag(word)[0][1] == 'RB':
return word
adjective = word[:-2] def writeWord(cleanString, word, stem, blacklist):
if not nltk.pos_tag(word)[0][1] == 'JJ': if word == stem:
return word; FILE = open("lexicon/" + word, "w")
FILE = open("lexicon/" + word, "w") FILE.write("1");
FILE.write("2" + temp) FILE.close();
FILE.close() return (cleanString + " " + word)
FILE = open("lexicon/" + adjective, "w")
FILE.write("1") elif stem not in blacklist:
FILE.close() if len(stem) > 2:
return adjective FILE = open("lexicon/" + word, "w")
FILE.write("2"+stem);
def strip(word): FILE.close();
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: FILE = open("lexicon/" + stem, "w")
if word.endswith(suffix): FILE.write("1")
return word[:-len(suffix)] FILE.close();
return (cleanString + " " + stem)
return cleanString
def liFix(word):
if not word[len(word)-2:] == "li":
return word return word
temp = ps.stem(word[:-2])
if temp:
return temp
return word
def cleanWord(word): def cleanWord(word):
#if(len(word) == 0):
#print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
word = word.lower(); word = word.lower();
regex = re.compile('[^a-z]+') regex = re.compile('[^a-z]+')
word = regex.sub('', word) word = regex.sub('', word)
...@@ -44,13 +50,11 @@ def cleanWord(word): ...@@ -44,13 +50,11 @@ def cleanWord(word):
def fileCheck(word): def fileCheck(word):
try: try:
#print("trying")
wordFile = open("lexicon/{}".format(word), "r") wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1)) code = int(wordFile.read(1))
except: except:
#print("file does not exist")
return 0 return 0
#print("fileCode{}".format(code))
if code == 2: if code == 2:
word = wordFile.read() word = wordFile.read()
...@@ -74,6 +78,8 @@ def morphyTest(word): ...@@ -74,6 +78,8 @@ def morphyTest(word):
return morphyTemp; return morphyTemp;
#begin mainfunction
blacklist = ["a", "an", "the", "so", "as", "how", blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she", "i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had", "but", "have", "had",
...@@ -90,13 +96,13 @@ print(sourceString + "\n") ...@@ -90,13 +96,13 @@ print(sourceString + "\n")
if not os.path.exists('cleanbooks'): if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks') os.makedirs('cleanbooks')
# if not os.path.exists('lexicon'): if not os.path.exists('lexicon'):
# os.makedirs('lexicon') os.makedirs('lexicon')
if not os.path.exists(pathString): if not os.path.exists(pathString):
os.makedirs(pathString) os.makedirs(pathString)
#call(["python", "blacklist.py"]) call(["python", "blacklist.py"])
i=0 i=0
skip = 1 skip = 1
with open(sourceString, 'U') as fileIn: with open(sourceString, 'U') as fileIn:
...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn: ...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
for tempWord in line.split(): for tempWord in line.split():
word=cleanWord(tempWord) word=cleanWord(tempWord)
if not word: if not word:
continue continue
if len(word) < 3:
# temp = fileCheck(word) continue;
# if word in blacklist:
# if temp == -1: continue;
# continue
# if temp == 0:
temp = morphyTest(word)
if temp:
stem = ps.stem(temp)
if stem and not stem in blacklist:
cleanString = cleanString + ' ' + stem
temp = fileCheck(word)
if temp == -1:
continue
if temp:
cleanString = (cleanString + " " + temp);
continue
else:
morphy = morphyTest(word)
if morphy:
stem = ps.stem(morphy)
if stem:
stem = liFix(stem)
cleanString = writeWord(cleanString, word, stem, blacklist)
#if temp == 0:
# catchAll(word)
cleanString = cleanString + os.linesep cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 10: if len(cleanString.split(' ')) > 2:
fileOut.write(cleanString) fileOut.write(cleanString)
fileOut.close() fileOut.close()
......
#ifndef RIVACCESS_H_ #ifndef RIVACCESS_H_
#define RIVACCESS_H_ #define RIVACCESS_H_
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
int isWordClean(char* word); int isWordClean(char* word);
/* used by wordClean */ /* used by wordClean */
int isLetter(char c); int isLetter(char c);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(char* word);
int isLetter(char c){ int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1; if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
...@@ -26,5 +33,19 @@ int isWordClean(char* word){ ...@@ -26,5 +33,19 @@ int isWordClean(char* word){
return 1; return 1;
} }
int wordtoSeed(char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
#endif #endif
#ifndef RIV_LEXICON_H
#define RIV_LEXICON_H
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
* data security */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
int cacheCheckOnPush(denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
* globally for key optimizations
*/
denseRIV* lexPull(char* word);
denseRIV* cacheCheckOnPull(char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
void lexOpen(char* lexName){
struct stat st = {0};
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for ;worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action = {0};
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){
sigaction(i,&action,NULL);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(char* word){
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(RIVKey.RIVCache[hash]){
if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
}
return NULL;
}
#endif
denseRIV* lexPull(char* word){
denseRIV* output;
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
if((output = cacheCheckOnPull(word))){
return output;
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = calloc(1, sizeof(denseRIV));
}
strcpy(output->name, word);
return output;
}
#if CACHESIZE > 0
int cacheCheckOnPush(denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached){
return 1;
}
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!RIVKey.RIVCache[hash]){
/* push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(RIVKey.RIVCache[hash]);
/* replace this cache-slot with the current vector */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
}
return 0;
}
#endif
int lexPush(denseRIV* RIVout){
#if CACHESIZE > 0
if(cacheCheckOnPush(RIVout)){
return 0;
}
#endif /* CACHESIZE != 0 */
/* find the cache-slot where this word belongs */
return fLexPush(RIVout);
}
int fLexPush(denseRIV* output){
char pathString[200] = {0};
denseRIV RIVout = *output;
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(output);
free(temp.locations);
return 0;
}
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output->values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output->values, RIVSIZE, sizeof(int), lexWord);
}
output->cached = 0;
return output;
}
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
flag += fLexPush(RIVKey.RIVCache[i]);
}
}
return flag;
}
/*TODO add a simplified free function*/
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <sys/stat.h>
#define CACHESIZE 15000 #include <sys/types.h>
#include <setjmp.h> #include <unistd.h>
#include <signal.h> #include <dirent.h>
#include "RIVtools.h" #include <error.h>
#include <sys/stat.h> #include "../../RIVtools.h"
#include <sys/types.h>
#include <unistd.h> //this program reads a directory full of files, and adds all context vectors (considering file as context)
#include <dirent.h> //to all words found in these files. this is used to create a lexicon, or add to an existing one
#include <error.h>
void fileGrind(FILE* textFile);
void fileGrind(FILE* textFile); void addContext(denseRIV* lexRIV, sparseRIV context);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void directoryGrind(char *rootString);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
void readdirContingency(int sigNumber); int main(int argc, char *argv[]){
char pathString[1000];
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ //we open the lexicon, if it does not yet exist, it will be created
clock_t begintotal = clock(); lexOpen("lexicon");
lexOpen("/home/drbob/Documents/lexicon");
char pathString[1000]; //we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcpy(pathString, argv[1]);
struct stat st = {0}; strcat(pathString, "/");
if(stat(pathString, &st) == -1) { //ensure that the targeted root directory exists
return 1;
} struct stat st;
if(stat(pathString, &st) == -1) {
directoryGrind(pathString); printf("directory doesn't seem to exist");
return 1;
clock_t endtotal = clock(); }
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; //we will scan the directory, adding all data to our lexicon, as seen inside
printf("total time:%lf\n\n", time_spent); directoryGrind(pathString);
lexClose();
return 0; //we close the lexicon again, ensuring all data is secured
} lexClose();
return 0;
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ }
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount; //mostly a standard recursive Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000];
while(denseSet_slider<dense_stop){ DIR *directory;
addS2D((*denseSet_slider).values, additive); struct dirent *files = 0;
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++; if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
} return;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){ while((files=readdir(directory))){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){ if(!files->d_name[0]) break;
if(!strcmp(word, RIVSet->name)){ while(*(files->d_name)=='.'){
return 1; files = readdir(directory);
} }
RIVSet++;
}
return 0;
} if(files->d_type == DT_DIR){
void directoryGrind(char *rootString){ strcpy(pathString, rootString);
char pathString[2000]; strcat(pathString, files->d_name);
DIR *directory; strcat(pathString, "/");
struct dirent *files = 0; directoryGrind(pathString);
continue;
if(!(directory = opendir(rootString))){ }
printf("location not found, %s\n", rootString);
return;
}
strcpy(pathString, rootString);
while((files=readdir(directory))){ strcat(pathString, files->d_name);
if(setjmp(readdirRecov)){ printf("%s\n", pathString);
continue; /* *** end dirent walk, begin meat of function *** */
}
//check for non-txt files
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); char *fileEnding = pathString+strlen(pathString)-4;
while(*(files->d_name)=='.'){ if(strcmp(fileEnding, ".txt")){
files = readdir(directory); printf("skipped: %s\n", files->d_name);
} continue;
//signal(SIGSEGV, signalSecure); }
if(files->d_type == DT_DIR){ //open a file within root directory
strcpy(pathString, rootString); FILE *input = fopen(pathString, "r");
if(input){
strcat(pathString, files->d_name); //process this file and add it's data to lexicon
strcat(pathString, "/"); fileGrind(input);
directoryGrind(pathString);
} fclose(input);
strcpy(pathString, rootString); }
strcat(pathString, files->d_name); }
printf("%s\n", pathString); }
FILE *input = fopen(pathString, "r+");
if(input){ //form context vector from contents of file, then add that vector to
fileGrind(input); //all lexicon entries of the words contained
fclose(input); void fileGrind(FILE* textFile){
} //form a context vector. "clean" indicates that it will ignore any word which
} //contains unwanted characters
} sparseRIV contextVector = fileToL2Clean(textFile);
void fileGrind(FILE* textFile){ //an array of denseRIVs, large enough to hold all vectors
sparseRIV aggregateRIV = fileToL2Clean(textFile); //(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
fseek(textFile, 0, SEEK_SET); denseRIV* lexiconRIV;
int wordCount = 0; char word[100] = {0};
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); while(fscanf(textFile, "%99s", word)){
char word[200]; //we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
while(fscanf(textFile, "%99s", word)){
if(!(*word))continue;
if(feof(textFile)) break;
if(!(*word))continue; if(!isWordClean((char*)word)){
continue;
if(!isWordClean((char*)word)){ }
continue;
}
if(checkDupe(RIVArray, word, wordCount)){ //we pull the vector corresponding to each word from the lexicon
continue; //if it's a new word, lexPull returns a 0 vector
} lexiconRIV= lexPull(word);
RIVArray[wordCount] = lexPull(word);
//we add the context of this file to this wordVector
if(!*((RIVArray[wordCount].name))) break; addContext(lexiconRIV, contextVector);
*(RIVArray[wordCount].frequency)+= 1;; //we remove the sub-vector corresponding to the word itself
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); subtractThisWord(lexiconRIV);
wordCount++; //we log that this word has been encountered one more time
lexiconRIV->frequency += 1;
}
//printf("%d\n", wordCount); //and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray; }
denseRIV* RIVArray_stop = RIVArray+wordCount; free(contextVector.locations);
while(RIVArray_slider<RIVArray_stop){ }
lexPush(*RIVArray_slider); void addContext(denseRIV* lexRIV, sparseRIV context){
RIVArray_slider++;
} //add context to the lexRIV, (using sparse-dense vector comparison)
free(RIVArray); addS2D(lexRIV->values, context);
free(aggregateRIV.locations);
//log the "size" of the vector which was added
} //this is not directly necessary, but is useful metadata for some analises
void readdirContingency(int sigNumber){ lexRIV->contextSize += context.contextSize;
puts("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1); }
}
#ifndef RIVTOOLS_H_ #ifndef RIVTOOLS_H_
#define RIVTOOLS_H_ #define RIVTOOLS_H_
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
#include "RIVlexicon.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n") /* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each * and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained * word contained
...@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input); ...@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input);
*/ */
sparseRIV fileToL2Clean(FILE *data); sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */ /* like fileToL2 but takes a block of text */
sparseRIV fileToL2direct(FILE *data); sparseRIV textToL2(char *text);
/*cosine determines the "similarity" between two RIVs. */ /*cosine determines the "similarity" between two RIVs. */
double cosCompare(denseRIV baseRIV, sparseRIV comparator); double cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */ /*used for analysis of lexicon vectors (not simply accumulation)
sparseRIV wordtoL2(char* word); * to avoid overflow of even a 64 bit integer, vectors must be normalized
* this is an experimental approximation of true normal, which should yield
/* converts an implicit RIV (a set of unvalued locations) into a formal * some extra data about the nature of this word's context
* sparse RIV. this chooses the best method to perform the consolidation */
* and launches that function defunct right now for memory usage reasons*/
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV normalizeFloored(denseRIV input, int factor);
sparseRIV normalize(denseRIV input, int factor); sparseRIV normalize(denseRIV input, int factor);
int roundMultiply(int base, float divisor);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
/* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process /* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
double getMagnitudeSparse(sparseRIV input); double getMagnitudeSparse(sparseRIV input);
/* same for denseVector */
sparseRIV text2L2(char *text){ double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
sparseRIV textToL2(char *text){
int wordCount = 0; int wordCount = 0;
unsigned char word[100] = {0}; char word[100] = {0};
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
...@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){ ...@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){
if(!displacement){ if(!displacement){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
...@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){ ...@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){
addI2D(denseTemp, locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file, untill frequency /* contextSize stores the number of words read */
* is needed to hold some more useful data point */ output.contextSize = wordCount;
output.frequency = wordCount;
output.boolean = 1;
return output; return output;
} }
sparseRIV fileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned char word[100] = {0}; char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temporarily stored in temp block,
* to permanent home in consolidation */ * and moved to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
...@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){ ...@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){
addI2D(denseTemp, locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* contextSize records the number of words in this file */
output.frequency = wordCount; output.contextSize = wordCount;
output.boolean = 1; fseek(data, 0, SEEK_SET);
return output; return output;
} }
sparseRIV fileToL2Clean(FILE *data){ sparseRIV fileToL2Clean(FILE *data){
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
unsigned char word[100] = {0}; char word[100] = {0};
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int wordCount = 0; unsigned int wordCount = 0;
...@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS; output.contextSize = locationCount/NONZEROS;
output.boolean = 1; fseek(data, 0, SEEK_SET);
return output; return output;
} }
//defunct temporarily, might make a return
/*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
//direct method is faster on small datasets, but has geometric scaling on large datasets
return consolidateI2SDirect(implicit, valueCount);
}else{
// optimized for large datasets
return consolidateI2SIndirect(implicit, valueCount);
}
}*/
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
double cosCompare(denseRIV baseRIV, sparseRIV comparator){ double cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0; long long int dot = 0;
int n = comparator.count; int* locations_stop = comparator.locations+comparator.count;
while(n){ int* locations_slider = comparator.locations;
n--; int* values_slider = comparator.values;
while(locations_slider<locations_stop){
/* we calculate the dot-product to derive the cosine /* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/ * comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]]; dot += *values_slider * baseRIV.values[*locations_slider];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]]; locations_slider++;
values_slider++;
//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
} }
/*dot divided by product of magnitudes */ /*dot divided by product of magnitudes */
...@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){ ...@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){
int *values = input.values; int *values = input.values;
int *values_stop = values+input.count; int *values_stop = values+input.count;
while(values<values_stop){ while(values<values_stop){
/* we sum the squares of all elements */
temp += (*values)*(*values); temp += (*values)*(*values);
//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
values++; values++;
} }
/* we take the root of that sum */
return sqrt(temp); return sqrt(temp);
} }
denseRIV lexPull(char* word){ double getMagnitudeDense(denseRIV *input){
#if CACHESIZE > 0 size_t temp = 0;
int *values = input->values;
/* if there is a cache, first check if the word is cached */ int *values_stop = values+RIVSIZE;
srand(wordtoSeed((unsigned char*)word)); while(values<values_stop){
int hash = rand()%CACHESIZE; if(*values){
if(!strcmp(word, RIVKey.RIVCache[hash].name)){ temp += (*values)*(*values);
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
} }
values++;
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
} }
sparseRIV output = consolidateD2S(denseTemp.values); return sqrt(temp);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
} }
sparseRIV normalizeFloored(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
if(!input.values[i]) continue;
locations[count] = i;
values[count]= input.values[i]*divisor;
if(values[count])count++;
}
sparseRIV output;
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
return output;
}
sparseRIV normalize(denseRIV input, int factor){ sparseRIV normalize(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize); /* multiplier is the scaling factor we need to bring our vector to the right size */
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor); float multiplier = (float)factor/(input.contextSize);
int* locations = RIVKey.h_tempBlock;
/* write to temp slot, data will go to a permanent home lower in function */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE; int* values = locations+RIVSIZE;
int count = 0; int count = 0;
for(int i=0; i<RIVSIZE; i++){ for(int i=0; i<RIVSIZE; i++){
/* if this point is 0, skip it */
if(!input.values[i]) continue; if(!input.values[i]) continue;
/* record position and value in the forming sparse vector */
locations[count] = i; locations[count] = i;
values[count]= roundMultiply(input.values[i], divisor); values[count]= round(input.values[i]*multiplier);
if(values[count])count++;
/* drop any 0 values */
if(values[count] > 1)count++;
} }
sparseRIV output; sparseRIV output;
output.count = count;
/* for memory conservation, both datasets are put inline with each other */
output.locations = (int*) malloc(count*2*sizeof(int)); output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count; output.values = output.locations+count;
/* copy the data from tempBlock into permanent home */
memcpy(output.locations, locations, count*sizeof(int)); memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int)); memcpy(output.values, values, count*sizeof(int));
/* carry metadata */
strcpy(output.name, input.name); strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output); output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize; output.contextSize = input.contextSize;
output.frequency = *input.frequency; output.frequency = input.frequency;
return output; return output;
} }
int roundMultiply(int base, float divisor){
float temp = base*divisor;
int output = temp*2;
if (output%2){
output/=2;
output+=1;
}else{
output/=2;
}
return output;
}
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment