Commit 34c65893 by amberhosen

updated RIVreads

parent 9d2c0fed
#ifndef RIVLOWER_H_ #ifndef RIVLOWER_H_
#define RIVLOWER_H_ #define RIVLOWER_H_
#include <stdio.h> #include <stdio.h>
...@@ -6,9 +5,8 @@ ...@@ -6,9 +5,8 @@
#include <string.h> #include <string.h>
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#include <math.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include "RIVaccessories.h"
/* RIVSIZE macro defines the dimensionality off the RIVs we will use /* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically * 25000 is the standard, but can be redefined specifically
*/ */
...@@ -38,7 +36,7 @@ ...@@ -38,7 +36,7 @@
* that do not use lexpull/push * that do not use lexpull/push
*/ */
#ifndef CACHESIZE #ifndef CACHESIZE
#define CACHESIZE 20 #define CACHESIZE 5000
#endif #endif
#if CACHESIZE<0 #if CACHESIZE<0
...@@ -60,10 +58,9 @@ typedef struct{ ...@@ -60,10 +58,9 @@ typedef struct{
int *values; int *values;
int *locations; int *locations;
size_t count; size_t count;
int frequency;
double magnitude; double magnitude;
int boolean;
int contextSize; int contextSize;
int frequency;
}sparseRIV; }sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors /* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector * this is rarely the case, but its primary use is for performing vector
...@@ -71,12 +68,12 @@ typedef struct{ ...@@ -71,12 +68,12 @@ typedef struct{
* performed between sparse and dense (hetero-arithmetic) * performed between sparse and dense (hetero-arithmetic)
*/ */
typedef struct{ typedef struct{
int cached;
char name[100]; char name[100];
int* values; int frequency;
int* frequency;
double magnitude; double magnitude;
int cached; int contextSize;
int *contextSize; int values[RIVSIZE];
}denseRIV; }denseRIV;
/*RIVKey, holds global variables used under the hood, primarily for the lexicon /*RIVKey, holds global variables used under the hood, primarily for the lexicon
...@@ -87,21 +84,9 @@ struct RIVData{ ...@@ -87,21 +84,9 @@ struct RIVData{
int h_tempBlock[TEMPSIZE]; int h_tempBlock[TEMPSIZE];
int tempSize; int tempSize;
char lexName[255]; char lexName[255];
denseRIV RIVCache[CACHESIZE]; denseRIV* RIVCache[CACHESIZE];
}static RIVKey; }static RIVKey;
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned * all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't * to a denseRIV after the fact. often denseRIVs are only temporary, and don't
...@@ -109,32 +94,12 @@ void lexClose(); ...@@ -109,32 +94,12 @@ void lexClose();
*/ */
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a /* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file * file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function * this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV. * to create a denseRIV.
*/ */
void makeSparseLocations(unsigned char* word, int *seeds, size_t seedCount); void makeSparseLocations(char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values, /* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations) * arranged by chronological order of generation (as per makesparseLocations)
...@@ -147,11 +112,6 @@ int* mapI2D(int *locations, size_t seedCount); ...@@ -147,11 +112,6 @@ int* mapI2D(int *locations, size_t seedCount);
* to be more than worth using * to be more than worth using
*/ */
int* addS2D(int* destination, sparseRIV input); int* addS2D(int* destination, sparseRIV input);
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
* consolidate I2S is temporarily deprecated. may be brought back.
* in tandem they are much faster, but less careful with RAM */
/* caheDump flushes the RIV cache out to relevant files, backing up all /* caheDump flushes the RIV cache out to relevant files, backing up all
* data. this is called by the lexClose and signalSecure functions * data. this is called by the lexClose and signalSecure functions
...@@ -163,11 +123,9 @@ int cacheDump(); ...@@ -163,11 +123,9 @@ int cacheDump();
*/ */
int* addI2D(int* destination, int* locations, size_t seedCount); int* addI2D(int* destination, int* locations, size_t seedCount);
/* allocates a denseRIV filled with 0s /*subtracts a words vector from its own context. regularly used in lex building
*/ */
denseRIV denseAllocate(); void subtractThisWord(denseRIV* vector);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum);
/* begin definitions */ /* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
...@@ -186,7 +144,6 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete ...@@ -186,7 +144,6 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
return destination; return destination;
} }
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int)); int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations; int *locations_slider = locations;
...@@ -201,7 +158,6 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete ...@@ -201,7 +158,6 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
locations_slider++; locations_slider++;
} }
return destination; return destination;
} }
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
...@@ -221,50 +177,8 @@ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix de ...@@ -221,50 +177,8 @@ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix de
return destination; return destination;
} }
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}*/
sparseRIV consolidateD2S(int *denseInput){ sparseRIV consolidateD2S(int *denseInput){
sparseRIV output; sparseRIV output;
output.count = 0; output.count = 0;
...@@ -305,46 +219,8 @@ sparseRIV consolidateD2S(int *denseInput){ ...@@ -305,46 +219,8 @@ sparseRIV consolidateD2S(int *denseInput){
} }
void lexOpen(char* lexName){
/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
struct stat st;
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
for(int i=1; i<20; i++){
signal(i, signalSecure);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){ void makeSparseLocations(char* word, int *locations, size_t count){
locations+=count; locations+=count;
srand(wordtoSeed(word)); srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS; int *locations_stop = locations+NONZEROS;
...@@ -358,128 +234,29 @@ void makeSparseLocations(unsigned char* word, int *locations, size_t count){ ...@@ -358,128 +234,29 @@ void makeSparseLocations(unsigned char* word, int *locations, size_t count){
return; return;
} }
int fLexPush(denseRIV RIVout){ sparseRIV* sparseAllocateFormatted(){
char pathString[200] = {0}; sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
// printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
// printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
size_t typeCheck;
int flag = 0;
/* get metadata for vector */
flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
flag+= fread(output.frequency, 1, sizeof(int), lexWord);
flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output; return output;
}
void signalSecure(int signum){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
exit(1);
} }
void subtractThisWord(denseRIV* vector){
int cacheDump(){ //set the rand() seed to the word
srand(wordtoSeed(vector->name));
int flag = 0; /* the base word vector is composed of NONZERO (always an even number)
denseRIV* cache_slider = RIVKey.RIVCache; * +1s and -1s at "random" points (defined by the above seed.
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE; * if we invert it to -1s and +1s, we have subtraction */
while(cache_slider<cache_stop){
if((*cache_slider).cached){ for(int i = 0; i < NONZEROS; i+= 2){
vector->values[rand()%RIVSIZE] -= 1;
flag += fLexPush(*cache_slider); vector->values[rand()%RIVSIZE] += 1;
}
else{
} }
cache_slider++; /* record a context size 1 smaller */
} vector->contextSize-= 1;
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.contextSize = output.frequency+1;
output.magnitude = 0;
output.cached = 0;
return output;
} }
/*TODO add a simplified free function*/
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#define RIVSIZE 50000
#define NONZEROS 8
#include <setjmp.h>
#include <signal.h>
#include "../RIVet/RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#define RIVSIZE 200000
#define NONZEROS 2
#define CACHESIZE 1000
#include "../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addContext(denseRIV* lexRIV, sparseRIV context);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon8-50");
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0}; //ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1; return 1;
} }
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); //we close the lexicon again, ensuring all data is secured
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose(); lexClose();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ //mostly a standard recursive Dirent-walk
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -76,15 +57,13 @@ void directoryGrind(char *rootString){ ...@@ -76,15 +57,13 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -92,63 +71,87 @@ void directoryGrind(char *rootString){ ...@@ -92,63 +71,87 @@ void directoryGrind(char *rootString){
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); /* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); //form a context vector. "clean" indicates that it will ignore any word which
fseek(textFile, 0, SEEK_SET); //contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
int wordCount = 0; //an array of denseRIVs, large enough to hold all vectors
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); //(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
char word[200]; denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;; //we pull the vector corresponding to each word from the lexicon
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
wordCount++; //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
} //we remove the sub-vector corresponding to the word itself
//printf("%d\n", wordCount); subtractThisWord(lexiconRIV);
addS2Ds(RIVArray, aggregateRIV, wordCount); //we log that this word has been encountered one more time
denseRIV* RIVArray_slider = RIVArray; lexiconRIV->frequency += 1;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider); //and finally we push it back to the lexicon for permanent storage
RIVArray_slider++; lexPush(lexiconRIV);
}
free(RIVArray);
free(aggregateRIV.locations);
}
free(contextVector.locations);
} }
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover"); void addContext(denseRIV* lexRIV, sparseRIV context){
longjmp(readdirRecov, 1);
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
} }
...@@ -5,15 +5,16 @@ clean(){ ...@@ -5,15 +5,16 @@ clean(){
else else
python shittyballs.py "$1" python shittyballs.py "$1"
./RIVread cleanbooks/ ./RIVread1 cleanbooks/
# ./RIVread1 cleanbooks/
./RIVread2 cleanbooks/ ./RIVread2 cleanbooks/
#./RIVread3 cleanbooks/ ./RIVread3 cleanbooks/
#./RIVread4 cleanbooks/ ./RIVread4 cleanbooks/
./RIVread5 cleanbooks/ ./RIVread5 cleanbooks/
./RIVread6 cleanbooks/ ./RIVread6 cleanbooks/
./RIVread7 cleanbooks/
rm -r cleanbooks/ rm -r cleanbooks/
#rm "$1"
fi fi
shift shift
done done
...@@ -21,4 +22,4 @@ clean(){ ...@@ -21,4 +22,4 @@ clean(){
clean ../bookCleaner/books/* clean ../../books/*
import requests #import requests
import re import re
import string import string
import os import os
...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn ...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
import pdb import pdb
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
def adverbFix(word):
if not nltk.pos_tag(word)[0][1] == 'RB':
return word
adjective = word[:-2] def writeWord(cleanString, word, stem, blacklist):
if not nltk.pos_tag(word)[0][1] == 'JJ': if word == stem:
return word; FILE = open("lexicon/" + word, "w")
FILE.write("1");
FILE.close();
return (cleanString + " " + word)
elif stem not in blacklist:
if len(stem) > 2:
FILE = open("lexicon/" + word, "w") FILE = open("lexicon/" + word, "w")
FILE.write("2" + temp) FILE.write("2"+stem);
FILE.close() FILE.close();
FILE = open("lexicon/" + adjective, "w") FILE = open("lexicon/" + stem, "w")
FILE.write("1") FILE.write("1")
FILE.close() FILE.close();
return adjective return (cleanString + " " + stem)
return cleanString
def strip(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: def liFix(word):
if word.endswith(suffix): if not word[len(word)-2:] == "li":
return word[:-len(suffix)]
return word return word
temp = ps.stem(word[:-2])
if temp:
return temp
return word
def cleanWord(word): def cleanWord(word):
#if(len(word) == 0):
#print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
word = word.lower(); word = word.lower();
regex = re.compile('[^a-z]+') regex = re.compile('[^a-z]+')
word = regex.sub('', word) word = regex.sub('', word)
...@@ -44,13 +50,11 @@ def cleanWord(word): ...@@ -44,13 +50,11 @@ def cleanWord(word):
def fileCheck(word): def fileCheck(word):
try: try:
#print("trying")
wordFile = open("lexicon/{}".format(word), "r") wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1)) code = int(wordFile.read(1))
except: except:
#print("file does not exist")
return 0 return 0
#print("fileCode{}".format(code))
if code == 2: if code == 2:
word = wordFile.read() word = wordFile.read()
...@@ -74,6 +78,8 @@ def morphyTest(word): ...@@ -74,6 +78,8 @@ def morphyTest(word):
return morphyTemp; return morphyTemp;
#begin mainfunction
blacklist = ["a", "an", "the", "so", "as", "how", blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she", "i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had", "but", "have", "had",
...@@ -90,13 +96,13 @@ print(sourceString + "\n") ...@@ -90,13 +96,13 @@ print(sourceString + "\n")
if not os.path.exists('cleanbooks'): if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks') os.makedirs('cleanbooks')
# if not os.path.exists('lexicon'): if not os.path.exists('lexicon'):
# os.makedirs('lexicon') os.makedirs('lexicon')
if not os.path.exists(pathString): if not os.path.exists(pathString):
os.makedirs(pathString) os.makedirs(pathString)
#call(["python", "blacklist.py"]) call(["python", "blacklist.py"])
i=0 i=0
skip = 1 skip = 1
with open(sourceString, 'U') as fileIn: with open(sourceString, 'U') as fileIn:
...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn: ...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
for tempWord in line.split(): for tempWord in line.split():
word=cleanWord(tempWord) word=cleanWord(tempWord)
if not word: if not word:
continue continue
if len(word) < 3:
continue;
if word in blacklist:
continue;
# temp = fileCheck(word)
#
# if temp == -1:
# continue
# if temp == 0:
temp = morphyTest(word)
if temp:
stem = ps.stem(temp)
if stem and not stem in blacklist:
cleanString = cleanString + ' ' + stem
temp = fileCheck(word)
if temp == -1:
continue
if temp:
cleanString = (cleanString + " " + temp);
continue
else:
morphy = morphyTest(word)
if morphy:
stem = ps.stem(morphy)
if stem:
stem = liFix(stem)
cleanString = writeWord(cleanString, word, stem, blacklist)
#if temp == 0:
# catchAll(word)
cleanString = cleanString + os.linesep cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 10: if len(cleanString.split(' ')) > 2:
fileOut.write(cleanString) fileOut.write(cleanString)
fileOut.close() fileOut.close()
......
#ifndef RIVACCESS_H_ #ifndef RIVACCESS_H_
#define RIVACCESS_H_ #define RIVACCESS_H_
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
int isWordClean(char* word); int isWordClean(char* word);
/* used by wordClean */ /* used by wordClean */
int isLetter(char c); int isLetter(char c);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(char* word);
int isLetter(char c){ int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1; if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
...@@ -26,5 +33,19 @@ int isWordClean(char* word){ ...@@ -26,5 +33,19 @@ int isWordClean(char* word){
return 1; return 1;
} }
int wordtoSeed(char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
#endif #endif
#ifndef RIV_LEXICON_H
#define RIV_LEXICON_H
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
* data security */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
int cacheCheckOnPush(denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
* globally for key optimizations
*/
denseRIV* lexPull(char* word);
denseRIV* cacheCheckOnPull(char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
void lexOpen(char* lexName){
struct stat st = {0};
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for ;worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action = {0};
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){
sigaction(i,&action,NULL);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(char* word){
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(RIVKey.RIVCache[hash]){
if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
}
return NULL;
}
#endif
denseRIV* lexPull(char* word){
denseRIV* output;
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
if((output = cacheCheckOnPull(word))){
return output;
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = calloc(1, sizeof(denseRIV));
}
strcpy(output->name, word);
return output;
}
#if CACHESIZE > 0
int cacheCheckOnPush(denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached){
return 1;
}
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!RIVKey.RIVCache[hash]){
/* push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(RIVKey.RIVCache[hash]);
/* replace this cache-slot with the current vector */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
}
return 0;
}
#endif
int lexPush(denseRIV* RIVout){
#if CACHESIZE > 0
if(cacheCheckOnPush(RIVout)){
return 0;
}
#endif /* CACHESIZE != 0 */
/* find the cache-slot where this word belongs */
return fLexPush(RIVout);
}
int fLexPush(denseRIV* output){
char pathString[200] = {0};
denseRIV RIVout = *output;
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(output);
free(temp.locations);
return 0;
}
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output->values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output->values, RIVSIZE, sizeof(int), lexWord);
}
output->cached = 0;
return output;
}
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
flag += fLexPush(RIVKey.RIVCache[i]);
}
}
return flag;
}
/*TODO add a simplified free function*/
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#include <setjmp.h>
#include <signal.h>
#include "RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include "../../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addContext(denseRIV* lexRIV, sparseRIV context);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon");
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0}; //ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1; return 1;
} }
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); //we close the lexicon again, ensuring all data is secured
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose(); lexClose();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ //mostly a standard recursive Dirent-walk
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -74,15 +53,13 @@ void directoryGrind(char *rootString){ ...@@ -74,15 +53,13 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -90,63 +67,87 @@ void directoryGrind(char *rootString){ ...@@ -90,63 +67,87 @@ void directoryGrind(char *rootString){
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); /* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); //form a context vector. "clean" indicates that it will ignore any word which
fseek(textFile, 0, SEEK_SET); //contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
int wordCount = 0; //an array of denseRIVs, large enough to hold all vectors
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); //(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
char word[200]; denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;; //we pull the vector corresponding to each word from the lexicon
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
wordCount++; //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
} //we remove the sub-vector corresponding to the word itself
//printf("%d\n", wordCount); subtractThisWord(lexiconRIV);
addS2Ds(RIVArray, aggregateRIV, wordCount); //we log that this word has been encountered one more time
denseRIV* RIVArray_slider = RIVArray; lexiconRIV->frequency += 1;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider); //and finally we push it back to the lexicon for permanent storage
RIVArray_slider++; lexPush(lexiconRIV);
}
free(RIVArray);
free(aggregateRIV.locations);
}
free(contextVector.locations);
} }
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover"); void addContext(denseRIV* lexRIV, sparseRIV context){
longjmp(readdirRecov, 1);
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
} }
#ifndef RIVTOOLS_H_ #ifndef RIVTOOLS_H_
#define RIVTOOLS_H_ #define RIVTOOLS_H_
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
#include "RIVlexicon.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n") /* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each * and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained * word contained
...@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input); ...@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input);
*/ */
sparseRIV fileToL2Clean(FILE *data); sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */ /* like fileToL2 but takes a block of text */
sparseRIV fileToL2direct(FILE *data); sparseRIV textToL2(char *text);
/*cosine determines the "similarity" between two RIVs. */ /*cosine determines the "similarity" between two RIVs. */
double cosCompare(denseRIV baseRIV, sparseRIV comparator); double cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */ /*used for analysis of lexicon vectors (not simply accumulation)
sparseRIV wordtoL2(char* word); * to avoid overflow of even a 64 bit integer, vectors must be normalized
* this is an experimental approximation of true normal, which should yield
/* converts an implicit RIV (a set of unvalued locations) into a formal * some extra data about the nature of this word's context
* sparse RIV. this chooses the best method to perform the consolidation */
* and launches that function defunct right now for memory usage reasons*/
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV normalizeFloored(denseRIV input, int factor);
sparseRIV normalize(denseRIV input, int factor); sparseRIV normalize(denseRIV input, int factor);
int roundMultiply(int base, float divisor);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
/* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process /* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
double getMagnitudeSparse(sparseRIV input); double getMagnitudeSparse(sparseRIV input);
/* same for denseVector */
double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
sparseRIV text2L2(char *text){ sparseRIV textToL2(char *text){
int wordCount = 0; int wordCount = 0;
unsigned char word[100] = {0}; char word[100] = {0};
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
...@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){ ...@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){
if(!displacement){ if(!displacement){
break; break;
} }
if(!(*word)){ if(!(*word)){
break; break;
} }
...@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){ ...@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){
addI2D(denseTemp, locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file, untill frequency /* contextSize stores the number of words read */
* is needed to hold some more useful data point */ output.contextSize = wordCount;
output.frequency = wordCount;
output.boolean = 1;
return output; return output;
} }
sparseRIV fileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned char word[100] = {0}; char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temporarily stored in temp block,
* to permanent home in consolidation */ * and moved to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
...@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){ ...@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){
addI2D(denseTemp, locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* contextSize records the number of words in this file */
output.frequency = wordCount; output.contextSize = wordCount;
output.boolean = 1; fseek(data, 0, SEEK_SET);
return output; return output;
} }
sparseRIV fileToL2Clean(FILE *data){ sparseRIV fileToL2Clean(FILE *data){
int denseTemp[RIVSIZE] = {0}; int denseTemp[RIVSIZE] = {0};
unsigned char word[100] = {0}; char word[100] = {0};
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int wordCount = 0; unsigned int wordCount = 0;
...@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){
sparseRIV output = consolidateD2S(denseTemp); sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS; output.contextSize = locationCount/NONZEROS;
output.boolean = 1; fseek(data, 0, SEEK_SET);
return output; return output;
} }
//defunct temporarily, might make a return
/*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
//direct method is faster on small datasets, but has geometric scaling on large datasets
return consolidateI2SDirect(implicit, valueCount);
}else{
// optimized for large datasets
return consolidateI2SIndirect(implicit, valueCount);
}
}*/
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
double cosCompare(denseRIV baseRIV, sparseRIV comparator){ double cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0; long long int dot = 0;
int n = comparator.count; int* locations_stop = comparator.locations+comparator.count;
while(n){ int* locations_slider = comparator.locations;
n--; int* values_slider = comparator.values;
while(locations_slider<locations_stop){
/* we calculate the dot-product to derive the cosine /* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/ * comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]]; dot += *values_slider * baseRIV.values[*locations_slider];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]]; locations_slider++;
values_slider++;
//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
} }
/*dot divided by product of magnitudes */ /*dot divided by product of magnitudes */
...@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){ ...@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){
int *values = input.values; int *values = input.values;
int *values_stop = values+input.count; int *values_stop = values+input.count;
while(values<values_stop){ while(values<values_stop){
/* we sum the squares of all elements */
temp += (*values)*(*values); temp += (*values)*(*values);
//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
values++; values++;
} }
/* we take the root of that sum */
return sqrt(temp); return sqrt(temp);
} }
denseRIV lexPull(char* word){ double getMagnitudeDense(denseRIV *input){
#if CACHESIZE > 0 size_t temp = 0;
int *values = input->values;
/* if there is a cache, first check if the word is cached */ int *values_stop = values+RIVSIZE;
srand(wordtoSeed((unsigned char*)word)); while(values<values_stop){
int hash = rand()%CACHESIZE; if(*values){
if(!strcmp(word, RIVKey.RIVCache[hash].name)){ temp += (*values)*(*values);
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
} }
values++;
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
} }
return 0; return sqrt(temp);
#endif /* CACHESIZE == 0 */
} }
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file sparseRIV normalize(denseRIV input, int factor){
output.frequency = count; /* multiplier is the scaling factor we need to bring our vector to the right size */
output.boolean = 1; float multiplier = (float)factor/(input.contextSize);
return output;
}
sparseRIV normalizeFloored(denseRIV input, int factor){ /* write to temp slot, data will go to a permanent home lower in function */
float divisor = (float)factor/(*input.contextSize); int* locations = RIVKey.h_tempBlock+RIVSIZE;
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE; int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
if(!input.values[i]) continue;
locations[count] = i;
values[count]= input.values[i]*divisor;
if(values[count])count++;
}
sparseRIV output;
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
return output;
}
sparseRIV normalize(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE;
int count = 0; int count = 0;
for(int i=0; i<RIVSIZE; i++){ for(int i=0; i<RIVSIZE; i++){
/* if this point is 0, skip it */
if(!input.values[i]) continue; if(!input.values[i]) continue;
/* record position and value in the forming sparse vector */
locations[count] = i; locations[count] = i;
values[count]= roundMultiply(input.values[i], divisor); values[count]= round(input.values[i]*multiplier);
if(values[count])count++;
/* drop any 0 values */
if(values[count] > 1)count++;
} }
sparseRIV output; sparseRIV output;
output.count = count;
/* for memory conservation, both datasets are put inline with each other */
output.locations = (int*) malloc(count*2*sizeof(int)); output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count; output.values = output.locations+count;
/* copy the data from tempBlock into permanent home */
memcpy(output.locations, locations, count*sizeof(int)); memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int)); memcpy(output.values, values, count*sizeof(int));
/* carry metadata */
strcpy(output.name, input.name); strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output); output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize; output.contextSize = input.contextSize;
output.frequency = *input.frequency; output.frequency = input.frequency;
return output;
}
int roundMultiply(int base, float divisor){
float temp = base*divisor;
int output = temp*2;
if (output%2){
output/=2;
output+=1;
}else{
output/=2;
}
return output; return output;
} }
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment