Commit 08c0a7e5 by simetk

updated

parent f258c284
File added
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
#include "RIVLower.h"
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVInit(){
RIVKey.I2SThreshold = sqrt(RIVSIZE);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action;
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
//for(int i=1; i<27; i++){
sigaction(11,&action,NULL);
//}
RIVKey.h_tempBlock = (int*)malloc(3*RIVSIZE*sizeof(int));
RIVKey.tempSize = 3*RIVSIZE;
RIVKey.thing = 0;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void RIVCleanup(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters couned in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(int), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
int typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(unsigned int), lexWord);
fread(output.frequency, 1, sizeof(int), lexWord);
fread(&(output.magnitude), 1, sizeof(int), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(){
int i=0;
int j=0;
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
j++;
flag += fLexPush(*cache_slider);
}
else{
i++;
}
cache_slider++;
}
printf("%d cacheslots unused\n%d, cacheslots used", i, j);
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVSIZE+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
File added
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
#include <stdio.h>
#include "RIVtools.h"
#include <dirent.h>
#include <sys/types.h>
#include <time.h>
int main(){
lexOpen("/home/drbob/Documents/lexicon");
FILE *wordList = fopen("wordList.txt", "r");
char word[100];
denseRIV accept;
sparseRIV analyzefloor;
sparseRIV analyzerounded;
sparseRIV other;
while(fscanf(wordList, "%s", word)){
if(!*word) break;
if(feof(wordList))break;
puts(word);
// sleep(1);
accept = lexPull(word);
other = consolidateD2S(accept.values);
//other.magnitude = getMagnitudeSparse(other);
// accept.magnitude = other.magnitude;
// analyzerounded = normalize(accept, 2000);
// analyzefloor = normalizeFloored(accept, 2000);
// if(cosCompare(accept, analyzefloor)>1.00){
// printf("floored: %f rounded: %f\tcontextSize: %d\tfrequency: %d\tsaturationbase %d, saturationFloored %d, saturationRounded %d\n", analyzefloor.magnitude, analyzerounded.magnitude, *(accept.contextSize), *(accept.frequency), other.count, analyzefloor.count, analyzerounded.count);
////}
// free(analyzefloor.locations);
// free(analyzerounded.locations);
free(other.locations);
free(accept.values);
}
lexClose();
}
File added
...@@ -15,7 +15,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount); ...@@ -15,7 +15,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
int fileCount = 0; int fileCount = 0;
RIVInit(); //RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV)); sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000]; char rootString[2000];
if(argc <2){ if(argc <2){
...@@ -44,6 +44,7 @@ int main(int argc, char *argv[]){ ...@@ -44,6 +44,7 @@ int main(int argc, char *argv[]){
baseDense.values = malloc(RIVSIZE*sizeof(int)); baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs; fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider; sparseRIV* comparators_slider;
int thing = 0;
int count = 0; int count = 0;
while(fileRIVs_slider<fileRIVs_stop){ while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs; comparators_slider = fileRIVs;
...@@ -60,7 +61,7 @@ int main(int argc, char *argv[]){ ...@@ -60,7 +61,7 @@ int main(int argc, char *argv[]){
if(cosine>THRESHOLD){ if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine); printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0; (*comparators_slider).boolean = 0;
RIVKey.thing++; thing++;
} }
} }
...@@ -75,7 +76,7 @@ int main(int argc, char *argv[]){ ...@@ -75,7 +76,7 @@ int main(int argc, char *argv[]){
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time); printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count); printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", RIVKey.thing); printf("\nsims: %d \n", thing);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
......
File deleted
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.98
#include "RIVtools.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
int thing = 0;
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){
printf("%s\t%f\n",(*comparators_slider).name, cosine);
if(remove((*comparators_slider).name)){
printf(" well shit");
}
(*comparators_slider).boolean = 0;
thing++;
}
}
comparators_slider++;
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
puts(pathString);
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.90
#include "RIVtools.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
lexOpen("/home/drbob/Documents/lexicon");
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
int thing = 0;
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){
if(!fileRIVs_slider->boolean){
fileRIVs_slider++;
continue;
}
if(fileRIVs_slider->magnitude == 0) continue;
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude =(*fileRIVs_slider).magnitude;
while(comparators_slider < fileRIVs_stop){
if(!(comparators_slider->boolean&&strcmp(comparators_slider->name, fileRIVs_slider->name))){
comparators_slider++;
continue;
}
if(comparators_slider->magnitude==0) continue;
cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", fileRIVs_slider->name , comparators_slider->name, cosine);
comparators_slider->boolean = 0;
thing++;
}
comparators_slider++;
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
lexClose();
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
denseRIV temp = lexPull(files->d_name);
if(*temp.frequency >2000){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(temp, 500);
strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
(*fileCount)++;
}
free(temp.values);
}
}
File added
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#define CACHESIZE 10000 #define CACHESIZE 15000
//#define RIVSIZE 5
#include <setjmp.h> #include <setjmp.h>
#include <signal.h> #include <signal.h>
#include "RIVtoolsCPUlinux.h" #include "RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
...@@ -21,16 +20,21 @@ void readdirContingency(int sigNumber); ...@@ -21,16 +20,21 @@ void readdirContingency(int sigNumber);
jmp_buf readdirRecov; jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock(); clock_t begintotal = clock();
RIVInit(); lexOpen("/home/drbob/Documents/lexicon");
char pathString[1000]; char pathString[1000];
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0};
if(stat(pathString, &st) == -1) {
return 1;
}
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC; double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent); printf("total time:%lf\n\n", time_spent);
RIVCleanup(); lexClose();
return 0; return 0;
} }
...@@ -39,14 +43,14 @@ void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ ...@@ -39,14 +43,14 @@ void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *dense_stop = denseSet+RIVCount; denseRIV *dense_stop = denseSet+RIVCount;
//int *target;
while(denseSet_slider<dense_stop){ while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive); addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++; denseSet_slider++;
} }
} }
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){ int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount; denseRIV* RIVStop = RIVSet+wordCount;
...@@ -73,12 +77,12 @@ void directoryGrind(char *rootString){ ...@@ -73,12 +77,12 @@ void directoryGrind(char *rootString){
if(setjmp(readdirRecov)){ if(setjmp(readdirRecov)){
continue; continue;
} }
signal(SIGSEGV, readdirContingency);
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); //printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, SIG_DFL); //signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -121,8 +125,7 @@ void fileGrind(FILE* textFile){ ...@@ -121,8 +125,7 @@ void fileGrind(FILE* textFile){
if(!*((RIVArray[wordCount].name))) break; if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency; *(RIVArray[wordCount].frequency)+= 1;;
*thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++; wordCount++;
......
...@@ -33,24 +33,33 @@ sparseRIV fileToL2Clean(FILE *data); ...@@ -33,24 +33,33 @@ sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data); sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */ /*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator); double cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */ /*currently unused */
sparseRIV wordtoL2(char* word); sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal /* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation * sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */ * and launches that function defunct right now for memory usage reasons*/
sparseRIV consolidateI2S(int *implicit, size_t valueCount); sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV normalizeFloored(denseRIV input, int factor);
sparseRIV normalize(denseRIV input, int factor);
int roundMultiply(int base, float divisor);
/* like fileToL2 but takes a block of text */ /* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text); sparseRIV text2L2(char *text);
float getMagnitudeSparse(sparseRIV input);
/* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
double getMagnitudeSparse(sparseRIV input);
sparseRIV text2L2(char *text){ sparseRIV text2L2(char *text){
unsigned int blockSize; int wordCount = 0;
char word[100] = {0}; unsigned char word[100] = {0};
int denseTemp[RIVSIZE] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */ * to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
...@@ -67,38 +76,36 @@ sparseRIV text2L2(char *text){ ...@@ -67,38 +76,36 @@ sparseRIV text2L2(char *text){
break; break;
} }
blockSize = locationCount+NONZEROS; /* if this word would overflow the locations block, map it to the denseVector */
/* if this word would overflow the locations block, grow it */ if((locationCount+NONZEROS)>TEMPSIZE){
if(blockSize>RIVKey.tempSize){ addI2D(denseTemp, locations, locationCount);
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); locationCount = 0;
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
} }
/* add word's L1 RIV to the accumulating implicit RIV */ /* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
wordCount++;
} }
sparseRIV output = consolidateI2S(locations, locationCount); /* map remaining locations to the denseTemp */
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file, untill frequency /* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */ * is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS; output.frequency = wordCount;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
sparseRIV fileToL2(FILE *data){ sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0}; unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved /* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */ * to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
int denseTemp[RIVSIZE] = {0};
int wordCount = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
...@@ -108,24 +115,22 @@ sparseRIV fileToL2(FILE *data){ ...@@ -108,24 +115,22 @@ sparseRIV fileToL2(FILE *data){
break; break;
} }
blockSize = locationCount+NONZEROS; /* if this word would overflow the locations block, map it to the denseVector */
/* if this word would overflow the locations block, grow it */ if((locationCount+NONZEROS)>TEMPSIZE){
if(blockSize>RIVKey.tempSize){ addI2D(denseTemp, locations, locationCount);
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); locationCount = 0;
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
} }
/* add word's L1 RIV to the accumulating implicit RIV */ /* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
wordCount++;
} }
/* map remaining locations to the denseTemp */
sparseRIV output = consolidateI2S(locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS; output.frequency = wordCount;
output.boolean = 1; output.boolean = 1;
return output; return output;
...@@ -133,13 +138,12 @@ sparseRIV fileToL2(FILE *data){ ...@@ -133,13 +138,12 @@ sparseRIV fileToL2(FILE *data){
sparseRIV fileToL2Clean(FILE *data){ sparseRIV fileToL2Clean(FILE *data){
int denseTemp[RIVSIZE] = {0};
unsigned char word[100] = {0}; unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
unsigned int blockSize; unsigned int wordCount = 0;
int locationCount = 0; int locationCount = 0;
while(fscanf(data, "%99s", word)){ while(fscanf(data, "%99s", word)){
if(feof(data)){ if(feof(data)){
...@@ -153,36 +157,36 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -153,36 +157,36 @@ sparseRIV fileToL2Clean(FILE *data){
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
blockSize = locationCount+NONZEROS; /* if this word would overflow the locations block, map it to the denseVector */
if(blockSize>RIVKey.tempSize){ if((locationCount+NONZEROS)>TEMPSIZE){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int)); addI2D(denseTemp, locations, locationCount);
locations = RIVKey.h_tempBlock; locationCount = 0;
RIVKey.tempSize+=NONZEROS;
} }
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount); makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS; locationCount+= NONZEROS;
wordCount++;
} }
/* map remaining locations to the denseTemp */
sparseRIV output = consolidateI2S(locations, locationCount); addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS; output.frequency = locationCount/NONZEROS;
output.boolean = 1; output.boolean = 1;
return output; return output;
} }
//defunct temporarily, might make a return
sparseRIV consolidateI2S(int *implicit, size_t valueCount){ /*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){ if(valueCount<RIVKey.I2SThreshold){
/*direct method is faster on small datasets, but has geometric scaling on large datasets */ //direct method is faster on small datasets, but has geometric scaling on large datasets
return consolidateI2SDirect(implicit, valueCount); return consolidateI2SDirect(implicit, valueCount);
}else{ }else{
/* optimized for large datasets */ // optimized for large datasets
return consolidateI2SIndirect(implicit, valueCount); return consolidateI2SIndirect(implicit, valueCount);
} }
} }*/
void aggregateWord2D(denseRIV destination, char* word){ void aggregateWord2D(denseRIV destination, char* word){
...@@ -194,7 +198,7 @@ void aggregateWord2D(denseRIV destination, char* word){ ...@@ -194,7 +198,7 @@ void aggregateWord2D(denseRIV destination, char* word){
} }
} }
float cosCompare(denseRIV baseRIV, sparseRIV comparator){ double cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0; int dot = 0;
int n = comparator.count; int n = comparator.count;
...@@ -209,22 +213,20 @@ float cosCompare(denseRIV baseRIV, sparseRIV comparator){ ...@@ -209,22 +213,20 @@ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
} }
/*dot divided by product of magnitudes */ /*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine; return dot/(baseRIV.magnitude*comparator.magnitude);
} }
float getMagnitudeSparse(sparseRIV input){ double getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0; size_t temp = 0;
int *values = input.values; int *values = input.values;
int *values_stop = values+input.count; int *values_stop = values+input.count;
while(values<values_stop){ while(values<values_stop){
temp += (*values)*(*values); temp += (*values)*(*values);
//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
values++; values++;
} }
return sqrt(temp);
input.magnitude = sqrt(temp);
return input.magnitude;
} }
denseRIV lexPull(char* word){ denseRIV lexPull(char* word){
...@@ -245,7 +247,7 @@ denseRIV lexPull(char* word){ ...@@ -245,7 +247,7 @@ denseRIV lexPull(char* word){
char pathString[200]; char pathString[200];
sprintf(pathString, "lexicon/%s", word); sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb"); FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */ /* if this lexicon file already exists */
...@@ -285,7 +287,6 @@ int lexPush(denseRIV RIVout){ ...@@ -285,7 +287,6 @@ int lexPush(denseRIV RIVout){
RIVKey.RIVCache[hash] = RIVout; RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1; RIVKey.RIVCache[hash].cached = 1;
return 0; return 0;
/*if the current RIV is more frequent than the RIV holding its slot */ /*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){ }else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */ /* push the current cache entry to a file */
...@@ -332,4 +333,70 @@ sparseRIV fileToL2direct(FILE *data){; ...@@ -332,4 +333,70 @@ sparseRIV fileToL2direct(FILE *data){;
return output; return output;
} }
sparseRIV normalizeFloored(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
if(!input.values[i]) continue;
locations[count] = i;
values[count]= input.values[i]*divisor;
if(values[count])count++;
}
sparseRIV output;
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
return output;
}
sparseRIV normalize(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
if(!input.values[i]) continue;
locations[count] = i;
values[count]= roundMultiply(input.values[i], divisor);
if(values[count])count++;
}
sparseRIV output;
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
return output;
}
int roundMultiply(int base, float divisor){
float temp = base*divisor;
int output = temp*2;
if (output%2){
output/=2;
output+=1;
}else{
output/=2;
}
return output;
}
#endif #endif
File added
#include <stdio.h>
#include "RIVtools.h"
int main(){
lexOpen("/home/drbob/Documents/lexicon");
lexPull("unseemli");
}
This diff is collapsed. Click to expand it.
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#define RIVSIZE 25000
#include "RIVtoolsCPUlinux.h"
#include <time.h>
#define iterations 500
int main(){
RIVInit();
FILE* numba1 = fopen("testfolder/numba1.txt", "r");
FILE* numba2 = fopen("testfolder/numba2.txt", "r");
if(numba1){
puts("numba1 opened successfully");
}if(numba1){
puts("numba1 opened successfully");
}
sparseRIV first = fileToL2(numba1);
sparseRIV second = fileToL2(numba2);
first.magnitude = getMagnitudeSparse(first);
denseRIV second2 = denseAllocate();
second2.values = addS2D(second2.values, second);
second2.magnitude = getMagnitudeSparse(second);
clock_t begintotal = clock();
for(int i=0; i<iterations; i++){
cosCompare(second2, first);
}
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
}
rabi_noun s._noun de_noun rabide@yahoo.com prospect_noun place_noun home_noun bellaire_noun tx_noun work_noun
objective_noun financial_adjective engineering_noun position_noun energy_noun trading_noun finance_noun
profile_noun over_other ten_noun year_noun diverse_adjective experience_noun risk_noun analysis_noun management_noun energy_noun sector_noun last_adjective four_noun which_other be_verb trading_noun finance_noun
analytical_noun quantitative_adjective skill_noun structuring_noun pricing_noun energy_noun derivative_noun
expertise_noun trading_noun derivative_noun development_noun trade_noun analytic_noun exposure_noun management_noun risk_verb structure_verb e&p_noun project_noun finance_noun transaction_noun
experience_noun shell_noun capital_noun inc._noun houston_noun tx_noun
present_adjective vice_noun president_noun reports_noun chief_noun financial_noun officer_noun responsible_adjective devise_verb strategy_noun manage_verb price_verb market_noun credit_noun risk_noun within_other structured_adjective transaction_noun
design_verb execute_verb oil_noun gas_noun hedge_noun eight_noun domestic_adjective two_noun international_adjective transaction_noun involve_verb over_other million_noun capital_noun risk_noun
develop_verb implement_verb framework_noun identification_noun mitigation_noun pricing_noun risk_noun producer_noun finance_noun transaction_noun
provide_verb sophisticated_adjective simulation_noun modeling_noun support_noun financial_adjective engineering_noun solution_noun e&p_noun finance_noun leasing_noun small_adjective business_noun finance_noun
led_verb development_noun computational_adjective infrastructure_noun risk_noun modeling_noun pricing_noun
shell_noun oil_noun products_noun company_noun houston_noun tx_noun
trade_noun analytics_noun developer_noun derivatives_noun trader_noun traded_noun future_noun option_noun otc_noun derivative_noun crude_adjective oil_noun heating_noun oil_noun gasoline_noun
manage_verb net_adjective hydrocarbon_noun exposure_noun company_noun
develop_verb analytic_noun identify_verb speculative_adjective program_noun trading_noun opportunity_noun e.g._other refinery_noun margin_noun protection_noun
carry_verb out_adverb simulation_noun back_adverb testing_noun risk_noun adjusted_adjective performance_noun measurement_noun trading_noun strategy_noun
price_verb embedded_adjective cap_noun devise_verb strategy_noun option_noun replication_noun dynamic_adjective hedging_noun
shell_noun e&p_noun technology_noun company_noun houston_noun tx_noun
senior_noun research_noun engineer_noun research_noun engineer_noun use_verb reliability_noun analysis_noun solve_verb wide_adjective variety_noun engineering_noun problem_noun
model_verb environmental_adjective structural_adjective response_noun develop_verb design_noun code_noun criterion_noun carry_verb out_adverb decision_noun analysis_noun under_other uncertainty_noun surface_noun system_noun selection_noun etc._other
deliver_verb enable_verb technology_noun risk-based_adjective design_noun recipe_noun development_noun complex_adjective engineering_noun system_noun range_verb billion-dollar_adjective tension_noun leg_noun platform_noun requalification_noun aging_noun fleet_noun offshore_adjective jacket_noun structure_noun
brown_adjective root_noun inc._noun houston_noun tx_noun
naval_noun architect_noun software_noun troubleshooter_noun carry_verb out_adverb naval_noun architectural_noun design_noun motion_noun response_noun modeling_noun downtime_noun analysis_noun environmental_adjective datum_noun base_noun management_noun software_noun development_noun maintenance_noun support_noun offshore_adjective structure_noun design_noun construction_noun
education_noun university_noun california_noun berkeley_noun ca_noun
ph.d._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun offshore_noun structural_noun system_noun reliability_noun wave-load_noun modeling_noun system_noun behavior_noun analysis_noun
probabilistically_adverb model_verb multidimensional_adjective hazard_noun effect_noun performance_noun complicated_adjective system_noun develop_verb methodology_noun characterize_verb system_noun failure_noun risk_noun
work_verb research_noun associate_noun reliability_noun marine_noun structures_noun center_noun stanford_noun university_noun consultant_noun offshore_adjective oil_noun gas_noun industry_noun
university_noun california_noun berkeley_noun ca_noun
m.s._noun naval_noun architecture_noun offshore_noun engineering_noun minor_noun statistics_noun structure_noun thesis_noun simulation_noun random_noun seaway_noun towing_noun tank_noun random_noun walk_verb frequency_noun method_noun
work_verb research_noun assistant_noun develop_verb software_noun time_noun series_noun analysis_noun model_noun testing_noun calibration_noun
indian_noun institute_noun technology_noun kharagpur_noun india_noun
b.tech._noun naval_noun architecture_noun graduate_verb first_adjective class_noun honor_noun rank_verb first_adverb class_noun
relevant_adjective training_noun credit_noun risk_noun modeling_noun stanford_noun university_noun stanford_noun ca_noun
october_noun finance_noun accounting_noun executive_noun rice_noun university_noun houston_noun tx_noun
august_noun training_noun modules_noun product_noun knowledge_noun structured_noun project_noun finance_noun securitization_noun credit_noun strategy_noun in-house_noun training_noun dc_noun gardener_noun euromoney_noun
january_noun april_noun risk_noun risk_noun conference_noun washington_noun d.c._noun june_noun economics_noun supply_noun refining_noun marketing_noun stone_noun bond_noun corp._noun houston_noun tx_noun
april_noun understand_verb apply_verb financial_adjective mathematics_noun energy_noun derivative_noun efficient_adjective pricing_noun trading_noun risk_noun management_noun risk_noun conferences_noun new_noun york_noun ny_noun
march_noun practical_noun strategic_noun application_noun var_noun energy_noun industries_noun risk_noun conferences_noun houston_noun tx_noun
december_noun financial_noun modeling_noun s-plus_noun mathsoft_noun new_noun york_noun ny_noun
october_noun option_noun analytic_noun pricing_noun option_noun exotic_noun options_noun cibc_noun school_noun financial_noun products_noun houston_noun tx_noun
september_noun fundamental_noun energy_noun basis_noun trading_noun princeton_noun energy_noun houston_noun tx_noun
feb_noun energy_noun derivatives_noun price_noun risk_noun management_noun energy_noun institute_noun univ._noun houston_noun houston_noun tx_noun
january_noun april_noun latest_adjective development_noun advanced_noun mathematics_noun derivative_noun risk_noun conference_noun new_noun york_noun ny_noun
december_noun options_noun seminar_noun nymex_noun houston_noun tx_noun
october_noun
select_verb honors_noun activities_noun present_verb seminar_noun credit_noun risk_noun e&p_noun mezzanine_noun finance_noun global_noun association_noun risk_noun professional_noun houston_noun chapter_noun tx_noun
june_noun special_adjective recognition_noun award_noun shell_noun oil_noun products_noun company_noun
committee_noun membership_noun panelist_noun author_noun lecturer_noun publication_noun reviewer_noun etc._other
asce_noun api_noun otc_noun asme_noun omae_noun etc._other
receive_verb omae_noun award_noun american_noun society_noun mechanical_adjective engineering_noun recognition_noun outstanding_adjective originality_noun significance_noun paper_noun title_verb development_noun reliability-based_adjective global_adjective design_noun equation_noun tension_noun leg_noun platforms_noun
short_adjective course_noun instructor_noun seminar_noun speaker_noun university_noun texas_noun austin_noun rice_noun university_noun university_noun houston_noun
sea_noun grant_noun association_noun award_noun excellence_noun research_noun sea_noun grant_noun association_noun usa_noun
institute_noun silver_noun medal_noun indian_noun institute_noun technology_noun kharagpur_noun india_noun
national_noun science_noun talent_noun search_verb scholarship_noun government_noun india_noun
personal_adjective data_noun date_noun birth_noun september_noun us_noun citizen_noun marry_verb one_noun child_noun
reference_noun available_adjective upon_other request_noun
document_noun properties_noun title_noun rabi_noun s_noun author_noun shell_noun chemical_noun company_noun template_noun normal_adjective last_adjective save_verb grady_adjective revision_noun number_noun application_noun microsoft_noun word_noun
total_adjective editing_noun time_noun last_adjective print_verb create_verb last_adjective save_verb company_noun shell_noun chemical_noun company_noun
meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun
calendar_noun entry_noun appointment_noun
description_noun meet_verb discuss_verb non-grid_adjective am_noun process_verb attendee_noun julia_noun lynn_noun steve_noun sheila_noun jerry_noun conference_noun room_noun
date_noun time_noun pm_noun pm_noun central_noun standard_noun time_noun
detailed_adjective description_noun united_noun states_noun license_noun
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment