Commit 60856c1d by etcart

added sorted cache for slower harddrives

parent 47297b52
File deleted
...@@ -60,7 +60,7 @@ typedef struct{ ...@@ -60,7 +60,7 @@ typedef struct{
char name[100]; char name[100];
int *values; int *values;
int *locations; int *locations;
int count; size_t count;
int frequency; int frequency;
int contextSize; int contextSize;
float magnitude; float magnitude;
......
#ifndef RIVACCESS_H_ #ifndef RIVACCESS_H_
#define RIVACCESS_H_ #define RIVACCESS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct treenode{
void* data;
struct treenode* links[26];
int downstream;
};
int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
...@@ -16,7 +32,7 @@ int wordtoSeed(char* word); ...@@ -16,7 +32,7 @@ int wordtoSeed(char* word);
int isLetter(char c){ int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1; if((c>96 && c<123)||(c == 32)) return 1;
else return 0; else return 0;
} }
int isWordClean(char* word){ int isWordClean(char* word){
...@@ -47,5 +63,106 @@ int wordtoSeed(char* word){ ...@@ -47,5 +63,106 @@ int wordtoSeed(char* word){
} }
return seed; return seed;
} }
struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r");
if(!netfile){
printf("no stemnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char stem[100];
while(fscanf(netfile, "%s %s", word, stem)){
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem);
}
return rootNode;
}
void* treeSearch(struct treenode* node, char* letter){
if(*(letter)){
if(!node->links[*(letter)-'a']){
return NULL;
}
return treeSearch(node->links[*(letter)-'a'], letter+1);
}else{
return node->data;
}
}
void RIVinsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
RIVinsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = data;
}
}
void stemInsert(struct treenode* node, char* letter, char* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
stemInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
strcpy((char*)node->data, data);
}
}
int treecut(struct treenode* node, char* letter){
node->downstream--;
int flag;
if(*(letter)){
if(node->links[*(letter)-'a']){
flag = treecut(node->links[*(letter)-'a'], letter+1);
if(flag){
node->links[*(letter)-'a'] = NULL;
}
}
if(!node->downstream){
free(node);
return 1;
}
}else{
free(node);
return 1;
}
return 0;
}
#endif #endif
File deleted
...@@ -2,82 +2,74 @@ ...@@ -2,82 +2,74 @@
#define RIVSIZE 50000 #define RIVSIZE 50000
#include "RIVtools.h" #include "RIVtools.h"
char* clean(char* word); char* clean(char* word);
char* stem(char* word); char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
typedef char label[200]; typedef char label[200];
struct RIVclass{ struct RIVclass{
label name; label name;
sparseRIV* set; sparseRIV* set;
int setSize; int setSize;
}; };
LEXICON* lexicon;
int main(){ int main(){
struct treenode* searchRoot = stemTreeSetup();
lexOpen("lexicon", "rx"); lexicon = lexOpen("consolidatedLexicon", "rx");
int classNo = 0; int classNo = 0;
label className = "tempName";
label* classNames = calloc(1, sizeof(label)); label* classNames = calloc(1, sizeof(label));
int classCount = 0; int classCount = 0;
struct RIVclass* classes = malloc(sizeof(struct RIVclass)); struct RIVclass* classes = malloc(sizeof(struct RIVclass));
strcpy(classes[classCount].name, className);
strcpy(classNames[classCount], className);
classCount++;
while(1){
FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(!*clean(word)) continue;
//if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
//printf("%s, not in lexicon\n", word);
continue;
}else{
//printf("%s, succesfully pulled\n", word);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
struct RIVclass* class = classes+classNo;
class->set = malloc(sizeof(sparseRIV));
class->setSize = 0;
class->set[class->setSize] = consolidateD2S(accumulate.values);
class->setSize++;
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
if(!textSet){
puts("no file");
return 1;
}
struct RIVclass* class;
char text[20000];
label className;
while(fscanf(textSet, "%s\t%s", text, className)){
char* labelTemp = strstr(*classNames, className); char* labelTemp = strstr(*classNames, className);
if(!labelTemp){ if(!labelTemp){
classNames = realloc(classNames, classCount*sizeof(label)); /* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className); strcpy(classNames[classCount], className);
/* reinitialize the classes with a new member */
classes = realloc(classes, (classCount+1)*sizeof(struct RIVclass));
class = classes+classCount;
class->set = malloc(sizeof(sparseRIV));
strcpy(class->name, className);
class->setSize = 0;
classNo = classCount;
classCount++; classCount++;
}else{ }else{
classNo = (labelTemp-*classNames); classNo = (labelTemp-*classNames);
class = classes+classNo;
} }
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot);
class->set[class->setSize] = thing;
class->setSize++;
}
for(int i=0; i<classCount; i++){
puts(classNames[i]);
printf("%d\n\n", classes[i].setSize);
} }
...@@ -103,44 +95,63 @@ char* clean(char* word){ ...@@ -103,44 +95,63 @@ char* clean(char* word){
strcpy(word,output); strcpy(word,output);
return word; return word;
} }
char* stem(char* word){
char pathString[200]; char* stemmy(struct treenode* searchRoot, char* word){
int WNdata;
sprintf(pathString, "WN/%s", word);
FILE* WNfile = fopen(pathString, "r");
if(!WNfile) return NULL; return treeSearch(searchRoot , word);
fscanf(WNfile, "%d", &WNdata); }
sparseRIV line2L3(char* text, struct treenode* searchRoot){
if(!WNdata) {
fclose(WNfile);
return NULL;
}
if(WNdata == 1) { denseRIV accumulate = {0};
fclose(WNfile); sparseRIV temp;
return word; char* textEnd = text+strlen(text);
} char word[100];
if(WNdata == 2){ int displacement;
fscanf(WNfile, "%s", word); while(text<textEnd){
fclose(WNfile); sscanf(text, "%99s%n", word, &displacement);
sprintf(pathString, "WN/%s", word); text += displacement+1;
WNfile = fopen(pathString, "r"); if(!displacement){
if(!WNfile) return NULL; break;
}
fscanf(WNfile, "%*d%s", word); if(!(*word)){
return word; break;
}
if(!*clean(word)) continue;
char* stem = stemmy(searchRoot, word);
if(stem){
denseRIV* wordRIV = lexPull(lexicon, stem);
if(!wordRIV){
//printf("%s, not in lexicon\n", stem);
continue;
}else{
//printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
} }
return NULL; temp = consolidateD2S(accumulate.values);
return temp;
} }
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include <dirent.h> #include <dirent.h>
int main(int argc, char* argv[]){ int main(int argc, char* argv[]){
lexOpen(argv[1]); LEXICON* lexicon = lexOpen(argv[1], "rx");
denseRIV* intake; denseRIV* intake;
sparseRIV examine; sparseRIV examine;
static denseRIV *output[60000] = {0}; static denseRIV *output[60000] = {0};
...@@ -26,13 +26,13 @@ int main(int argc, char* argv[]){ ...@@ -26,13 +26,13 @@ int main(int argc, char* argv[]){
continue; continue;
} }
j++; j++;
intake = lexPull(files->d_name); intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times /* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */ * then it should be statistically significant, and useful */
if(intake->contextSize<7000){ /*if(intake->contextSize<7000){
free(intake); free(intake);
continue; continue;
} }*/
examine = normalize(*intake, 10000); examine = normalize(*intake, 10000);
strcpy(examine.name, files->d_name); strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j); printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
...@@ -46,14 +46,14 @@ int main(int argc, char* argv[]){ ...@@ -46,14 +46,14 @@ int main(int argc, char* argv[]){
free(examine.locations); free(examine.locations);
i++; i++;
} }
lexClose(); lexClose(lexicon);
lexOpen("consolidatedLexicon50-8"); lexicon = lexOpen("consolidatedLexicon", "wx");
for(int j=0; j<i; j++){ for(int j=0; j<i; j++){
lexPush(output[j]); lexPush(lexicon, output[j]);
} }
lexClose(); lexClose(lexicon);
return 0; return 0;
} }
File deleted
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include <dirent.h> #include <dirent.h>
int main(int argc, char* argv[]){ int main(int argc, char* argv[]){
lexOpen(argv[1]); LEXICON* lexicon = lexOpen(argv[1], "r");
denseRIV* intake; denseRIV* intake;
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -23,7 +23,7 @@ int main(int argc, char* argv[]){ ...@@ -23,7 +23,7 @@ int main(int argc, char* argv[]){
continue; continue;
} }
intake = lexPull(files->d_name); intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times /* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */ * then it should be statistically significant, and useful */
...@@ -34,7 +34,7 @@ int main(int argc, char* argv[]){ ...@@ -34,7 +34,7 @@ int main(int argc, char* argv[]){
i++; i++;
} }
lexClose(); lexClose(lexicon);
return 0; return 0;
} }
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#ifndef SORTCACHE #ifndef SORTCACHE
#ifndef HASHCACHE #ifndef HASHCACHE
#define HASHCACHE #define SORTCACHE
#endif #endif
#endif #endif
typedef struct{ typedef struct{
...@@ -31,6 +31,9 @@ typedef struct{ ...@@ -31,6 +31,9 @@ typedef struct{
denseRIV* *cache; denseRIV* *cache;
struct cacheList* listPoint; struct cacheList* listPoint;
char flags; char flags;
#ifdef SORTCACHE
struct treenode* treeRoot;
#endif /* SORTCACHE */
}LEXICON; }LEXICON;
struct cacheList{ struct cacheList{
denseRIV* *cache; denseRIV* *cache;
...@@ -145,7 +148,14 @@ LEXICON* lexOpen(const char* lexName, const char* flags){ ...@@ -145,7 +148,14 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
/* if we will be reading and writing the same lexicon, setup a /* if we will be reading and writing the same lexicon, setup a
* cache for this lexicon to speed up rewrites */ * cache for this lexicon to speed up rewrites */
struct cacheList* newCache = calloc(1, sizeof(struct cacheList)); struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
#ifdef HASHCACHE
newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*)); newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
#else
#ifdef SORTCACHE
newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*));
output->treeRoot = calloc(1, sizeof(struct treenode));
#endif
#endif
output->flags |= CACHEFLAG; output->flags |= CACHEFLAG;
output->cache = newCache->cache; output->cache = newCache->cache;
...@@ -188,22 +198,7 @@ void lexClose(LEXICON* toClose){ ...@@ -188,22 +198,7 @@ void lexClose(LEXICON* toClose){
free(toClose); free(toClose);
} }
int cacheDump(denseRIV* *toDump){
int flag = 0;
denseRIV* *toDump_slider = toDump;
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
free(toDump);
return flag;
}
#if CACHESIZE > 0 #if CACHESIZE > 0
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
...@@ -212,7 +207,6 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){ ...@@ -212,7 +207,6 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
int hash = rand()%CACHESIZE; int hash = rand()%CACHESIZE;
if(lexicon->cache[hash]){ if(lexicon->cache[hash]){
if(!strcmp(word, lexicon->cache[hash]->name)){ if(!strcmp(word, lexicon->cache[hash]->name)){
/* if word is cached, pull from cache and exit */ /* if word is cached, pull from cache and exit */
return lexicon->cache[hash]; return lexicon->cache[hash];
} }
...@@ -221,25 +215,18 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){ ...@@ -221,25 +215,18 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
#endif #endif
#ifdef SORTCACHE #ifdef SORTCACHE
return treeSearch(lexicon->treeRoot, word);
#endif #endif
} }
#if CACHESIZE > 0
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */ /* if our RIV was cached already, no need to play with it */
if(RIVout->cached == lexicon){ if(RIVout->cached == lexicon){
return 1; return 1;
} }
#if HASHCACHE #ifdef HASHCACHE
srand(wordtoSeed(RIVout->name)); srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE; int hash = rand()%CACHESIZE;
...@@ -262,15 +249,40 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){ ...@@ -262,15 +249,40 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
return 1; return 1;
} }
return 0; return 0;
#endif #endif /* HASHCACHE */
#if SORTCACHE #ifdef SORTCACHE
denseRIV* *cache_slider = lexicon->cache;
while(*cache_slider){
if(RIVout->frequency > (*cache_slider)->frequency){
memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
#endif if(lexicon->cache[CACHESIZE]){
fLexPush(lexicon, lexicon->cache[CACHESIZE]);
//remove tree element
treecut(lexicon->treeRoot, RIVout->name);
lexicon->cache[CACHESIZE] = NULL;
}
RIVout->cached = lexicon;
*cache_slider = RIVout;
//add tree element
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
return 1;
}
cache_slider++;
}
if(cache_slider-lexicon->cache < CACHESIZE){
RIVout->cached = lexicon;
*cache_slider = RIVout;
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
//add tree element
return 1;
}
return 0;
#endif /* SORTCACHE */
} }
#endif
#endif #endif
denseRIV* lexPull(LEXICON* lexicon, char* word){ denseRIV* lexPull(LEXICON* lexicon, char* word){
...@@ -302,6 +314,7 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){ ...@@ -302,6 +314,7 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
}else{ }else{
/* if lexicon is set to inclusive (can gain new words) */ /* if lexicon is set to inclusive (can gain new words) */
if(lexicon->flags & INCFLAG){ if(lexicon->flags & INCFLAG){
/*if file does not exist, return a 0 vector (word is new to the lexicon) */ /*if file does not exist, return a 0 vector (word is new to the lexicon) */
output = calloc(1, sizeof(denseRIV)); output = calloc(1, sizeof(denseRIV));
strcpy(output->name, word); strcpy(output->name, word);
...@@ -339,11 +352,12 @@ int saturationForStaging(denseRIV* output){ ...@@ -339,11 +352,12 @@ int saturationForStaging(denseRIV* output){
int* count = IOstagingSlot; int* count = IOstagingSlot;
*count = 0; *count = 0;
*(count+1) = output->frequency; *(count+1) = 0;
*(count+2) = output->contextSize; *(count+2) = output->frequency;
*(float*)(count+3) = output->magnitude; *(count+3) = output->contextSize;
*(float*)(count+4) = output->magnitude;
int* locations = IOstagingSlot+4; int* locations = IOstagingSlot+5;
int* values = IOstagingSlot-RIVSIZE;; int* values = IOstagingSlot-RIVSIZE;;
int* locations_slider = locations; int* locations_slider = locations;
int* values_slider = values; int* values_slider = values;
...@@ -384,7 +398,7 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){ ...@@ -384,7 +398,7 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name); printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1; return 1;
} }
fwrite(IOstagingSlot, (saturation*2)+4, sizeof(int), lexWord); fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
fclose(lexWord); fclose(lexWord);
}else{ }else{
output->cached = 0; output->cached = 0;
...@@ -393,45 +407,10 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){ ...@@ -393,45 +407,10 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name); printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1; return 1;
} }
fwrite(((int*)&output->cached)+1, sizeof(int), RIVSIZE+4, lexWord); fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
fclose(lexWord); fclose(lexWord);
} }
/* older way of writing, kept while debugging new */
//~ if(temp.count<(RIVSIZE/2)){
//~ /* smaller stored as sparse vector */
//~ *writeStaging = temp.count;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, temp.locations, temp.count*2*sizeof(int));
//~ stagingSize += temp.count*2*sizeof(int);
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(temp.locations, temp.count, sizeof(int), lexWord);
//~ fwrite(temp.values, temp.count, sizeof(int), lexWord);*/
//~ }else{
//~ /* saturation is too high, better to store dense */
//~ /* there's gotta be a better way to do this */
//~ *writeStaging = 0;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, RIVout.values, sizeof(int)*RIVSIZE);
//~ stagingSize +=sizeof(int)*RIVSIZE;
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);*/
//~ }
free(output); free(output);
...@@ -440,47 +419,47 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){ ...@@ -440,47 +419,47 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
denseRIV* fLexPull(FILE* lexWord){ denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV)); denseRIV *output = calloc(1,sizeof(denseRIV));
int typeCheck; size_t typeCheck;
/* get metadata for vector */ /* get metadata for vector */
if(!fread(&typeCheck, 1, sizeof(int), lexWord)){ if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
return NULL; return NULL;
} }
int flag = 0;
/* first value stored is the value count if sparse, and 0 if dense */ /* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){ if (typeCheck){
/* pull as sparseVector */ /* pull as sparseVector */
sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT)); /*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
assert(&temp->count == IOstagingSlot);
temp->count = typeCheck; temp->count = typeCheck;
temp->locations = IOstagingSlot+4; temp->locations = IOstagingSlot+5;
temp->values = temp->locations+temp->count; temp->values = temp->locations+temp->count;
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){ if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
printf("vector read failure"); printf("vector read failure");
return NULL; return NULL;
} }*/
/*sparseRIV temp; sparseRIV temp;
temp.count = typeCheck; temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int)); temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count; temp.values = temp.locations+temp.count;
fread(&output->frequency, 1, sizeof(int), lexWord); flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(unsigned int), lexWord); flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord); flag+= fread(&output->magnitude, 1, sizeof(float), lexWord);
fread(temp.locations, temp.count, sizeof(int), lexWord); flag += fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord); flag+= fread(temp.values, temp.count, sizeof(int), lexWord);
*/
addS2D(output->values, *temp); addS2D(output->values, temp);
}else{ }else{
/* typecheck is thrown away, just a flag in this case */ /* typecheck is thrown away, just a flag in this case */
//~ fread(&output->frequency, 1, sizeof(int), lexWord); flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
//~ fread(&output->contextSize, 1, sizeof(unsigned int), lexWord); flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
//~ fread(&output->magnitude, 1, sizeof(float), lexWord); flag +=fread(&output->magnitude, 1, sizeof(float), lexWord);
if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){ /*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure"); printf("vector read failure");
return NULL; return NULL;
} }*/
} }
...@@ -496,10 +475,36 @@ void signalSecure(int signum, siginfo_t *si, void* arg){ ...@@ -496,10 +475,36 @@ void signalSecure(int signum, siginfo_t *si, void* arg){
puts("cache dump failed, some lexicon data lost"); puts("cache dump failed, some lexicon data lost");
} }
rootCache = rootCache->next; rootCache = rootCache->next;
free(rootCache->prev);
} }
signal(signum, SIG_DFL); signal(signum, SIG_DFL);
kill(getpid(), signum); kill(getpid(), signum);
} }
int cacheDump(denseRIV* *toDump){
int flag = 0;
denseRIV* *toDump_slider = toDump;
#ifdef HASHCACHE
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
#else
#ifdef SORTCACHE
while(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
toDump_slider++;
}
#endif
#endif
free(toDump);
return flag;
}
#endif #endif
File deleted
...@@ -6,7 +6,10 @@ ...@@ -6,7 +6,10 @@
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include <string.h> #include <string.h>
#define CACHESIZE 10000 //#define HASHCACHE
#define RIVSIZE 50000
#define NONZEROS 4
#define CACHESIZE 27000
#include "RIVtools.h" #include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context) //this program reads a directory full of files, and adds all context vectors (considering file as context)
...@@ -17,6 +20,7 @@ void addContext(denseRIV* lexRIV, sparseRIV context); ...@@ -17,6 +20,7 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void lineGrind(char* textLine); void lineGrind(char* textLine);
LEXICON* lp; LEXICON* lp;
//int COUNTY = 0;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
...@@ -71,11 +75,12 @@ void directoryGrind(char *rootString){ ...@@ -71,11 +75,12 @@ void directoryGrind(char *rootString){
printf("skipped: %s\n", files->d_name); printf("skipped: %s\n", files->d_name);
continue; continue;
} }
puts(files->d_name); //puts(files->d_name);
//open a file within root directory //open a file within root directory
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon //process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++);
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
...@@ -88,9 +93,9 @@ void directoryGrind(char *rootString){ ...@@ -88,9 +93,9 @@ void directoryGrind(char *rootString){
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
char textLine[10000]; char textLine[10000];
// included python script separates paragraphs into lines // included python script separates paragraphs into lines
int i=0; //int i=0;
while(fgets(textLine, 9999, textFile)){ while(fgets(textLine, 9999, textFile)){
printf("line: %d\n", i++); //printf("line: %d\n", i++);
if(!strlen(textLine)) continue; if(!strlen(textLine)) continue;
if(feof(textFile)) break; if(feof(textFile)) break;
......
File deleted
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
RIVcull: RIVcullDestructive.c
gcc -o RIVcull RIVcullDestructive.c -lm -O3
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -19,8 +19,7 @@ while(1): ...@@ -19,8 +19,7 @@ while(1):
freq = int(segments[1]) freq = int(segments[1])
mag = float(segments[2]) mag = float(segments[2])
name = segments[4]; name = segments[4];
if(freq>40000):
continue;
core = fit(freq) core = fit(freq)
fitmax = core*(1+range); fitmax = core*(1+range);
fitmin = core*(1-range); fitmin = core*(1-range);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment