Commit 60856c1d by etcart

added sorted cache for slower harddrives

parent 47297b52
File deleted
......@@ -60,7 +60,7 @@ typedef struct{
char name[100];
int *values;
int *locations;
int count;
size_t count;
int frequency;
int contextSize;
float magnitude;
......
#ifndef RIVACCESS_H_
#define RIVACCESS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct treenode{
void* data;
struct treenode* links[26];
int downstream;
};
int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and
......@@ -16,7 +32,7 @@ int wordtoSeed(char* word);
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
if((c>96 && c<123)||(c == 32)) return 1;
else return 0;
}
int isWordClean(char* word){
......@@ -47,5 +63,106 @@ int wordtoSeed(char* word){
}
return seed;
}
struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r");
if(!netfile){
printf("no stemnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char stem[100];
while(fscanf(netfile, "%s %s", word, stem)){
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem);
}
return rootNode;
}
void* treeSearch(struct treenode* node, char* letter){
if(*(letter)){
if(!node->links[*(letter)-'a']){
return NULL;
}
return treeSearch(node->links[*(letter)-'a'], letter+1);
}else{
return node->data;
}
}
void RIVinsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
RIVinsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = data;
}
}
void stemInsert(struct treenode* node, char* letter, char* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
stemInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
strcpy((char*)node->data, data);
}
}
int treecut(struct treenode* node, char* letter){
node->downstream--;
int flag;
if(*(letter)){
if(node->links[*(letter)-'a']){
flag = treecut(node->links[*(letter)-'a'], letter+1);
if(flag){
node->links[*(letter)-'a'] = NULL;
}
}
if(!node->downstream){
free(node);
return 1;
}
}else{
free(node);
return 1;
}
return 0;
}
#endif
File deleted
......@@ -2,82 +2,74 @@
#define RIVSIZE 50000
#include "RIVtools.h"
char* clean(char* word);
char* stem(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
LEXICON* lexicon;
int main(){
lexOpen("lexicon", "rx");
struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx");
int classNo = 0;
label className = "tempName";
label* classNames = calloc(1, sizeof(label));
int classCount = 0;
struct RIVclass* classes = malloc(sizeof(struct RIVclass));
strcpy(classes[classCount].name, className);
strcpy(classNames[classCount], className);
classCount++;
while(1){
FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(!*clean(word)) continue;
//if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
//printf("%s, not in lexicon\n", word);
continue;
}else{
//printf("%s, succesfully pulled\n", word);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
struct RIVclass* class = classes+classNo;
class->set = malloc(sizeof(sparseRIV));
class->setSize = 0;
class->set[class->setSize] = consolidateD2S(accumulate.values);
class->setSize++;
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
if(!textSet){
puts("no file");
return 1;
}
struct RIVclass* class;
char text[20000];
label className;
while(fscanf(textSet, "%s\t%s", text, className)){
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
classNames = realloc(classNames, classCount*sizeof(label));
/* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className);
/* reinitialize the classes with a new member */
classes = realloc(classes, (classCount+1)*sizeof(struct RIVclass));
class = classes+classCount;
class->set = malloc(sizeof(sparseRIV));
strcpy(class->name, className);
class->setSize = 0;
classNo = classCount;
classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
}
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot);
class->set[class->setSize] = thing;
class->setSize++;
}
for(int i=0; i<classCount; i++){
puts(classNames[i]);
printf("%d\n\n", classes[i].setSize);
}
......@@ -103,44 +95,63 @@ char* clean(char* word){
strcpy(word,output);
return word;
}
char* stem(char* word){
char pathString[200];
int WNdata;
sprintf(pathString, "WN/%s", word);
FILE* WNfile = fopen(pathString, "r");
char* stemmy(struct treenode* searchRoot, char* word){
if(!WNfile) return NULL;
return treeSearch(searchRoot , word);
fscanf(WNfile, "%d", &WNdata);
}
sparseRIV line2L3(char* text, struct treenode* searchRoot){
if(!WNdata) {
fclose(WNfile);
return NULL;
}
if(WNdata == 1) {
fclose(WNfile);
return word;
}
if(WNdata == 2){
fscanf(WNfile, "%s", word);
fclose(WNfile);
sprintf(pathString, "WN/%s", word);
WNfile = fopen(pathString, "r");
if(!WNfile) return NULL;
fscanf(WNfile, "%*d%s", word);
return word;
denseRIV accumulate = {0};
sparseRIV temp;
char* textEnd = text+strlen(text);
char word[100];
int displacement;
while(text<textEnd){
sscanf(text, "%99s%n", word, &displacement);
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
if(!*clean(word)) continue;
char* stem = stemmy(searchRoot, word);
if(stem){
denseRIV* wordRIV = lexPull(lexicon, stem);
if(!wordRIV){
//printf("%s, not in lexicon\n", stem);
continue;
}else{
//printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
}
return NULL;
temp = consolidateD2S(accumulate.values);
return temp;
}
......
......@@ -5,7 +5,7 @@
#include <dirent.h>
int main(int argc, char* argv[]){
lexOpen(argv[1]);
LEXICON* lexicon = lexOpen(argv[1], "rx");
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
......@@ -26,13 +26,13 @@ int main(int argc, char* argv[]){
continue;
}
j++;
intake = lexPull(files->d_name);
intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<7000){
/*if(intake->contextSize<7000){
free(intake);
continue;
}
}*/
examine = normalize(*intake, 10000);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
......@@ -46,14 +46,14 @@ int main(int argc, char* argv[]){
free(examine.locations);
i++;
}
lexClose();
lexOpen("consolidatedLexicon50-8");
lexClose(lexicon);
lexicon = lexOpen("consolidatedLexicon", "wx");
for(int j=0; j<i; j++){
lexPush(output[j]);
lexPush(lexicon, output[j]);
}
lexClose();
lexClose(lexicon);
return 0;
}
File deleted
......@@ -5,7 +5,7 @@
#include <dirent.h>
int main(int argc, char* argv[]){
lexOpen(argv[1]);
LEXICON* lexicon = lexOpen(argv[1], "r");
denseRIV* intake;
DIR *directory;
struct dirent *files = 0;
......@@ -23,7 +23,7 @@ int main(int argc, char* argv[]){
continue;
}
intake = lexPull(files->d_name);
intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
......@@ -34,7 +34,7 @@ int main(int argc, char* argv[]){
i++;
}
lexClose();
lexClose(lexicon);
return 0;
}
......@@ -23,7 +23,7 @@
#ifndef SORTCACHE
#ifndef HASHCACHE
#define HASHCACHE
#define SORTCACHE
#endif
#endif
typedef struct{
......@@ -31,6 +31,9 @@ typedef struct{
denseRIV* *cache;
struct cacheList* listPoint;
char flags;
#ifdef SORTCACHE
struct treenode* treeRoot;
#endif /* SORTCACHE */
}LEXICON;
struct cacheList{
denseRIV* *cache;
......@@ -145,7 +148,14 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
/* if we will be reading and writing the same lexicon, setup a
* cache for this lexicon to speed up rewrites */
struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
#ifdef HASHCACHE
newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
#else
#ifdef SORTCACHE
newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*));
output->treeRoot = calloc(1, sizeof(struct treenode));
#endif
#endif
output->flags |= CACHEFLAG;
output->cache = newCache->cache;
......@@ -188,22 +198,7 @@ void lexClose(LEXICON* toClose){
free(toClose);
}
int cacheDump(denseRIV* *toDump){
int flag = 0;
denseRIV* *toDump_slider = toDump;
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
free(toDump);
return flag;
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
......@@ -212,7 +207,6 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
int hash = rand()%CACHESIZE;
if(lexicon->cache[hash]){
if(!strcmp(word, lexicon->cache[hash]->name)){
/* if word is cached, pull from cache and exit */
return lexicon->cache[hash];
}
......@@ -221,25 +215,18 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
#endif
#ifdef SORTCACHE
return treeSearch(lexicon->treeRoot, word);
#endif
}
#if CACHESIZE > 0
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached == lexicon){
return 1;
}
#if HASHCACHE
#ifdef HASHCACHE
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
......@@ -262,15 +249,40 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
return 1;
}
return 0;
#endif
#if SORTCACHE
#endif
#endif /* HASHCACHE */
#ifdef SORTCACHE
denseRIV* *cache_slider = lexicon->cache;
while(*cache_slider){
if(RIVout->frequency > (*cache_slider)->frequency){
memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
if(lexicon->cache[CACHESIZE]){
fLexPush(lexicon, lexicon->cache[CACHESIZE]);
//remove tree element
treecut(lexicon->treeRoot, RIVout->name);
lexicon->cache[CACHESIZE] = NULL;
}
RIVout->cached = lexicon;
*cache_slider = RIVout;
//add tree element
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
return 1;
}
cache_slider++;
}
if(cache_slider-lexicon->cache < CACHESIZE){
RIVout->cached = lexicon;
*cache_slider = RIVout;
RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
//add tree element
return 1;
}
return 0;
#endif /* SORTCACHE */
}
#endif
#endif
denseRIV* lexPull(LEXICON* lexicon, char* word){
......@@ -302,6 +314,7 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
}else{
/* if lexicon is set to inclusive (can gain new words) */
if(lexicon->flags & INCFLAG){
/*if file does not exist, return a 0 vector (word is new to the lexicon) */
output = calloc(1, sizeof(denseRIV));
strcpy(output->name, word);
......@@ -339,11 +352,12 @@ int saturationForStaging(denseRIV* output){
int* count = IOstagingSlot;
*count = 0;
*(count+1) = output->frequency;
*(count+2) = output->contextSize;
*(float*)(count+3) = output->magnitude;
*(count+1) = 0;
*(count+2) = output->frequency;
*(count+3) = output->contextSize;
*(float*)(count+4) = output->magnitude;
int* locations = IOstagingSlot+4;
int* locations = IOstagingSlot+5;
int* values = IOstagingSlot-RIVSIZE;;
int* locations_slider = locations;
int* values_slider = values;
......@@ -384,7 +398,7 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1;
}
fwrite(IOstagingSlot, (saturation*2)+4, sizeof(int), lexWord);
fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
fclose(lexWord);
}else{
output->cached = 0;
......@@ -393,45 +407,10 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1;
}
fwrite(((int*)&output->cached)+1, sizeof(int), RIVSIZE+4, lexWord);
fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
fclose(lexWord);
}
/* older way of writing, kept while debugging new */
//~ if(temp.count<(RIVSIZE/2)){
//~ /* smaller stored as sparse vector */
//~ *writeStaging = temp.count;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, temp.locations, temp.count*2*sizeof(int));
//~ stagingSize += temp.count*2*sizeof(int);
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(temp.locations, temp.count, sizeof(int), lexWord);
//~ fwrite(temp.values, temp.count, sizeof(int), lexWord);*/
//~ }else{
//~ /* saturation is too high, better to store dense */
//~ /* there's gotta be a better way to do this */
//~ *writeStaging = 0;
//~ stagingSize = sizeof(temp.count);
//~ memcpy(writeStaging+stagingSize, &RIVout.frequency, sizeof(int)*3);
//~ stagingSize += sizeof(int)*3;
//~ memcpy(writeStaging+stagingSize, RIVout.values, sizeof(int)*RIVSIZE);
//~ stagingSize +=sizeof(int)*RIVSIZE;
//~ fwrite(writeStaging, 1, stagingSize, lexWord);
//~ /*fwrite(&temp.count, 1, sizeof(size_t), lexWord);
//~ fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
//~ fwrite(&RIVout.contextSize, 1, sizeof(unsigned int), lexWord);
//~ fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
//~ fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);*/
//~ }
free(output);
......@@ -440,47 +419,47 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
int typeCheck;
size_t typeCheck;
/* get metadata for vector */
if(!fread(&typeCheck, 1, sizeof(int), lexWord)){
if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
return NULL;
}
int flag = 0;
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
assert(&temp->count == IOstagingSlot);
/*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
temp->count = typeCheck;
temp->locations = IOstagingSlot+4;
temp->locations = IOstagingSlot+5;
temp->values = temp->locations+temp->count;
if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
printf("vector read failure");
return NULL;
}
/*sparseRIV temp;
}*/
sparseRIV temp;
temp.count = typeCheck;
temp.locations = malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
*/
addS2D(output->values, *temp);
flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
flag+= fread(&output->magnitude, 1, sizeof(float), lexWord);
flag += fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+= fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output->values, temp);
}else{
/* typecheck is thrown away, just a flag in this case */
//~ fread(&output->frequency, 1, sizeof(int), lexWord);
//~ fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
//~ fread(&output->magnitude, 1, sizeof(float), lexWord);
if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
flag +=fread(&output->magnitude, 1, sizeof(float), lexWord);
/*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
printf("vector read failure");
return NULL;
}
}*/
}
......@@ -496,10 +475,36 @@ void signalSecure(int signum, siginfo_t *si, void* arg){
puts("cache dump failed, some lexicon data lost");
}
rootCache = rootCache->next;
free(rootCache->prev);
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
int cacheDump(denseRIV* *toDump){
int flag = 0;
denseRIV* *toDump_slider = toDump;
#ifdef HASHCACHE
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
#else
#ifdef SORTCACHE
while(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
toDump_slider++;
}
#endif
#endif
free(toDump);
return flag;
}
#endif
File deleted
......@@ -6,7 +6,10 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
#define CACHESIZE 10000
//#define HASHCACHE
#define RIVSIZE 50000
#define NONZEROS 4
#define CACHESIZE 27000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -17,6 +20,7 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
//int COUNTY = 0;
int main(int argc, char *argv[]){
char pathString[1000];
......@@ -71,11 +75,12 @@ void directoryGrind(char *rootString){
printf("skipped: %s\n", files->d_name);
continue;
}
puts(files->d_name);
//puts(files->d_name);
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
//process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++);
fileGrind(input);
fclose(input);
......@@ -88,9 +93,9 @@ void directoryGrind(char *rootString){
void fileGrind(FILE* textFile){
char textLine[10000];
// included python script separates paragraphs into lines
int i=0;
//int i=0;
while(fgets(textLine, 9999, textFile)){
printf("line: %d\n", i++);
//printf("line: %d\n", i++);
if(!strlen(textLine)) continue;
if(feof(textFile)) break;
......
File deleted
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
RIVcull: RIVcullDestructive.c
gcc -o RIVcull RIVcullDestructive.c -lm -O3
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -19,8 +19,7 @@ while(1):
freq = int(segments[1])
mag = float(segments[2])
name = segments[4];
if(freq>40000):
continue;
core = fit(freq)
fitmax = core*(1+range);
fitmin = core*(1-range);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment