Commit 47297b52 by etcart

improved lex pointer system

parent ad4b27c9
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
/* the size of the tempBlock used in consolidation and implicit RIVs */ /* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE #define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s /* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default * as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation. * unless we are doing long term RIV aggregation.
...@@ -54,6 +55,7 @@ ...@@ -54,6 +55,7 @@
* containing locations and values, where pairs are found in like array * containing locations and values, where pairs are found in like array
* indices. * indices.
*/ */
typedef struct{ typedef struct{
char name[100]; char name[100];
int *values; int *values;
...@@ -70,7 +72,7 @@ typedef struct{ ...@@ -70,7 +72,7 @@ typedef struct{
*/ */
typedef struct{ typedef struct{
char name[100]; char name[100];
int cached; void* cached;
int frequency; int frequency;
int contextSize; int contextSize;
float magnitude; float magnitude;
...@@ -85,7 +87,8 @@ struct RIVData{ ...@@ -85,7 +87,8 @@ struct RIVData{
int h_tempBlock[TEMPSIZE]; int h_tempBlock[TEMPSIZE];
int tempSize; int tempSize;
char lexName[255]; char lexName[255];
denseRIV* RIVCache[CACHESIZE]; denseRIV** RIVCache;
char flags;
}static RIVKey; }static RIVKey;
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
......
No preview for this file type
No preview for this file type
#include <stdio.h> #include <stdio.h>
#define CACHESIZE 0
#define CACHEEXCLUSIVE 1
#define RIVSIZE 50000 #define RIVSIZE 50000
#include "RIVtools.h" #include "RIVtools.h"
char* clean(char* word);
char* stem(char* word); char* stem(char* word);
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
int main(){ int main(){
lexOpen("consolidatedLexicon50-8");
FILE* text = fopen("../books/pg56902.txt", "r"); lexOpen("lexicon", "rx");
if(!text){
puts("no file"); int classNo = 0;
return 1; label className = "tempName";
} label* classNames = calloc(1, sizeof(label));
denseRIV accumulate = {0}; int classCount = 0;
sparseRIV temp;
char word[100]; struct RIVclass* classes = malloc(sizeof(struct RIVclass));
while(fscanf(text, "%99s", word)){
if(feof(text)) break; strcpy(classes[classCount].name, className);
if(!*word) break; strcpy(classNames[classCount], className);
classCount++;
while(1){
if(stem(word)){ FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(!*clean(word)) continue;
//if(stem(word)){
denseRIV* wordRIV = lexPull(word); denseRIV* wordRIV = lexPull(word);
if(!wordRIV){ if(!wordRIV){
printf("%s, not in lexicon\n", word); //printf("%s, not in lexicon\n", word);
continue; continue;
}else{ }else{
//printf("%s, succesfully pulled\n", word);
temp = consolidateD2S(wordRIV->values); temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp); addS2D(accumulate.values, temp);
free(temp.locations); free(temp.locations);
free(wordRIV); free(wordRIV);
} }
}else{ }
printf("%s, not in wordNet\n", word);
struct RIVclass* class = classes+classNo;
class->set = malloc(sizeof(sparseRIV));
class->setSize = 0;
class->set[class->setSize] = consolidateD2S(accumulate.values);
class->setSize++;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
classNames = realloc(classNames, classCount*sizeof(label));
strcpy(classNames[classCount], className);
classCount++;
}else{
classNo = (labelTemp-*classNames);
} }
} }
return 0; return 0;
} }
char* clean(char* word){
char* letter = word;
char output[100] = {0};
char *outLetter = output;
while(*letter){
if(*letter >= 'A' && *letter <= 'Z'){
*outLetter = *letter + 32;
outLetter++;
}else if( *letter >= 'a' && *letter <= 'z'){
*outLetter = *letter;
outLetter++;
}
letter++;
}
strcpy(word,output);
return word;
}
char* stem(char* word){ char* stem(char* word){
...@@ -61,10 +115,15 @@ char* stem(char* word){ ...@@ -61,10 +115,15 @@ char* stem(char* word){
fscanf(WNfile, "%d", &WNdata); fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL; if(!WNdata) {
if(WNdata == 1) return word;
fclose(WNfile);
return NULL;
}
if(WNdata == 1) {
fclose(WNfile);
return word;
}
if(WNdata == 2){ if(WNdata == 2){
fscanf(WNfile, "%s", word); fscanf(WNfile, "%s", word);
fclose(WNfile); fclose(WNfile);
......
No preview for this file type
No preview for this file type
No preview for this file type
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include <string.h> #include <string.h>
#define CACHESIZE 100000 #define CACHESIZE 10000
#include "RIVtools.h" #include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context) //this program reads a directory full of files, and adds all context vectors (considering file as context)
...@@ -16,13 +16,13 @@ void fileGrind(FILE* textFile); ...@@ -16,13 +16,13 @@ void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context); void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void lineGrind(char* textLine); void lineGrind(char* textLine);
LEXICON* lp;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
lp = lexOpen("lexicon", "rw");
//we open the lexicon, if it does not yet exist, it will be created //we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents //we format the root directory, preparing to scan its contents
...@@ -39,7 +39,7 @@ int main(int argc, char *argv[]){ ...@@ -39,7 +39,7 @@ int main(int argc, char *argv[]){
directoryGrind(pathString); directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured //we close the lexicon again, ensuring all data is secured
lexClose(); lexClose(lp);
return 0; return 0;
} }
...@@ -81,6 +81,7 @@ void directoryGrind(char *rootString){ ...@@ -81,6 +81,7 @@ void directoryGrind(char *rootString){
fclose(input); fclose(input);
} }
} }
closedir(directory);
} }
...@@ -126,7 +127,7 @@ void lineGrind(char* textLine){ ...@@ -126,7 +127,7 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon //we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word); lexiconRIV= lexPull(lp, word);
//we add the context of this file to this wordVector //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector); addContext(lexiconRIV, contextVector);
...@@ -138,7 +139,7 @@ void lineGrind(char* textLine){ ...@@ -138,7 +139,7 @@ void lineGrind(char* textLine){
lexiconRIV->frequency += 1; lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage //and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV); lexPush(lp, lexiconRIV);
} }
......
No preview for this file type
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment