Commit 47297b52 by etcart

improved lex pointer system

parent ad4b27c9
......@@ -47,6 +47,7 @@
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
......@@ -54,6 +55,7 @@
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
......@@ -70,7 +72,7 @@ typedef struct{
*/
typedef struct{
char name[100];
int cached;
void* cached;
int frequency;
int contextSize;
float magnitude;
......@@ -85,7 +87,8 @@ struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV* RIVCache[CACHESIZE];
denseRIV** RIVCache;
char flags;
}static RIVKey;
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
......
No preview for this file type
No preview for this file type
#include <stdio.h>
#define CACHESIZE 0
#define CACHEEXCLUSIVE 1
#define RIVSIZE 50000
#include "RIVtools.h"
char* clean(char* word);
char* stem(char* word);
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
int main(){
lexOpen("consolidatedLexicon50-8");
FILE* text = fopen("../books/pg56902.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
lexOpen("lexicon", "rx");
int classNo = 0;
label className = "tempName";
label* classNames = calloc(1, sizeof(label));
int classCount = 0;
struct RIVclass* classes = malloc(sizeof(struct RIVclass));
strcpy(classes[classCount].name, className);
strcpy(classNames[classCount], className);
classCount++;
while(1){
if(stem(word)){
FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(!*clean(word)) continue;
//if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
printf("%s, not in lexicon\n", word);
//printf("%s, not in lexicon\n", word);
continue;
}else{
//printf("%s, succesfully pulled\n", word);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}else{
printf("%s, not in wordNet\n", word);
}
struct RIVclass* class = classes+classNo;
class->set = malloc(sizeof(sparseRIV));
class->setSize = 0;
class->set[class->setSize] = consolidateD2S(accumulate.values);
class->setSize++;
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
classNames = realloc(classNames, classCount*sizeof(label));
strcpy(classNames[classCount], className);
classCount++;
}else{
classNo = (labelTemp-*classNames);
}
}
return 0;
}
char* clean(char* word){
char* letter = word;
char output[100] = {0};
char *outLetter = output;
while(*letter){
if(*letter >= 'A' && *letter <= 'Z'){
*outLetter = *letter + 32;
outLetter++;
}else if( *letter >= 'a' && *letter <= 'z'){
*outLetter = *letter;
outLetter++;
}
letter++;
}
strcpy(word,output);
return word;
}
char* stem(char* word){
......@@ -61,10 +115,15 @@ char* stem(char* word){
fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL;
if(WNdata == 1) return word;
if(!WNdata) {
fclose(WNfile);
return NULL;
}
if(WNdata == 1) {
fclose(WNfile);
return word;
}
if(WNdata == 2){
fscanf(WNfile, "%s", word);
fclose(WNfile);
......
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -6,7 +6,7 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
#define CACHESIZE 100000
#define CACHESIZE 10000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -16,13 +16,13 @@ void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
int main(int argc, char *argv[]){
char pathString[1000];
lp = lexOpen("lexicon", "rw");
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents
......@@ -39,7 +39,7 @@ int main(int argc, char *argv[]){
directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured
lexClose();
lexClose(lp);
return 0;
}
......@@ -81,6 +81,7 @@ void directoryGrind(char *rootString){
fclose(input);
}
}
closedir(directory);
}
......@@ -126,7 +127,7 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
lexiconRIV= lexPull(lp, word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
......@@ -138,7 +139,7 @@ void lineGrind(char* textLine){
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
lexPush(lp, lexiconRIV);
}
......
No preview for this file type
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment