Commit 9abb263f by amberhosen

refactored RIVread for less fragmentation

parent 34c65893
File added
...@@ -5,7 +5,8 @@ ...@@ -5,7 +5,8 @@
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include "../../RIVtools.h" #include <string.h>
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context) //this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one //to all words found in these files. this is used to create a lexicon, or add to an existing one
...@@ -13,7 +14,7 @@ ...@@ -13,7 +14,7 @@
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context); void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void lineGrind(char* textLine);
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
...@@ -40,9 +41,9 @@ int main(int argc, char *argv[]){ ...@@ -40,9 +41,9 @@ int main(int argc, char *argv[]){
return 0; return 0;
} }
//mostly a standard recursive Dirent-walk //mostly a standard Dirent-walk
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */ /* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -54,27 +55,12 @@ void directoryGrind(char *rootString){ ...@@ -54,27 +55,12 @@ void directoryGrind(char *rootString){
while((files=readdir(directory))){ while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
continue; continue;
} }
sprintf(pathString, "%s/%s", rootString, files->d_name);
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
/* *** end dirent walk, begin meat of function *** */ /* *** end dirent walk, begin meat of function *** */
//check for non-txt files //check for non-txt files
...@@ -95,21 +81,34 @@ void directoryGrind(char *rootString){ ...@@ -95,21 +81,34 @@ void directoryGrind(char *rootString){
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
//form a context vector. "clean" indicates that it will ignore any word which char textLine[5000];
//contains unwanted characters // included python script separates paragraphs into lines
sparseRIV contextVector = fileToL2Clean(textFile);
//an array of denseRIVs, large enough to hold all vectors while(fgets(textLine, 4999, textFile)){
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV; if(!strlen(textLine)) continue;
if(feof(textFile)) break;
//process each line as a context set
lineGrind(textLine);
}
}
//form context vector from contents of text, then add that vector to
//all lexicon entries of the words contained
void lineGrind(char* textLine){
//extract a context vector from this text set
sparseRIV contextVector = textToL2(textLine);
denseRIV* lexiconRIV;
//identify stopping point in line read
char* textEnd = textLine + strlen(textLine)-1;
int displacement = 0;
char word[100] = {0}; char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(textLine<textEnd){
sscanf(textLine, "%99s%n", word, &displacement);
//we ensure that each word exists, and is free of unwanted characters //we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
...@@ -133,9 +132,19 @@ void fileGrind(FILE* textFile){ ...@@ -133,9 +132,19 @@ void fileGrind(FILE* textFile){
//and finally we push it back to the lexicon for permanent storage //and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV); lexPush(lexiconRIV);
textLine += displacement+1;
} }
//free the heap allocated context vector data
free(contextVector.locations); free(contextVector.locations);
} }
void addContext(denseRIV* lexRIV, sparseRIV context){ void addContext(denseRIV* lexRIV, sparseRIV context){
......
File added
...@@ -51,9 +51,11 @@ sparseRIV textToL2(char *text){ ...@@ -51,9 +51,11 @@ sparseRIV textToL2(char *text){
* to permanent home in consolidation */ * to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock; int *locations = RIVKey.h_tempBlock;
int locationCount = 0; int locationCount = 0;
int displacement; int displacement = 0;;
char* textEnd = text+strlen(text)-1;
while(sscanf(text, "%99s%n", word, &displacement)){ while(text<textEnd){
sscanf(text, "%99s%n", word, &displacement);
text += displacement+1; text += displacement+1;
if(!displacement){ if(!displacement){
break; break;
......
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment