Commit 9abb263f by amberhosen

refactored RIVread for less fragmentation

parent 34c65893
File added
......@@ -5,7 +5,8 @@
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include "../../RIVtools.h"
#include <string.h>
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
......@@ -13,7 +14,7 @@
void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
int main(int argc, char *argv[]){
char pathString[1000];
......@@ -40,9 +41,9 @@ int main(int argc, char *argv[]){
return 0;
}
//mostly a standard recursive Dirent-walk
//mostly a standard Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
......@@ -54,27 +55,12 @@ void directoryGrind(char *rootString){
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
sprintf(pathString, "%s/%s", rootString, files->d_name);
/* *** end dirent walk, begin meat of function *** */
//check for non-txt files
......@@ -95,21 +81,34 @@ void directoryGrind(char *rootString){
}
}
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){
//form a context vector. "clean" indicates that it will ignore any word which
//contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
char textLine[5000];
// included python script separates paragraphs into lines
//an array of denseRIVs, large enough to hold all vectors
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV;
while(fgets(textLine, 4999, textFile)){
if(!strlen(textLine)) continue;
if(feof(textFile)) break;
//process each line as a context set
lineGrind(textLine);
}
}
//form context vector from contents of text, then add that vector to
//all lexicon entries of the words contained
void lineGrind(char* textLine){
//extract a context vector from this text set
sparseRIV contextVector = textToL2(textLine);
denseRIV* lexiconRIV;
//identify stopping point in line read
char* textEnd = textLine + strlen(textLine)-1;
int displacement = 0;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){
while(textLine<textEnd){
sscanf(textLine, "%99s%n", word, &displacement);
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
if(!(*word))continue;
......@@ -133,9 +132,19 @@ void fileGrind(FILE* textFile){
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
textLine += displacement+1;
}
//free the heap allocated context vector data
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
......
File added
......@@ -51,9 +51,11 @@ sparseRIV textToL2(char *text){
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
int displacement = 0;;
char* textEnd = text+strlen(text)-1;
while(sscanf(text, "%99s%n", word, &displacement)){
while(text<textEnd){
sscanf(text, "%99s%n", word, &displacement);
text += displacement+1;
if(!displacement){
break;
......
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment