Commit 47297b52 by etcart

improved lex pointer system

parent ad4b27c9
......@@ -47,6 +47,7 @@
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
......@@ -54,6 +55,7 @@
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
......@@ -70,7 +72,7 @@ typedef struct{
*/
typedef struct{
char name[100];
int cached;
void* cached;
int frequency;
int contextSize;
float magnitude;
......@@ -85,7 +87,8 @@ struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV* RIVCache[CACHESIZE];
denseRIV** RIVCache;
char flags;
}static RIVKey;
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
......
No preview for this file type
No preview for this file type
#include <stdio.h>
#define CACHESIZE 0
#define CACHEEXCLUSIVE 1
#define RIVSIZE 50000
#include "RIVtools.h"
char* clean(char* word);
char* stem(char* word);
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
int main(){
lexOpen("consolidatedLexicon50-8");
FILE* text = fopen("../books/pg56902.txt", "r");
lexOpen("lexicon", "rx");
int classNo = 0;
label className = "tempName";
label* classNames = calloc(1, sizeof(label));
int classCount = 0;
struct RIVclass* classes = malloc(sizeof(struct RIVclass));
strcpy(classes[classCount].name, className);
strcpy(classNames[classCount], className);
classCount++;
while(1){
FILE* text = fopen("../bookCleaner/cleanbooks/pg56902clean.txt", "r");
if(!text){
puts("no file");
return 1;
......@@ -17,39 +37,73 @@ int main(){
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(!*clean(word)) continue;
if(stem(word)){
//if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
printf("%s, not in lexicon\n", word);
//printf("%s, not in lexicon\n", word);
continue;
}else{
//printf("%s, succesfully pulled\n", word);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
struct RIVclass* class = classes+classNo;
class->set = malloc(sizeof(sparseRIV));
class->setSize = 0;
class->set[class->setSize] = consolidateD2S(accumulate.values);
class->setSize++;
}
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
classNames = realloc(classNames, classCount*sizeof(label));
strcpy(classNames[classCount], className);
classCount++;
}else{
printf("%s, not in wordNet\n", word);
classNo = (labelTemp-*classNames);
}
}
}
return 0;
}
char* clean(char* word){
char* letter = word;
char output[100] = {0};
char *outLetter = output;
while(*letter){
if(*letter >= 'A' && *letter <= 'Z'){
*outLetter = *letter + 32;
outLetter++;
}else if( *letter >= 'a' && *letter <= 'z'){
*outLetter = *letter;
outLetter++;
}
letter++;
}
strcpy(word,output);
return word;
}
char* stem(char* word){
char pathString[200];
......@@ -61,10 +115,15 @@ char* stem(char* word){
fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL;
if(WNdata == 1) return word;
if(!WNdata) {
fclose(WNfile);
return NULL;
}
if(WNdata == 1) {
fclose(WNfile);
return word;
}
if(WNdata == 2){
fscanf(WNfile, "%s", word);
fclose(WNfile);
......
No preview for this file type
......@@ -5,9 +5,39 @@
#include "RIVaccessories.h"
#include "assert.h"
#ifndef CACHEEXCLUSIVE
#define CACHEEXCLUSIVE 0
#ifndef READFLAG
#define READFLAG 0x01
#endif
#ifndef WRITEFLAG
#define WRITEFLAG 0x02
#endif
#ifndef INCFLAG
#define INCFLAG 0x04
#endif
#ifndef CACHEFLAG
#define CACHEFLAG 0x08
#endif
#ifndef SORTCACHE
#ifndef HASHCACHE
#define HASHCACHE
#endif
#endif
typedef struct{
char lexName[100];
denseRIV* *cache;
struct cacheList* listPoint;
char flags;
}LEXICON;
struct cacheList{
denseRIV* *cache;
struct cacheList* next;
struct cacheList* prev;
}*rootCache = NULL;
#define IODISPLACEMENT (sizeof(((sparseRIV*)0)->count)\
+ sizeof(((sparseRIV*)0)->frequency)\
+ sizeof(((sparseRIV*)0)->contextSize)\
......@@ -15,17 +45,18 @@
/ sizeof(int)
int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
LEXICON* lexOpen(const char* lexName, const char* flags);
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
void lexClose(LEXICON*);
/* both lexPush and lexPull must be called *after* the lexOpen() function
......@@ -33,34 +64,34 @@ void lexClose();
* data security */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
int lexPush(LEXICON* lexicon, denseRIV* RIVout);
/* cacheCheckOnPush tests the state of this vector in our lexicon cache
* and returns 1 on "success" indicating cache storage and no need to push to file
* or returns 0 on "failure" indicating that the vector need be pushed to file
*/
int cacheCheckOnPush(denseRIV* RIVout);
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
* globally for key optimizations
*/
denseRIV* lexPull(char* word);
denseRIV* lexPull(LEXICON* lexicon, char* word);
/* cacheCheckonPull checks if the word's vector is stored in cache,
* and returns a pointer to that vector on success
* or returns a NULL pointer if the word is not cached, indicating a need
* to pull from file
*/
denseRIV* cacheCheckOnPull(char* word);
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV* RIVout);
int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
......@@ -77,63 +108,187 @@ void signalSecure(int signum, siginfo_t *si, void* arg);
*/
int saturationForStaging(denseRIV* output);
/* begin definitions */
void lexOpen(char* lexName){
LEXICON* lexOpen(const char* lexName, const char* flags){
LEXICON* output = calloc(1, sizeof(LEXICON));
/* identify the presence of read, write, and exclusive flags */
char* r = strstr(flags, "r");
char* w = strstr(flags, "w");
char* x = strstr(flags, "x");
struct stat st = {0};
if(w){
/* if set to write, we check and create if necessary, the lexicon */
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for ;worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
output->flags |= WRITEFLAG;
}else if(r){
/* if set to read and not write, return null if lexicon does not exist */
if (stat(lexName, &st) == -1) {
free(output);
return NULL;
}
output->flags |= READFLAG;
}
/* if not set to exclusive, set the inclusive flag */
if(!x){
output->flags |= INCFLAG;
}
strcpy(output->lexName, lexName);
#if CACHESIZE > 0
if(r && w){
//#TODO include hash vs sort cache logic flags
/* if we will be reading and writing the same lexicon, setup a
* cache for this lexicon to speed up rewrites */
struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
output->flags |= CACHEFLAG;
output->cache = newCache->cache;
newCache->next = rootCache;
if(rootCache){
rootCache->prev = newCache;
}
rootCache = newCache;
output->listPoint = newCache;
struct sigaction action = {0};
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){
sigaction(i,&action,NULL);
}
}
#endif
return output;
}
void lexClose(LEXICON* toClose){
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
#if CACHESIZE>0
if(toClose->flags & CACHEFLAG){
if(cacheDump(toClose->cache)){
puts("cache dump failed, some lexicon data was lost");
}
struct cacheList* listPoint = toClose->listPoint;
if(listPoint->prev){
listPoint->prev->next = toClose->listPoint->next;
}
if(listPoint->next){
listPoint->next->prev = toClose->listPoint->prev;
}
free(listPoint);
}
#endif
free(toClose);
}
void lexClose(){
int cacheDump(denseRIV* *toDump){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
int flag = 0;
denseRIV* *toDump_slider = toDump;
denseRIV* *toDump_stop = toDump+CACHESIZE;
while(toDump_slider<toDump_stop){
if(*toDump_slider){
flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
}
toDump_slider++;
}
free(toDump);
return flag;
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(char* word){
denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
#ifdef HASHCACHE
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(RIVKey.RIVCache[hash]){
if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
if(lexicon->cache[hash]){
if(!strcmp(word, lexicon->cache[hash]->name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
return lexicon->cache[hash];
}
}
return NULL;
#endif
#ifdef SORTCACHE
#endif
}
#if CACHESIZE > 0
int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached == lexicon){
return 1;
}
#if HASHCACHE
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!lexicon->cache[hash]){
/* push to cache instead of file */
lexicon->cache[hash] = RIVout;
lexicon->cache[hash]->cached = lexicon;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > lexicon->cache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(lexicon, lexicon->cache[hash]);
/* replace this cache-slot with the current vector */
lexicon->cache[hash] = RIVout;
lexicon->cache[hash]->cached = lexicon;
return 1;
}
return 0;
#endif
#if SORTCACHE
#endif
}
#endif
denseRIV* lexPull(char* word){
#endif
denseRIV* lexPull(LEXICON* lexicon, char* word){
denseRIV* output = NULL;
#if CACHESIZE > 0
if(lexicon->flags & CACHEFLAG){
/* if there is a cache, first check if the word is cached */
if((output = cacheCheckOnPull(word))){
if((output = cacheCheckOnPull(lexicon, word))){
return output;
}
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
sprintf(pathString, "%s/%s", lexicon->lexName, word);
FILE *lexWord = fopen(pathString, "rb");
......@@ -145,11 +300,12 @@ denseRIV* lexPull(char* word){
strcpy(output->name, word);
fclose(lexWord);
}else{
#if CACHEEXCLUSIVE == 0
/*if file does not exist, return a 0 vector (word is new to the lexicon */
/* if lexicon is set to inclusive (can gain new words) */
if(lexicon->flags & INCFLAG){
/*if file does not exist, return a 0 vector (word is new to the lexicon) */
output = calloc(1, sizeof(denseRIV));
strcpy(output->name, word);
#endif
}
/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
}
......@@ -157,53 +313,23 @@ denseRIV* lexPull(char* word){
return output;
}
#if CACHESIZE > 0
int cacheCheckOnPush(denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached){
return 1;
}
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!RIVKey.RIVCache[hash]){
/* push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(RIVKey.RIVCache[hash]);
/* replace this cache-slot with the current vector */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
}
return 0;
}
#endif
int lexPush(denseRIV* RIVout){
int lexPush(LEXICON* lexicon, denseRIV* RIVout){
#if CACHESIZE > 0
if(lexicon->flags & CACHEFLAG){
/* check the cache to see if it belongs in cache */
if(cacheCheckOnPush(RIVout)){
if(cacheCheckOnPush(lexicon, RIVout)){
/* if the cache check returns 1, it has been dealth with in cache */
return 0;
}
}
#endif /* CACHESIZE != 0 */
#endif
/* push to the lexicon */
return fLexPush(RIVout);
return fLexPush(lexicon, RIVout);
}
......@@ -242,12 +368,12 @@ int saturationForStaging(denseRIV* output){
return *count;
}
int fLexPush(denseRIV* output){
int fLexPush(LEXICON* lexicon, denseRIV* output){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, output->name);
sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
int saturation = saturationForStaging(output);
......@@ -267,7 +393,7 @@ int fLexPush(denseRIV* output){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
return 1;
}
fwrite(&output->cached, sizeof(int), RIVSIZE+4, lexWord);
fwrite(((int*)&output->cached)+1, sizeof(int), RIVSIZE+4, lexWord);
fclose(lexWord);
}
......@@ -364,26 +490,14 @@ denseRIV* fLexPull(FILE* lexWord){
}
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
flag += fLexPush(RIVKey.RIVCache[i]);
}
}
return flag;
}
/*TODO add a simplified free function*/
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
while(rootCache){
if(cacheDump(rootCache->cache)){
puts("cache dump failed, some lexicon data lost");
}
rootCache = rootCache->next;
free(rootCache->prev);
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
......
No preview for this file type
No preview for this file type
......@@ -6,7 +6,7 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
#define CACHESIZE 100000
#define CACHESIZE 10000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -16,13 +16,13 @@ void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
int main(int argc, char *argv[]){
char pathString[1000];
lp = lexOpen("lexicon", "rw");
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents
......@@ -39,7 +39,7 @@ int main(int argc, char *argv[]){
directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured
lexClose();
lexClose(lp);
return 0;
}
......@@ -81,6 +81,7 @@ void directoryGrind(char *rootString){
fclose(input);
}
}
closedir(directory);
}
......@@ -126,7 +127,7 @@ void lineGrind(char* textLine){
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
lexiconRIV= lexPull(lp, word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
......@@ -138,7 +139,7 @@ void lineGrind(char* textLine){
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
lexPush(lp, lexiconRIV);
}
......
No preview for this file type
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment