Commit 3179d5fd by etcart

added comments and explanations

parent d78631fd
...@@ -191,7 +191,7 @@ sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){ ...@@ -191,7 +191,7 @@ sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount); int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp); sparseRIV sparseOut = consolidateD2S(denseTemp);
/* sparseOut is flagged as sparse in consolidate step */
free(denseTemp); free(denseTemp);
...@@ -362,6 +362,7 @@ denseRIV fLexPull(FILE* lexWord){ ...@@ -362,6 +362,7 @@ denseRIV fLexPull(FILE* lexWord){
output.magnitude = -1; output.magnitude = -1;
} }
output.cached = 0; output.cached = 0;
output.flags &= ~SPARSE;
return output; return output;
} }
...@@ -397,6 +398,7 @@ denseRIV denseAllocate(){ ...@@ -397,6 +398,7 @@ denseRIV denseAllocate(){
output.frequency = (unsigned int*)(output.values+RIVSIZE); output.frequency = (unsigned int*)(output.values+RIVSIZE);
output.magnitude = 0; output.magnitude = 0;
output.cached = 0; output.cached = 0;
output.flags &= ~SPARSE;
return output; return output;
} }
......
No preview for this file type
No preview for this file type
No preview for this file type
...@@ -5,6 +5,95 @@ ...@@ -5,6 +5,95 @@
#include "RIVLower.h" #include "RIVLower.h"
#include "RIVaccessories.h" #include "RIVaccessories.h"
/* RIV stands for Random Index Vector, referring to the method of generating
* the basic vectors that correspond to each word. each word has an algorithmically
* generated vector which represents it in this mathematical model, such that a word
* will produce the same vector each time it is encountered*[1]. this base
* vector will be referred to as a L1 vector or a barcode vector
*
* by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
* or aggregate vector. in its simplest implimentation, an L2 vector
* representation of a document contains a model of the contents of the
* document, enabling us to compare direction and magnitude of document
* vectors to understand their relationships to each other.
*
* but the system we are really interested in is the ability to form
* context vectors
* a context vector is the sum of all (L1?) vectors that the word
* has been encountered in context with. from these context vectors
* certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
*
* [1] a word produces the same vector each time it is encountered only
* if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
* environments yields meaningless drivel and should be avoided
*
* [2] what exactly "context" means remains a major stumbling point.
* paragraphs? sentences? some potential analyses would expect a static
* sized context (the nearest 10 words?) in order to be sensible, but
* it may be that some other definition of context is the most valid for
* this model. we will have to find out.
*
* some notes:
*
* -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
* the two primary data structures we will use to analyze these vectors
* each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
*
* -denseRIV is a standard vector representation.
* each array index corresponds to a dimension
* each value corresponds to a measurement in that dimension
*
* -sparseRIV is vector representation optimized for largely empty vectors
* each data point is a location/value pair where the
* location represents array index
* value represents value in that array index
*
* if we have a sparsely populated dense vector (mostly 0s) such as:
*
* |0|0|5|0|0|0|0|0|4|0|
*
* there are only 2 values in a ten element array. this could, instead
* be represented as
*
* |2|8| array indexes
* |5|4| array values
* |2| record of size
*
* and so, a 10 element vector has been represented in only 5 integers
*
* this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
*
* |0|0|5|0|0|0|0|0|4|0|
* |0|0|0|0|0|0|7|0|3|-2|
* and we wish to perform the dot product this will take 10 steps,
* 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
* |2|8|
* |5|4|
* |2|
*
* |6|8|9|
* |7|3|-2|
* |3|
*
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
* |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
* / / /
* |6|8|9|
* |7|3|-2|
* |3|
* we can simply access the dense vector by indexes held in the sparse vector
* reducing this operation to only 3 steps
/* lexPush writes a denseRIV to a file for permanent storage */ /* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout); int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon") /* lexPull reads an existing lexicon entry (under directory "lexicon")
...@@ -18,19 +107,28 @@ denseRIV lexPull(char* word); ...@@ -18,19 +107,28 @@ denseRIV lexPull(char* word);
* word contained * word contained
*/ */
sparseRIV fileToL2(FILE *input); sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words /* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol * containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later * this is important if you will be lexPush-ing those words later
*/ */
sparseRIV fileToL2Clean(FILE *data); sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2direct(FILE *data); sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */ /*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator); float cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */
sparseRIV wordtoL2(char* word); sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */
sparseRIV consolidateI2S(int *implicit, size_t valueCount); sparseRIV consolidateI2S(int *implicit, size_t valueCount);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text); sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){ sparseRIV text2L2(char *text){
unsigned int blockSize; unsigned int blockSize;
...@@ -67,7 +165,8 @@ sparseRIV text2L2(char *text){ ...@@ -67,7 +165,8 @@ sparseRIV text2L2(char *text){
} }
sparseRIV output = consolidateI2S(locations, locationCount); sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */ /* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS; output.frequency = locationCount/NONZEROS;
output.boolean = 1; output.boolean = 1;
return output; return output;
...@@ -158,16 +257,18 @@ sparseRIV fileToL2Clean(FILE *data){ ...@@ -158,16 +257,18 @@ sparseRIV fileToL2Clean(FILE *data){
} }
sparseRIV consolidateI2S(int *implicit, size_t valueCount){ sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){ if(valueCount<RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount); /* direct method is faster on small datasets, but has geometric scaling on large datasets */
}else{
return consolidateI2SDirect(implicit, valueCount); return consolidateI2SDirect(implicit, valueCount);
}else{
/* optimized for large datasets */
return consolidateI2SIndirect(implicit, valueCount);
} }
} }
void aggregateWord2D(denseRIV destination, char* word){ void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word)); srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){ for(int i=0; i<NONZEROS; i++){
...@@ -185,11 +286,13 @@ float cosCompare(denseRIV baseRIV, sparseRIV comparator){ ...@@ -185,11 +286,13 @@ float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int *locations_Stop = locations+comparator.count; int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){ while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */ /* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
dot += (*values)*(*(baseRIV.values+(*locations))); dot += (*values)*(*(baseRIV.values+(*locations)));
locations++; locations++;
values++; values++;
} }
/*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude); float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine; return cosine;
...@@ -221,9 +324,9 @@ denseRIV lexPull(char* word){ ...@@ -221,9 +324,9 @@ denseRIV lexPull(char* word){
return RIVKey.RIVCache[hash]; return RIVKey.RIVCache[hash];
} }
#endif /* CACHESIZE > 0 */ #endif /* CACHESIZE > 0 */
denseRIV output;
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200]; char pathString[200];
...@@ -236,7 +339,7 @@ denseRIV lexPull(char* word){ ...@@ -236,7 +339,7 @@ denseRIV lexPull(char* word){
output = fLexPull(lexWord); output = fLexPull(lexWord);
fclose(lexWord); fclose(lexWord);
}else{ }else{
/*if file does not exist, return a 0 vector */ /*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate(); output = denseAllocate();
} }
...@@ -244,15 +347,16 @@ denseRIV lexPull(char* word){ ...@@ -244,15 +347,16 @@ denseRIV lexPull(char* word){
return output; return output;
} }
int lexPush(denseRIV RIVout){ int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0 #if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout); fLexPush(RIVout);
return 0; return 0;
#else /* CACHESIZE != 0 */ #else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully) /* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache * either the RIV is still cached, and the data has been updated
* or the RIV was pushed out from under it, in which case it has already been pushed*/ * to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){ if(RIVout.cached){
return 0; return 0;
...@@ -262,17 +366,16 @@ int lexPush(denseRIV RIVout){ ...@@ -262,17 +366,16 @@ int lexPush(denseRIV RIVout){
int hash = rand()%CACHESIZE; int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){ if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout; RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1; RIVKey.RIVCache[hash].cached = 1;
return 0; return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */ /*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){ }else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */ /* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]); int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */ /* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout; RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1; RIVKey.RIVCache[hash].cached = 1;
......
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment