Commit 9c6ceacf by simetk

added direct addition of barcode vectors to dense vectors, began experimentation with morphic RIVs

parent 49b28180
Showing with 626 additions and 196 deletions

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <string.h> #include <string.h>
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#include <math.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use /* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically * 25000 is the standard, but can be redefined specifically
*/ */
...@@ -64,6 +65,7 @@ typedef struct{ ...@@ -64,6 +65,7 @@ typedef struct{
struct RIVData{ struct RIVData{
size_t RIVsize; size_t RIVsize;
int nonZeros; int nonZeros;
int I2SThreshold;
int *h_tempBlock; int *h_tempBlock;
int tempSize; int tempSize;
int thing; int thing;
...@@ -75,7 +77,6 @@ struct RIVData{ ...@@ -75,7 +77,6 @@ struct RIVData{
* it sets global variables that practically all functions will reference, * it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for * it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations. * the functions to use, so that we can move fast with rare allocations.
* #TODO add signal redefinitions so that cache is saved even on crash
*/ */
void RIVInit(); void RIVInit();
...@@ -85,7 +86,7 @@ void RIVInit(); ...@@ -85,7 +86,7 @@ void RIVInit();
void RIVCleanup(); void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with /*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does nto automatically carry metadata, which must be assigned * all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't * to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata * need to carry metadata
*/ */
...@@ -94,7 +95,7 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion ...@@ -94,7 +95,7 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations /* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs * based on location-value pairs
*/ */
int* mapS2D(int* destination, sparseRIV input); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a /* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file * file to produce a series of locations from the words of the file
...@@ -110,7 +111,7 @@ void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount); ...@@ -110,7 +111,7 @@ void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
*/ */
int fLexPush(denseRIV RIVout); int fLexPush(denseRIV RIVout);
denseRIV fLexPull(FILE* lexWord, denseRIV output); denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */ /* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word); int wordtoSeed(unsigned char* word);
...@@ -121,29 +122,16 @@ int wordtoSeed(unsigned char* word); ...@@ -121,29 +122,16 @@ int wordtoSeed(unsigned char* word);
*/ */
int* mapI2D(int *locations, size_t seedCount); int* mapI2D(int *locations, size_t seedCount);
int* addI2D(int* destination, int* locations, size_t seedCount);
int cacheDump();
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
int cacheDump();
int* addI2D(int* destination, int* locations, size_t seedCount);
denseRIV denseAllocate();
void signalSecure(int signum, siginfo_t *si, void* arg); void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */ /* begin definitions */
int* mapS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
/* make sure our destination is a 0 vector */
memset(destination, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
...@@ -160,6 +148,8 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete ...@@ -160,6 +148,8 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
return destination; return destination;
} }
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVKey.RIVsize,sizeof(int)); int *destination = (int*)calloc(RIVKey.RIVsize,sizeof(int));
int *locations_slider = locations; int *locations_slider = locations;
...@@ -193,12 +183,54 @@ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix de ...@@ -193,12 +183,54 @@ int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix de
return destination; return destination;
} }
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVKey.RIVsize;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVKey.RIVsize;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, 2*sparseOut.count*sizeof(int));
return sparseOut;
}
sparseRIV consolidateD2S(int *denseInput){ sparseRIV consolidateD2S(int *denseInput){
sparseRIV output; sparseRIV output;
output.count = 0; output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */ /* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock; int* locations = RIVKey.h_tempBlock+RIVKey.RIVsize;
int* values = RIVKey.h_tempBlock+RIVKey.RIVsize; int* values = locations+RIVKey.RIVsize;
int* locations_slider = locations; int* locations_slider = locations;
int* values_slider = values; int* values_slider = values;
for(int i=0; i<RIVKey.RIVsize; i++){ for(int i=0; i<RIVKey.RIVsize; i++){
...@@ -236,7 +268,7 @@ sparseRIV consolidateD2S(int *denseInput){ ...@@ -236,7 +268,7 @@ sparseRIV consolidateD2S(int *denseInput){
void RIVInit(){ void RIVInit(){
RIVKey.RIVsize = RIVSIZE; //#TODO decide about macros vs global variables RIVKey.RIVsize = RIVSIZE; //#TODO decide about macros vs global variables
RIVKey.nonZeros = NONZEROS; RIVKey.nonZeros = NONZEROS;
RIVKey.I2SThreshold = sqrt(RIVSIZE);
if(RIVKey.nonZeros%2){ if(RIVKey.nonZeros%2){
printf("your NONZEROS value must be an even number"); printf("your NONZEROS value must be an even number");
RIVKey.nonZeros++; RIVKey.nonZeros++;
...@@ -248,9 +280,9 @@ void RIVInit(){ ...@@ -248,9 +280,9 @@ void RIVInit(){
struct sigaction action; struct sigaction action;
action.sa_sigaction = signalSecure; action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO; action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){ //for(int i=1; i<27; i++){
sigaction(i,&action,NULL); sigaction(11,&action,NULL);
} //}
RIVKey.h_tempBlock = (int*)malloc(3*RIVKey.RIVsize*sizeof(int)); RIVKey.h_tempBlock = (int*)malloc(3*RIVKey.RIVsize*sizeof(int));
RIVKey.tempSize = 3*RIVKey.RIVsize; RIVKey.tempSize = 3*RIVKey.RIVsize;
...@@ -286,11 +318,11 @@ int wordtoSeed(unsigned char* word){ ...@@ -286,11 +318,11 @@ int wordtoSeed(unsigned char* word){
void makeSparseLocations(unsigned char* word, int *locations, size_t count){ void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count; locations+=count;
srand(wordtoSeed(word)); srand(wordtoSeed(word));
for(int i=0; i<RIVKey.nonZeros; i++){ int *locations_stop = locations+RIVKey.nonZeros;
/* unrolled for speed, gauranteed to be an even number of steps */ while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVKey.RIVsize; *locations = rand()%RIVKey.RIVsize;
locations++; locations++;
i++;
*locations = rand()%RIVKey.RIVsize; *locations = rand()%RIVKey.RIVsize;
locations++; locations++;
} }
...@@ -319,17 +351,19 @@ int fLexPush(denseRIV RIVout){ ...@@ -319,17 +351,19 @@ int fLexPush(denseRIV RIVout){
return 0; return 0;
} }
denseRIV fLexPull(FILE* lexWord, denseRIV output){ denseRIV fLexPull(FILE* lexWord){
denseRIV output;
output.values = malloc( (RIVKey.RIVsize+1) *sizeof(int));
output.frequency = output.values+RIVKey.RIVsize;
int diagnostic = 0; int diagnostic = 0;
diagnostic += fread(output.frequency, 1, sizeof(int), lexWord); diagnostic += fread(output.frequency, 1, sizeof(int), lexWord);
diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord); diagnostic += fread(&(output.magnitude), 1, sizeof(int), lexWord);
diagnostic += fread(output.values, RIVKey.RIVsize, sizeof(int), lexWord); diagnostic += fread(output.values, RIVKey.RIVsize, sizeof(int), lexWord);
fclose(lexWord);
if(diagnostic != (RIVKey.RIVsize+2)){ if(diagnostic != (RIVKey.RIVsize+2)){
output.magnitude = -1; output.magnitude = -1;
} }
output.cached = 0;
return output; return output;
} }
...@@ -350,9 +384,22 @@ int cacheDump(){ ...@@ -350,9 +384,22 @@ int cacheDump(){
denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize; denseRIV* cache_stop = RIVKey.RIVCache+RIVKey.cacheSize;
while(cache_slider<cache_stop){ while(cache_slider<cache_stop){
if((*cache_slider).cached){ if((*cache_slider).cached){
fLexPush(*cache_slider); flag += fLexPush(*cache_slider);
} }
cache_slider++; cache_slider++;
} }
return flag; return flag;
} }
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = calloc(RIVKey.RIVsize+1, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVKey.RIVsize;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
No preview for this file type
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
int isWordClean(char* word);
/* used by wordClean */
int isLetter(char c);
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
No preview for this file type
No preview for this file type
...@@ -5,7 +5,8 @@ ...@@ -5,7 +5,8 @@
#define RIVSIZE 5000 #define RIVSIZE 5000
#define CACHESIZE 0 #define CACHESIZE 0
#define THRESHOLD 0.70 #define NONZEROS 2
#define THRESHOLD 0.7
#define COSINEACTION do {\ #define COSINEACTION do {\
if(cosine > THRESHOLD){ \ if(cosine > THRESHOLD){ \
printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\ printf("%s\t%s\n%f\n", baseRIV.name, (*multipliers).name, cosine);\
...@@ -33,14 +34,49 @@ int main(int argc, char *argv[]){ ...@@ -33,14 +34,49 @@ int main(int argc, char *argv[]){
directoryToL2s(rootString, &fileRIVs, &fileCount); directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount); printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock(); clock_t beginnsquared = clock();
for(int i=1; i<fileCount; i++){ float cosine;
if(fileRIVs[i].boolean){ float minmag;
cosineCompare(fileRIVs[i], fileRIVs, i); float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVKey.RIVsize*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVKey.RIVsize*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0;
RIVKey.thing++;
}
}
comparators_slider++;
//cosineCompare(fileRIVs[i], fileRIVs, i);
} }
fileRIVs_slider++;
} }
clock_t endnsquared = clock(); clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
...@@ -85,7 +121,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){ ...@@ -85,7 +121,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
}else{ }else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV)); (*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input); (*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString); strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input); fclose(input);
......
No preview for this file type
No preview for this file type
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#define CACHESIZE 1000 #define CACHESIZE 10000
#include "RIVtoolsCPUlinux.h" #include "RIVtoolsCPUlinux.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
......
File deleted
No preview for this file type
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+RIVKey.nonZeros;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= RIVKey.nonZeros;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+RIVKey.nonZeros;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= RIVKey.nonZeros;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+RIVKey.nonZeros;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=RIVKey.nonZeros;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= RIVKey.nonZeros;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/RIVKey.nonZeros;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount);
}else{
return consolidateI2SDirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<RIVKey.nonZeros; i++){
destination.values[(rand()%RIVKey.RIVsize)] +=1;
destination.values[(rand()%RIVKey.RIVsize)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%RIVKey.cacheSize;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector */
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%RIVKey.cacheSize;
if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
int* mapS2D(int* destination, sparseRIV input); //#TODO fix int*/denseRIV confusion
int* addI2D(int* destination, int* locations, size_t seedCount);
/* cosine determines the "similarity" between two RIVs. */
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount);
/* magnitudes will be used later in cosine comparison */
void getMagnitudes(sparseRIV *inputs, size_t RIVCount);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused except in text2l2
sparseRIV text2L2(unsigned char *text);//unused
float* cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount);
/*lexPush writes a denseRIV to a file of the same name, under the directory "lexicon"
* it is up to the programmer to ensure that the name of the RIV is a valid filename
* although it will of course attempt to create the file if it does not exist
*/
int* mapS2D(denseRIV destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
// make sure our destination is a 0 vector
memset(destination.values, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
// apply values at an index based on locations
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
strcpy(destination.name, input.name);
*(destination.frequency) = input.frequency;
destination.magnitude = input.magnitude;
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount){
float *results = calloc(multiplierCount, sizeof(float));
float* results_slider = results;
int *baseDenseRIV = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVKey.RIVsize*sizeof(int));
addS2D(baseDenseRIV, baseRIV);
float cosine;
sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){
/* skip a pair if the multiplier has already been culled, or if
* the size difference is too great */
if(((*multipliers).boolean)
&& (((*multipliers).magnitude < maxsize)
&& ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
/* magnitudes had better already be calculated at this point*/
cosine = dot/((baseRIV.magnitude)*((*multipliers).magnitude));
*results_slider = cosine;
results_slider++;
/* perform the action defined by the COSINEACTION macro */
COSINEACTION;
}
multipliers++;
}
return results;
}
float* cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount){
float *results = calloc(multiplierCount, sizeof(float));
float* results_slider = results;
int *baseDenseRIV = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVKey.RIVsize*sizeof(int));
addS2D(baseDenseRIV, baseRIV);
float cosine;
sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
/* magnitudes had better already be calculated at this point*/
cosine = dot/((baseRIV.magnitude)*((*multipliers).magnitude));
*results_slider = cosine;
results_slider++;
/* perform the action defined by the COSINEACTION macro */
COSINEACTION;
}
multipliers++;
return results;
}
void getMagnitudes(sparseRIV *inputs, size_t RIVCount){
for(int i=0; i<RIVCount; i++){
unsigned long long int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
}
}
File added
File added
project gutenberg scientific american supplement no
\ No newline at end of file
title scientific american supplement no february
\ No newline at end of file
rate feet work
\ No newline at end of file
cities herculaneum pompeii several smaller towns on slope
\ No newline at end of file
mountain destroyed lava buried under mass
\ No newline at end of file
pumice stones ashes second
\ No newline at end of file
contiguous mountains in iceland in two enormous lava
\ No newline at end of file
streams one miles wide over ft deep other
\ No newline at end of file
scarcely inferior flowed first miles other till
\ No newline at end of file
reached sea pouring flood white hot lava
\ No newline at end of file
ocean destroying in paths killing in water
\ No newline at end of file
ocean fish mainstay inhabitants who
\ No newline at end of file
reduced disaster directly indirectly less
\ No newline at end of file
former strength third
\ No newline at end of file
in devastated such an immense area in java but all
\ No newline at end of file
eruptions known besides as mere childs play terrible one
\ No newline at end of file
krakatoa in
\ No newline at end of file
reader will examine map east indies will find
\ No newline at end of file
represented in straits lie between sumatra
\ No newline at end of file
java little island krakatoa in maps make before will
\ No newline at end of file
hunt in vain name like bull run before then
\ No newline at end of file
unknown fame though navigators who passed through straits knew
\ No newline at end of file
vi educational competitive examinations
\ No newline at end of file
as beautiful tropical isle an extinct volcanic cone in
\ No newline at end of file
center in beginning however little well behaved
\ No newline at end of file
island showed symptoms wrath boded no good larger
\ No newline at end of file
islands in vicinity noted fine fruits
\ No newline at end of file
abounded famous picnic ground towns cities even
\ No newline at end of file
miles away subterranean rumblings mutterings
\ No newline at end of file
wrath became conspicuous people capital java
\ No newline at end of file
put steamboat requisition visited island in large
\ No newline at end of file
number time island constantly in slight tremor
\ No newline at end of file
subterranean roar like continue but distant mutterings
\ No newline at end of file
interesting details famous examinations
\ No newline at end of file
thunder but crisis reached august am
\ No newline at end of file
beautiful sunday morning water straits
\ No newline at end of file
like sea glass as clear as crystal john in
\ No newline at end of file
apocalyptic vision speaks beauty morning enhanced
\ No newline at end of file
extraordinary transparency tropical air distant mountain
\ No newline at end of file
ranges seem so near seem possible strike
\ No newline at end of file
stone cast hand only mysterious rumblings mutterings
\ No newline at end of file
pent up forces beneath island disturbed breathless calm
\ No newline at end of file
silence lay on calm before terrible
\ No newline at end of file
mightiest most awful on record burst forth sudden
\ No newline at end of file
consequences overworked competitors
\ No newline at end of file
night snatched away day eyes terrified beholders on
\ No newline at end of file
mainland but vivid play lightnings around ascending
\ No newline at end of file
column dust penetrated even deep obscurity distance
\ No newline at end of file
miles awful darkness stretched within circle diameter
\ No newline at end of file
miles while more less darkness reigned within circle
\ No newline at end of file
diameter three times as great within latter area dust
\ No newline at end of file
fall like snow sky breaking off limbs trees its weight
\ No newline at end of file
miles distant while in miles away scene
\ No newline at end of file
disaster fall depth several inches explosions
\ No newline at end of file
so loud as distinctly heard in miles away
\ No newline at end of file
sound like constant roar cannon in field
\ No newline at end of file
battle finally whole island blown pieces now came
\ No newline at end of file
most awful contest battle death between neptune
\ No newline at end of file
vulcan sea poured down chasm millions tons only
\ No newline at end of file
first converted vapor millions tons
\ No newline at end of file
seething white hot lava beneath over shores miles away waves
\ No newline at end of file
over ft high rolled such fury even
\ No newline at end of file
part bedrock swept away blocks stone tons
\ No newline at end of file
weight carry two miles inland on sumatra side
\ No newline at end of file
straits large vessel carry three miles inland wave
\ No newline at end of file
vii electrical speed engine
\ No newline at end of file
course growing less in intensity traveled across whole indian
\ No newline at end of file
ocean miles cape good hope around
\ No newline at end of file
atlantic waves in atmosphere traveled around globe three
\ No newline at end of file
times rate miles hour dust volcano
\ No newline at end of file
carry up atmosphere fully twenty miles finest
\ No newline at end of file
distribute through whole body air reader doubtless
\ No newline at end of file
remembers beautiful reddish purple glow sunrise sunset
\ No newline at end of file
fully six months after august glow caused
\ No newline at end of file
volcanic dust in atmosphere interfering passage
\ No newline at end of file
suns rays upper part solar spectrum more manifest
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment