Commit 7c37cc43 by Ethan

updated and commented

parent 41cdd603
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
unsigned int frequency;
float magnitude;
int boolean;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
float magnitude;
}denseRIV;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct RIVData{
size_t RIVsize;
int nonZeros;
int *h_tempBlock;
int tempSize;
int thing;
denseRIV* RIVCache;
int cacheSize;
}static RIVKey;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
* #TODO add signal redefinitions so that cache is saved even on crash
*/
void RIVinit();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void RIVCleanup();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does nto automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
int* mapS2D(int* destination, sparseRIV input); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makesparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
/* begin definitions */
int* mapS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
/* make sure our destination is a 0 vector */
memset(destination, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVKey.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock;
int* values = RIVKey.h_tempBlock+RIVKey.RIVsize;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVKey.RIVsize; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void RIVinit(){
RIVKey.RIVsize = RIVSIZE; //#TODO decide about macros vs global variables
RIVKey.nonZeros = NONZEROS;
if(RIVKey.nonZeros%2){
printf("your NONZEROS value must be an even number");
RIVKey.nonZeros++;
printf(", changed to %d", RIVKey.nonZeros);
}
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
RIVKey.h_tempBlock = (int*)malloc(3*RIVKey.RIVsize*sizeof(int));
RIVKey.tempSize = 3*RIVKey.RIVsize;
RIVKey.thing = 0;
RIVKey.cacheSize = CACHESIZE;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey.RIVCache = (denseRIV*)calloc(RIVKey.cacheSize,sizeof(denseRIV));
}
void RIVCleanup(){
for(int i=0; i<RIVKey.cacheSize; i++){
fLexPush(RIVKey.RIVCache[i]);
}
#if CACHESIZE > 0
free(RIVKey.RIVCache);
#endif
free(RIVKey.h_tempBlock);
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
for(int i=0; i<RIVKey.nonZeros; i++){
/* unrolled for speed, gauranteed to be an even number of steps */
*locations = rand()%RIVKey.RIVsize;
locations++;
i++;
*locations = rand()%RIVKey.RIVsize;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[500] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* and named after the word itself */
sprintf(pathString, "lexicon/%s", RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
fwrite(RIVout.frequency, 1, 4, lexWord);
fwrite(&RIVout.magnitude, 1, 4, lexWord);
fwrite(RIVout.values, RIVKey.RIVsize, 4, lexWord);
fclose(lexWord);
free(RIVout.values);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#include "RIVtoolsCPUlinux.h"
#define THRESHOLD .80f
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int printname(float cosine, sparseRIV base, sparseRIV multiplier);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVinit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <1){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock();
printf("got past magnitudes");
for(int i=0; i<fileCount; i++){
if(fileRIVs[i].boolean){
cosineCompare(fileRIVs[i], fileRIVs+i+1, fileCount-(i+1), printname);
}
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
int printname(float cosine, sparseRIV base, sparseRIV multiplier){
if(cosine>= THRESHOLD){
printf("%s\t%s\n%f\n", base.name, multiplier.name, cosine);
multiplier.boolean = 0;
RIVKey.thing++;
return 0;
}
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 100
#include "RIVtoolsCPUlinux.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
setKeyData();
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
for( int i=0; i<RIVKey.cacheSize; i++){
printf("%s, %d", RIVKey.RIVCache[i].name, *(RIVKey.RIVCache[i].frequency));
printf("\n");
}
RIVCleanup();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider;
denseRIV *dense_stop = denseSet+RIVCount;
int *locations = additive.locations;
int *locations_stop = locations+additive.count;
int *values = additive.values;
//int *target;
while(locations<locations_stop){
denseSet_slider = denseSet;
while(denseSet_slider<dense_stop){
(*denseSet_slider).values[*locations]+= *values;
//*target+=*values;
denseSet_slider++;
}
locations++;
values++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
while(!strcmp(files->d_name, ".") || !strcmp(files->d_name, "..")){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(!checkDupe(RIVArray, word, wordCount)){
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
int* thing = RIVArray[wordCount].frequency;
*thing = *thing + 1;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
free(aggregateRIV.values);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment