Commit c4c7beaa by birdperson

merging

parent ad4b27c9
File deleted
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#define RIVSIZE 200000 #define RIVSIZE 100000
#define NONZEROS 2 #define NONZEROS 8
#define CACHESIZE 1000 #define CACHESIZE 10000
#include "../RIVtools.h" #include "../RIVtools.h"
...@@ -23,7 +23,7 @@ int main(int argc, char *argv[]){ ...@@ -23,7 +23,7 @@ int main(int argc, char *argv[]){
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created //we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2"); lexOpen("lexicon100-8");
//we format the root directory, preparing to scan its contents //we format the root directory, preparing to scan its contents
...@@ -58,7 +58,7 @@ void directoryGrind(char *rootString){ ...@@ -58,7 +58,7 @@ void directoryGrind(char *rootString){
while((files=readdir(directory))){ while((files=readdir(directory))){
if(!files->d_name[0]) break; if(!files->d_name) break;
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
......
...@@ -5,13 +5,13 @@ clean(){ ...@@ -5,13 +5,13 @@ clean(){
else else
python shittyballs.py "$1" python shittyballs.py "$1"
./RIVread1 cleanbooks/ ./RIVread25-2 cleanbooks/ > somewhere.txt
./RIVread2 cleanbooks/ ./RIVread50-2 cleanbooks/ > somewhere.txt
./RIVread3 cleanbooks/ ./RIVread50-8 cleanbooks/ > somewhere.txt
./RIVread4 cleanbooks/ ./RIVread75-4 cleanbooks/ > somewhere.txt
./RIVread5 cleanbooks/ ./RIVread100-2 cleanbooks/ > somewhere.txt
./RIVread6 cleanbooks/ ./RIVread100-8 cleanbooks/ > somewhere.txt
./RIVread7 cleanbooks/ ./RIVread125-2 cleanbooks/ > somewhere.txt
rm -r cleanbooks/ rm -r cleanbooks/
#rm "$1" #rm "$1"
...@@ -22,4 +22,4 @@ clean(){ ...@@ -22,4 +22,4 @@ clean(){
clean ../../books/* clean ../../../Documents/PGunzips/*
...@@ -102,7 +102,7 @@ if not os.path.exists('lexicon'): ...@@ -102,7 +102,7 @@ if not os.path.exists('lexicon'):
if not os.path.exists(pathString): if not os.path.exists(pathString):
os.makedirs(pathString) os.makedirs(pathString)
call(["python", "blacklist.py"])
i=0 i=0
skip = 1 skip = 1
with open(sourceString, 'U') as fileIn: with open(sourceString, 'U') as fileIn:
......
File deleted
...@@ -3,10 +3,11 @@ ...@@ -3,10 +3,11 @@
#define CACHEEXCLUSIVE 1 #define CACHEEXCLUSIVE 1
#define RIVSIZE 50000 #define RIVSIZE 50000
#include "RIVtools.h" #include "RIVtools.h"
char* clean(char* word);
char* stem(char* word); char* stem(char* word);
int main(){ int main(){
lexOpen("consolidatedLexicon50-8"); lexOpen("consolidatedLexicon50-4");
FILE* text = fopen("../books/pg56902.txt", "r"); FILE* text = fopen("../../Downloads/pg62.txt", "r");
if(!text){ if(!text){
puts("no file"); puts("no file");
return 1; return 1;
...@@ -17,14 +18,17 @@ int main(){ ...@@ -17,14 +18,17 @@ int main(){
while(fscanf(text, "%99s", word)){ while(fscanf(text, "%99s", word)){
if(feof(text)) break; if(feof(text)) break;
if(!*word) break; if(!*word) break;
if(!*(clean(word))) continue;
if(stem(word)){ if(stem(word)){
denseRIV* wordRIV = lexPull(word); denseRIV* wordRIV = lexPull(word);
if(!wordRIV){ if(!wordRIV){
printf("%s, not in lexicon\n", word); //printf("%s, not in lexicon\n", word);
continue; continue;
}else{ }else{
temp = consolidateD2S(wordRIV->values); temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp); addS2D(accumulate.values, temp);
...@@ -37,12 +41,13 @@ int main(){ ...@@ -37,12 +41,13 @@ int main(){
} }
}else{ }else{
printf("%s, not in wordNet\n", word); // printf("%s, not in wordNet\n", word);
} }
} }
temp = consolidateD2S(accumulate.values);
printf("saturatoins, %d", temp.count);
return 0; return 0;
...@@ -54,35 +59,59 @@ char* stem(char* word){ ...@@ -54,35 +59,59 @@ char* stem(char* word){
char pathString[200]; char pathString[200];
int WNdata; int WNdata;
sprintf(pathString, "WN/%s", word); sprintf(pathString, "../bookCleaner/WN/%s", word);
FILE* WNfile = fopen(pathString, "r"); FILE* WNfile = fopen(pathString, "r");
if(!WNfile) return NULL; if(!WNfile) return NULL;
fscanf(WNfile, "%d", &WNdata); fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL; if(!WNdata) {
fclose(WNfile);
if(WNdata == 1) return word; return NULL;
}
if(WNdata == 1){
fclose(WNfile);
return word;
}
if(WNdata == 2){ if(WNdata == 2){
fscanf(WNfile, "%s", word); fscanf(WNfile, "%s", word);
fclose(WNfile); fclose(WNfile);
sprintf(pathString, "WN/%s", word); sprintf(pathString, "../bookCleaner/WN/%s", word);
WNfile = fopen(pathString, "r"); WNfile = fopen(pathString, "r");
if(!WNfile) return NULL; if(!WNfile) return NULL;
fscanf(WNfile, "%*d%s", word); fscanf(WNfile, "%*d%s", word);
fclose(WNfile);
return word; return word;
} }
return NULL; return NULL;
} }
char* clean(char* word){
char* letter = word;
char output[100];
int i=0;
while(*letter){
if(*letter >96 && *letter < 123){
output[i++] = *letter;
}else if(*letter > 64 && *letter < 91){
output[i++] = *letter + 32;
}
letter++;
}
output[i] = 0;
strcpy(word, output);
return word;
}
......
...@@ -47,7 +47,7 @@ int main(int argc, char* argv[]){ ...@@ -47,7 +47,7 @@ int main(int argc, char* argv[]){
i++; i++;
} }
lexClose(); lexClose();
lexOpen("consolidatedLexicon50-8"); lexOpen("consolidatedLexicon50-4");
for(int j=0; j<i; j++){ for(int j=0; j<i; j++){
lexPush(output[j]); lexPush(output[j]);
......
File deleted
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#define RIVSIZE 25000 #define RIVSIZE 4000
#define CACHESIZE 0 #define CACHESIZE 0
#define NONZEROS 2 #define NONZEROS 2
#define THRESHOLD 0.99 #define THRESHOLD 0.97
#include "RIVtools.h" #include "RIVtools.h"
...@@ -30,49 +30,63 @@ int main(int argc, char *argv[]){ ...@@ -30,49 +30,63 @@ int main(int argc, char *argv[]){
sparseRIV* fileRIVs_slider = fileRIVs; sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount; sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){ while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider); fileRIVs_slider->magnitude = getMagnitudeSparse(*fileRIVs_slider);
if(fileRIVs_slider->magnitude == 0){
if(remove(fileRIVs_slider->name)){
printf("%s failed", fileRIVs_slider->name);
}
}
fileRIVs_slider++; fileRIVs_slider++;
} }
clock_t beginnsquared = clock(); clock_t beginnsquared = clock();
int thing = 0; int thing = 0;
float cosine; float cosine;
float minmag; float minmag;
float maxmag; float maxmag;
denseRIV baseDense; denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int)); char booleans[fileCount];
char* booleans_slider = booleans;
memset(booleans, 0, fileCount);
fileRIVs_slider = fileRIVs; fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider; sparseRIV* comparators_slider;
int count = 0; int count = 0;
while(fileRIVs_slider<fileRIVs_stop){ while(fileRIVs_slider<fileRIVs_stop){
//the comparators slider and booleans slider are both set back to the beginning of their scan
comparators_slider = fileRIVs; comparators_slider = fileRIVs;
booleans_slider = booleans;
memset(baseDense.values, 0, RIVSIZE*sizeof(int)); memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider); addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude; baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85; minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15; maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){ while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && !*booleans_slider){
cosine = cosCompare(baseDense, *comparators_slider); cosine = cosCompare(baseDense, *comparators_slider);
count++; count++;
if(cosine>THRESHOLD){ if(cosine>THRESHOLD){
printf("%s\t%f\n",(*comparators_slider).name, cosine); printf("%s\t%s\t%f\n",fileRIVs_slider->name, comparators_slider->name, cosine);
if(remove((*comparators_slider).name)){ if(remove((*comparators_slider).name)){
printf(" well shit"); printf(" well shit");
} }
(*comparators_slider).boolean = 0; *booleans_slider += 1;
thing++; thing++;
} }
} }
comparators_slider++; comparators_slider++;
booleans_slider++;
} }
fileRIVs_slider++; fileRIVs_slider++;
} }
clock_t endnsquared = clock(); clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC; double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
...@@ -106,13 +120,14 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){ ...@@ -106,13 +120,14 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount); directoryToL2s(pathString, fileRIVs, fileCount);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r"); FILE *input = fopen(pathString, "r");
puts(pathString); //puts(pathString);
if(!input){ if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString); printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return; return;
...@@ -126,4 +141,5 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){ ...@@ -126,4 +141,5 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
(*fileCount)++; (*fileCount)++;
} }
} }
closedir(directory);
} }
File deleted
File deleted
...@@ -228,7 +228,7 @@ sparseRIV normalize(denseRIV input, int factor){ ...@@ -228,7 +228,7 @@ sparseRIV normalize(denseRIV input, int factor){
values[count]= round(input.values[i]*multiplier); values[count]= round(input.values[i]*multiplier);
/* drop any 0 values */ /* drop any 0 values */
if(values[count])count++; if(values[count]>1)count++;
} }
sparseRIV output; sparseRIV output;
output.count = count; output.count = count;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
RIVcull: RIVcullDestructive.c
gcc -o RIVcull RIVcullDestructive.c -lm -O3
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment