Commit d06a64b7 by simetk

added simplified usage package

parent 80b16c56
This diff is collapsed. Click to expand it.
No preview for this file type
......@@ -2,16 +2,13 @@
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5000
#define RIVSIZE 50000
#define CACHESIZE 0
#define NONZEROS 2
#define EPSILON 0.8
#define MINPOINTS 15
#define EPSILON 0.95
#define MINPOINTS 20
#define UNCHECKED 0
#define NOISE -1
#define MINSIZE 3000
#include "RIVtools.h"
......@@ -29,7 +26,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
lexOpen("/run/media/etcart/UUI/lexicon");
lexOpen("/home/drbob/Documents/lexicon8-50");
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
......@@ -64,7 +61,6 @@ int main(int argc, char *argv[]){
baseDense.values = malloc(RIVSIZE*sizeof(int));
for(int i=0; i<fileCount; i++){
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, DBset[i].RIV);
baseDense.magnitude = DBset[i].RIV.magnitude;
......@@ -79,7 +75,6 @@ int main(int argc, char *argv[]){
DBset[i].indexes[DBset[i].indexCount++] = j;
DBset[j].indexes = realloc(DBset[j].indexes, (DBset[j].indexCount+1)*sizeof(int));
DBset[j].indexes[DBset[j].indexCount++] = i;
}
}
}
......@@ -107,19 +102,19 @@ int main(int argc, char *argv[]){
return 0;
}
void DBdive(int C, int i){
printf("starting at: %s\n", DBset[i].RIV.name);
printf("root: %s\n", DBset[i].RIV.name);
struct DBnode *DBnet = malloc(sizeof(struct DBnode));
DBnet[0] = DBset[i];
int nodeCount = 1;
for(int j=0; j<nodeCount; j++){
for(int k=0; k<DBnet[j].indexCount;k++){
int index = DBnet[j].indexes[k];
if(DBset[index].status) continue;
if(DBset[index].status>0) continue;
printf(">>%s\n", DBset[index].RIV.name);
DBset[index].status = C;
if(DBset[index].indexCount> MINPOINTS){
DBnet = realloc(DBnet, (nodeCount+1)*sizeof(struct DBnode));
printf("diving into: %s\n", DBset[index].RIV.name);
DBnet[nodeCount++] = DBset[index];
}
}
......@@ -148,10 +143,8 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
denseRIV temp = lexPull(files->d_name);
if(*temp.frequency >2000){
if(*temp.frequency >MINSIZE){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(temp, 500);
......
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h>
......@@ -313,9 +314,9 @@ void lexOpen(char* lexName){
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
signal(11, signalSecure);
for(int i=1; i<20; i++){
signal(i, signalSecure);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
......
This diff is collapsed. Click to expand it.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#define RIVSIZE 50000
#define NONZEROS 8
#include <setjmp.h>
#include <signal.h>
#include "../RIVet/RIVtools.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon8-50");
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
struct stat st = {0};
if(stat(pathString, &st) == -1) {
return 1;
}
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
while(*(files->d_name)=='.'){
files = readdir(directory);
}
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
}
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1);
}
clean(){
while [ "$1" ]; do
if [ -d "$1" ]; then
clean "$1"/*
else
python shittyballs.py "$1"
./RIVread cleanbooks/
# ./RIVread1 cleanbooks/
./RIVread2 cleanbooks/
#./RIVread3 cleanbooks/
#./RIVread4 cleanbooks/
./RIVread5 cleanbooks/
./RIVread6 cleanbooks/
rm -r cleanbooks/
fi
shift
done
}
clean ../bookCleaner/books/*
import requests
import re
import string
import os
import sys
from subprocess import call
import nltk
from nltk.corpus import wordnet as wn
import pdb
from nltk.stem import PorterStemmer
def adverbFix(word):
if not nltk.pos_tag(word)[0][1] == 'RB':
return word
adjective = word[:-2]
if not nltk.pos_tag(word)[0][1] == 'JJ':
return word;
FILE = open("lexicon/" + word, "w")
FILE.write("2" + temp)
FILE.close()
FILE = open("lexicon/" + adjective, "w")
FILE.write("1")
FILE.close()
return adjective
def strip(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
def cleanWord(word):
#if(len(word) == 0):
#print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
word = word.lower();
regex = re.compile('[^a-z]+')
word = regex.sub('', word)
#print(word)
return word
def fileCheck(word):
try:
#print("trying")
wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1))
except:
#print("file does not exist")
return 0
#print("fileCode{}".format(code))
if code == 2:
word = wordFile.read()
#print("file flipped to: " + word)
wordFile.close()
return word
elif code == 1:
#print("file accepted: " + word)
wordFile.close()
return word
elif code == 0:
wordFile.close()
return -1
def morphyTest(word):
morphyTemp = wn.morphy(word)
if not morphyTemp:
return 0
return morphyTemp;
blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had",
"for", "by", "in", "out", "as", "not"
"be", "were", "was", "am", "are", "is",
"mr", "mrs", "mr", "and"]
word = {}
ps = PorterStemmer()
sourceString = sys.argv[1]
cutDirectories = sourceString.split('/')[-1]
pathString = cutDirectories.split('.')[0]
pathString = "cleanbooks/" + pathString + "clean/"
print(sourceString + "\n")
if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks')
# if not os.path.exists('lexicon'):
# os.makedirs('lexicon')
if not os.path.exists(pathString):
os.makedirs(pathString)
#call(["python", "blacklist.py"])
i=0
skip = 1
with open(sourceString, 'U') as fileIn:
text = fileIn.read()
for paragraph in text.split(2*os.linesep):
if not paragraph:
continue
elif "*** START OF " in paragraph or "*END THE SMALL PRINT" in paragraph:
skip = 0
continue
elif "*** END OF " in paragraph:
fileIn.close()
sys.exit()
elif "End of Project Gutenberg's" in paragraph:
fileIn.close()
sys.exit()
elif "End of the Project Gutenberg" in paragraph:
fileIn.close()
sys.exit()
if not skip:
cleanString = ''
i += 1
fileOut = open("{}{}.txt".format(pathString, i), "w")
for line in paragraph.split(os.linesep):
for tempWord in line.split():
word=cleanWord(tempWord)
if not word:
continue
# temp = fileCheck(word)
#
# if temp == -1:
# continue
# if temp == 0:
temp = morphyTest(word)
if temp:
stem = ps.stem(temp)
if stem and not stem in blacklist:
cleanString = cleanString + ' ' + stem
#if temp == 0:
# catchAll(word)
cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 10:
fileOut.write(cleanString)
fileOut.close()
else:
fileOut.close()
os.remove("{}{}.txt".format(pathString, i))
i -= 1
if skip==1:
print(sourceString + " was badly parsed, no output");
No preview for this file type
......@@ -15,7 +15,7 @@ void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
lexOpen("/home/drbob/Documents/lexicon");
lexOpen("/home/drbob/Documents/lexicon2-25");
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment