Commit 34c65893 by amberhosen

updated RIVreads

parent 9d2c0fed
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#define RIVSIZE 50000
#define NONZEROS 8
#include <setjmp.h>
#include <signal.h>
#include "../RIVet/RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#define RIVSIZE 200000
#define NONZEROS 2
#define CACHESIZE 1000
#include "../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addContext(denseRIV* lexRIV, sparseRIV context);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon8-50");
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0}; //ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1; return 1;
} }
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); //we close the lexicon again, ensuring all data is secured
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose(); lexClose();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ //mostly a standard recursive Dirent-walk
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -76,15 +57,13 @@ void directoryGrind(char *rootString){ ...@@ -76,15 +57,13 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -92,63 +71,87 @@ void directoryGrind(char *rootString){ ...@@ -92,63 +71,87 @@ void directoryGrind(char *rootString){
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); /* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); //form a context vector. "clean" indicates that it will ignore any word which
fseek(textFile, 0, SEEK_SET); //contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
int wordCount = 0; //an array of denseRIVs, large enough to hold all vectors
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); //(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
char word[200]; denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;; //we pull the vector corresponding to each word from the lexicon
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
wordCount++; //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
} //we remove the sub-vector corresponding to the word itself
//printf("%d\n", wordCount); subtractThisWord(lexiconRIV);
addS2Ds(RIVArray, aggregateRIV, wordCount); //we log that this word has been encountered one more time
denseRIV* RIVArray_slider = RIVArray; lexiconRIV->frequency += 1;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider); //and finally we push it back to the lexicon for permanent storage
RIVArray_slider++; lexPush(lexiconRIV);
}
free(RIVArray);
free(aggregateRIV.locations);
}
free(contextVector.locations);
} }
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover"); void addContext(denseRIV* lexRIV, sparseRIV context){
longjmp(readdirRecov, 1);
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
} }
...@@ -5,15 +5,16 @@ clean(){ ...@@ -5,15 +5,16 @@ clean(){
else else
python shittyballs.py "$1" python shittyballs.py "$1"
./RIVread cleanbooks/ ./RIVread1 cleanbooks/
# ./RIVread1 cleanbooks/
./RIVread2 cleanbooks/ ./RIVread2 cleanbooks/
#./RIVread3 cleanbooks/ ./RIVread3 cleanbooks/
#./RIVread4 cleanbooks/ ./RIVread4 cleanbooks/
./RIVread5 cleanbooks/ ./RIVread5 cleanbooks/
./RIVread6 cleanbooks/ ./RIVread6 cleanbooks/
./RIVread7 cleanbooks/
rm -r cleanbooks/ rm -r cleanbooks/
#rm "$1"
fi fi
shift shift
done done
...@@ -21,4 +22,4 @@ clean(){ ...@@ -21,4 +22,4 @@ clean(){
clean ../bookCleaner/books/* clean ../../books/*
import requests #import requests
import re import re
import string import string
import os import os
...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn ...@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
import pdb import pdb
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
def adverbFix(word):
if not nltk.pos_tag(word)[0][1] == 'RB':
return word
adjective = word[:-2] def writeWord(cleanString, word, stem, blacklist):
if not nltk.pos_tag(word)[0][1] == 'JJ': if word == stem:
return word; FILE = open("lexicon/" + word, "w")
FILE.write("1");
FILE.close();
return (cleanString + " " + word)
elif stem not in blacklist:
if len(stem) > 2:
FILE = open("lexicon/" + word, "w") FILE = open("lexicon/" + word, "w")
FILE.write("2" + temp) FILE.write("2"+stem);
FILE.close() FILE.close();
FILE = open("lexicon/" + adjective, "w") FILE = open("lexicon/" + stem, "w")
FILE.write("1") FILE.write("1")
FILE.close() FILE.close();
return adjective return (cleanString + " " + stem)
return cleanString
def strip(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: def liFix(word):
if word.endswith(suffix): if not word[len(word)-2:] == "li":
return word[:-len(suffix)]
return word return word
temp = ps.stem(word[:-2])
if temp:
return temp
return word
def cleanWord(word): def cleanWord(word):
#if(len(word) == 0):
#print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
word = word.lower(); word = word.lower();
regex = re.compile('[^a-z]+') regex = re.compile('[^a-z]+')
word = regex.sub('', word) word = regex.sub('', word)
...@@ -44,13 +50,11 @@ def cleanWord(word): ...@@ -44,13 +50,11 @@ def cleanWord(word):
def fileCheck(word): def fileCheck(word):
try: try:
#print("trying")
wordFile = open("lexicon/{}".format(word), "r") wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1)) code = int(wordFile.read(1))
except: except:
#print("file does not exist")
return 0 return 0
#print("fileCode{}".format(code))
if code == 2: if code == 2:
word = wordFile.read() word = wordFile.read()
...@@ -74,6 +78,8 @@ def morphyTest(word): ...@@ -74,6 +78,8 @@ def morphyTest(word):
return morphyTemp; return morphyTemp;
#begin mainfunction
blacklist = ["a", "an", "the", "so", "as", "how", blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she", "i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had", "but", "have", "had",
...@@ -90,13 +96,13 @@ print(sourceString + "\n") ...@@ -90,13 +96,13 @@ print(sourceString + "\n")
if not os.path.exists('cleanbooks'): if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks') os.makedirs('cleanbooks')
# if not os.path.exists('lexicon'): if not os.path.exists('lexicon'):
# os.makedirs('lexicon') os.makedirs('lexicon')
if not os.path.exists(pathString): if not os.path.exists(pathString):
os.makedirs(pathString) os.makedirs(pathString)
#call(["python", "blacklist.py"]) call(["python", "blacklist.py"])
i=0 i=0
skip = 1 skip = 1
with open(sourceString, 'U') as fileIn: with open(sourceString, 'U') as fileIn:
...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn: ...@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
for tempWord in line.split(): for tempWord in line.split():
word=cleanWord(tempWord) word=cleanWord(tempWord)
if not word: if not word:
continue continue
if len(word) < 3:
continue;
if word in blacklist:
continue;
# temp = fileCheck(word)
#
# if temp == -1:
# continue
# if temp == 0:
temp = morphyTest(word)
if temp:
stem = ps.stem(temp)
if stem and not stem in blacklist:
cleanString = cleanString + ' ' + stem
temp = fileCheck(word)
if temp == -1:
continue
if temp:
cleanString = (cleanString + " " + temp);
continue
else:
morphy = morphyTest(word)
if morphy:
stem = ps.stem(morphy)
if stem:
stem = liFix(stem)
cleanString = writeWord(cleanString, word, stem, blacklist)
#if temp == 0:
# catchAll(word)
cleanString = cleanString + os.linesep cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 10: if len(cleanString.split(' ')) > 2:
fileOut.write(cleanString) fileOut.write(cleanString)
fileOut.close() fileOut.close()
......
#ifndef RIVACCESS_H_ #ifndef RIVACCESS_H_
#define RIVACCESS_H_ #define RIVACCESS_H_
/*isWordClean filters words that contain non-letter characters, and /*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through * upperCase letters, allowing only the '_' symbol through
*/ */
int isWordClean(char* word); int isWordClean(char* word);
/* used by wordClean */ /* used by wordClean */
int isLetter(char c); int isLetter(char c);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(char* word);
int isLetter(char c){ int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1; if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
...@@ -26,5 +33,19 @@ int isWordClean(char* word){ ...@@ -26,5 +33,19 @@ int isWordClean(char* word){
return 1; return 1;
} }
int wordtoSeed(char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
#endif #endif
#ifndef RIV_LEXICON_H
#define RIV_LEXICON_H
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
* data security */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
int cacheCheckOnPush(denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
* globally for key optimizations
*/
denseRIV* lexPull(char* word);
denseRIV* cacheCheckOnPull(char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
void lexOpen(char* lexName){
struct stat st = {0};
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for ;worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action = {0};
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){
sigaction(i,&action,NULL);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(char* word){
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(RIVKey.RIVCache[hash]){
if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
}
return NULL;
}
#endif
denseRIV* lexPull(char* word){
denseRIV* output;
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
if((output = cacheCheckOnPull(word))){
return output;
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = calloc(1, sizeof(denseRIV));
}
strcpy(output->name, word);
return output;
}
#if CACHESIZE > 0
int cacheCheckOnPush(denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached){
return 1;
}
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!RIVKey.RIVCache[hash]){
/* push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(RIVKey.RIVCache[hash]);
/* replace this cache-slot with the current vector */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
}
return 0;
}
#endif
int lexPush(denseRIV* RIVout){
#if CACHESIZE > 0
if(cacheCheckOnPush(RIVout)){
return 0;
}
#endif /* CACHESIZE != 0 */
/* find the cache-slot where this word belongs */
return fLexPush(RIVout);
}
int fLexPush(denseRIV* output){
char pathString[200] = {0};
denseRIV RIVout = *output;
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(output);
free(temp.locations);
return 0;
}
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output->values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output->values, RIVSIZE, sizeof(int), lexWord);
}
output->cached = 0;
return output;
}
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
flag += fLexPush(RIVKey.RIVCache[i]);
}
}
return flag;
}
/*TODO add a simplified free function*/
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#include <setjmp.h>
#include <signal.h>
#include "RIVtools.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <dirent.h> #include <dirent.h>
#include <error.h> #include <error.h>
#include "../../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile); void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount); void addContext(denseRIV* lexRIV, sparseRIV context);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString); void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon");
char pathString[1000]; char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]); strcpy(pathString, argv[1]);
strcat(pathString, "/"); strcat(pathString, "/");
struct stat st = {0}; //ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) { if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1; return 1;
} }
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString); directoryGrind(pathString);
clock_t endtotal = clock(); //we close the lexicon again, ensuring all data is secured
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose(); lexClose();
return 0; return 0;
} }
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){ //mostly a standard recursive Dirent-walk
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){ void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000]; char pathString[2000];
DIR *directory; DIR *directory;
struct dirent *files = 0; struct dirent *files = 0;
...@@ -74,15 +53,13 @@ void directoryGrind(char *rootString){ ...@@ -74,15 +53,13 @@ void directoryGrind(char *rootString){
} }
while((files=readdir(directory))){ while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name)); if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){ while(*(files->d_name)=='.'){
files = readdir(directory); files = readdir(directory);
} }
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){ if(files->d_type == DT_DIR){
strcpy(pathString, rootString); strcpy(pathString, rootString);
...@@ -90,63 +67,87 @@ void directoryGrind(char *rootString){ ...@@ -90,63 +67,87 @@ void directoryGrind(char *rootString){
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
strcat(pathString, "/"); strcat(pathString, "/");
directoryGrind(pathString); directoryGrind(pathString);
continue;
} }
strcpy(pathString, rootString); strcpy(pathString, rootString);
strcat(pathString, files->d_name); strcat(pathString, files->d_name);
printf("%s\n", pathString); printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+"); /* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){ if(input){
//process this file and add it's data to lexicon
fileGrind(input); fileGrind(input);
fclose(input); fclose(input);
} }
} }
} }
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){ void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile); //form a context vector. "clean" indicates that it will ignore any word which
fseek(textFile, 0, SEEK_SET); //contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
int wordCount = 0; //an array of denseRIVs, large enough to hold all vectors
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV)); //(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
char word[200]; denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){ while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break; if(feof(textFile)) break;
if(!(*word))continue; if(!(*word))continue;
if(!isWordClean((char*)word)){ if(!isWordClean((char*)word)){
continue; continue;
} }
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;; //we pull the vector corresponding to each word from the lexicon
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing); //if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
wordCount++; //we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
} //we remove the sub-vector corresponding to the word itself
//printf("%d\n", wordCount); subtractThisWord(lexiconRIV);
addS2Ds(RIVArray, aggregateRIV, wordCount); //we log that this word has been encountered one more time
denseRIV* RIVArray_slider = RIVArray; lexiconRIV->frequency += 1;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider); //and finally we push it back to the lexicon for permanent storage
RIVArray_slider++; lexPush(lexiconRIV);
}
free(RIVArray);
free(aggregateRIV.locations);
}
free(contextVector.locations);
} }
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover"); void addContext(denseRIV* lexRIV, sparseRIV context){
longjmp(readdirRecov, 1);
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment