Commit 34c65893 by amberhosen

updated RIVreads

parent 9d2c0fed
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
int frequency;
double magnitude;
int boolean;
int contextSize;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
int* values;
int* frequency;
double magnitude;
int cached;
int *contextSize;
}denseRIV;
/*RIVKey, holds global variables used under the hood, primarily for the lexicon
* it also holds a "temp block" that will be used by the dense to sparse
* conversion and implicit RIV aggregation
*/
struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV RIVCache[CACHESIZE];
}static RIVKey;
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* contain any metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(unsigned char* word, int *seeds, size_t seedCount);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV fLexPull(FILE* lexWord);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(unsigned char* word);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using
*/
int* addS2D(int* destination, sparseRIV input);
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount);
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount);
* consolidate I2S is temporarily deprecated. may be brought back.
* in tandem they are much faster, but less careful with RAM */
/* caheDump flushes the RIV cache out to relevant files, backing up all
* data. this is called by the lexClose and signalSecure functions
*/
int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, size_t seedCount);
/* allocates a denseRIV filled with 0s
*/
denseRIV denseAllocate();
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
/*
sparseRIV consolidateI2SIndirect(int *implicit, size_t valueCount){
int *denseTemp = mapI2D(implicit, valueCount);
sparseRIV sparseOut = consolidateD2S(denseTemp);
free(denseTemp);
return sparseOut;
}
sparseRIV consolidateI2SDirect(int *implicit, size_t valueCount){
sparseRIV sparseOut;
int *locationsTemp = RIVKey.h_tempBlock+RIVSIZE;
int *valuesTemp = RIVKey.h_tempBlock+2*RIVSIZE;
sparseOut.count = 0;
int add = 1;
int found;
for(int i=0; i<valueCount; i++){
found = 0;
for(int j=0; j<sparseOut.count; j++){
if(implicit[i] == locationsTemp[j]){
valuesTemp[i] += add;
add *= -1;
found = 1;
}
}
if(!found){
locationsTemp[sparseOut.count] = implicit[i];
valuesTemp[sparseOut.count] = add;
sparseOut.count++;
add*= -1;
}
}
sparseOut.locations = (int*)malloc(2*sparseOut.count*sizeof(int));
sparseOut.values = sparseOut.locations+sparseOut.count;
memcpy(sparseOut.locations, locationsTemp, sparseOut.count*sizeof(int));
memcpy(sparseOut.values, valuesTemp, sparseOut.count*sizeof(int));
return sparseOut;
}*/
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void lexOpen(char* lexName){
/* RIVKey.I2SThreshold = sqrt(RIVSIZE);*/ //deprecate?
struct stat st;
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
for(int i=1; i<20; i++){
signal(i, signalSecure);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
int wordtoSeed(unsigned char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
void makeSparseLocations(unsigned char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
int fLexPush(denseRIV RIVout){
char pathString[200] = {0};
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
// printf("%s, writing as sparse, frequency: %d", RIVout.name, *RIVout.frequency);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
// printf("%s, writing as dense, frequency: %d", RIVout.name, *RIVout.frequency);
}
fclose(lexWord);
free(RIVout.values);
free(temp.locations);
return 0;
}
denseRIV fLexPull(FILE* lexWord){
denseRIV output = denseAllocate();
size_t typeCheck;
int flag = 0;
/* get metadata for vector */
flag+= fread(&typeCheck, 1, sizeof(size_t), lexWord);
flag+= fread(output.frequency, 1, sizeof(int), lexWord);
flag+= fread(output.contextSize, 1, sizeof(int), lexWord);
flag+= fread(&(output.magnitude), 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
flag+= fread(temp.locations, temp.count, sizeof(int), lexWord);
flag+=fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output.values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
flag += fread(output.values, RIVSIZE, sizeof(int), lexWord);
}
output.cached = 0;
return output;
}
void signalSecure(int signum){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
exit(1);
}
int cacheDump(){
int flag = 0;
denseRIV* cache_slider = RIVKey.RIVCache;
denseRIV* cache_stop = RIVKey.RIVCache+CACHESIZE;
while(cache_slider<cache_stop){
if((*cache_slider).cached){
flag += fLexPush(*cache_slider);
}
else{
}
cache_slider++;
}
return flag;
}
denseRIV denseAllocate(){
/* allocates a 0 vector */
denseRIV output;
output.values = (int*)calloc(RIVSIZE+2, sizeof(int));
/* for compact memory use, frequency is placed immediately after values */
output.frequency = output.values+RIVSIZE;
output.contextSize = output.frequency+1;
output.magnitude = 0;
output.cached = 0;
return output;
}
/*TODO add a simplified free function*/
#endif
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <sys/stat.h>
#include "RIVaccessories.h"
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 5000
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
double magnitude;
int contextSize;
int frequency;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
int cached;
char name[100];
int frequency;
double magnitude;
int contextSize;
int values[RIVSIZE];
}denseRIV;
/*RIVKey, holds global variables used under the hood, primarily for the lexicon
* it also holds a "temp block" that will be used by the dense to sparse
* conversion and implicit RIV aggregation
*/
struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV* RIVCache[CACHESIZE];
}static RIVKey;
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* contain any metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(char* word, int *seeds, size_t seedCount);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using
*/
int* addS2D(int* destination, sparseRIV input);
/* caheDump flushes the RIV cache out to relevant files, backing up all
* data. this is called by the lexClose and signalSecure functions
*/
int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, size_t seedCount);
/*subtracts a words vector from its own context. regularly used in lex building
*/
void subtractThisWord(denseRIV* vector);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void makeSparseLocations(char* word, int *locations, size_t count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
sparseRIV* sparseAllocateFormatted(){
sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
return output;
}
void subtractThisWord(denseRIV* vector){
//set the rand() seed to the word
srand(wordtoSeed(vector->name));
/* the base word vector is composed of NONZERO (always an even number)
* +1s and -1s at "random" points (defined by the above seed.
* if we invert it to -1s and +1s, we have subtraction */
for(int i = 0; i < NONZEROS; i+= 2){
vector->values[rand()%RIVSIZE] -= 1;
vector->values[rand()%RIVSIZE] += 1;
}
/* record a context size 1 smaller */
vector->contextSize-= 1;
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#define RIVSIZE 50000
#define NONZEROS 8
#include <setjmp.h>
#include <signal.h>
#include "../RIVet/RIVtools.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#define RIVSIZE 200000
#define NONZEROS 2
#define CACHESIZE 1000
#include "../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon8-50");
char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]);
strcat(pathString, "/");
struct stat st = {0};
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1;
}
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
//we close the lexicon again, ensuring all data is secured
lexClose();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
......@@ -76,79 +57,101 @@ void directoryGrind(char *rootString){
}
while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
/* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
//process this file and add it's data to lexicon
fileGrind(input);
fclose(input);
}
}
}
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
//form a context vector. "clean" indicates that it will ignore any word which
//contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
//an array of denseRIVs, large enough to hold all vectors
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
//we remove the sub-vector corresponding to the word itself
subtractThisWord(lexiconRIV);
//we log that this word has been encountered one more time
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
}
free(RIVArray);
free(aggregateRIV.locations);
free(contextVector.locations);
}
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1);
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
}
......@@ -5,15 +5,16 @@ clean(){
else
python shittyballs.py "$1"
./RIVread cleanbooks/
# ./RIVread1 cleanbooks/
./RIVread1 cleanbooks/
./RIVread2 cleanbooks/
#./RIVread3 cleanbooks/
#./RIVread4 cleanbooks/
./RIVread3 cleanbooks/
./RIVread4 cleanbooks/
./RIVread5 cleanbooks/
./RIVread6 cleanbooks/
./RIVread7 cleanbooks/
rm -r cleanbooks/
#rm "$1"
fi
shift
done
......@@ -21,4 +22,4 @@ clean(){
clean ../bookCleaner/books/*
clean ../../books/*
import requests
#import requests
import re
import string
import os
......@@ -9,31 +9,37 @@ from nltk.corpus import wordnet as wn
import pdb
from nltk.stem import PorterStemmer
def adverbFix(word):
if not nltk.pos_tag(word)[0][1] == 'RB':
return word
adjective = word[:-2]
if not nltk.pos_tag(word)[0][1] == 'JJ':
return word;
FILE = open("lexicon/" + word, "w")
FILE.write("2" + temp)
FILE.close()
FILE = open("lexicon/" + adjective, "w")
FILE.write("1")
FILE.close()
return adjective
def strip(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
def writeWord(cleanString, word, stem, blacklist):
if word == stem:
FILE = open("lexicon/" + word, "w")
FILE.write("1");
FILE.close();
return (cleanString + " " + word)
elif stem not in blacklist:
if len(stem) > 2:
FILE = open("lexicon/" + word, "w")
FILE.write("2"+stem);
FILE.close();
FILE = open("lexicon/" + stem, "w")
FILE.write("1")
FILE.close();
return (cleanString + " " + stem)
return cleanString
def liFix(word):
if not word[len(word)-2:] == "li":
return word
temp = ps.stem(word[:-2])
if temp:
return temp
return word
def cleanWord(word):
#if(len(word) == 0):
#print("\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************\n\n***************")
word = word.lower();
regex = re.compile('[^a-z]+')
word = regex.sub('', word)
......@@ -44,13 +50,11 @@ def cleanWord(word):
def fileCheck(word):
try:
#print("trying")
wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1))
except:
#print("file does not exist")
return 0
#print("fileCode{}".format(code))
if code == 2:
word = wordFile.read()
......@@ -74,6 +78,8 @@ def morphyTest(word):
return morphyTemp;
#begin mainfunction
blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had",
......@@ -90,13 +96,13 @@ print(sourceString + "\n")
if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks')
# if not os.path.exists('lexicon'):
# os.makedirs('lexicon')
if not os.path.exists('lexicon'):
os.makedirs('lexicon')
if not os.path.exists(pathString):
os.makedirs(pathString)
#call(["python", "blacklist.py"])
call(["python", "blacklist.py"])
i=0
skip = 1
with open(sourceString, 'U') as fileIn:
......@@ -127,27 +133,31 @@ with open(sourceString, 'U') as fileIn:
for tempWord in line.split():
word=cleanWord(tempWord)
if not word:
continue
# temp = fileCheck(word)
#
# if temp == -1:
# continue
# if temp == 0:
temp = morphyTest(word)
if temp:
stem = ps.stem(temp)
if stem and not stem in blacklist:
cleanString = cleanString + ' ' + stem
if len(word) < 3:
continue;
if word in blacklist:
continue;
temp = fileCheck(word)
if temp == -1:
continue
if temp:
cleanString = (cleanString + " " + temp);
continue
else:
morphy = morphyTest(word)
if morphy:
stem = ps.stem(morphy)
if stem:
stem = liFix(stem)
cleanString = writeWord(cleanString, word, stem, blacklist)
#if temp == 0:
# catchAll(word)
cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 10:
if len(cleanString.split(' ')) > 2:
fileOut.write(cleanString)
fileOut.close()
......
#ifndef RIVACCESS_H_
#define RIVACCESS_H_
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
int isWordClean(char* word);
/* used by wordClean */
int isLetter(char c);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(char* word);
int isLetter(char c){
if((c>96 && c<123)||(c == 32) || (c == '_')) return 1;
......@@ -26,5 +33,19 @@ int isWordClean(char* word){
return 1;
}
int wordtoSeed(char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
#endif
#ifndef RIV_LEXICON_H
#define RIV_LEXICON_H
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexOpen is called to "open the lexicon", setting up for later calls to
* lexPush and lexPull. if the lexicon has not been opened before calls
* to these functions, their behavior can be unpredictable, most likely crashing
*/
void lexOpen();
/* lexClose should always be called after the last lex push or lex pull call
* if the lexicon is left open, some vector data may be lost due to
* un-flushed RIV cache
*/
void lexClose();
/* both lexPush and lexPull must be called *after* the lexOpen() function
* and after using them the lexClose() function must be called to ensure
* data security */
/* lexPush writes a denseRIV to the lexicon for permanent storage */
int lexPush(denseRIV* RIVout);
int cacheCheckOnPush(denseRIV* RIVout);
/* lexPull reads a denseRIV from the lexicon, under "word"
* if the file does not exist, it creates a 0 vector with the name of word
* lexPull returns a denseRIV *pointer* because its data must be tracked
* globally for key optimizations
*/
denseRIV* lexPull(char* word);
denseRIV* cacheCheckOnPull(char* word);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexPush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int fLexPush(denseRIV* RIVout);
/* flexPull pulls data directly from a file and converts it (if necessary)
* to a denseRIV. function is called by "lexPull" which is what users
* should actually use. lexPull, unlike FlexPull, has cache logic under
* the hood for speed and harddrive optimization
*/
denseRIV* fLexPull(FILE* lexWord);
/* redefines signal behavior to protect cached data against seg-faults etc*/
void signalSecure(int signum, siginfo_t *si, void* arg);
/* begin definitions */
void lexOpen(char* lexName){
struct stat st = {0};
if (stat(lexName, &st) == -1) {
mkdir(lexName, 0777);
}
strcpy(RIVKey.lexName, lexName);
/* open a slot at least large enough for ;worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
struct sigaction action = {0};
action.sa_sigaction = signalSecure;
action.sa_flags = SA_SIGINFO;
for(int i=1; i<27; i++){
sigaction(i,&action,NULL);
}
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
memset(RIVKey.RIVCache, 0, sizeof(denseRIV*)*CACHESIZE);
}
void lexClose(){
if(cacheDump()){
puts("cache dump failed, some lexicon data was lost");
}
}
#if CACHESIZE > 0
denseRIV* cacheCheckOnPull(char* word){
srand(wordtoSeed(word));
int hash = rand()%CACHESIZE;
if(RIVKey.RIVCache[hash]){
if(!strcmp(word, RIVKey.RIVCache[hash]->name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
}
return NULL;
}
#endif
denseRIV* lexPull(char* word){
denseRIV* output;
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
if((output = cacheCheckOnPull(word))){
return output;
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = calloc(1, sizeof(denseRIV));
}
strcpy(output->name, word);
return output;
}
#if CACHESIZE > 0
int cacheCheckOnPush(denseRIV* RIVout){
/* if our RIV was cached already, no need to play with it */
if(RIVout->cached){
return 1;
}
srand(wordtoSeed(RIVout->name));
int hash = rand()%CACHESIZE;
/* if there is no word in this cache slot */
if(!RIVKey.RIVCache[hash]){
/* push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
/*if the current RIV is more frequent than the RIV holding its slot */
}
if(RIVout->frequency > RIVKey.RIVCache[hash]->frequency ){
/* push the lower frequency cache entry to a file */
fLexPush(RIVKey.RIVCache[hash]);
/* replace this cache-slot with the current vector */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash]->cached = 1;
return 1;
}
return 0;
}
#endif
int lexPush(denseRIV* RIVout){
#if CACHESIZE > 0
if(cacheCheckOnPush(RIVout)){
return 0;
}
#endif /* CACHESIZE != 0 */
/* find the cache-slot where this word belongs */
return fLexPush(RIVout);
}
int fLexPush(denseRIV* output){
char pathString[200] = {0};
denseRIV RIVout = *output;
/* word data will be placed in a (new?) file under the lexicon directory
* in a file named after the word itself */
sprintf(pathString, "%s/%s", RIVKey.lexName, RIVout.name);
FILE *lexWord = fopen(pathString, "wb");
if(!lexWord){
printf("lexicon push has failed for word: %s\nconsider cleaning inputs", pathString);
return 1;
}
sparseRIV temp = consolidateD2S(RIVout.values);
if(temp.count<(RIVSIZE/2)){
/* smaller stored as sparse vector */
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(temp.locations, temp.count, sizeof(int), lexWord);
fwrite(temp.values, temp.count, sizeof(int), lexWord);
}else{
/* saturation is too high, better to store dense */
/* there's gotta be a better way to do this */
temp.count = 0;
fwrite(&temp.count, 1, sizeof(size_t), lexWord);
fwrite(&RIVout.frequency, 1, sizeof(int), lexWord);
fwrite(&RIVout.contextSize, 1, sizeof(int), lexWord);
fwrite(&RIVout.magnitude, 1, sizeof(float), lexWord);
fwrite(RIVout.values, RIVSIZE, sizeof(int), lexWord);
}
fclose(lexWord);
free(output);
free(temp.locations);
return 0;
}
denseRIV* fLexPull(FILE* lexWord){
denseRIV *output = calloc(1,sizeof(denseRIV));
size_t typeCheck;
/* get metadata for vector */
fread(&typeCheck, 1, sizeof(size_t), lexWord);
fread(&output->frequency, 1, sizeof(int), lexWord);
fread(&output->contextSize, 1, sizeof(int), lexWord);
fread(&output->magnitude, 1, sizeof(float), lexWord);
/* first value stored is the value count if sparse, and 0 if dense */
if (typeCheck){
/* pull as sparseVector */
sparseRIV temp;
/* value was not 0, so it's the value count */
temp.count = typeCheck;
temp.locations = (int*)malloc(temp.count*2*sizeof(int));
temp.values = temp.locations+temp.count;
fread(temp.locations, temp.count, sizeof(int), lexWord);
fread(temp.values, temp.count, sizeof(int), lexWord);
addS2D(output->values, temp);
free(temp.locations);
}else{
/* typecheck is thrown away, just a flag in this case */
fread(output->values, RIVSIZE, sizeof(int), lexWord);
}
output->cached = 0;
return output;
}
int cacheDump(){
int flag = 0;
for(int i = 0; i < CACHESIZE; i++){
if(RIVKey.RIVCache[i]){
flag += fLexPush(RIVKey.RIVCache[i]);
}
}
return flag;
}
/*TODO add a simplified free function*/
void signalSecure(int signum, siginfo_t *si, void* arg){
if(cacheDump()){
puts("cache dump failed, some lexicon data lost");
}else{
puts("cache dumped successfully");
}
signal(signum, SIG_DFL);
kill(getpid(), signum);
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 15000
#include <setjmp.h>
#include <signal.h>
#include "RIVtools.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void fileGrind(FILE* textFile);
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount);
int checkDupe(denseRIV* RIVSet, char* word, int wordCount);
void directoryGrind(char *rootString);
void readdirContingency(int sigNumber);
jmp_buf readdirRecov;
int main(int argc, char *argv[]){
clock_t begintotal = clock();
lexOpen("/home/drbob/Documents/lexicon");
char pathString[1000];
strcpy(pathString, argv[1]);
strcat(pathString, "/");
struct stat st = {0};
if(stat(pathString, &st) == -1) {
return 1;
}
directoryGrind(pathString);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
lexClose();
return 0;
}
void addS2Ds(denseRIV *denseSet, sparseRIV additive, int RIVCount){
denseRIV *denseSet_slider = denseSet;
denseRIV *dense_stop = denseSet+RIVCount;
while(denseSet_slider<dense_stop){
addS2D((*denseSet_slider).values, additive);
*(denseSet_slider->contextSize) += additive.frequency;
denseSet_slider++;
}
}
int checkDupe(denseRIV* RIVSet, char* word, int wordCount){
denseRIV* RIVStop = RIVSet+wordCount;
while(RIVSet<RIVStop){
if(!strcmp(word, RIVSet->name)){
return 1;
}
RIVSet++;
}
return 0;
}
void directoryGrind(char *rootString){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(setjmp(readdirRecov)){
continue;
}
//printf("reclen: %d, d_name pointer: %p, firstDigit, %d", files->d_reclen,files->d_name,*(files->d_name));
while(*(files->d_name)=='.'){
files = readdir(directory);
}
//signal(SIGSEGV, signalSecure);
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
FILE *input = fopen(pathString, "r+");
if(input){
fileGrind(input);
fclose(input);
}
}
}
void fileGrind(FILE* textFile){
sparseRIV aggregateRIV = fileToL2Clean(textFile);
fseek(textFile, 0, SEEK_SET);
int wordCount = 0;
denseRIV *RIVArray = (denseRIV*)malloc(aggregateRIV.frequency*sizeof(denseRIV));
char word[200];
while(fscanf(textFile, "%99s", word)){
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
if(checkDupe(RIVArray, word, wordCount)){
continue;
}
RIVArray[wordCount] = lexPull(word);
if(!*((RIVArray[wordCount].name))) break;
*(RIVArray[wordCount].frequency)+= 1;;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount++;
}
//printf("%d\n", wordCount);
addS2Ds(RIVArray, aggregateRIV, wordCount);
denseRIV* RIVArray_slider = RIVArray;
denseRIV* RIVArray_stop = RIVArray+wordCount;
while(RIVArray_slider<RIVArray_stop){
lexPush(*RIVArray_slider);
RIVArray_slider++;
}
free(RIVArray);
free(aggregateRIV.locations);
}
void readdirContingency(int sigNumber){
puts("readdir segfaulted, trying to recover");
longjmp(readdirRecov, 1);
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include "../../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]);
strcat(pathString, "/");
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1;
}
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured
lexClose();
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
/* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
//process this file and add it's data to lexicon
fileGrind(input);
fclose(input);
}
}
}
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){
//form a context vector. "clean" indicates that it will ignore any word which
//contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
//an array of denseRIVs, large enough to hold all vectors
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
//we remove the sub-vector corresponding to the word itself
subtractThisWord(lexiconRIV);
//we log that this word has been encountered one more time
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
}
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
}
#ifndef RIVTOOLS_H_
#define RIVTOOLS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
#include "RIVlexicon.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
......@@ -29,35 +22,29 @@ sparseRIV fileToL2(FILE *input);
*/
sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2direct(FILE *data);
/* like fileToL2 but takes a block of text */
sparseRIV textToL2(char *text);
/*cosine determines the "similarity" between two RIVs. */
double cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */
sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation
* and launches that function defunct right now for memory usage reasons*/
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV normalizeFloored(denseRIV input, int factor);
/*used for analysis of lexicon vectors (not simply accumulation)
* to avoid overflow of even a 64 bit integer, vectors must be normalized
* this is an experimental approximation of true normal, which should yield
* some extra data about the nature of this word's context
*/
sparseRIV normalize(denseRIV input, int factor);
int roundMultiply(int base, float divisor);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
/* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
double getMagnitudeSparse(sparseRIV input);
sparseRIV text2L2(char *text){
/* same for denseVector */
double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
sparseRIV textToL2(char *text){
int wordCount = 0;
unsigned char word[100] = {0};
char word[100] = {0};
int denseTemp[RIVSIZE] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
......@@ -71,7 +58,6 @@ sparseRIV text2L2(char *text){
if(!displacement){
break;
}
if(!(*word)){
break;
}
......@@ -90,18 +76,16 @@ sparseRIV text2L2(char *text){
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = wordCount;
output.boolean = 1;
/* contextSize stores the number of words read */
output.contextSize = wordCount;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned char word[100] = {0};
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
/* locations (implicit RIV) are temporarily stored in temp block,
* and moved to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int denseTemp[RIVSIZE] = {0};
......@@ -129,17 +113,16 @@ sparseRIV fileToL2(FILE *data){
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */
output.frequency = wordCount;
output.boolean = 1;
/* contextSize records the number of words in this file */
output.contextSize = wordCount;
fseek(data, 0, SEEK_SET);
return output;
}
sparseRIV fileToL2Clean(FILE *data){
int denseTemp[RIVSIZE] = {0};
unsigned char word[100] = {0};
char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int wordCount = 0;
......@@ -172,44 +155,24 @@ sparseRIV fileToL2Clean(FILE *data){
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
output.contextSize = locationCount/NONZEROS;
fseek(data, 0, SEEK_SET);
return output;
}
//defunct temporarily, might make a return
/*sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
//direct method is faster on small datasets, but has geometric scaling on large datasets
return consolidateI2SDirect(implicit, valueCount);
}else{
// optimized for large datasets
return consolidateI2SIndirect(implicit, valueCount);
}
}*/
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
double cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int n = comparator.count;
while(n){
n--;
long long int dot = 0;
int* locations_stop = comparator.locations+comparator.count;
int* locations_slider = comparator.locations;
int* values_slider = comparator.values;
while(locations_slider<locations_stop){
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
//printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
dot += *values_slider * baseRIV.values[*locations_slider];
locations_slider++;
values_slider++;
}
/*dot divided by product of magnitudes */
......@@ -222,181 +185,65 @@ double getMagnitudeSparse(sparseRIV input){
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
/* we sum the squares of all elements */
temp += (*values)*(*values);
//if(temp> 0x0AFFFFFFFFFFFFFF) printf("%s, fuuuuuuuuuuuuck*****************************************",input.name );
values++;
}
/* we take the root of that sum */
return sqrt(temp);
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "%s/%s", RIVKey.lexName, word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
double getMagnitudeDense(denseRIV *input){
size_t temp = 0;
int *values = input->values;
int *values_stop = values+RIVSIZE;
while(values<values_stop){
if(*values){
temp += (*values)*(*values);
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
values++;
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
return sqrt(temp);
}
sparseRIV normalizeFloored(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
if(!input.values[i]) continue;
locations[count] = i;
values[count]= input.values[i]*divisor;
if(values[count])count++;
}
sparseRIV output;
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
return output;
}
sparseRIV normalize(denseRIV input, int factor){
float divisor = (float)factor/(*input.contextSize);
// printf("in norm: %d, %d, %f\n", *input.contextSize, factor, divisor);
int* locations = RIVKey.h_tempBlock;
/* multiplier is the scaling factor we need to bring our vector to the right size */
float multiplier = (float)factor/(input.contextSize);
/* write to temp slot, data will go to a permanent home lower in function */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
/* if this point is 0, skip it */
if(!input.values[i]) continue;
/* record position and value in the forming sparse vector */
locations[count] = i;
values[count]= roundMultiply(input.values[i], divisor);
if(values[count])count++;
values[count]= round(input.values[i]*multiplier);
/* drop any 0 values */
if(values[count] > 1)count++;
}
sparseRIV output;
output.count = count;
/* for memory conservation, both datasets are put inline with each other */
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
/* copy the data from tempBlock into permanent home */
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
/* carry metadata */
strcpy(output.name, input.name);
output.count = count;
output.magnitude = getMagnitudeSparse(output);
output.contextSize = *input.contextSize;
output.frequency = *input.frequency;
output.contextSize = input.contextSize;
output.frequency = input.frequency;
return output;
}
int roundMultiply(int base, float divisor){
float temp = base*divisor;
int output = temp*2;
if (output%2){
output/=2;
output+=1;
}else{
output/=2;
}
return output;
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment