Commit 79f1845b by birdperson

merging

parent 1d617988
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 50000
#define CACHESIZE 0
#define EPSILON 0.95
#define MINPOINTS 20
#define UNCHECKED 0
#define NOISE -1
#define MINSIZE 3000
#include "RIVtools.h"
struct DBnode{
sparseRIV RIV;
int* indexes;
int indexCount;
int status;
}*DBset;
void DBdive(int C, int i);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
lexOpen("/home/drbob/Documents/lexicon8-50");
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
DBset = malloc(fileCount*sizeof(struct DBnode));
struct DBnode* DBset_slider = DBset;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
(*DBset_slider).RIV = *fileRIVs_slider;
(*DBset_slider).indexes = malloc(sizeof(int));
(*DBset_slider).indexCount = 0;
(*DBset_slider++).status = 0;
fileRIVs_slider++;
}
free(fileRIVs);
clock_t beginnsquared = clock();
float cosine;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
for(int i=0; i<fileCount; i++){
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, DBset[i].RIV);
baseDense.magnitude = DBset[i].RIV.magnitude;
for(int j=i+1; j<fileCount; j++){
cosine = cosCompare(baseDense, DBset[j].RIV);
if(cosine>EPSILON){
DBset[i].indexes = realloc(DBset[i].indexes, (DBset[i].indexCount+1)*sizeof(int));
DBset[i].indexes[DBset[i].indexCount++] = j;
DBset[j].indexes = realloc(DBset[j].indexes, (DBset[j].indexCount+1)*sizeof(int));
DBset[j].indexes[DBset[j].indexCount++] = i;
}
}
}
int C = 0;
printf("got here\n");
for(int i=0; i<fileCount; i++){
if(DBset[i].status) continue;
if(DBset[i].indexCount <MINPOINTS){
DBset[i].status = NOISE;
}
C++;
DBset[i].status = C;
DBdive(C, i);
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
return 0;
}
void DBdive(int C, int i){
printf("root: %s\n", DBset[i].RIV.name);
struct DBnode *DBnet = malloc(sizeof(struct DBnode));
DBnet[0] = DBset[i];
int nodeCount = 1;
for(int j=0; j<nodeCount; j++){
for(int k=0; k<DBnet[j].indexCount;k++){
int index = DBnet[j].indexes[k];
if(DBset[index].status>0) continue;
printf(">>%s\n", DBset[index].RIV.name);
DBset[index].status = C;
if(DBset[index].indexCount> MINPOINTS){
DBnet = realloc(DBnet, (nodeCount+1)*sizeof(struct DBnode));
DBnet[nodeCount++] = DBset[index];
}
}
}
free(DBnet);
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
denseRIV temp = lexPull(files->d_name);
if(*temp.frequency >MINSIZE){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(temp, 500);
strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
(*fileCount)++;
}
free(temp.values);
}
}
/* this DB scan algorithm is not meant to be an example of an easily written
* program. rather it is a useful tool that can be used to validate the contents
* of a lexicon. it will identify, using a density based algorithm
* clusters of vectors. if the lexicon is well formed, these clusters should
* be numerous, as well as containing well related words */
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
//RIVSIZE macro must be set to the size of the RIVs in the lexicon
#define RIVSIZE 25000
#define CACHESIZE 0
#define EPSILON 0.98
#define MINPOINTS 1
#define UNCHECKED 0
#define NOISE -1
#define MINSIZE 10000
#include "RIVtools.h"
/* the node holds a vector, and metadata:
* -indexes will hold the array indexes of its neighbors
* -indexCount will hold the number of neighbors
* -status will hold its cluster, either a cluster number or "unchecked"
*/
struct DBnode{
sparseRIV RIV;
struct DBnode** neighbors;
int neighborCount;
int status;
};
void intercompare(struct DBnode* DBset, int nodeCount);
void DBdive(struct DBnode* root, struct DBnode *DBset, int C);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
if(argc <2){
printf("give me a directory");
return 1;
}
int fileCount = 0;
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[1000];
lexOpen(argv[1]);
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
/* an array of nodes, one for each vector */
struct DBnode DBset[fileCount];
/* fill the node array with vectors and initialize metadata */
for(int i = 0; i < fileCount; i++){
fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
DBset[i].RIV = fileRIVs[i];
/* a single malloc for later realloc'ing */
DBset[i].neighbors = malloc(sizeof(struct DBnode*));
DBset[i].neighborCount = 0;
DBset[i].status = UNCHECKED;
}
/* fileRIVs was only temporary */
free(fileRIVs);
intercompare(DBset, fileCount);
int C = 0;
for(int i=0; i<fileCount; i++){
if(DBset[i].status) continue;
if(DBset[i].neighborCount <MINPOINTS){
DBset[i].status = NOISE;
continue;
}
C++;
printf("\ncluster %d\n", C);
DBset[i].status = C;
printf("root: %s, %d, %lf\n", DBset[i].RIV.name, DBset[i].RIV.frequency, DBset[i].RIV.magnitude);
DBdive(&DBset[i], DBset, C);
}
return 0;
}
void DBdive(struct DBnode* root, struct DBnode *DBset, int C){
for(int i = 0; i < root->neighborCount; i++){
/* if this node is not already claimed by a cluster */
if(root->neighbors[i]->status > 0){
continue;
}
/* for easier coding, put it in a local variable */
struct DBnode *branch = root->neighbors[i];
printf(">>%s, %d, %lf\n", branch->RIV.name, branch->RIV.frequency, branch->RIV.magnitude);
/* include this in the cluster C */
branch->status = C;
/* if this branch has enough neighbors to spread */
if(branch->neighborCount > MINPOINTS){
/* recursive dive into next branch */
DBdive(branch, DBset, C);
}
}
}
/* fileRIVs and fileCount are accessed as pointers, so that we can find them changed outside this function
*/
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
denseRIV* temp = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(temp->contextSize >MINSIZE){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(*temp, 500);
(*fileRIVs)[(*fileCount)].magnitude = temp->magnitude;
strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
(*fileCount)++;
}
free(temp);
}
}
void intercompare(struct DBnode* DBset, int nodeCount){
double cosine;
denseRIV baseDense;
for(int i=0; i<nodeCount; i++){
/* map the RIV in question to a dense for comparison */
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
addS2D(baseDense.values, DBset[i].RIV);
baseDense.magnitude = DBset[i].RIV.magnitude;
/* for each previous vector */
for(int j=i+1; j<nodeCount; j++){
/* get cosine distance to that vector */
cosine = cosCompare(baseDense, DBset[j].RIV);
/* if this pair is close enough */
if(cosine>EPSILON){
/* add the pairing to each node's list of neighbors */
DBset[i].neighbors = realloc(DBset[i].neighbors, (DBset[i].neighborCount+1)*sizeof(struct DBnode*));
DBset[j].neighbors = realloc(DBset[j].neighbors, (DBset[j].neighborCount+1)*sizeof(struct DBnode*));
DBset[i].neighbors[DBset[i].neighborCount++] = &DBset[j];
DBset[j].neighbors[DBset[j].neighborCount++] = &DBset[i];
}
}
}
}
/* RIV stands for Random Index Vector, referring to the method of generating
* the basic vectors that correspond to each word. each word has an algorithmically
* generated vector which represents it in this mathematical model, such that a word
* will produce the same vector each time it is encountered*[1]. this base
* vector will be referred to as a L1 vector or a barcode vector
*
* by summing these vectors, we can get a mathematical representation of
* a set of text. this summed vector will be referred to as an L2 vector
* or aggregate vector. in its simplest implimentation, an L2 vector
* representation of a document contains a model of the contents of the
* document, enabling us to compare direction and magnitude of document
* vectors to understand their relationships to each other.
*
* but the system we are really interested in is the ability to form
* context vectors
* a context vector is the sum of all (L1?) vectors that the word
* has been encountered in context with. from these context vectors
* certain patterns and relationships between words should emerge.
* what patterns? that is the key question we will try to answer
*
* [1] a word produces the same vector each time it is encountered only
* if the environment is the same, ie. RIVs are the same dimensionality
* nonzero count is the same. comparing vectors produced in different
* environments yields meaningless drivel and should be avoided
*
* [2] what exactly "context" means remains a major stumbling point.
* paragraphs? sentences? some potential analyses would expect a static
* sized context (the nearest 10 words?) in order to be sensible, but
* it may be that some other definition of context is the most valid for
* this model. we will have to find out.
*
* some notes:
*
* -sparseRIV vs. denseRIV (sparse vector vs. dense vector)
* the two primary data structures we will use to analyze these vectors
* each vector type is packed with some metadata
* (name, magnitude, frequency, flags)
*
* -denseRIV is a standard vector representation.
* each array index corresponds to a dimension
* each value corresponds to a measurement in that dimension
*
* -sparseRIV is vector representation optimized for largely empty vectors
* each data point is a location/value pair where the
* location represents array index
* value represents value in that array index
*
* if we have a sparsely populated dense vector (mostly 0s) such as:
*
* |0|0|5|0|0|0|0|0|4|0|
*
* there are only 2 values in a ten element array. this could, instead
* be represented as
*
* |2|8| array indexes
* |5|4| array values
* |2| record of size
*
* and so, a 10 element vector has been represented in only 5 integers
*
* this is important for memory use, of course, but also for rapid calculations
* if we have two vectors
*
* |0|0|5|0|0|0|0|0|4|0|
* |0|0|0|0|0|0|7|0|3|-2|
* and we wish to perform the dot product this will take 10 steps,
* 9 of which are either 0*0 = 0, or 0*x = 0
* if we instead have these represented as sparse vectors
* |2|8|
* |5|4|
* |2|
*
* |6|8|9|
* |7|3|-2|
* |3|
*
* we only need to search for matching location values
* or, better yet, if we use a hybrid analysis:
* |0|0|5|0|0|0|0|0|4|0|
* ___________/__/_/
* / / /
* |6|8|9|
* |7|3|-2|
* |3|
* we can simply access the dense vector by indexes held in the sparse vector
* reducing this operation to only 3 steps
*/
#ifndef RIVLOWER_H_
#define RIVLOWER_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <sys/stat.h>
#include "RIVaccessories.h"
#include "assert.h"
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
#if RIVSIZE<4
#error "RIVSIZE must be a positive number, greater than 4 (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
#if NONZEROS%2 || NONZEROS<1
#error "NONZEROS must be an even, greater than 0 number"
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 10000
#endif
#if CACHESIZE<0
#error "CACHESIZE cannot be a negative number"
#endif
/* the size of the tempBlock used in consolidation and implicit RIVs */
#define TEMPSIZE 3*RIVSIZE
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
int frequency;
int contextSize;
float magnitude;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
char name[100];
void* cached;
int frequency;
int contextSize;
float magnitude;
int values[RIVSIZE];
}denseRIV;
/*RIVKey, holds global variables used under the hood, primarily for the lexicon
* it also holds a "temp block" that will be used by the dense to sparse
* conversion and implicit RIV aggregation
*/
struct RIVData{
int h_tempBlock[TEMPSIZE];
int tempSize;
char lexName[255];
denseRIV** RIVCache;
char flags;
}static RIVKey;
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does not automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* contain any metadata
*/
sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(char* word, int *seeds, int seedCount);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, int seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
* to be more than worth using
*/
int* addS2D(int* destination, sparseRIV input);
/* caheDump flushes the RIV cache out to relevant files, backing up all
* data. this is called by the lexClose and signalSecure functions
*/
int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, int seedCount);
/*subtracts a words vector from its own context. regularly used in lex building
*/
void subtractThisWord(denseRIV* vector);
/* begin definitions */
int* addS2D(int* destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
}
return destination;
}
int* mapI2D(int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
int* addI2D(int* destination, int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
sparseRIV consolidateD2S(int *denseInput){
sparseRIV output;
output.count = 0;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int* locations_slider = locations;
int* values_slider = values;
for(int i=0; i<RIVSIZE; i++){
/* act only on non-zeros */
if(denseInput[i]){
/* assign index to locations */
*(locations_slider++) = i;
/* assign value to values */
*(values_slider++) = denseInput[i];
/* track size of forming sparseRIV */
output.count++;
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
}
/* copy locations values into opened slot */
memcpy(output.locations, locations, output.count*sizeof(int));
output.values = output.locations + output.count;
/* copy values into opened slot */
memcpy(output.values, values, output.count*sizeof(int));
return output;
}
void makeSparseLocations(char* word, int *locations, int count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
while(locations<locations_stop){
/* unrolled for speed, guaranteed to be an even number of steps */
*locations = rand()%RIVSIZE;
locations++;
*locations = rand()%RIVSIZE;
locations++;
}
return;
}
sparseRIV* sparseAllocateFormatted(){
sparseRIV* output = (sparseRIV*)calloc(1, sizeof(sparseRIV));
return output;
}
void subtractThisWord(denseRIV* vector){
//set the rand() seed to the word
srand(wordtoSeed(vector->name));
/* the base word vector is composed of NONZERO (always an even number)
* +1s and -1s at "random" points (defined by the above seed.
* if we invert it to -1s and +1s, we have subtraction */
for(int i = 0; i < NONZEROS; i+= 2){
vector->values[rand()%RIVSIZE] -= 1;
vector->values[rand()%RIVSIZE] += 1;
}
/* record a context size 1 smaller */
vector->contextSize-= 1;
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#define RIVSIZE 200000
#define NONZEROS 2
#define CACHESIZE 1000
#include "../RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
int main(int argc, char *argv[]){
char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
lexOpen("lexicon200-2");
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]);
strcat(pathString, "/");
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1;
}
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured
lexClose();
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryGrind(pathString);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
printf("%s\n", pathString);
/* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
//process this file and add it's data to lexicon
fileGrind(input);
fclose(input);
}
}
}
//form context vector from contents of file, then add that vector to
//all lexicon entries of the words contained
void fileGrind(FILE* textFile){
//form a context vector. "clean" indicates that it will ignore any word which
//contains unwanted characters
sparseRIV contextVector = fileToL2Clean(textFile);
//an array of denseRIVs, large enough to hold all vectors
//(we don't yet know how many vectors there will be, so we make it big enough for the maximum)
denseRIV* lexiconRIV;
char word[100] = {0};
while(fscanf(textFile, "%99s", word)){
//we ensure that each word exists, and is free of unwanted characters
if(feof(textFile)) break;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
//we remove the sub-vector corresponding to the word itself
subtractThisWord(lexiconRIV);
//we log that this word has been encountered one more time
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
}
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
}
clean(){
while [ "$1" ]; do
if [ -d "$1" ]; then
clean "$1"/*
else
python shittyballs.py "$1"
./RIVread1 cleanbooks/
./RIVread2 cleanbooks/
./RIVread3 cleanbooks/
./RIVread4 cleanbooks/
./RIVread5 cleanbooks/
./RIVread6 cleanbooks/
./RIVread7 cleanbooks/
rm -r cleanbooks/
#rm "$1"
fi
shift
done
}
clean ../../books/*
#import requests
import re
import string
import os
import sys
from subprocess import call
import nltk
from nltk.corpus import wordnet as wn
import pdb
from nltk.stem import PorterStemmer
def writeWord(cleanString, word, stem, blacklist):
if word == stem:
FILE = open("lexicon/" + word, "w")
FILE.write("1");
FILE.close();
return (cleanString + " " + word)
elif stem not in blacklist:
if len(stem) > 2:
FILE = open("lexicon/" + word, "w")
FILE.write("2"+stem);
FILE.close();
FILE = open("lexicon/" + stem, "w")
FILE.write("1")
FILE.close();
return (cleanString + " " + stem)
return cleanString
def liFix(word):
if not word[len(word)-2:] == "li":
return word
temp = ps.stem(word[:-2])
if temp:
return temp
return word
def cleanWord(word):
word = word.lower();
regex = re.compile('[^a-z]+')
word = regex.sub('', word)
#print(word)
return word
def fileCheck(word):
try:
wordFile = open("lexicon/{}".format(word), "r")
code = int(wordFile.read(1))
except:
return 0
if code == 2:
word = wordFile.read()
#print("file flipped to: " + word)
wordFile.close()
return word
elif code == 1:
#print("file accepted: " + word)
wordFile.close()
return word
elif code == 0:
wordFile.close()
return -1
def morphyTest(word):
morphyTemp = wn.morphy(word)
if not morphyTemp:
return 0
return morphyTemp;
#begin mainfunction
blacklist = ["a", "an", "the", "so", "as", "how",
"i", "me", "we", "they", "you", "it", "he", "she",
"but", "have", "had",
"for", "by", "in", "out", "as", "not"
"be", "were", "was", "am", "are", "is",
"mr", "mrs", "mr", "and"]
word = {}
ps = PorterStemmer()
sourceString = sys.argv[1]
cutDirectories = sourceString.split('/')[-1]
pathString = cutDirectories.split('.')[0]
pathString = "cleanbooks/" + pathString + "clean/"
print(sourceString + "\n")
if not os.path.exists('cleanbooks'):
os.makedirs('cleanbooks')
if not os.path.exists('lexicon'):
os.makedirs('lexicon')
if not os.path.exists(pathString):
os.makedirs(pathString)
call(["python", "blacklist.py"])
i=0
skip = 1
with open(sourceString, 'U') as fileIn:
text = fileIn.read()
for paragraph in text.split(2*os.linesep):
if not paragraph:
continue
elif "*** START OF " in paragraph or "*END THE SMALL PRINT" in paragraph:
skip = 0
continue
elif "*** END OF " in paragraph:
fileIn.close()
sys.exit()
elif "End of Project Gutenberg's" in paragraph:
fileIn.close()
sys.exit()
elif "End of the Project Gutenberg" in paragraph:
fileIn.close()
sys.exit()
if not skip:
cleanString = ''
i += 1
fileOut = open("{}{}.txt".format(pathString, i), "w")
for line in paragraph.split(os.linesep):
for tempWord in line.split():
word=cleanWord(tempWord)
if not word:
continue
if len(word) < 3:
continue;
if word in blacklist:
continue;
temp = fileCheck(word)
if temp == -1:
continue
if temp:
cleanString = (cleanString + " " + temp);
continue
else:
morphy = morphyTest(word)
if morphy:
stem = ps.stem(morphy)
if stem:
stem = liFix(stem)
cleanString = writeWord(cleanString, word, stem, blacklist)
cleanString = cleanString + os.linesep
if len(cleanString.split(' ')) > 2:
fileOut.write(cleanString)
fileOut.close()
else:
fileOut.close()
os.remove("{}{}.txt".format(pathString, i))
i -= 1
if skip==1:
print(sourceString + " was badly parsed, no output");
#ifndef RIVACCESS_H_
#define RIVACCESS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct treenode{
void* data;
struct treenode* links[26];
int downstream;
};
int treecut(struct treenode* node, char* letter);
void stemInsert(struct treenode* node, char* letter, char* data);
void RIVinsert(struct treenode* node, char* letter, void* data);
void* treeSearch(struct treenode* node, char* letter);
struct treenode* stemTreeSetup();
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
int isWordClean(char* word);
/* used by wordClean */
int isLetter(char c);
/* creates a standard seed from the characters in a word, hopefully unique */
int wordtoSeed(char* word);
int isLetter(char c){
if((c>96 && c<123)||(c == 32)) return 1;
else return 0;
}
int isWordClean(char* word){
char *letter = word;
char *word_stop = word+99;
while(letter<word_stop){
if(!(*letter)) break;
if(!(isLetter(*letter))){
return 0;
}
letter++;
}
return 1;
}
int wordtoSeed(char* word){
int i=0;
int seed = 0;
while(*word){
/* left-shift 5 each time *should* make seeds unique to words
* this means letters are taken as characters counted in base 32, which
* should be large enough to hold all english characters plus a few outliers
* */
seed += (*(word))<<(i*5);
word++;
i++;
}
return seed;
}
struct treenode* stemTreeSetup(){
FILE* netfile = fopen("stemnet2.txt", "r");
if(!netfile){
printf("no stemnet file");
return 0;
}
struct treenode* rootNode = calloc(1, sizeof(struct treenode));
char word[100];
char stem[100];
while(fscanf(netfile, "%s %s", word, stem)){
if(feof(netfile)){
break;
}
stemInsert(rootNode, word, stem);
}
return rootNode;
}
void* treeSearch(struct treenode* node, char* letter){
if(*(letter)){
if(!node->links[*(letter)-'a']){
return NULL;
}
return treeSearch(node->links[*(letter)-'a'], letter+1);
}else{
return node->data;
}
}
void RIVinsert(struct treenode* node, char* letter, void* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
RIVinsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = data;
}
}
void stemInsert(struct treenode* node, char* letter, char* data){
node->downstream++;
if(*(letter)){
if(!node->links[*(letter)-'a']){
node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
}
stemInsert(node->links[*(letter)-'a'], letter+1, data);
}else{
if(node->data) return;
node->data = calloc(strlen(data)+1, sizeof(char));
strcpy((char*)node->data, data);
}
}
int treecut(struct treenode* node, char* letter){
node->downstream--;
int flag;
if(*(letter)){
if(node->links[*(letter)-'a']){
flag = treecut(node->links[*(letter)-'a'], letter+1);
if(flag){
node->links[*(letter)-'a'] = NULL;
}
}
if(!node->downstream){
free(node);
return 1;
}
}else{
free(node);
return 1;
}
return 0;
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 5
#define CACHESIZE 0
#define THRESHOLD 0.70
#include "RIVtoolsCPUlinux.h"
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount);
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
getMagnitudes(fileRIVs, fileCount);
clock_t beginnsquared = clock();
sparseRIV centroids[5];
strcpy(centroids[0].name, "boobs");
strcpy(centroids[1].name, "ass");
strcpy(centroids[2].name, "shit");
strcpy(centroids[3].name, "cocks");
strcpy(centroids[4].name, "fuck");
for(int i=0; i<5; i++){
centroids[i] = wordtoL2(centroids[i].name);
}
getMagnitudes(centroids, 5);
getcentroids(centroids, fileRIVs, 5, fileCount);
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("nsquared time:%lf\n\n", time);
printf("%d <", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2Clean(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
void getcentroids(sparseRIV* centroids, sparseRIV* vectorSet, int centroidCount, int vectorCount){
float** cosines = malloc(centroidCount*sizeof(int*));
for(int i=0; i<centroidCount; i++){
cosines[i] = cosineCompare(centroids[i], vectorSet, vectorCount);
}
int* centroidIndexes[centroidCount];
int indexCounts[centroidCount];
int* denses[centroidCount];
*centroidIndexes = calloc(vectorCount*centroidCount, sizeof(int));
*denses = malloc(RIVKey.RIVsize*centroidCount * sizeof(int));
for(int i=1; i<centroidCount; i++){
centroidIndexes[i] = centroidIndexes[0]+i*vectorCount;
denses[i] = denses[0] +i*RIVKey.RIVsize;
}
float token = 2.0;
int counter = 0;
for(int i=0; i<vectorCount; i++){
token = 2.0;
printf("\nfor vector %d:\n", i);
for(int j = 0; j<centroidCount; j++){
printf("centroid %d: %f", j, cosines[j][i]);
if(fabsf(cosines[j][i])< token){
token = fabsf(cosines[j][i]);
counter = j;
}
}
centroidIndexes[counter][indexCounts[counter]] = i;
indexCounts[counter] += 1;
}
for(int i=0; i<centroidCount; i++){
memset(denses[i], 0, RIVKey.RIVsize);
printf("\n\nnumber %d\n", i);
for(int j=0; j<indexCounts[i]; i++){
addS2D(denses[i], vectorSet[j]);
for(int k=0; k<RIVKey.RIVsize; k++){
printf("%d, ", denses[i][k]);
}
}
}
}
#include <stdio.h>
#define RIVSIZE 50000
#include "RIVtools.h"
char* clean(char* word);
char* stemmy(struct treenode* searchRoot, char* word);
sparseRIV line2L3(char* text, struct treenode* searchRoot);
typedef char label[200];
struct RIVclass{
label name;
sparseRIV* set;
int setSize;
};
LEXICON* lexicon;
int main(){
struct treenode* searchRoot = stemTreeSetup();
lexicon = lexOpen("consolidatedLexicon", "rx");
int classNo = 0;
label* classNames = calloc(1, sizeof(label));
int classCount = 0;
struct RIVclass* classes = malloc(sizeof(struct RIVclass));
FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
if(!textSet){
puts("no file");
return 1;
}
struct RIVclass* class;
char text[20000];
label className;
while(fscanf(textSet, "%s\t%s", text, className)){
char* labelTemp = strstr(*classNames, className);
if(!labelTemp){
/* reinitialize the classnames with a new member */
classNames = realloc(classNames, (classCount+1)*sizeof(label));
strcpy(classNames[classCount], className);
/* reinitialize the classes with a new member */
classes = realloc(classes, (classCount+1)*sizeof(struct RIVclass));
class = classes+classCount;
class->set = malloc(sizeof(sparseRIV));
strcpy(class->name, className);
class->setSize = 0;
classNo = classCount;
classCount++;
}else{
classNo = (labelTemp-*classNames);
class = classes+classNo;
}
class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
sparseRIV thing= line2L3(text, searchRoot);
class->set[class->setSize] = thing;
class->setSize++;
}
for(int i=0; i<classCount; i++){
puts(classNames[i]);
printf("%d\n\n", classes[i].setSize);
}
return 0;
}
char* clean(char* word){
char* letter = word;
char output[100] = {0};
char *outLetter = output;
while(*letter){
if(*letter >= 'A' && *letter <= 'Z'){
*outLetter = *letter + 32;
outLetter++;
}else if( *letter >= 'a' && *letter <= 'z'){
*outLetter = *letter;
outLetter++;
}
letter++;
}
strcpy(word,output);
return word;
}
char* stemmy(struct treenode* searchRoot, char* word){
return treeSearch(searchRoot , word);
}
sparseRIV line2L3(char* text, struct treenode* searchRoot){
denseRIV accumulate = {0};
sparseRIV temp;
char* textEnd = text+strlen(text);
char word[100];
int displacement;
while(text<textEnd){
sscanf(text, "%99s%n", word, &displacement);
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
if(!*clean(word)) continue;
char* stem = stemmy(searchRoot, word);
if(stem){
denseRIV* wordRIV = lexPull(lexicon, stem);
if(!wordRIV){
//printf("%s, not in lexicon\n", stem);
continue;
}else{
//printf("%s, succesfully pulled\n", stem);
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}
}
temp = consolidateD2S(accumulate.values);
return temp;
}
#include <stdio.h>
#include "RIVtools.h"
#include <dirent.h>
#include <sys/types.h>
#include <time.h>
int main(){
lexOpen("/home/drbob/Documents/lexicon");
FILE *wordList = fopen("wordList.txt", "r");
char word[100];
denseRIV accept;
sparseRIV analyzefloor;
sparseRIV analyzerounded;
sparseRIV other;
while(fscanf(wordList, "%s", word)){
if(!*word) break;
if(feof(wordList))break;
puts(word);
// sleep(1);
accept = lexPull(word);
other = consolidateD2S(accept.values);
//other.magnitude = getMagnitudeSparse(other);
// accept.magnitude = other.magnitude;
// analyzerounded = normalize(accept, 2000);
// analyzefloor = normalizeFloored(accept, 2000);
// if(cosCompare(accept, analyzefloor)>1.00){
// printf("floored: %f rounded: %f\tcontextSize: %d\tfrequency: %d\tsaturationbase %d, saturationFloored %d, saturationRounded %d\n", analyzefloor.magnitude, analyzerounded.magnitude, *(accept.contextSize), *(accept.frequency), other.count, analyzefloor.count, analyzerounded.count);
////}
// free(analyzefloor.locations);
// free(analyzerounded.locations);
free(other.locations);
free(accept.values);
}
lexClose();
}
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
int main(int argc, char* argv[]){
LEXICON* lexicon = lexOpen(argv[1], "rx");
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(argv[1]))){
printf("location not found, %s\n", argv[1]);
return 1;
}
int i=0;
int j=0;
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
j++;
intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
/*if(intake->contextSize<7000){
free(intake);
continue;
}*/
examine = normalize(*intake, 10000);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
output[i]->contextSize = intake->contextSize;
free(intake);
free(examine.locations);
i++;
}
lexClose(lexicon);
lexicon = lexOpen("consolidatedLexicon", "wx");
for(int j=0; j<i; j++){
lexPush(lexicon, output[j]);
}
lexClose(lexicon);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtools.h"
#define THRESHOLD 0.70
/* this program identifies all near-duplicates among the documents in the
* chosen root directory, using RIV comparison */
// fills the fileRIVs array with a vector for each file in the root directory
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
int fileCount = 0;
//initializes the fileRIVs array to be reallocced by later function
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
//gather all vectors ino the fileRIVs array and count them in fileCount
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
//first calculate all magnitudes for later use
for(int i = 0; i < fileCount; i++){
fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
}
clock_t begintotal = clock();
double cosine;
double minmag;
double maxmag;
//all cosines need a sparse-dense comparison. so we will create a
denseRIV baseDense;
for(int i = 0; i < fileCount; i++){
//0 out the denseVector, and map the next sparseVector to it
memset(&baseDense, 0, sizeof(denseRIV));
addS2D(baseDense.values, fileRIVs[i]);
//pass magnitude to the to the dense vector
baseDense.magnitude = fileRIVs[i].magnitude;
//if these two vectors are too different in size, we can know that they are not duplicates
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
for(int j = 0; j < i; j++){
//if this vector is within magnitude threshold
if(fileRIVs[j].magnitude < maxmag
&& fileRIVs[j].magnitude > minmag){
//identify the similarity of these two vectors
cosine = cosCompare(baseDense, fileRIVs[j]);
//if the two are similar enough to be flagged
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", fileRIVs[i].name , fileRIVs[j].name, cosine);
}
}
}
}
printf("fileCount: %d", fileCount);
free(fileRIVs);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
/* *** end dirent walk, begin meat of function *** */
FILE *input = fopen(pathString, "r");
if(input){
*fileRIVs = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[*fileCount] = fileToL2(input);
strcpy((*fileRIVs)[*fileCount].name, pathString);
fclose(input);
*fileCount += 1;
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.70
#include "RIVtools.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
//RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int thing = 0;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", (*fileRIVs_slider).name , (*comparators_slider).name, cosine);
(*comparators_slider).boolean = 0;
thing++;
}
}
comparators_slider++;
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.99
#include "RIVtools.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
int thing = 0;
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude = (*fileRIVs_slider).magnitude;
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
while(comparators_slider < fileRIVs_slider){
if((*comparators_slider).magnitude < maxmag && (*comparators_slider).magnitude > minmag && (*comparators_slider).boolean){
cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){
printf("%s\t%f\n",(*comparators_slider).name, cosine);
if(remove((*comparators_slider).name)){
printf(" well shit");
}
(*comparators_slider).boolean = 0;
thing++;
}
}
comparators_slider++;
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
puts(pathString);
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.70
#include "RIVtools.h"
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char *file, int line){
if(err !=cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
__global__ void d_mapS2D(int *d_denseSlot, int *d_sparseSlot, int count){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(!id<count) return;
int *target = d_sparseSlot+id;
d_denseSlot[*target] = *(target+count);
}
__global__ void cosines(int* d_denseBase, int* d_sparseBlock, int* output, int RIVcount){
int id =blockIdx.x*blockDim.x + threadIdx.x;
if(id>=RIVcount) return;
int count = *(d_sparseBlock+RIVSIZE*id);
int *locations = &count+1;
int *values = locations+count;
int dot = 0;
output+=id;
while(count--){
dot+= values[count]*d_denseBase[locations[count]];
}
*output = dot;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
float** cosineMatrix(sparseRIV* RIVs, int RIVcount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
RIVInit();
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
float minmag;
float maxmag;
denseRIV baseDense;
baseDense.values = (int*)malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
cosineMatrix(fileRIVs, fileCount);
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", RIVKey.thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = fileToL2(input);
strcpy((*fileRIVs)[(*fileCount)].name, pathString);
fclose(input);
(*fileCount)++;
}
}
}
float** cosineMatrix(sparseRIV* RIVs, int RIVcount){
int *d_massiveBlock;
cudaMalloc((void**)&d_massiveBlock, 100000000*sizeof(int));
int *d_outputSlot = d_massiveBlock;
int *d_denseSlot = d_outputSlot+(RIVcount*RIVcount/2);
int *d_sparseSection =d_denseSlot+RIVSIZE;
int *d_sparse_slider = d_sparseSection;
for(int i=0; i<RIVcount; i++){
HANDLE_ERROR (cudaMemcpy (d_sparse_slider++, &RIVs[i].count, sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMemcpy (d_sparse_slider, RIVs[i].locations, RIVs[i].count*2*sizeof(int), cudaMemcpyHostToDevice));
d_sparse_slider+=RIVs[i].count*2;
}
}
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
int main(int argc, char* argv[]){
LEXICON* lexicon = lexOpen(argv[1], "r");
denseRIV* intake;
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(argv[1]))){
printf("location not found, %s\n", argv[1]);
return 1;
}
int i=0;
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
intake = lexPull(lexicon, files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
printf("%d,%d,%lf,%d,%s\n", intake->frequency, intake->contextSize, intake->magnitude, i, files->d_name);
free(intake);
i++;
}
lexClose(lexicon);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#define RIVSIZE 25000
#define CACHESIZE 0
#define NONZEROS 2
#define THRESHOLD 0.90
#include "RIVtools.h"
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
clock_t begintotal = clock();
int fileCount = 0;
lexOpen("/home/drbob/Documents/lexicon2-25");
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
int thing = 0;
strcpy(rootString, argv[1]);
strcat(rootString, "/");
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
sparseRIV* fileRIVs_slider = fileRIVs;
sparseRIV* fileRIVs_stop = fileRIVs+fileCount;
while(fileRIVs_slider <fileRIVs_stop){
(*fileRIVs_slider).magnitude = getMagnitudeSparse(*fileRIVs_slider);
fileRIVs_slider++;
}
clock_t beginnsquared = clock();
float cosine;
denseRIV baseDense;
baseDense.values = malloc(RIVSIZE*sizeof(int));
fileRIVs_slider = fileRIVs;
sparseRIV* comparators_slider;
int count = 0;
while(fileRIVs_slider<fileRIVs_stop){
if(!fileRIVs_slider->boolean){
fileRIVs_slider++;
continue;
}
if(fileRIVs_slider->magnitude == 0) continue;
comparators_slider = fileRIVs;
memset(baseDense.values, 0, RIVSIZE*sizeof(int));
baseDense.values = addS2D(baseDense.values, *fileRIVs_slider);
baseDense.magnitude =(*fileRIVs_slider).magnitude;
while(comparators_slider < fileRIVs_stop){
if(!(comparators_slider->boolean&&strcmp(comparators_slider->name, fileRIVs_slider->name))){
comparators_slider++;
continue;
}
if(comparators_slider->magnitude==0) continue;
cosine = cosCompare(baseDense, *comparators_slider);
count++;
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", fileRIVs_slider->name , comparators_slider->name, cosine);
comparators_slider->boolean = 0;
thing++;
}
comparators_slider++;
}
fileRIVs_slider++;
}
clock_t endnsquared = clock();
double time = (double)(endnsquared - beginnsquared) / CLOCKS_PER_SEC;
printf("\nnsquared time:%lf\n\n", time);
printf("\ncosines: %d \n", count);
printf("\nsims: %d \n", thing);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
free(fileRIVs);
lexClose();
return 0;
}
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
denseRIV temp = lexPull(files->d_name);
if(*temp.frequency >2000){
(*fileRIVs) = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[(*fileCount)] = normalize(temp, 500);
strcpy((*fileRIVs)[(*fileCount)].name, files->d_name);
(*fileCount)++;
}
free(temp.values);
}
}
This diff is collapsed. Click to expand it.
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include <string.h>
//#define HASHCACHE
#define RIVSIZE 50000
#define NONZEROS 4
#define CACHESIZE 27000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
//to all words found in these files. this is used to create a lexicon, or add to an existing one
void fileGrind(FILE* textFile);
void addContext(denseRIV* lexRIV, sparseRIV context);
void directoryGrind(char *rootString);
void lineGrind(char* textLine);
LEXICON* lp;
//int COUNTY = 0;
int main(int argc, char *argv[]){
char pathString[1000];
lp = lexOpen("lexicon", "rw");
//we open the lexicon, if it does not yet exist, it will be created
//we format the root directory, preparing to scan its contents
strcpy(pathString, argv[1]);
strcat(pathString, "/");
//ensure that the targeted root directory exists
struct stat st;
if(stat(pathString, &st) == -1) {
printf("directory doesn't seem to exist");
return 1;
}
//we will scan the directory, adding all data to our lexicon, as seen inside
directoryGrind(pathString);
//we close the lexicon again, ensuring all data is secured
lexClose(lp);
return 0;
}
//mostly a standard Dirent-walk
void directoryGrind(char *rootString){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(files->d_type == DT_DIR){
continue;
}
sprintf(pathString, "%s/%s", rootString, files->d_name);
/* *** end dirent walk, begin meat of function *** */
//check for non-txt files
char *fileEnding = pathString+strlen(pathString)-4;
if(strcmp(fileEnding, ".txt")){
printf("skipped: %s\n", files->d_name);
continue;
}
//puts(files->d_name);
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
//process this file and add it's data to lexicon
//fprintf(stderr, "***%d", COUNTY++);
fileGrind(input);
fclose(input);
}
}
closedir(directory);
}
void fileGrind(FILE* textFile){
char textLine[10000];
// included python script separates paragraphs into lines
//int i=0;
while(fgets(textLine, 9999, textFile)){
//printf("line: %d\n", i++);
if(!strlen(textLine)) continue;
if(feof(textFile)) break;
//process each line as a context set
lineGrind(textLine);
}
}
//form context vector from contents of text, then add that vector to
//all lexicon entries of the words contained
void lineGrind(char* textLine){
//extract a context vector from this text set
sparseRIV contextVector = textToL2(textLine);
if(contextVector.contextSize <= 1){
free(contextVector.locations);
return;
}
denseRIV* lexiconRIV;
//identify stopping point in line read
char* textEnd = textLine + strlen(textLine)-1;
int displacement = 0;
char word[100] = {0};
while(textLine<textEnd){
sscanf(textLine, "%99s%n", word, &displacement);
//we ensure that each word exists, and is free of unwanted characters
textLine += displacement+1;
if(!(*word))continue;
if(!isWordClean((char*)word)){
continue;
}
//we pull the vector corresponding to each word from the lexicon
//if it's a new word, lexPull returns a 0 vector
lexiconRIV= lexPull(lp, word);
//we add the context of this file to this wordVector
addContext(lexiconRIV, contextVector);
//we remove the sub-vector corresponding to the word itself
subtractThisWord(lexiconRIV);
//we log that this word has been encountered one more time
lexiconRIV->frequency += 1;
//and finally we push it back to the lexicon for permanent storage
lexPush(lp, lexiconRIV);
}
//free the heap allocated context vector data
free(contextVector.locations);
}
void addContext(denseRIV* lexRIV, sparseRIV context){
//add context to the lexRIV, (using sparse-dense vector comparison)
addS2D(lexRIV->values, context);
//log the "size" of the vector which was added
//this is not directly necessary, but is useful metadata for some analises
lexRIV->contextSize += context.contextSize;
}
#ifndef RIVTOOLS_H_
#define RIVTOOLS_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
#include "RIVlexicon.h"
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
/* like fileToL2 but takes a block of text */
sparseRIV textToL2(char *text);
/*cosine determines the "similarity" between two RIVs. */
double cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*used for analysis of lexicon vectors (not simply accumulation)
* to avoid overflow of even a 64 bit integer, vectors must be normalized
* this is an experimental approximation of true normal, which should yield
* some extra data about the nature of this word's context
*/
sparseRIV normalize(denseRIV input, int factor);
/* calculates the magnitude of a sparseVector */ //TODO contain integer overflow in square process
double getMagnitudeSparse(sparseRIV input);
/* same for denseVector */
double getMagnitudeDense(denseRIV *input); //TODO consolidate these into one function
sparseRIV textToL2(char *text){
int wordCount = 0;
char word[100] = {0};
int denseTemp[RIVSIZE] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement = 0;;
char* textEnd = text+strlen(text)-1;
while(text<textEnd){
sscanf(text, "%99s%n", word, &displacement);
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
/* if this word would overflow the locations block, map it to the denseVector */
if((locationCount+NONZEROS)>TEMPSIZE){
addI2D(denseTemp, locations, locationCount);
locationCount = 0;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
wordCount++;
}
/* map remaining locations to the denseTemp */
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* contextSize stores the number of words read */
output.contextSize = wordCount;
return output;
}
sparseRIV fileToL2(FILE *data){
char word[100] = {0};
/* locations (implicit RIV) are temporarily stored in temp block,
* and moved to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int denseTemp[RIVSIZE] = {0};
int wordCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if this word would overflow the locations block, map it to the denseVector */
if((locationCount+NONZEROS)>TEMPSIZE){
addI2D(denseTemp, locations, locationCount);
locationCount = 0;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
wordCount++;
}
/* map remaining locations to the denseTemp */
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* contextSize records the number of words in this file */
output.contextSize = wordCount;
fseek(data, 0, SEEK_SET);
return output;
}
sparseRIV fileToL2Clean(FILE *data){
int denseTemp[RIVSIZE] = {0};
char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int wordCount = 0;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
/* if this word would overflow the locations block, map it to the denseVector */
if((locationCount+NONZEROS)>TEMPSIZE){
addI2D(denseTemp, locations, locationCount);
locationCount = 0;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
wordCount++;
}
/* map remaining locations to the denseTemp */
addI2D(denseTemp, locations, locationCount);
sparseRIV output = consolidateD2S(denseTemp);
/* frequency records the number of words in this file */
output.contextSize = locationCount/NONZEROS;
fseek(data, 0, SEEK_SET);
return output;
}
double cosCompare(denseRIV baseRIV, sparseRIV comparator){
long long int dot = 0;
int* locations_stop = comparator.locations+comparator.count;
int* locations_slider = comparator.locations;
int* values_slider = comparator.values;
while(locations_slider<locations_stop){
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
dot += *values_slider * baseRIV.values[*locations_slider];
locations_slider++;
values_slider++;
}
/*dot divided by product of magnitudes */
return dot/(baseRIV.magnitude*comparator.magnitude);
}
double getMagnitudeSparse(sparseRIV input){
size_t temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
/* we sum the squares of all elements */
temp += (*values)*(*values);
values++;
}
/* we take the root of that sum */
return sqrt(temp);
}
double getMagnitudeDense(denseRIV *input){
size_t temp = 0;
int *values = input->values;
int *values_stop = values+RIVSIZE;
while(values<values_stop){
if(*values){
temp += (*values)*(*values);
}
values++;
}
return sqrt(temp);
}
sparseRIV normalize(denseRIV input, int factor){
/* multiplier is the scaling factor we need to bring our vector to the right size */
float multiplier = (float)factor/(input.contextSize);
/* write to temp slot, data will go to a permanent home lower in function */
int* locations = RIVKey.h_tempBlock+RIVSIZE;
int* values = locations+RIVSIZE;
int count = 0;
for(int i=0; i<RIVSIZE; i++){
/* if this point is 0, skip it */
if(!input.values[i]) continue;
/* record position and value in the forming sparse vector */
locations[count] = i;
values[count]= round(input.values[i]*multiplier);
/* drop any 0 values */
if(values[count])count++;
}
sparseRIV output;
output.count = count;
/* for memory conservation, both datasets are put inline with each other */
output.locations = (int*) malloc(count*2*sizeof(int));
output.values = output.locations+count;
/* copy the data from tempBlock into permanent home */
memcpy(output.locations, locations, count*sizeof(int));
memcpy(output.values, values, count*sizeof(int));
/* carry metadata */
strcpy(output.name, input.name);
output.magnitude = getMagnitudeSparse(output);
output.contextSize = input.contextSize;
output.frequency = input.frequency;
return output;
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
/*filetoL2direct is an experiment in simplifying the process. it's slow */
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
/*currently unused */
sparseRIV wordtoL2(char* word);
/* converts an implicit RIV (a set of unvalued locations) into a formal
* sparse RIV. this chooses the best method to perform the consolidation
* and launches that function */
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
/* like fileToL2 but takes a block of text */
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file, untill frequency
* is needed to hold some more useful data point */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
output.frequency = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount<RIVKey.I2SThreshold){
/*direct method is faster on small datasets, but has geometric scaling on large datasets */
return consolidateI2SDirect(implicit, valueCount);
}else{
/* optimized for large datasets */
return consolidateI2SIndirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int n = comparator.count;
while(n){
n--;
/* we calculate the dot-product to derive the cosine
* comparing sparse to dense by index*/
//dot += values[i]*baseRIV.values[locations[i]];
dot += comparator.values[n] * baseRIV.values[comparator.locations[n]];
printf("%d, %d, %d\n",baseRIV.values[comparator.locations[n]],comparator.values[n] , n);
}
/*dot divided by product of magnitudes */
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(sparseRIV input){
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+input.count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
input.magnitude = sqrt(temp);
return input.magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%CACHESIZE;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
/* if not, attempt to pull the word data from lexicon file */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector (word is new to the lexicon */ //#TODO enable NO-NEW features to protect mature lexicons?
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
#if CACHESIZE == 0
/* if there is no cache, simply push to file */
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated
* to the cache or the RIV was pushed out from under it,
* in which case it has already been pushed! move on*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%CACHESIZE;
if(!RIVKey.RIVCache[hash].cached){
/* if there is no word in this cache slot, push to cache instead of file */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding its slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* push the current RIV to cache */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
output.frequency = count;
output.boolean = 1;
return output;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLowerMorphic.h"
#include "RIVaccessories.h"
/* lexPush writes a denseRIV to a file for permanent storage */
int lexPush(denseRIV RIVout);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV lexPull(char* word);
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV fileToL2(FILE *input);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV fileToL2Clean(FILE *data);
sparseRIV fileToL2direct(FILE *data);
/*cosine determines the "similarity" between two RIVs. */
float cosCompare(denseRIV baseRIV, sparseRIV comparator);
sparseRIV wordtoL2(char* word);
sparseRIV consolidateI2S(int *implicit, size_t valueCount);
sparseRIV text2L2(char *text);
sparseRIV text2L2(char *text){
unsigned int blockSize;
char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
unsigned int locationCount = 0;
int displacement;
while(sscanf(text, "%99s%n", word, &displacement)){
text += displacement+1;
if(!displacement){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations((unsigned char*)word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2(FILE *data){
unsigned int blockSize;
unsigned char word[100] = {0};
/* locations (implicit RIV) are temp stored in temp block, and moved
* to permanent home in consolidation */
int *locations = RIVKey.h_tempBlock;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
blockSize = locationCount+NONZEROS;
/* if this word would overflow the locations block, grow it */
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*) realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
/* add word's L1 RIV to the accumulating implicit RIV */
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
output.frequency = malloc(1*sizeof(int));
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV fileToL2Clean(FILE *data){
unsigned char word[100] = {0};
int *locations = RIVKey.h_tempBlock;
unsigned int blockSize;
int locationCount = 0;
while(fscanf(data, "%99s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
/* if the word is not clean, skip it */
if(!isWordClean((char*)word)){
continue;
}
blockSize = locationCount+NONZEROS;
if(blockSize>RIVKey.tempSize){
RIVKey.h_tempBlock = (int*)realloc(RIVKey.h_tempBlock, blockSize*sizeof(int));
locations = RIVKey.h_tempBlock;
RIVKey.tempSize+=NONZEROS;
}
makeSparseLocations(word, locations, locationCount);
locationCount+= NONZEROS;
}
sparseRIV output = consolidateI2S(locations, locationCount);
/* frequency records the number of words in this file */
*(output.frequency) = locationCount/NONZEROS;
output.boolean = 1;
return output;
}
sparseRIV consolidateI2S(int *implicit, size_t valueCount){
if(valueCount>RIVKey.I2SThreshold){
return consolidateI2SIndirect(implicit, valueCount);
}else{
return consolidateI2SDirect(implicit, valueCount);
}
}
void aggregateWord2D(denseRIV destination, char* word){
//makeSparseLocations((unsigned char*)word, locationSlot, 0);
srand(wordtoSeed((unsigned char*)word));
for(int i=0; i<NONZEROS; i++){
destination.values[(rand()%RIVSIZE)] +=1;
destination.values[(rand()%RIVSIZE)] -= 1;
}
}
float cosCompare(denseRIV baseRIV, sparseRIV comparator){
int dot = 0;
int *values = comparator.values;
int *locations = comparator.locations;
int *locations_Stop = locations+comparator.count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseRIV.values+(*locations)));
locations++;
values++;
}
float cosine = dot/(baseRIV.magnitude*comparator.magnitude);
return cosine;
}
float getMagnitudeSparse(RIV input){
size_t count;
if(input.flags & SPARSE){
count = input.count;
}else{
count = RIVSIZE;
}
unsigned long long int temp = 0;
int *values = input.values;
int *values_stop = values+count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
input.magnitude = magnitude;
return magnitude;
}
denseRIV lexPull(char* word){
#if CACHESIZE > 0
/* if there is a cache, first check if the word is cached */
srand(wordtoSeed((unsigned char*)word));
int hash = rand()%RIVKey.cacheSize;
if(!strcmp(word, RIVKey.RIVCache[hash].name)){
/* if word is cached, pull from cache and exit */
return RIVKey.RIVCache[hash];
}
#endif /* CACHESIZE > 0 */
denseRIV output;
char pathString[200];
sprintf(pathString, "lexicon/%s", word);
FILE *lexWord = fopen(pathString, "rb");
/* if this lexicon file already exists */
if(lexWord){
/* pull data from file */
output = fLexPull(lexWord);
fclose(lexWord);
}else{
/*if file does not exist, return a 0 vector */
output = denseAllocate();
}
strcpy(output.name, word);
return output;
}
int lexPush(denseRIV RIVout){
//printf("%s\n", (*RIVout).name);
#if CACHESIZE == 0
fLexPush(RIVout);
return 0;
#else /* CACHESIZE != 0 */
/* if our RIV was cached, there are two options (hopefully)
* either the RIV is still cached, and the data has been updated to the cache
* or the RIV was pushed out from under it, in which case it has already been pushed*/
if(RIVout.cached){
return 0;
}
srand(wordtoSeed((unsigned char*)RIVout.name));
int hash = rand()%RIVKey.cacheSize;
if(!RIVKey.RIVCache[hash].cached){
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return 0;
/*if the current RIV is more frequent than the RIV holding it's slot */
}else if(*(RIVout.frequency) > *(RIVKey.RIVCache[hash].frequency) ){
//scanf("%f", &(*RIVout).magnitude);
//printf("%s replacing %s\n", (*RIVout).name, RIVKey.RIVCache[hash].name);
/* push the current cache entry to a file */
int diag = fLexPush(RIVKey.RIVCache[hash]);
/* replace the cache entry with the currrent RIV */
RIVKey.RIVCache[hash] = RIVout;
RIVKey.RIVCache[hash].cached = 1;
return diag;
}else{
/* push current RIV to file */
fLexPush(RIVout);
}
return 0;
#endif /* CACHESIZE == 0 */
}
sparseRIV fileToL2direct(FILE *data){;
unsigned char word[100] = {0};
denseRIV denseTemp;
// a temporary dense RIV is stored in the tempBlock
denseTemp.values = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVSIZE*sizeof(int));
int count = 0;
while(fscanf(data, "%99s", word)){
count++;
if(feof(data)){
break;
}
if(!(*word)){
break;
}
// add word's L1 RIV to the accumulating implicit RIV
aggregateWord2D(denseTemp, (char*)word);
}
sparseRIV output = consolidateD2S(denseTemp.values);
// frequency records the number of words in this file
*(output.frequency) = count;
output.boolean = 1;
return output;
}
int* mapS2D(int* destination, sparseRIV input); //#TODO fix int*/denseRIV confusion
int* addI2D(int* destination, int* locations, size_t seedCount);
/* cosine determines the "similarity" between two RIVs. */
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount);
/* magnitudes will be used later in cosine comparison */
void getMagnitudes(sparseRIV *inputs, size_t RIVCount);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);//unused except in text2l2
sparseRIV text2L2(unsigned char *text);//unused
float* cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount);
/*lexPush writes a denseRIV to a file of the same name, under the directory "lexicon"
* it is up to the programmer to ensure that the name of the RIV is a valid filename
* although it will of course attempt to create the file if it does not exist
*/
int* mapS2D(denseRIV destination, sparseRIV input){// #TODO fix destination parameter vs calloc of destination
// make sure our destination is a 0 vector
memset(destination.values, 0, RIVKey.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
// apply values at an index based on locations
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
strcpy(destination.name, input.name);
*(destination.frequency) = input.frequency;
destination.magnitude = input.magnitude;
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
/*apply values +1 or -1 at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] +=1;
locations_slider++;
destination[*locations_slider] -= 1;
locations_slider++;
}
return destination;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount){
float *results = calloc(multiplierCount, sizeof(float));
float* results_slider = results;
int *baseDenseRIV = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVKey.RIVsize*sizeof(int));
addS2D(baseDenseRIV, baseRIV);
float cosine;
sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
float minsize = baseRIV.magnitude * .85;
float maxsize = baseRIV.magnitude * 1.15;
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){
/* skip a pair if the multiplier has already been culled, or if
* the size difference is too great */
if(((*multipliers).boolean)
&& (((*multipliers).magnitude < maxsize)
&& ((*multipliers).magnitude > minsize))){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
/* magnitudes had better already be calculated at this point*/
cosine = dot/((baseRIV.magnitude)*((*multipliers).magnitude));
*results_slider = cosine;
results_slider++;
/* perform the action defined by the COSINEACTION macro */
COSINEACTION;
}
multipliers++;
}
return results;
}
float* cosineCompareUnbound(sparseRIV baseRIV, sparseRIV *multipliers, size_t multiplierCount){
float *results = calloc(multiplierCount, sizeof(float));
float* results_slider = results;
int *baseDenseRIV = RIVKey.h_tempBlock;
memset(RIVKey.h_tempBlock, 0, RIVKey.RIVsize*sizeof(int));
addS2D(baseDenseRIV, baseRIV);
float cosine;
sparseRIV *multipliersStop = multipliers+multiplierCount;
/* if two vectors are too different in size, we can ignore the risk of similarity */
int dot = 0;
int *values;
int *locations;
int *locations_Stop;
/* check the baseRIV against each multiplier */
while(multipliers<multipliersStop){
dot = 0;
values = (*multipliers).values;
locations = (*multipliers).locations;
locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
/* we calculate the dot-product to derive the cosine */
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
/* magnitudes had better already be calculated at this point*/
cosine = dot/((baseRIV.magnitude)*((*multipliers).magnitude));
*results_slider = cosine;
results_slider++;
/* perform the action defined by the COSINEACTION macro */
COSINEACTION;
}
multipliers++;
return results;
}
void getMagnitudes(sparseRIV *inputs, size_t RIVCount){
for(int i=0; i<RIVCount; i++){
unsigned long long int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
clean(){
while [ "$1" ]; do
./RIVread "$1"
shift
done
}
clean ../bookCleaner/cleanbooks/*
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtoolsCPUlinux.h"
void directoryToL2s(char *rootString);
int main(){
RIVInit();
char rootString[] = "lexicon/";
directoryToL2s(rootString);
}
void directoryToL2s(char *rootString){
sparseRIV fileRIV;
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString);
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
FILE *input = fopen(pathString, "r");
if(!input){
printf("file %s doesn't seem to exist, breaking out of loop", pathString);
return;
}else{
denseRIV temp = lexPull(pathString);
fileRIV = consolidateD2S(temp.values);
strcpy(fileRIV.name, pathString);
float count = fileRIV.count;
printf("%s, saturation: %f\n", fileRIV.name, count);
fclose(input);
free(temp.values);
//free(fileRIV.locations);
}
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import matplotlib.pyplot as plt
import math
def fit(x):
return 1*(1067+94500000/x)
x = 7
range = 0.15
while(1):
range = input("gimmerange");
data = open("graphdata.txt", "r");
frequencies = [];
mags = [];
fitline = [];
i = 0;
for line in data:
segments = line.split(",")
freq = int(segments[1])
mag = float(segments[2])
name = segments[4];
core = fit(freq)
fitmax = core*(1+range);
fitmin = core*(1-range);
if(mag >fitmax or mag < fitmin):
continue
frequencies.append(freq)
mags.append(mag)
fitline.append(fit(freq));
print("{} {} {}".format(name, freq, mag))
i+=1
#plt.scatter(frequencies, mags)
plt.plot(frequencies, fitline, 'r^', frequencies, mags, 'bs')
plt.show()
x+=1
#include <stdio.h>
#include <stdlib.h>
#include <strsafe.h>
#define SEEDMASK 25214903917
struct RIVData{
int RIVsize;
int nonZeros;
long long int *masks;
int *h_tempBlock;
int *h_stagingBlock;
int *h_staging_slider;
int *h_staging_stop;
int *h_displacements;
int *d_OpenSlot;
int *d_SlotEnd;
float *d_magnitudes;
int thing;
}RIVKeyData;
typedef struct{
char name[100];
int *values;
int *locations;
int count;
int frequency;
float magnitude;
int boolean;
}sparseRIV;
sparseRIV FileToL2(FILE *data);
void consolidateD2S(sparseRIV *destination, int *denseInput);
void setKeyData(int RIVsize, int nonZeros, int blockSize);
int* mapS2D(int * destination, sparseRIV input);
int* makeSparseLocations(int *seeds, int seedCount);
void makeSeeds(unsigned char* word, int **seeds, int *seedCount);
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold);
void getMagnitudes(sparseRIV *inputs, int RIVCount);
int *mapI2D(int *locations, int seedCount);
sparseRIV text2L2(unsigned char *text);
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
sparseRIV FileToL2(FILE *data){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = RIVKeyData.h_tempBlock;
int seedCount = 0;
while(fscanf(data, "%s", word)){
if(feof(data)){
break;
}
if(!(*word)){
break;
}
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
//printf("mcshittles");
int *L2dense;
L2dense = mapI2D(locations, seedCount);
sparseRIV output;
//printf("tits");
consolidateD2S( &output, L2dense);
free(L2dense);
output.boolean = 1;
RIVKeyData.thing++;
return output;
}
float* cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCount, float threshold){
int *baseDenseRIV = RIVKeyData.h_tempBlock;
mapS2D(baseDenseRIV, baseRIV);
float *outputs = (float*)malloc((multiplierCount)* sizeof(float));
float *output_slider = outputs;
sparseRIV *multipliersStop = multipliers+multiplierCount;
float minsize = baseRIV.magnitude * .75;
float maxsize = baseRIV.magnitude * 1.25;
while(multipliers<multipliersStop){
if(((*multipliers).boolean) /*&& (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/){
int dot = 0;
int *values = (*multipliers).values;
int *locations = (*multipliers).locations;
int *locations_Stop = locations+(*multipliers).count;
while(locations<locations_Stop){
dot += (*values)*(*(baseDenseRIV+(*locations)));
locations++;
values++;
}
*output_slider= dot/((baseRIV.magnitude)*((*multipliers).magnitude));
if(*output_slider>=threshold){
printf("%s\t%s\n%f\n", (*multipliers).name, baseRIV.name, *output_slider);
(*multipliers).boolean = 0;
//RIVKeyData.thing ++;
}
}
multipliers++;
output_slider++;
}
return outputs;
}
void getMagnitudes(sparseRIV *inputs, int RIVCount){
for(int i=0; i<RIVCount; i++){
int temp = 0;
int *values = inputs[i].values;
int *values_stop = values+inputs[i].count;
while(values<values_stop){
temp += (*values)*(*values);
values++;
}
float magnitude = sqrt(temp);
inputs[i].magnitude = magnitude;
//printf("magnitude = %f, \n", magnitude);
}
}
int* mapS2D(int* destination, sparseRIV input){
memset(destination, 0, RIVKeyData.RIVsize*sizeof(int));
int *locations_slider = input.locations;
int *values_slider = input.values;
int *locations_stop = locations_slider+input.count;
while(locations_slider<locations_stop){
destination[*locations_slider] = *values_slider;
locations_slider++;
values_slider++;
}
//HANDLE_ERROR (cudaMemcpy (RIVKeyData.d_OpenSlot, destination, RIVKeyData.RIVsize*sizeof(int), cudaMemcpyHostToDevice));
return destination;
}
int* mapI2D(int *locations, int valueCount){
int *destination = (int*)calloc(RIVKeyData.RIVsize,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
int value = 1;
while(locations_slider<locations_stop){
destination[*locations_slider] +=value;
locations_slider++;
value = (value == 1)? -1: 1;
}
return destination;
}
void consolidateD2S(sparseRIV *destination, int *denseInput){
int count = 0;
(*destination).locations = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
(*destination).values = (int*) malloc(RIVKeyData.RIVsize*sizeof(int));
for(int i=0; i<RIVKeyData.RIVsize; i++){
if(denseInput[i]){
(*destination).locations[count] = i;
(*destination).values[count] = denseInput[i];
count++;
}
}
destination->count = count;
(*destination).locations = (int*) realloc((*destination).locations, (*destination).count*sizeof(int));
(*destination).values = (int*) realloc((*destination).values, (*destination).count*sizeof(int));
}
void setKeyData(int RIVsize, int nonZeros, int blockSize){
RIVKeyData.RIVsize = RIVsize;
if(nonZeros%2){
printf("your nonZeros must be an even number");
nonZeros++;
printf(", changed to %d", nonZeros);
}
RIVKeyData.nonZeros = nonZeros;
RIVKeyData.masks = (long long int*)malloc(nonZeros*sizeof(long long int));
for(int i = 0; i<nonZeros; i++){
RIVKeyData.masks[i] = SEEDMASK>>(5*i);
}
RIVKeyData.h_tempBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_stagingBlock = (int*)malloc(blockSize*sizeof(int));
//RIVKeyData.h_staging_slider = RIVKeyData.h_stagingBlock;
RIVKeyData.thing = 0;
}
void makeSeeds(unsigned char* word, int **seeds, int *seedCount){
int i=0;
int seedbase = 0;
while(*word){
seedbase += (*(word))<<(i*5);
word++;
i++;
}
int *seedTrack = (*seeds)+*seedCount;
for(i =0 ; i<RIVKeyData.nonZeros; i++){
*seedTrack = (seedbase>>i)+(3*i);
seedTrack++;
}
*seedCount+=RIVKeyData.nonZeros;
return;
}
int* makeSparseLocations(int* seeds, int seedCount){
int *locations = RIVKeyData.h_tempBlock;
int *locations_slider = locations;
int *seeds_stop = seeds+seedCount;
long long int *mask = RIVKeyData.masks;
long long int *mask_stop = mask+RIVKeyData.nonZeros;
while(seeds<seeds_stop){
*locations_slider =(((*seeds)^(*mask)) & 2147483647) %(RIVKeyData.RIVsize);
mask++;
locations_slider++;
seeds++;
if(!(mask<mask_stop)) mask-=RIVKeyData.nonZeros;
}
return locations;
}
unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
unsigned char *word_slider = word;
while(*(*string)){
if(*(*string) == ' ') {
(*string)++;
break;
}
*word_slider = *(*string);
word_slider++;
(*string)++;
}
*word_slider = 0;
return word;
}
sparseRIV text2L2(unsigned char *text){
unsigned char *word = (unsigned char*)calloc(2000, 1);
int *seeds = ( int*)malloc(RIVKeyData.nonZeros*sizeof( int));
unsigned char *text_slider = text;
int seedCount = 0;
while(*text_slider){
sscanAdvance(&text_slider, word);
makeSeeds(word, &seeds, &seedCount);
memset(word, 0, 2000);
}
int *locations = makeSparseLocations(seeds, seedCount);
int *L2dense;
L2dense = mapI2D(locations, seedCount);
free(locations);
sparseRIV output;
consolidateD2S(&output, L2dense);
free(seeds);
return output;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment