Commit ad4b27c9 by etcart

secured push against cache

parent 9fd65b3a
......@@ -7,6 +7,7 @@
#include <unistd.h>
#include <sys/stat.h>
#include "RIVaccessories.h"
#include "assert.h"
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
......@@ -14,8 +15,8 @@
#define RIVSIZE 25000
#endif
#if RIVSIZE<0
#error "RIVSIZE must be a positive number (preferably a large positive)"
#if RIVSIZE<4
#error "RIVSIZE must be a positive number, greater than 4 (preferably a large positive)"
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
......@@ -36,7 +37,7 @@
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 5000
#define CACHESIZE 10000
#endif
#if CACHESIZE<0
......@@ -57,10 +58,10 @@ typedef struct{
char name[100];
int *values;
int *locations;
size_t count;
float magnitude;
int contextSize;
int count;
int frequency;
int contextSize;
float magnitude;
}sparseRIV;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
......@@ -68,11 +69,11 @@ typedef struct{
* performed between sparse and dense (hetero-arithmetic)
*/
typedef struct{
int cached;
char name[100];
int cached;
int frequency;
float magnitude;
int contextSize;
float magnitude;
int values[RIVSIZE];
}denseRIV;
......@@ -99,13 +100,13 @@ sparseRIV consolidateD2S(int *denseInput); //#TODO fix int*/denseRIV confusion
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void makeSparseLocations(char* word, int *seeds, size_t seedCount);
void makeSparseLocations(char* word, int *seeds, int seedCount);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int* mapI2D(int *locations, size_t seedCount);
int* mapI2D(int *locations, int seedCount);
/* highly optimized method for adding vectors. there is no method
* included for adding D2D or S2S, as this system is faster-enough
......@@ -121,7 +122,7 @@ int cacheDump();
/* adds all elements of an implicit RIV (a sparseRIV represented without values)
* to a denseRIV. used by the file2L2 functions in aggregating a document vector
*/
int* addI2D(int* destination, int* locations, size_t seedCount);
int* addI2D(int* destination, int* locations, int seedCount);
/*subtracts a words vector from its own context. regularly used in lex building
*/
......@@ -136,6 +137,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
/* apply values at an index based on locations */
while(locations_slider<locations_stop){
destination[*locations_slider] += *values_slider;
locations_slider++;
values_slider++;
......@@ -144,7 +146,7 @@ int* addS2D(int* destination, sparseRIV input){// #TODO fix destination paramete
return destination;
}
int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int* mapI2D(int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *destination = (int*)calloc(RIVSIZE,sizeof(int));
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
......@@ -160,7 +162,7 @@ int* mapI2D(int *locations, size_t valueCount){// #TODO fix destination paramete
return destination;
}
int* addI2D(int* destination, int *locations, size_t valueCount){// #TODO fix destination parameter vs calloc of destination
int* addI2D(int* destination, int *locations, int valueCount){// #TODO fix destination parameter vs calloc of destination
int *locations_slider = locations;
int *locations_stop = locations_slider+valueCount;
......@@ -203,6 +205,7 @@ sparseRIV consolidateD2S(int *denseInput){
}
}
/* a slot is opened for the locations/values pair */
output.locations = (int*) malloc(output.count*2*sizeof(int));
if(!output.locations){
printf("memory allocation failed"); //*TODO enable fail point knowledge and security
......@@ -220,7 +223,7 @@ sparseRIV consolidateD2S(int *denseInput){
void makeSparseLocations(char* word, int *locations, size_t count){
void makeSparseLocations(char* word, int *locations, int count){
locations+=count;
srand(wordtoSeed(word));
int *locations_stop = locations+NONZEROS;
......
No preview for this file type
File added
#include <stdio.h>
#define CACHESIZE 0
#define CACHEEXCLUSIVE 1
#define RIVSIZE 50000
#include "RIVtools.h"
char* stem(char* word);
int main(){
lexOpen("consolidatedLexicon50-8");
FILE* text = fopen("../books/pg56902.txt", "r");
if(!text){
puts("no file");
return 1;
}
denseRIV accumulate = {0};
sparseRIV temp;
char word[100];
while(fscanf(text, "%99s", word)){
if(feof(text)) break;
if(!*word) break;
if(stem(word)){
denseRIV* wordRIV = lexPull(word);
if(!wordRIV){
printf("%s, not in lexicon\n", word);
continue;
}else{
temp = consolidateD2S(wordRIV->values);
addS2D(accumulate.values, temp);
free(temp.locations);
free(wordRIV);
}
}else{
printf("%s, not in wordNet\n", word);
}
}
return 0;
}
char* stem(char* word){
char pathString[200];
int WNdata;
sprintf(pathString, "WN/%s", word);
FILE* WNfile = fopen(pathString, "r");
if(!WNfile) return NULL;
fscanf(WNfile, "%d", &WNdata);
if(!WNdata) return NULL;
if(WNdata == 1) return word;
if(WNdata == 2){
fscanf(WNfile, "%s", word);
fclose(WNfile);
sprintf(pathString, "WN/%s", word);
WNfile = fopen(pathString, "r");
if(!WNfile) return NULL;
fscanf(WNfile, "%*d%s", word);
return word;
}
return NULL;
}
File added
#include <stdio.h>
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
int main(int argc, char* argv[]){
lexOpen(argv[1]);
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(argv[1]))){
printf("location not found, %s\n", argv[1]);
return 1;
}
int i=0;
int j=0;
while((files=readdir(directory))){
if(*(files->d_name) == '.') continue;
if(files->d_type == DT_DIR){
/* the lexicon should not have valid sub-directories */
continue;
}
j++;
intake = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<7000){
free(intake);
continue;
}
examine = normalize(*intake, 10000);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%d,%d\n", examine.frequency, examine.contextSize, examine.magnitude, i, j);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
output[i]->contextSize = intake->contextSize;
free(intake);
free(examine.locations);
i++;
}
lexClose();
lexOpen("consolidatedLexicon50-8");
for(int j=0; j<i; j++){
lexPush(output[j]);
}
lexClose();
return 0;
}
File added
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <time.h>
#include "RIVtools.h"
#define THRESHOLD 0.70
/* this program identifies all near-duplicates among the documents in the
* chosen root directory, using RIV comparison */
// fills the fileRIVs array with a vector for each file in the root directory
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount);
int main(int argc, char *argv[]){
int fileCount = 0;
//initializes the fileRIVs array to be reallocced by later function
sparseRIV *fileRIVs = (sparseRIV*) malloc(1*sizeof(sparseRIV));
char rootString[2000];
if(argc <2){
printf("give me a directory");
return 1;
}
strcpy(rootString, argv[1]);
strcat(rootString, "/");
//gather all vectors ino the fileRIVs array and count them in fileCount
directoryToL2s(rootString, &fileRIVs, &fileCount);
printf("fileCount: %d\n", fileCount);
//first calculate all magnitudes for later use
for(int i = 0; i < fileCount; i++){
fileRIVs[i].magnitude = getMagnitudeSparse(fileRIVs[i]);
}
clock_t begintotal = clock();
double cosine;
double minmag;
double maxmag;
//all cosines need a sparse-dense comparison. so we will create a
denseRIV baseDense;
for(int i = 0; i < fileCount; i++){
//0 out the denseVector, and map the next sparseVector to it
memset(&baseDense, 0, sizeof(denseRIV));
addS2D(baseDense.values, fileRIVs[i]);
//pass magnitude to the to the dense vector
baseDense.magnitude = fileRIVs[i].magnitude;
//if these two vectors are too different in size, we can know that they are not duplicates
minmag = baseDense.magnitude*.85;
maxmag = baseDense.magnitude*1.15;
for(int j = 0; j < i; j++){
//if this vector is within magnitude threshold
if(fileRIVs[j].magnitude < maxmag
&& fileRIVs[j].magnitude > minmag){
//identify the similarity of these two vectors
cosine = cosCompare(baseDense, fileRIVs[j]);
//if the two are similar enough to be flagged
if(cosine>THRESHOLD){
printf("%s\t%s\n%f\n", fileRIVs[i].name , fileRIVs[j].name, cosine);
}
}
}
}
printf("fileCount: %d", fileCount);
free(fileRIVs);
clock_t endtotal = clock();
double time_spent = (double)(endtotal - begintotal) / CLOCKS_PER_SEC;
printf("total time:%lf\n\n", time_spent);
return 0;
}
//mostly a standard recursive Dirent-walk
void directoryToL2s(char *rootString, sparseRIV** fileRIVs, int *fileCount){
/* *** begin Dirent walk *** */
char pathString[2000];
DIR *directory;
struct dirent *files = 0;
if(!(directory = opendir(rootString))){
printf("location not found, %s\n", rootString);
return;
}
while((files=readdir(directory))){
if(!files->d_name[0]) break;
while(*(files->d_name)=='.'){
files = readdir(directory);
}
if(files->d_type == DT_DIR){
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
strcat(pathString, "/");
directoryToL2s(pathString, fileRIVs, fileCount);
continue;
}
strcpy(pathString, rootString);
strcat(pathString, files->d_name);
/* *** end dirent walk, begin meat of function *** */
FILE *input = fopen(pathString, "r");
if(input){
*fileRIVs = (sparseRIV*)realloc((*fileRIVs), ((*fileCount)+1)*sizeof(sparseRIV));
(*fileRIVs)[*fileCount] = fileToL2(input);
strcpy((*fileRIVs)[*fileCount].name, pathString);
fclose(input);
*fileCount += 1;
}
}
}
#include <stdio.h>
#define RIVSIZE 25000
#define RIVSIZE 50000
#define CACHESIZE 0
#include "RIVtools.h"
#include <dirent.h>
......@@ -7,8 +7,6 @@
int main(int argc, char* argv[]){
lexOpen(argv[1]);
denseRIV* intake;
sparseRIV examine;
static denseRIV *output[60000] = {0};
DIR *directory;
struct dirent *files = 0;
......@@ -28,27 +26,15 @@ int main(int argc, char* argv[]){
intake = lexPull(files->d_name);
/* if the vector has been encountered more than MINSIZE times
* then it should be statistically significant, and useful */
if(intake->contextSize<10000)continue;
examine = normalize(*intake, 500);
strcpy(examine.name, files->d_name);
printf("%d,%d,%lf,%s\n", examine.frequency, examine.contextSize, examine.magnitude, examine.name);
output[i] = calloc(1, sizeof(denseRIV));
addS2D(output[i]->values, examine);
output[i]->magnitude = examine.magnitude;
strcpy(output[i]->name, files->d_name);
output[i]->frequency = intake->frequency;
free(intake);
free(examine.locations);
i++;
}
lexClose();
/*lexOpen("consolidatedLexiconAggressive");
for(int j=0; j<i; j++){
lexPush(output[j]);
printf("%d,%d,%lf,%d,%s\n", intake->frequency, intake->contextSize, intake->magnitude, i, files->d_name);
free(intake);
i++;
}
lexClose();*/
lexClose();
return 0;
}
No preview for this file type
No preview for this file type
......@@ -6,6 +6,7 @@
#include <dirent.h>
#include <error.h>
#include <string.h>
#define CACHESIZE 100000
#include "RIVtools.h"
//this program reads a directory full of files, and adds all context vectors (considering file as context)
......@@ -17,6 +18,7 @@ void directoryGrind(char *rootString);
void lineGrind(char* textLine);
int main(int argc, char *argv[]){
char pathString[1000];
//we open the lexicon, if it does not yet exist, it will be created
......@@ -69,7 +71,7 @@ void directoryGrind(char *rootString){
printf("skipped: %s\n", files->d_name);
continue;
}
puts(files->d_name);
//open a file within root directory
FILE *input = fopen(pathString, "r");
if(input){
......@@ -83,11 +85,11 @@ void directoryGrind(char *rootString){
void fileGrind(FILE* textFile){
char textLine[5000];
char textLine[10000];
// included python script separates paragraphs into lines
while(fgets(textLine, 4999, textFile)){
int i=0;
while(fgets(textLine, 9999, textFile)){
printf("line: %d\n", i++);
if(!strlen(textLine)) continue;
if(feof(textFile)) break;
......@@ -100,7 +102,11 @@ void fileGrind(FILE* textFile){
void lineGrind(char* textLine){
//extract a context vector from this text set
sparseRIV contextVector = textToL2(textLine);
if(contextVector.contextSize <= 1){
free(contextVector.locations);
return;
}
denseRIV* lexiconRIV;
//identify stopping point in line read
char* textEnd = textLine + strlen(textLine)-1;
......@@ -110,6 +116,7 @@ void lineGrind(char* textLine){
sscanf(textLine, "%99s%n", word, &displacement);
//we ensure that each word exists, and is free of unwanted characters
textLine += displacement+1;
if(!(*word))continue;
if(!isWordClean((char*)word)){
......@@ -132,7 +139,7 @@ void lineGrind(char* textLine){
//and finally we push it back to the lexicon for permanent storage
lexPush(lexiconRIV);
textLine += displacement+1;
}
//free the heap allocated context vector data
......
No preview for this file type
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import matplotlib.pyplot as plt
import math
def fit(x):
return 1*(1067+94500000/x)
x = 7
range = 0.15
while(1):
range = input("gimmerange");
data = open("graphdata.txt", "r");
frequencies = [];
mags = [];
fitline = [];
i = 0;
for line in data:
segments = line.split(",")
freq = int(segments[1])
mag = float(segments[2])
name = segments[4];
if(freq>40000):
continue;
core = fit(freq)
fitmax = core*(1+range);
fitmin = core*(1-range);
if(mag >fitmax or mag < fitmin):
continue
frequencies.append(freq)
mags.append(mag)
fitline.append(fit(freq));
print("{} {} {}".format(name, freq, mag))
i+=1
data = open("../code/RIVet/graphdata.txt", "r");
frequencies = [];
mags = [];
i = 0;
for line in data:
if(int(line.split(",")[1])>40000):
continue;
frequencies.append(int(line.split(",")[1]))
mags.append(float(line.split(",")[2]))
if(mags[i]>80 and frequencies[i]>7000 and frequencies[i]<15000):
print(line)
i+=1
plt.scatter(frequencies, mags)
plt.show()
#plt.scatter(frequencies, mags)
plt.plot(frequencies, fitline, 'r^', frequencies, mags, 'bs')
plt.show()
x+=1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment