updated lots of stuff

fe20c6f5 · etcart · 60856c1d · fe20c6f5 · fe20c6f5 · fe20c6f5
Commit fe20c6f5 authored May 09, 2018 by etcart
Showing with 586 additions and 231 deletions
.stemnet2.txt.swp
.stemnete.txt.swp
RIVaccessories.h
RIVaccessories.h.gch
RIVclasses
RIVclasses.c
RIVclasses.o
RIVlexicon.h
RIVlexicon.h.gch
RIVread.c
runscriptUb.sh
saturation.c
someshit.c
stemconfig/dbtools.py
stemconfig/dbtools.pyc
stemconfig/stemconf
stemconfig/stemconf.c
stemconfig/stemconf.o
stemconfig/stemconfig
stemconfig/stemconfig.c
--- a/.stemnet2.txt.swp
+++ b/.stemnet2.txt.swp
--- a/.stemnete.txt.swp
+++ b/.stemnete.txt.swp
--- a/RIVaccessories.h
+++ b/RIVaccessories.h
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "stemconfig/stemset.h"
 struct treenode{
 	void* data;
@@ -11,14 +12,14 @@ struct treenode{
 	struct treenode* links[26];
 	int downstream;
-};
+}*nextNode;
+void stemInsert(struct treenode* node, char* letter, void* data);
 int treecut(struct treenode* node, char* letter);
-void stemInsert(struct treenode* node, char* letter, char* data);
-void RIVinsert(struct treenode* node, char* letter, void* data);
+void treeInsert(struct treenode* node, char* letter, void* data);
 void* treeSearch(struct treenode* node, char* letter);
 struct treenode* stemTreeSetup();
 /*isWordClean filters words that contain non-letter characters, and 
 * upperCase letters, allowing only the '_' symbol through
 */
@@ -64,27 +65,34 @@ int wordtoSeed(char* word){
 	return seed;
 }
 struct treenode* stemTreeSetup(){
-	FILE* netfile = fopen("stemnet2.txt", "r");
+	FILE* wordFile = fopen("stemconfig/wordset.txt", "r");
-	if(!netfile){
+	if(!wordFile){
-		printf("no stemnet file");
+		printf("no wordnet file");
 		return 0;
 	}
-	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	struct treenode* rootNode = calloc(treesize, sizeof(struct treenode));
+	nextNode = rootNode+1;
 	char word[100];
-	char stem[100];
+	char* stem = (char*)stemset;
+	int displacement;
+	while(fscanf(wordFile, "%s", word)){
-	while(fscanf(netfile, "%s %s", word, stem)){
+		sscanf(stem, "%*s%n", &displacement);
+		stem[displacement] = '\0';
-		if(feof(netfile)){
-			break;
-		}
 		stemInsert(rootNode, word, stem);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
 	}
+	fclose(wordFile);
 	return rootNode;
 }
 void* treeSearch(struct treenode* node, char* letter){
@@ -100,15 +108,15 @@ void* treeSearch(struct treenode* node, char* letter){
 		return node->data;
 	}
 }
-void RIVinsert(struct treenode* node, char* letter, void* data){
+void stemInsert(struct treenode* node, char* letter, void* data){
 	node->downstream++;
 	if(*(letter)){
 		if(!node->links[*(letter)-'a']){
-			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+			node->links[*(letter)-'a'] = nextNode++;
 		}
-		RIVinsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 	}else{
@@ -119,43 +127,46 @@ void RIVinsert(struct treenode* node, char* letter, void* data){
 	}
 }
-void stemInsert(struct treenode* node, char* letter, char* data){
+void treeInsert(struct treenode* node, char* letter, void* data){
 	node->downstream++;
 	if(*(letter)){
 		if(!node->links[*(letter)-'a']){
 			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
 		}
-		stemInsert(node->links[*(letter)-'a'], letter+1, data);
+		treeInsert(node->links[*(letter)-'a'], letter+1, data);
 	}else{
 		if(node->data) return;
-		node->data = calloc(strlen(data)+1, sizeof(char));
+		node->data = data;
-		strcpy((char*)node->data, data);
 	}
 }
 int treecut(struct treenode* node, char* letter){
 	node->downstream--;
 	int flag;
+	//continue searching downstream if there is a letter
 	if(*(letter)){
 		if(node->links[*(letter)-'a']){
+			//propagate to next section
 			flag = treecut(node->links[*(letter)-'a'], letter+1);
+			//if next section returned a "cut" flag, 0 it out
 			if(flag){
 				node->links[*(letter)-'a'] = NULL;
 			}
 		}
-		if(!node->downstream){
+	//there are no more letters, we've reached our destination
+	}else{
-			free(node);
+		node->data = NULL;
-			return 1;
 	}
-	}else{
+	//this is on a branch that leads nowhere, free it and return "cut" flag
+	if(!node->downstream){
 		free(node);
 		return 1;
@@ -164,5 +175,17 @@ int treecut(struct treenode* node, char* letter){
 }
+void destroyTree(struct treenode* node){
+	if(node->data) free(node->data);
+	for(int i=0; i<26; i++){
+		if(node->links[i]){
+			destroyTree(node->links[i]);
+		}
+	}
+	free(node);
+}
 #endif
--- a/RIVaccessories.h.gch
+++ b/RIVaccessories.h.gch
--- a/RIVclasses
+++ b/RIVclasses
--- a/RIVclasses.c
+++ b/RIVclasses.c
 #include <stdio.h>
 #define RIVSIZE 50000
+#define CACHESIZE 20000
 #include "RIVtools.h"
-char* clean(char* word);
+#define k 5
-char* stemmy(struct treenode* searchRoot, char* word);
-sparseRIV line2L3(char* text, struct treenode* searchRoot);
 typedef char label[200];
 struct RIVclass{
 	label name;
 	sparseRIV* set;
 	int setSize;
 };
+char* clean(char* word);
+char* stemmy(struct treenode* searchRoot, char* word);
+sparseRIV line2L3(char* text, struct treenode* searchRoot);
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion);
 LEXICON* lexicon;
 int main(){
 	struct treenode* searchRoot = stemTreeSetup();
-	lexicon = lexOpen("consolidatedLexicon", "rx");
+	lexicon = lexOpen("lexiconEnron50-4", "rx");
 	int classNo = 0;
@@ -25,18 +30,38 @@ int main(){
-	FILE* textSet = fopen("../../Downloads/labeledText.tsv", "r");
+	FILE* textSet = fopen("../../Downloads/trainingText.tsv", "r");
 	if(!textSet){
 		puts("no file");
 		return 1;
 	}
-	struct RIVclass* class;
+	struct RIVclass* class = 0;
 	char text[20000];
 	label className;
-	while(fscanf(textSet, "%s\t%s", text, className)){
+	//int j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		//if(j++>100) break;
+		if(feof(textSet)) break;
-		char* labelTemp = strstr(*classNames, className);
-		if(!labelTemp){
+		sparseRIV temp = line2L3(text, searchRoot);
+		temp.magnitude = getMagnitudeSparse(temp);
+		if(temp.magnitude == 0){
+			printf("%s, empty\n", text);
+			continue;
+		}
+		//printf("%s, %s", text, className);
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}
+		if(i == classCount){
 			/* reinitialize the classnames with a new member */
 			classNames = realloc(classNames, (classCount+1)*sizeof(label));
 			strcpy(classNames[classCount], className);
@@ -53,14 +78,10 @@ int main(){
 			classNo = classCount;
 			classCount++;
-		}else{
-			classNo = (labelTemp-*classNames);
-			class = classes+classNo;
 		}
 		class->set = realloc(class->set, (class->setSize+1) *sizeof(sparseRIV));
-		sparseRIV thing= line2L3(text, searchRoot);
+		sparseRIV thing= temp;
 		class->set[class->setSize] = thing;
 		class->setSize++;
@@ -69,10 +90,71 @@ int main(){
 	for(int i=0; i<classCount; i++){
 		puts(classNames[i]);
+		puts(classes[i].name);
 		printf("%d\n\n", classes[i].setSize);
 	}
+	fclose(textSet);
+	textSet = fopen("../../Downloads/validationText.tsv", "r");
+	if(!textSet) return 1;
+	int won = 0;
+	int docTotal = 0;
+	//scanf("%d", &won);
+	//j=0;
+	while(fscanf(textSet, "%[^\t]\t%[^\n]", text, className)){
+		if(feof(textSet)) break;
+		//if(j++>30) break;
+		int i=0;
+		for(; i< classCount; i++){
+			if(!strcmp(className, classNames[i])){
+				classNo = i;
+				class = classes+classNo;
+				break;
+			}
+		}if(i == classCount){
+			printf("unclassifiable\n");
+			continue;
+		}
+		sparseRIV thing= line2L3(text, searchRoot);
+		if(thing.count ==0){
+			continue;
+		}
+		docTotal++;
+		denseRIV inQuestion = {0};
+		addS2D(inQuestion.values, thing);
+		inQuestion.magnitude = getMagnitudeDense(&inQuestion);
+		double weights[classCount];
+		int choice = kNearest(weights, classes, classCount, inQuestion);
+		if(choice == -1){
+			printf("classificationFailed");
+		}else{
+			//puts(text);
+			printf("survey says! %s  ", className);
+			printf("your asnwer was...%d, %s\n", choice, classes[choice].name);
+		}
+		if(choice == classNo){
+			won++;
+		}
+		free(thing.locations);
+	}
+	printf("\n\n we got %d/%d ", won, docTotal);
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			free(classes[i].set[j].locations);
+		}
+		free(classes[i].set);
+	}
+	free(classes);
+	free(classNames);
+	destroyTree(searchRoot);
+	lexClose(lexicon);
+	fclose(textSet);
 	return 0;
 }
@@ -132,24 +214,72 @@ sparseRIV line2L3(char* text, struct treenode* searchRoot){
 				continue;
 			}else{
 				//printf("%s, succesfully pulled\n", stem);
-				temp = consolidateD2S(wordRIV->values);
+				temp = normalize(*wordRIV, 10000);
+				//temp = consolidateD2S(wordRIV->values);
 				addS2D(accumulate.values, temp);
 				free(temp.locations);
-				free(wordRIV);
+				//free(wordRIV);
+				lexPush(lexicon, wordRIV);
 			}
 		}
 	}
 	temp = consolidateD2S(accumulate.values);
 	return temp;
+}
+int kNearest(double* weights, struct RIVclass* classes, int classCount, denseRIV inQuestion){
+	int choice = -1;
+	memset(weights, 0, classCount*sizeof(double));
+	double distances[k] = {-2};
+	int labels[k] = {0};
+	int fill = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j=0; j<classes[i].setSize; j++){
+			double cosine = cosCompare(inQuestion, classes[i].set[j]);
+			if(fill < k){
+				distances[fill] = cosine;
+				fill++;
+				continue;
+			}
+			for(int x = 0; x<k; x++){
-}
+				if(cosine>distances[x]){
+					distances[x] = cosine;
+					labels[x] = i;
+					break;
+				}
+			}
+		}
+	}
+	double totalweight = 0;
+	for(int i=0; i<classCount; i++){
+		for(int j = 0; j<k; j++){
+			if(labels[j] == i){
+				weights[i] += distances[j];
+				totalweight += distances[j];
+			}
+		}
+	}
+	double tempmax = -2;
+	for(int i=0; i<classCount; i++){
+		weights[i] /= totalweight;
+		if(weights[i] > tempmax){
+			choice = i;
+			tempmax = weights[i];
+		}
+	}
+	return choice;
+}

--- a/RIVclasses.o
+++ b/RIVclasses.o
--- a/RIVlexicon.h
+++ b/RIVlexicon.h
@@ -5,6 +5,7 @@
 #include "RIVaccessories.h"
 #include "assert.h"
+/* these flags will be used by the lexicon to know its permissions and states */
 #ifndef READFLAG
 #define READFLAG 0x01
 #endif
@@ -21,60 +22,71 @@
 #define CACHEFLAG 0x08
 #endif
+/* if user has specified neither hashed nor sorted cache we assume sorted
+ * hashed strategy is extremely CPU and memory light, but very inneffective 
+ * at ensuring the most important vectors are cached. as such it is better
+ * optimized for RAMdisks and unusually fast SSDs.  the sorted strategy
+ * is much more expensive for the CPU, but ensures the minimum possible 
+ * hard-drive read writes far more effectively */
 #ifndef SORTCACHE
 	#ifndef HASHCACHE
 		#define SORTCACHE
 	#endif
 #endif
+/* the LEXICON struct will be used similar to a FILE (as a pointer) which
+ * contains all metadata that a lexicon needs in order to be read and written to safely*/
 typedef struct{
 	char lexName[100];
 	denseRIV* *cache;
 	struct cacheList* listPoint;
 	char flags;
 	#ifdef SORTCACHE
+	/* if our cache is sorted, we will need a search tree and a saturation */
 	struct treenode* treeRoot;
+	int cacheSaturation;
+	denseRIV* *cache_slider;
 	#endif /* SORTCACHE */
 }LEXICON;
+/* this will form a linked list of caches, so that all data can be safely dumped
+ * in event of an error, no matter how many or how strangely lexica have
+ * been opened and closed */
 struct cacheList{
 	denseRIV* *cache;
 	struct cacheList* next;
 	struct cacheList* prev;
 }*rootCache = NULL;
-#define IODISPLACEMENT   (sizeof(((sparseRIV*)0)->count)\
+/* IOstagingSlot is used by fLexPush to preformat data to be written in a single
-						+ sizeof(((sparseRIV*)0)->frequency)\
+ * fwrite() call.  it has room for RIVSIZE integers behind it and 2*RIVSIZE
-						+ sizeof(((sparseRIV*)0)->contextSize)\
+ * integers ahead of it, which the function saturationForStaging() will need */
-						+ sizeof(((sparseRIV*)0)->magnitude))\
+int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE;
-						/ sizeof(int)
-int* IOstagingSlot = RIVKey.h_tempBlock+RIVSIZE; //#TODO format this better
 /* lexOpen is called to "open the lexicon", setting up for later calls to
 * lexPush and lexPull. if the lexicon has not been opened before calls
 * to these functions, their behavior can be unpredictable, most likely crashing
+ * lexOpen accepts flags: r, w, x.
+ * r: for reading, currently meaningless, it wont stop you reading if you don't have this
+ * w: for writing. if a readonly lexicon is "written to" no data will be saved in hardcopy
+ * although it will be cached if possible, so that later pulls will be optimized
+ * x: exclusive. will not accept new words, lexPull returns a NULL pointer
+ * and lexPush simply frees any word which is not already in the lexicon
 */
 LEXICON* lexOpen(const char* lexName, const char* flags);
 /* lexClose should always be called after the last lex push or lex pull call
 * if the lexicon is left open, some vector data may be lost due to 
- * un-flushed RIV cache
+ * un-flushed RIV cache.  also frees up data, memory leaks if lexicon is not closed
 */
 void lexClose(LEXICON*);
 /* both lexPush and lexPull must be called *after* the lexOpen() function
 * and after using them the lexClose() function must be called to ensure
- * data security */
+ * data security (only after the final push or pull, not regularly during operation */
 /* lexPush writes a denseRIV to the lexicon for permanent storage */
 int lexPush(LEXICON* lexicon, denseRIV* RIVout);
-/* cacheCheckOnPush tests the state of this vector in our lexicon cache
- * and returns 1 on "success" indicating cache storage and no need to push to file
- * or returns 0 on "failure" indicating that the vector need be pushed to file 
- */
-int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
 /* lexPull reads a denseRIV from the lexicon, under "word"
 * if the file does not exist, it creates a 0 vector with the name of word
 * lexPull returns a denseRIV *pointer* because its data must be tracked 
@@ -82,6 +94,12 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
 */
 denseRIV* lexPull(LEXICON* lexicon, char* word);
+/* cacheCheckOnPush tests the state of this vector in our lexicon cache
+ * and returns 1 on "success" indicating cache storage and no need to push to file
+ * or returns 0 on "failure" indicating that the vector need be pushed to file 
+ */
+int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout);
 /* cacheCheckonPull checks if the word's vector is stored in cache,
 * and returns a pointer to that vector on success
 * or returns a NULL pointer if the word is not cached, indicating a need 
@@ -96,8 +114,8 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word);
 */
 int fLexPush(LEXICON* lexicon, denseRIV* RIVout);
-/* flexPull pulls data directly from a file and converts it (if necessary)
+/* flexPull pulls data directly from a file and outputs it as a denseRIV.
- * to a denseRIV.  function is called by "lexPull" which is what users 
+ * function is called by "lexPull" which is what users 
 * should actually use.  lexPull, unlike FlexPull, has cache logic under
 * the hood for speed and harddrive optimization 
 */
@@ -105,6 +123,7 @@ denseRIV* fLexPull(FILE* lexWord);
 /* redefines signal behavior to protect cached data against seg-faults etc*/
 void signalSecure(int signum, siginfo_t *si, void* arg);
+int cacheDump(denseRIV* *toDump);
 /* used exclusively by flexpush to determine write-style (sparse or dense)
 * and also formats the "IOstagingSlot" for fwrite as a single block if sparse
@@ -125,6 +144,7 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
 		if (stat(lexName, &st) == -1) {
 			mkdir(lexName, 0777);
 		}
+		/* flag for writing*/
 		output->flags |= WRITEFLAG;
 	}else if(r){
 		/* if set to read and not write, return null if lexicon does not exist */
@@ -132,33 +152,35 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
 			free(output);
 			return NULL;
 		}	
+		/* flag for reading */
 		output->flags |= READFLAG;
 	}
 		/* if not set to exclusive, set the inclusive flag */
 	if(!x){
+		/* flag inclusive (will return unknown words as 0 vector */
 		output->flags |= INCFLAG;
 	}
+	/* record the name of the lexicon */
 	strcpy(output->lexName, lexName);
 	#if CACHESIZE > 0
+	output->cache = calloc(CACHESIZE, sizeof(denseRIV*));
-	if(r && w){
-		//#TODO include hash vs sort cache logic flags
-		/* if we will be reading and writing the same lexicon, setup a
-		 * cache for this lexicon to speed up rewrites */
-		struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
-		#ifdef HASHCACHE
-		newCache->cache = calloc(CACHESIZE, sizeof(denseRIV*));
-		#else
 	#ifdef SORTCACHE
-		newCache->cache = calloc(CACHESIZE+1, sizeof(denseRIV*));
+	/* a sorted cache needs a search tree for finding RIVs by name */
 	output->treeRoot = calloc(1, sizeof(struct treenode));
-		#endif
+	output->cacheSaturation = 0;
-		#endif
+	output->cache_slider = output->cache+CACHESIZE;
+	#endif /* SORTCACHE */
+	/* flag cached ?? */ 
 	output->flags |= CACHEFLAG;
+	if(w){
+		/* setup cache-list element for break dumping */
+		struct cacheList* newCache = calloc(1, sizeof(struct cacheList));
+		newCache->cache = output->cache;
-		output->cache = newCache->cache;
 		newCache->next = rootCache;
 		if(rootCache){
 			rootCache->prev = newCache;
@@ -174,14 +196,15 @@ LEXICON* lexOpen(const char* lexName, const char* flags){
 			sigaction(i,&action,NULL);
 		}
 	}
-	#endif
+	#endif /* CACHESIZE > 0 */
 	return output;
 }
 void lexClose(LEXICON* toClose){
 #if CACHESIZE>0 
-	if(toClose->flags & CACHEFLAG){
+	if(toClose->flags & WRITEFLAG){
 		if(cacheDump(toClose->cache)){
 			puts("cache dump failed, some lexicon data was lost");
 		}
@@ -203,6 +226,7 @@ void lexClose(LEXICON* toClose){
 #if CACHESIZE > 0
 denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
 	#ifdef HASHCACHE
+	/* we find which cache entry this word belongs in by simple hashing */
 	srand(wordtoSeed(word));
 	int hash = rand()%CACHESIZE;
 	if(lexicon->cache[hash]){
@@ -214,7 +238,7 @@ denseRIV* cacheCheckOnPull(LEXICON* lexicon, char* word){
 	return NULL;
 	#endif
 	#ifdef SORTCACHE
+	/* use a treeSearch (found in RIVaccessories) to find the denseRIV* in the cache */
 	return treeSearch(lexicon->treeRoot, word);
 	#endif
@@ -224,6 +248,7 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
 	/* if our RIV was cached already, no need to play with it */
 	if(RIVout->cached == lexicon){
+		/* return "success" the vector is already in cache and updated */
 		return 1;
 	}
 	#ifdef HASHCACHE
@@ -235,6 +260,7 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
 		/* push to cache instead of file */
 		lexicon->cache[hash] = RIVout;
 		lexicon->cache[hash]->cached = lexicon;
+		/* return "success" */
 		return 1;
 	/*if the current RIV is more frequent than the RIV holding its slot */
 	}
@@ -245,41 +271,54 @@ int cacheCheckOnPush(LEXICON* lexicon, denseRIV* RIVout){
 		lexicon->cache[hash] = RIVout;
 		lexicon->cache[hash]->cached = lexicon;
+		/* return "success" */
 		return 1;
 	}
 	return 0;
 	#endif /* HASHCACHE */
 	#ifdef SORTCACHE
-	denseRIV* *cache_slider = lexicon->cache;
-	while(*cache_slider){
-		if(RIVout->frequency > (*cache_slider)->frequency){
-			memcpy(cache_slider+1, cache_slider, CACHESIZE-(cache_slider-lexicon->cache));
-			if(lexicon->cache[CACHESIZE]){
-				fLexPush(lexicon, lexicon->cache[CACHESIZE]);
+	/* if the cache is not yet full, append this vector to the accumulating list */
-				//remove tree element
+	if (lexicon->cacheSaturation < CACHESIZE){
-				treecut(lexicon->treeRoot, RIVout->name);
-				lexicon->cache[CACHESIZE] = NULL;
-			}
 		RIVout->cached = lexicon;
-			*cache_slider = RIVout;
+		lexicon->cache[lexicon->cacheSaturation] = RIVout;
-			//add tree element
+		treeInsert(lexicon->treeRoot, RIVout->name, RIVout);	
-			RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
+		lexicon->cacheSaturation = lexicon->cacheSaturation+1;
+		/* return "success" */
 		return 1;
-		}
+	}else{ /* if cache is full */
-		cache_slider++;
-	}
-	if(cache_slider-lexicon->cache < CACHESIZE){
 		RIVout->cached = lexicon;
-		*cache_slider = RIVout;
+		denseRIV* toCheck = RIVout;
-		RIVinsert(lexicon->treeRoot, RIVout->name, RIVout);
+		denseRIV* temp;
-		//add tree element
+		while(1){
+			if(lexicon->cache_slider == lexicon->cache){
+				lexicon->cache_slider += CACHESIZE;
+			}		
+			(lexicon->cache_slider)--;
+			if(toCheck->frequency > (*lexicon->cache_slider)->frequency){
+				temp = (*lexicon->cache_slider);
+				(*lexicon->cache_slider) = toCheck;
+				toCheck = temp;
+			}else{
+				if(toCheck == RIVout){
+					return 0;
+				}else{
+					treecut(lexicon->treeRoot, toCheck->name);
+					fLexPush(lexicon, toCheck);
+					treeInsert(lexicon->treeRoot, RIVout->name, RIVout);
 					return 1;
 				}
+				break;
+			}			
+		}
+	}
+	/* return "failure" */
 	return 0;
 	#endif /* SORTCACHE */
 }
@@ -309,6 +348,10 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
 		/* pull data from file */
 		output = fLexPull(lexWord);
+		if(!output){
+			return NULL;
+		}
+		/* record the "name" of the vector, as the word */
 		strcpy(output->name, word);
 		fclose(lexWord);
 	}else{
@@ -317,13 +360,14 @@ denseRIV* lexPull(LEXICON* lexicon, char* word){
 			/*if file does not exist, return a 0 vector (word is new to the lexicon) */
 			output = calloc(1, sizeof(denseRIV));
+			/* record the "name" of the vector, as the word */
 			strcpy(output->name, word);
-		}
+		}else{
 			/*if lexicon is set to exclusive, will return a NULL pointer instead of a 0 vector */
+			return NULL;
 		}
+	}
 	return output;
 }
@@ -340,24 +384,39 @@ int lexPush(LEXICON* lexicon, denseRIV* RIVout){
 	#endif
+	if(lexicon->flags & WRITEFLAG){
 		/* push to the lexicon */
 		return fLexPush(lexicon, RIVout);
+	}else{
+		/* free and return */
+		free(RIVout);
+		return 0;
+	}
 }
 int saturationForStaging(denseRIV* output){
-	/* key/value pairs will be loaded to a worst-case sized temporary slot */
+	/* IOstagingSlot is a reserved block of heap memory used for this (and other)
+	 * purposes. in this function, all of the metadata to be written along with a
+	 * sparse representation of the vector, will be laid into the IOstagingSlot
+	 * in the necessary format for writing and reading again */	
 	int* count = IOstagingSlot;
+	/* count, requires an 8 byte slot for reasons of compatibility between 
+	 * dense and sparse. it takes up two integers (int* count and count+1); */
 	*count = 0;
 	*(count+1) = 0;
 	*(count+2) = output->frequency;
 	*(count+3) = output->contextSize;
+	/* TODO fix this to allow magnitude to be changed to double easily */
 	*(float*)(count+4) = output->magnitude;
+	/* locations will be laid in immediately after the metadata */
 	int* locations = IOstagingSlot+5;
+	/* values will be laid in *before* metadata, to be copied after locations,
+	 * once the size of the values and locations arrays are known.  there is,
+	 * by description of the stagingSlot, enough room for a 
+	 * completely saturated vector without conflict */
 	int* values = IOstagingSlot-RIVSIZE;;
 	int* locations_slider = locations;
 	int* values_slider = values;
@@ -380,6 +439,7 @@ int saturationForStaging(denseRIV* output){
 	/* copy values into slot immediately after locations */
 	memcpy(locations+*count, values, (*count)*sizeof(int));
+	/* return number of non-zeros */
 	return *count;
 }
 int fLexPush(LEXICON* lexicon, denseRIV* output){	
@@ -389,29 +449,39 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
 	 * in a file named after the word itself */
 	sprintf(pathString, "%s/%s", lexicon->lexName, output->name);
+	/* saturationForStaging returns the number of non-zero elements in the vector
+	 * and, in the process, places the data of the vector, in sparse format, in the
+	 * preallocated "IOstagingSlot" */
 	int saturation = saturationForStaging(output);
+	/* if our vector is less than half full, it is lighter to save it as a sparseRIV */
 	if( saturation < RIVSIZE/2){
 		FILE *lexWord = fopen(pathString, "wb");
 		if(!lexWord){
-			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
+			fprintf(stderr,"lexicon push has failed for word: %s\n", output->name);
 			return 1;
 		}
+		/* IOstagingSlot is formatted for immediate writing */
 		fwrite(IOstagingSlot, (saturation*2)+5, sizeof(int), lexWord);
 		fclose(lexWord);
 	}else{
+		/* the "cached" datapoint will be erased, a typecheck flag (0) for
+		 * the fLexPull function to know that this is a denseVector put 
+		 * in its place */
 		output->cached = 0;
 		FILE *lexWord = fopen(pathString, "wb");
 		if(!lexWord){
-			printf("lexicon push has failed for word: %s\nconsider cleaning inputs", output->name);
+			fprintf(stderr, "lexicon push has failed for word: %s\n", output->name);
 			return 1;
 		}
+		/* from the type flag forward, all metadata is preformatted, we simpy write */
 		fwrite(((int*)&output->cached), sizeof(int), RIVSIZE+5, lexWord);
 		fclose(lexWord);
 	}
+	/* and free the memory */
 	free(output);
 	return 0;
@@ -420,91 +490,85 @@ int fLexPush(LEXICON* lexicon, denseRIV* output){
 denseRIV* fLexPull(FILE* lexWord){
 	denseRIV *output = calloc(1,sizeof(denseRIV));
 	size_t typeCheck;
-	/* get metadata for vector */
+	/* the first 8 byte value in the file will be either 0 (indicating storage as a dense vector)
+	 * or a positive number, the number of values in a sparse-vector */
 	if(!fread(&typeCheck, 1, sizeof(size_t), lexWord)){
 		return NULL;
 	}
-	int flag = 0;
 	/* first value stored is the value count if sparse, and 0 if dense */
-	if (typeCheck){
+	if (typeCheck){ /* pull as sparseVector */
-		/* pull as sparseVector */
-		/*sparseRIV* temp = (sparseRIV*) (IOstagingSlot-(sizeof(sparseRIV)/sizeof(int)-IODISPLACEMENT));
+		/*create a sparseVector pointer, pointing to a prealloccated slot */
+		sparseRIV* temp = (sparseRIV*)RIVKey.h_tempBlock;
+		/* typecheck, non-zero, is the number of values in our vector */
 		temp->count = typeCheck;
-		temp->locations = IOstagingSlot+5;
+		/* locations slot comes immediately after the magnitude */
+		temp->locations = (int*)&(temp->magnitude) + 1;
+		/* and values slot comes immediately after locations */
 		temp->values = temp->locations+temp->count;		
 		if (fread(&(temp->frequency), sizeof(int), (typeCheck* 2)+3, lexWord) != typeCheck*2 + 3){
 			printf("vector read failure");
 			return NULL;
-		}*/
+		}
-		sparseRIV temp;
-		temp.count = typeCheck;
-		temp.locations = malloc(temp.count*2*sizeof(int));
-		temp.values = temp.locations+temp.count;
-		flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
-		flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
-		flag+= fread(&output->magnitude, 1, sizeof(float), lexWord);
-		flag += fread(temp.locations, temp.count, sizeof(int), lexWord);
-		flag+= fread(temp.values, temp.count, sizeof(int), lexWord);
+		/* add our temporary sparseVector to the empty denseVector, for output */
+		addS2D(output->values, *temp);
+	}else{ /* typecheck is thrown away, just a flag in this case */
-		addS2D(output->values, temp);
+		/*  read into our denseVector pre-formatted to fit */
-	}else{
+		if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
-		/* typecheck is thrown away, just a flag in this case */
-		flag+= fread(&output->frequency, 1, sizeof(int), lexWord);
-		flag += fread(&output->contextSize, 1, sizeof(unsigned int), lexWord);
-		flag +=fread(&output->magnitude, 1, sizeof(float), lexWord);
-		/*if(fread(&output->frequency, sizeof(int), RIVSIZE+3, lexWord) != RIVSIZE+3){
 			printf("vector read failure");
 			return NULL;
-		}*/
 		}
+	}
-	output->cached = 0;
 	return output;
 }
+/* if our data is cached, it cannot be allowed to be lost in event of an issue */
 void signalSecure(int signum, siginfo_t *si, void* arg){
+	/* descend linked list */
 	while(rootCache){
+		/* dumping all caches contained */
 		if(cacheDump(rootCache->cache)){
-			puts("cache dump failed, some lexicon data lost");
+			fprintf(stderr, "cache dump failed, some lexicon data lost");
 		}
 		rootCache = rootCache->next;
 	}
+	/* end with normal behavior of error */
 	signal(signum, SIG_DFL);
 	kill(getpid(), signum);
 }
 int cacheDump(denseRIV* *toDump){
+	/* flag will record if there are any errors and alert */
 	int flag = 0;
+	/* iterate through the elements of our cache */
 	denseRIV* *toDump_slider = toDump;
-	#ifdef HASHCACHE
 	denseRIV* *toDump_stop = toDump+CACHESIZE;
 	while(toDump_slider<toDump_stop){
+		#ifdef HASHCACHE
+		/* if our cache is hashed, there may be null vectors to be skipped */
 		if(*toDump_slider){
 			flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
 		}
-		toDump_slider++;
+		#else /* HASHCAVHE */
-	}
-	#else
 		#ifdef SORTCACHE
-	while(*toDump_slider){
+		/* if our cache is sorted, a null vector represents the end of the cache */
+		if(!*toDump_slider)break;
 		flag += fLexPush((LEXICON*)(*toDump_slider)->cached,*toDump_slider);
+		#endif /* SORTCACHE */
+		#endif
 		toDump_slider++;
 	}
-	#endif
-	#endif
 	free(toDump);
 	return flag;
 }
-#endif
+#endif /* RIV_LEXICON_H */
--- a/RIVlexicon.h.gch
+++ b/RIVlexicon.h.gch
--- a/RIVread.c
+++ b/RIVread.c
@@ -6,10 +6,11 @@
 #include <dirent.h>
 #include <error.h>
 #include <string.h>
-//#define HASHCACHE
 #define RIVSIZE 50000
 #define NONZEROS 4
-#define CACHESIZE 27000
+#define CACHESIZE 25000
+#define SORTCACHE
 #include "RIVtools.h"
 //this program reads a directory full of files, and adds all context vectors (considering file as context)
@@ -20,11 +21,11 @@ void addContext(denseRIV* lexRIV, sparseRIV context);
 void directoryGrind(char *rootString);
 void lineGrind(char* textLine);
 LEXICON* lp;
-//int COUNTY = 0;
+int COUNTY = 0;
 int main(int argc, char *argv[]){
 	char pathString[1000];
-	lp = lexOpen("lexicon", "rw");
+	lp = lexOpen("lexiconshitty", "r");
 	//we open the lexicon, if it does not yet exist, it will be created
@@ -33,7 +34,6 @@ int main(int argc, char *argv[]){
 	strcpy(pathString, argv[1]);
 	strcat(pathString, "/");
 	//ensure that the targeted root directory exists
 	struct stat st;
 	if(stat(pathString, &st) == -1) {
 		printf("directory doesn't seem to exist");
@@ -79,8 +79,10 @@ void directoryGrind(char *rootString){
 		//open a file within root directory
 		FILE *input = fopen(pathString, "r");
 		if(input){
+			if(COUNTY++>1000) return;
 			//process this file and add it's data to lexicon
 			//fprintf(stderr, "***%d", COUNTY++);
 			fileGrind(input);
 			fclose(input);
@@ -133,7 +135,10 @@ void lineGrind(char* textLine){
 		//we pull the vector corresponding to each word from the lexicon
 		//if it's a new word, lexPull returns a 0 vector
 		lexiconRIV= lexPull(lp, word);
+		if(!lexiconRIV){
+			printf("Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuucked");
+			continue;
+		}
 		//we add the context of this file to this wordVector
 		addContext(lexiconRIV, contextVector);
@@ -150,20 +155,13 @@ void lineGrind(char* textLine){
 	}
 	//free the heap allocated context vector data
 	free(contextVector.locations);
 }
 void addContext(denseRIV* lexRIV, sparseRIV context){
 		//add context to the lexRIV, (using sparse-dense vector comparison)
-		addS2D(lexRIV->values, context);
+		sparseRIV thing = context;
+		addS2D(lexRIV->values, thing);
 		//log the "size" of the vector which was added
 		//this is not directly necessary, but is useful metadata for some analises

--- a/runscriptUb.sh
+++ b/runscriptUb.sh
-clean(){
-	while [ "$1" ]; do
-		./RIVread "$1"
-		shift
-	done
-}
-clean ../bookCleaner/cleanbooks/*
--- a/saturation.c
+++ b/saturation.c
-#include <stdio.h>
-#include <stdlib.h>
-#include <dirent.h>
-#include <time.h>
-#include "RIVtoolsCPUlinux.h"
-void directoryToL2s(char *rootString);
-int main(){
-	RIVInit();
-	char rootString[] = "lexicon/";
-	directoryToL2s(rootString);
-}
-void directoryToL2s(char *rootString){
-	sparseRIV fileRIV;
-	char pathString[2000];
-	DIR *directory;
-    struct dirent *files = 0;
-	if(!(directory = opendir(rootString))){
-		printf("location not found, %s\n", rootString);
-		return;
-	}
-	while((files=readdir(directory))){
-		if(*(files->d_name) == '.') continue;
-		if(files->d_type == DT_DIR){
-			strcpy(pathString, rootString);
-			strcat(pathString, files->d_name);
-			strcat(pathString, "/");
-			directoryToL2s(pathString);
-		}
-		strcpy(pathString, rootString);
-		strcat(pathString, files->d_name);
-		FILE *input = fopen(pathString, "r");
-		if(!input){
-			printf("file %s doesn't seem to exist, breaking out of loop", pathString);
-			return;
-		}else{
-			denseRIV temp = lexPull(pathString);
-			fileRIV = consolidateD2S(temp.values);
-			strcpy(fileRIV.name, pathString);
-			float count = fileRIV.count;
-			printf("%s, saturation: %f\n", fileRIV.name, count);
-			fclose(input);
-			free(temp.values);
-			//free(fileRIV.locations);
-		}
-	}
-}
--- a/someshit.c
+++ b/someshit.c
+#include <stdio.h>
+#include "RIVaccessories.h"
+#include <time.h>
+int main(){
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	clock_t start, end;
+	puts("tree ready");
+	while(1){
+		scanf("%s", word);
+		start = clock();
+		stem = treeSearch(root, word) ;
+		end = clock();
+		if(stem){
+			puts(stem);
+		}else{
+			puts("no entry");
+		}
+		printf("took: %lf\n", (double)(end-start)/CLOCKS_PER_SEC);
+	}
+}
--- a/stemconfig/dbtools.py
+++ b/stemconfig/dbtools.py
+import pymongo
+from pymongo import MongoClient
+def dbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds261969.mlab.com:61969/rivwordnet")
+    database = client.rivwordnet
+    collection = database.stems
+    collection.create_index("from")
+    return collection
+def dbPost(wordset, collection):
+    if not len(wordset):
+        return
+    posts = []
+    for key, value in wordset.iteritems():
+        post = {"from": key, "to": value}
+        posts.append(post)
+    collection.insert_many(posts)
+def cleanDbSetup():
+    client = MongoClient("mongodb://etcart:Argelfraster1@ds163119.mlab.com:63119/rivetcleandocs")
+    database = client.rivetcleandocs
+    collection = database.cleaned
+    collection.create_index("file")
+    return collection
+def dbPostCleaned(text, file, collection):
+    if not len(text):
+        return
+    document = {
+        "text": text,
+        "file": file,
+    }
+    collection.insert_one(document)
+def dbGet(words, collection):
+    if mebewords:
+        return mebeword["to"]
+    else:
+        return 0
\ No newline at end of file
--- a/stemconfig/dbtools.pyc
+++ b/stemconfig/dbtools.pyc
--- a/stemconfig/stemconf
+++ b/stemconfig/stemconf
--- a/stemconfig/stemconf.c
+++ b/stemconfig/stemconf.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int configInsert(struct treenode* node, char* letter, int treeSize);
+int stemTreeConfig();
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+}
+int configInsert(struct treenode* node, char* letter, int treeSize){
+	node->downstream++;
+	if(*(letter)){
+		if(!node->links[*(letter)-'a']){
+			treeSize++;
+			node->links[*(letter)-'a'] = calloc(1, sizeof(struct treenode));
+		}
+		return configInsert(node->links[*(letter)-'a'], letter+1, treeSize);
+	}else{
+		return treeSize;
+	}
+}
+int stemTreeConfig(){
+	int treeSize = 1;
+	FILE* wordFile = fopen("wordset.txt", "r");
+	if(!wordFile){
+		printf("no wordnet file");
+		return 0;
+	}
+	struct treenode* rootNode = calloc(1, sizeof(struct treenode));
+	char word[100];
+	char* stem = (char*)stemset;
+	int displacement;
+	while(fscanf(wordFile, "%s", word)){
+		sscanf(stem, "%*s%n", &displacement);
+		stem[displacement] = '\0';
+		treeSize = configInsert(rootNode, word, treeSize);
+		if(feof(wordFile)){
+			break;
+		}
+		stem += displacement+1;
+	}
+	fclose(wordFile);
+	return treeSize;
+}
--- a/stemconfig/stemconf.o
+++ b/stemconfig/stemconf.o
--- a/stemconfig/stemconfig
+++ b/stemconfig/stemconfig
--- a/stemconfig/stemconfig.c
+++ b/stemconfig/stemconfig.c
+#include <stdio.h>
+#include "../RIVaccessories.h"
+int main(){
+	int count = stemTreeConfig();
+	printf("%d", count);
+}
--- a/stemconfig/stemconfig.py
+++ b/stemconfig/stemconfig.py
+import dbtools
+from subprocess import call
+collection = dbtools.dbSetup()
+preset = collection.find()
+set = {}
+for doc in preset:
+	set[doc["from"]] = doc["to"]
+words = [];
+stems = [];
+for key, value in set.iteritems():
+	words.append(key);
+	stems.append(value);
+wordFILE = open("wordset.txt", "w")
+wordFILE.write(' '.join(words));
+wordFILE.close()
+stemFILE = open("stemset.h", "w")
+finalOut = 'char stemset[] = "' + ' '.join(stems) + ' ";'+'\nint treesize = '
+stemFILE.write(finalOut + '0;')
+stemFILE.close()
+tempfile = open("tempfile.txt", "w")
+call(["gcc", "stemconf.c","-o", "stemconfig"])
+call(["./stemconfig"], stdout=tempfile)
+tempfile.close()
+tempfile = open("tempfile.txt", "r")
+treesize = tempfile.read();
+finalOut = finalOut + treesize + ';'
+stemFile = open("stemset.h", "w")
+stemFile.write(finalOut)
+stemFile.close;
--- a/stemconfig/stemset.h
+++ b/stemconfig/stemset.h
--- a/stemconfig/tempfile.txt
+++ b/stemconfig/tempfile.txt
+279920
\ No newline at end of file
--- a/stemconfig/wordset.txt
+++ b/stemconfig/wordset.txt
--- a/stemnet2.txt
+++ b/stemnet2.txt
--- a/treetest.c
+++ b/treetest.c
+#include <stdio.h>
+#include "RIVtools.h"
+int main(){
+	struct treenode* root = stemTreeSetup();
+	char word[100];
+	char* stem;
+	while(1){
+		while(*word != '1'){
+			scanf("%s", word);
+			stem = treeSearch(root, word);
+			if(stem){
+				puts(stem);
+			}else{
+				puts("NULL return");
+			}
+		}
+		while(*word != '0'){
+			scanf("%s", word);
+			treecut(root, word);
+		}
+	}
+	return 0;
+}