Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
etcart
/
RIVet
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
7c37cc43
authored
Feb 11, 2018
by
Ethan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated and commented
parent
41cdd603
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
672 additions
and
249 deletions
RIVLower.h
RIVcullCPUlinux.c
RIVread.c
RIVtoolsCPUlinux.h
RIVtoolsCPU.h → RIVtoolsCPUwindows.h
RIVLower.h
0 → 100644
View file @
7c37cc43
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* RIVSIZE macro defines the dimensionality off the RIVs we will use
* 25000 is the standard, but can be redefined specifically
*/
#ifndef RIVSIZE
#define RIVSIZE 25000
#endif
/* NONZeros macro defines the number of non-zero values that will be generated
* for any level one (barcode) RIV. 2 is simple and lightweight to begin
*/
#ifndef NONZEROS
#define NONZEROS 2
#endif
/* CACHESIZE macro defines the number of RIVs the system will cache.
* a larger cache means more memory consumption, but will also be significantly
* faster in aggregation and reading applications. doesn't affect systems
* that do not use lexpull/push
*/
#ifndef CACHESIZE
#define CACHESIZE 20
#endif
/* the sparseRIV is a RIV form optimized for RIVs that will be mostly 0s
* as this is often an ideal case, it is adviseable as the default
* unless we are doing long term RIV aggregation.
* specifically, a sparseRIV contains a pair of arrays,
* containing locations and values, where pairs are found in like array
* indices.
*/
typedef
struct
{
char
name
[
100
];
int
*
values
;
int
*
locations
;
size_t
count
;
unsigned
int
frequency
;
float
magnitude
;
int
boolean
;
}
sparseRIV
;
/* the denseRIV is a RIV form optimized for overwhelmingly non-0 vectors
* this is rarely the case, but its primary use is for performing vector
* math, as comparisons and arithmetic between vectors are ideally
* performed between sparse and dense (hetero-arithmetic)
*/
typedef
struct
{
char
name
[
100
];
int
*
values
;
int
*
frequency
;
float
magnitude
;
}
denseRIV
;
/*RIVKey, holds globally important data that should not be changed partway through
* first function call in the program should always be:
* RIVinit();
* this will set these variables, check for incompatible choices, and open up
* memory blocks which the system will use in the background
*/
struct
RIVData
{
size_t
RIVsize
;
int
nonZeros
;
int
*
h_tempBlock
;
int
tempSize
;
int
thing
;
denseRIV
*
RIVCache
;
int
cacheSize
;
}
static
RIVKey
;
/* RIVinit should be the first function called in any usage of this library
* it sets global variables that practically all functions will reference,
* it checks that your base parameters are valid, and allocates memory for
* the functions to use, so that we can move fast with rare allocations.
* #TODO add signal redefinitions so that cache is saved even on crash
*/
void
RIVinit
();
/* RIVCleanup should always be called to close a RIV program. it frees
* blocks allocated by RIVinit, and dumps the cached data to appropriate lexicon files
*/
void
RIVCleanup
();
/*consolidateD2S takes a denseRIV value-set input, and returns a sparse RIV with
* all 0s removed. it does nto automatically carry metadata, which must be assigned
* to a denseRIV after the fact. often denseRIVs are only temporary, and don't
* need to carry metadata
*/
sparseRIV
consolidateD2S
(
int
*
denseInput
);
//#TODO fix int*/denseRIV confusion
/* mapS2D expands a sparseRIV out to denseRIV values, filling array locations
* based on location-value pairs
*/
int
*
mapS2D
(
int
*
destination
,
sparseRIV
input
);
//#TODO fix int*/denseRIV confusion
/* makeSparseLocations must be called repeatedly in the processing of a
* file to produce a series of locations from the words of the file
* this produces an "implicit" RIV which can be used with the mapI2D function
* to create a denseRIV.
*/
void
makesparseLocations
(
unsigned
char
*
word
,
int
*
seeds
,
size_t
seedCount
);
/* fLexPush pushes the data contained in a denseRIV out to a lexicon file,
* saving it for long-term aggregation. function is called by "lexpush",
* which is what users should actually use. lexPush, unlike fLexPush,
* has cache logic under the hood for speed and harddrive optimization
*/
int
fLexPush
(
denseRIV
RIVout
);
int
wordtoSeed
(
unsigned
char
*
word
);
/* mapI2D maps an "implicit RIV" that is, an array of index values,
* arranged by chronological order of generation (as per makesparseLocations)
* it assigns, in the process of mapping, values according to ordering
*/
int
*
mapI2D
(
int
*
locations
,
size_t
seedCount
);
/* begin definitions */
int
*
mapS2D
(
int
*
destination
,
sparseRIV
input
){
// #TODO fix destination parameter vs calloc of destination
/* make sure our destination is a 0 vector */
memset
(
destination
,
0
,
RIVKey
.
RIVsize
*
sizeof
(
int
));
int
*
locations_slider
=
input
.
locations
;
int
*
values_slider
=
input
.
values
;
int
*
locations_stop
=
locations_slider
+
input
.
count
;
/* apply values at an index based on locations */
while
(
locations_slider
<
locations_stop
){
destination
[
*
locations_slider
]
=
*
values_slider
;
locations_slider
++
;
values_slider
++
;
}
return
destination
;
}
int
*
mapI2D
(
int
*
locations
,
size_t
valueCount
){
// #TODO fix destination parameter vs calloc of destination
int
*
destination
=
(
int
*
)
calloc
(
RIVKey
.
RIVsize
,
sizeof
(
int
));
int
*
locations_slider
=
locations
;
int
*
locations_stop
=
locations_slider
+
valueCount
;
/*apply values +1 or -1 at an index based on locations */
while
(
locations_slider
<
locations_stop
){
destination
[
*
locations_slider
]
+=
1
;
locations_slider
++
;
destination
[
*
locations_slider
]
-=
1
;
locations_slider
++
;
}
return
destination
;
}
sparseRIV
consolidateD2S
(
int
*
denseInput
){
sparseRIV
output
;
output
.
count
=
0
;
/* key/value pairs will be loaded to a worst-case sized temporary slot */
int
*
locations
=
RIVKey
.
h_tempBlock
;
int
*
values
=
RIVKey
.
h_tempBlock
+
RIVKey
.
RIVsize
;
int
*
locations_slider
=
locations
;
int
*
values_slider
=
values
;
for
(
int
i
=
0
;
i
<
RIVKey
.
RIVsize
;
i
++
){
/* act only on non-zeros */
if
(
denseInput
[
i
]){
/* assign index to locations */
*
(
locations_slider
++
)
=
i
;
/* assign value to values */
*
(
values_slider
++
)
=
denseInput
[
i
];
/* track size of forming sparseRIV */
output
.
count
++
;
}
}
/* a slot is opened for the locations/values pair */
output
.
locations
=
(
int
*
)
malloc
(
output
.
count
*
2
*
sizeof
(
int
));
if
(
!
output
.
locations
){
printf
(
"memory allocation failed"
);
//*TODO enable fail point knowledge
}
/* copy locations values into opened slot */
memcpy
(
output
.
locations
,
locations
,
output
.
count
*
sizeof
(
int
));
output
.
values
=
output
.
locations
+
output
.
count
;
/* copy values into opened slot */
memcpy
(
output
.
values
,
values
,
output
.
count
*
sizeof
(
int
));
return
output
;
}
void
RIVinit
(){
RIVKey
.
RIVsize
=
RIVSIZE
;
//#TODO decide about macros vs global variables
RIVKey
.
nonZeros
=
NONZEROS
;
if
(
RIVKey
.
nonZeros
%
2
){
printf
(
"your NONZEROS value must be an even number"
);
RIVKey
.
nonZeros
++
;
printf
(
", changed to %d"
,
RIVKey
.
nonZeros
);
}
/* open a slot at least large enough for worst case handling of
* sparse to dense conversion. may be enlarged by filetoL2 functions */
RIVKey
.
h_tempBlock
=
(
int
*
)
malloc
(
3
*
RIVKey
.
RIVsize
*
sizeof
(
int
));
RIVKey
.
tempSize
=
3
*
RIVKey
.
RIVsize
;
RIVKey
.
thing
=
0
;
RIVKey
.
cacheSize
=
CACHESIZE
;
/* open a slot for a cache of dense RIVs, optimized for frequent accesses */
RIVKey
.
RIVCache
=
(
denseRIV
*
)
calloc
(
RIVKey
.
cacheSize
,
sizeof
(
denseRIV
));
}
void
RIVCleanup
(){
for
(
int
i
=
0
;
i
<
RIVKey
.
cacheSize
;
i
++
){
fLexPush
(
RIVKey
.
RIVCache
[
i
]);
}
#if CACHESIZE > 0
free
(
RIVKey
.
RIVCache
);
#endif
free
(
RIVKey
.
h_tempBlock
);
}
int
wordtoSeed
(
unsigned
char
*
word
){
int
i
=
0
;
int
seed
=
0
;
while
(
*
word
){
/* left-shift 5 each time *should* make seeds unique to words */
seed
+=
(
*
(
word
))
<<
(
i
*
5
);
word
++
;
i
++
;
}
return
seed
;
}
void
makeSparseLocations
(
unsigned
char
*
word
,
int
*
locations
,
size_t
count
){
locations
+=
count
;
srand
(
wordtoSeed
(
word
));
for
(
int
i
=
0
;
i
<
RIVKey
.
nonZeros
;
i
++
){
/* unrolled for speed, gauranteed to be an even number of steps */
*
locations
=
rand
()
%
RIVKey
.
RIVsize
;
locations
++
;
i
++
;
*
locations
=
rand
()
%
RIVKey
.
RIVsize
;
locations
++
;
}
return
;
}
int
fLexPush
(
denseRIV
RIVout
){
char
pathString
[
500
]
=
{
0
};
/* word data will be placed in a (new?) file under the lexicon directory
* and named after the word itself */
sprintf
(
pathString
,
"lexicon/%s"
,
RIVout
.
name
);
FILE
*
lexWord
=
fopen
(
pathString
,
"wb"
);
if
(
!
lexWord
){
printf
(
"lexicon push has failed for word: %s
\n
consider cleaning inputs"
,
pathString
);
return
1
;
}
fwrite
(
RIVout
.
frequency
,
1
,
4
,
lexWord
);
fwrite
(
&
RIVout
.
magnitude
,
1
,
4
,
lexWord
);
fwrite
(
RIVout
.
values
,
RIVKey
.
RIVsize
,
4
,
lexWord
);
fclose
(
lexWord
);
free
(
RIVout
.
values
);
return
0
;
}
RIVcullCPUlinux.c
0 → 100644
View file @
7c37cc43
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
#include <time.h>
#define RIVSIZE 5000
#define CACHESIZE 0
#include "RIVtoolsCPUlinux.h"
#define THRESHOLD .80f
void
directoryToL2s
(
char
*
rootString
,
sparseRIV
**
fileRIVs
,
int
*
fileCount
);
int
printname
(
float
cosine
,
sparseRIV
base
,
sparseRIV
multiplier
);
int
main
(
int
argc
,
char
*
argv
[]){
clock_t
begintotal
=
clock
();
int
fileCount
=
0
;
RIVinit
();
sparseRIV
*
fileRIVs
=
(
sparseRIV
*
)
malloc
(
1
*
sizeof
(
sparseRIV
));
char
rootString
[
2000
];
if
(
argc
<
1
){
printf
(
"give me a directory"
);
return
1
;
}
strcpy
(
rootString
,
argv
[
1
]);
strcat
(
rootString
,
"/"
);
directoryToL2s
(
rootString
,
&
fileRIVs
,
&
fileCount
);
printf
(
"fileCount: %d
\n
"
,
fileCount
);
getMagnitudes
(
fileRIVs
,
fileCount
);
clock_t
beginnsquared
=
clock
();
printf
(
"got past magnitudes"
);
for
(
int
i
=
0
;
i
<
fileCount
;
i
++
){
if
(
fileRIVs
[
i
].
boolean
){
cosineCompare
(
fileRIVs
[
i
],
fileRIVs
+
i
+
1
,
fileCount
-
(
i
+
1
),
printname
);
}
}
clock_t
endnsquared
=
clock
();
double
time
=
(
double
)(
endnsquared
-
beginnsquared
)
/
CLOCKS_PER_SEC
;
printf
(
"nsquared time:%lf
\n\n
"
,
time
);
printf
(
"%d <"
,
RIVKey
.
thing
);
clock_t
endtotal
=
clock
();
double
time_spent
=
(
double
)(
endtotal
-
begintotal
)
/
CLOCKS_PER_SEC
;
printf
(
"total time:%lf
\n\n
"
,
time_spent
);
free
(
fileRIVs
);
return
0
;
}
void
directoryToL2s
(
char
*
rootString
,
sparseRIV
**
fileRIVs
,
int
*
fileCount
){
char
pathString
[
2000
];
DIR
*
directory
;
struct
dirent
*
files
=
0
;
if
(
!
(
directory
=
opendir
(
rootString
))){
printf
(
"location not found, %s
\n
"
,
rootString
);
return
;
}
while
((
files
=
readdir
(
directory
))){
if
(
*
(
files
->
d_name
)
==
'.'
)
continue
;
if
(
files
->
d_type
==
DT_DIR
){
strcpy
(
pathString
,
rootString
);
strcat
(
pathString
,
files
->
d_name
);
strcat
(
pathString
,
"/"
);
directoryToL2s
(
pathString
,
fileRIVs
,
fileCount
);
}
strcpy
(
pathString
,
rootString
);
strcat
(
pathString
,
files
->
d_name
);
FILE
*
input
=
fopen
(
pathString
,
"r"
);
if
(
!
input
){
printf
(
"file %s doesn't seem to exist, breaking out of loop"
,
pathString
);
return
;
}
else
{
(
*
fileRIVs
)
=
(
sparseRIV
*
)
realloc
((
*
fileRIVs
),
((
*
fileCount
)
+
1
)
*
sizeof
(
sparseRIV
));
(
*
fileRIVs
)[(
*
fileCount
)]
=
fileToL2Clean
(
input
);
strcpy
((
*
fileRIVs
)[(
*
fileCount
)].
name
,
pathString
);
fclose
(
input
);
(
*
fileCount
)
++
;
}
}
}
int
printname
(
float
cosine
,
sparseRIV
base
,
sparseRIV
multiplier
){
if
(
cosine
>=
THRESHOLD
){
printf
(
"%s
\t
%s
\n
%f
\n
"
,
base
.
name
,
multiplier
.
name
,
cosine
);
multiplier
.
boolean
=
0
;
RIVKey
.
thing
++
;
return
0
;
}
return
0
;
}
RIVread.c
0 → 100644
View file @
7c37cc43
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CACHESIZE 100
#include "RIVtoolsCPUlinux.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
#include <error.h>
void
fileGrind
(
FILE
*
textFile
);
void
addS2Ds
(
denseRIV
*
denseSet
,
sparseRIV
additive
,
int
RIVCount
);
int
checkDupe
(
denseRIV
*
RIVSet
,
char
*
word
,
int
wordCount
);
void
directoryGrind
(
char
*
rootString
);
int
main
(
int
argc
,
char
*
argv
[]){
clock_t
begintotal
=
clock
();
setKeyData
();
char
pathString
[
1000
];
strcpy
(
pathString
,
argv
[
1
]);
strcat
(
pathString
,
"/"
);
directoryGrind
(
pathString
);
clock_t
endtotal
=
clock
();
double
time_spent
=
(
double
)(
endtotal
-
begintotal
)
/
CLOCKS_PER_SEC
;
printf
(
"total time:%lf
\n\n
"
,
time_spent
);
for
(
int
i
=
0
;
i
<
RIVKey
.
cacheSize
;
i
++
){
printf
(
"%s, %d"
,
RIVKey
.
RIVCache
[
i
].
name
,
*
(
RIVKey
.
RIVCache
[
i
].
frequency
));
printf
(
"
\n
"
);
}
RIVCleanup
();
return
0
;
}
void
addS2Ds
(
denseRIV
*
denseSet
,
sparseRIV
additive
,
int
RIVCount
){
denseRIV
*
denseSet_slider
;
denseRIV
*
dense_stop
=
denseSet
+
RIVCount
;
int
*
locations
=
additive
.
locations
;
int
*
locations_stop
=
locations
+
additive
.
count
;
int
*
values
=
additive
.
values
;
//int *target;
while
(
locations
<
locations_stop
){
denseSet_slider
=
denseSet
;
while
(
denseSet_slider
<
dense_stop
){
(
*
denseSet_slider
).
values
[
*
locations
]
+=
*
values
;
//*target+=*values;
denseSet_slider
++
;
}
locations
++
;
values
++
;
}
}
int
checkDupe
(
denseRIV
*
RIVSet
,
char
*
word
,
int
wordCount
){
denseRIV
*
RIVStop
=
RIVSet
+
wordCount
;
while
(
RIVSet
<
RIVStop
){
if
(
!
strcmp
(
word
,
RIVSet
->
name
)){
return
1
;
}
RIVSet
++
;
}
return
0
;
}
void
directoryGrind
(
char
*
rootString
){
char
pathString
[
2000
];
DIR
*
directory
;
struct
dirent
*
files
=
0
;
if
(
!
(
directory
=
opendir
(
rootString
))){
printf
(
"location not found, %s
\n
"
,
rootString
);
return
;
}
while
((
files
=
readdir
(
directory
))){
while
(
!
strcmp
(
files
->
d_name
,
"."
)
||
!
strcmp
(
files
->
d_name
,
".."
)){
files
=
readdir
(
directory
);
}
if
(
files
->
d_type
==
DT_DIR
){
strcpy
(
pathString
,
rootString
);
strcat
(
pathString
,
files
->
d_name
);
strcat
(
pathString
,
"/"
);
directoryGrind
(
pathString
);
}
strcpy
(
pathString
,
rootString
);
strcat
(
pathString
,
files
->
d_name
);
printf
(
"%s
\n
"
,
pathString
);
FILE
*
input
=
fopen
(
pathString
,
"r+"
);
if
(
input
){
fileGrind
(
input
);
fclose
(
input
);
}
}
}
void
fileGrind
(
FILE
*
textFile
){
sparseRIV
aggregateRIV
=
fileToL2Clean
(
textFile
);
fseek
(
textFile
,
0
,
SEEK_SET
);
int
wordCount
=
0
;
denseRIV
*
RIVArray
=
(
denseRIV
*
)
malloc
(
aggregateRIV
.
frequency
*
sizeof
(
denseRIV
));
char
word
[
200
];
while
(
fscanf
(
textFile
,
"%99s"
,
word
)){
if
(
feof
(
textFile
))
break
;
if
(
!
(
*
word
))
continue
;
if
(
!
isWordClean
((
char
*
)
word
)){
continue
;
}
if
(
!
checkDupe
(
RIVArray
,
word
,
wordCount
)){
RIVArray
[
wordCount
]
=
lexPull
(
word
);
if
(
!*
((
RIVArray
[
wordCount
].
name
)))
break
;
int
*
thing
=
RIVArray
[
wordCount
].
frequency
;
*
thing
=
*
thing
+
1
;
//printf("%s, %d, %d\n", RIVArray[wordCount].name, *(RIVArray[wordCount].frequency), *thing);
wordCount
++
;
}
}
//printf("%d\n", wordCount);
addS2Ds
(
RIVArray
,
aggregateRIV
,
wordCount
);
denseRIV
*
RIVArray_slider
=
RIVArray
;
denseRIV
*
RIVArray_stop
=
RIVArray
+
wordCount
;
while
(
RIVArray_slider
<
RIVArray_stop
){
lexPush
(
*
RIVArray_slider
);
RIVArray_slider
++
;
}
free
(
RIVArray
);
free
(
aggregateRIV
.
locations
);
free
(
aggregateRIV
.
values
);
}
RIVtoolsCPUlinux.h
View file @
7c37cc43
...
...
@@ -2,65 +2,50 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "RIVLower.h"
#define SEEDMASK 25214903917
struct
RIVData
{
int
RIVsize
;
int
nonZeros
;
long
long
int
*
masks
;
int
*
h_tempBlock
;
int
*
h_stagingBlock
;
int
*
h_staging_slider
;
int
*
h_staging_stop
;
int
*
h_displacements
;
int
*
d_OpenSlot
;
int
*
d_SlotEnd
;
float
*
d_magnitudes
;
int
thing
;
}
RIVKey
;
typedef
struct
{
char
name
[
100
];
int
*
values
;
int
*
locations
;
int
count
;
int
frequency
;
float
magnitude
;
int
boolean
;
}
sparseRIV
;
typedef
struct
{
char
name
[
100
];
int
*
values
;
int
frequency
;
float
magnitude
;
}
denseRIV
;
void
cosineCompareUnbound
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
size_t
multiplierCount
,
float
threshold
);
/*lexPush writes a denseRIV to a file of the same name, under the directory "lexicon"
* it is up to the programmer to ensure that the name of the RIV is a valid filename
* although it will of course attempt to create the file if it does not exist
*/
int
lexPush
(
denseRIV
RIVout
);
denseRIV
lexPull
(
int
*
valuesOut
,
char
*
word
);
/* lexPull reads an existing lexicon entry (under directory "lexicon")
* and creates a denseRIV with those attributes.
* if the file does not exist, it creates a 0 vector with the name of word
*/
denseRIV
lexPull
(
char
*
word
);
/*isWordClean filters words that contain non-letter characters, and
* upperCase letters, allowing only the '_' symbol through
*/
int
cacheHash
(
char
*
word
);
int
isWordClean
(
char
*
word
);
int
isLetter
(
char
c
);
sparseRIV
FileToL2
(
FILE
*
data
);
sparseRIV
FileToL2Clean
(
FILE
*
data
);
sparseRIV
consolidateD2S
(
int
*
denseInput
);
void
setKeyData
(
int
RIVsize
,
int
nonZeros
,
int
blockSize
);
int
*
mapS2D
(
int
*
destination
,
sparseRIV
input
);
int
*
makeSparseLocations
(
int
*
seeds
,
int
seedCount
);
void
makeSeeds
(
unsigned
char
*
word
,
int
**
seeds
,
int
*
seedCount
);
void
cosineCompare
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
int
multiplierCount
,
float
threshold
);
void
getMagnitudes
(
sparseRIV
*
inputs
,
int
RIVCount
);
int
*
mapI2D
(
int
*
locations
,
int
seedCount
);
sparseRIV
text2L2
(
unsigned
char
*
text
);
unsigned
char
*
sscanAdvance
(
unsigned
char
**
string
,
unsigned
char
*
word
);
sparseRIV
FileToL2
(
FILE
*
data
){
unsigned
char
word
[
2000
]
=
{
0
};
int
*
seeds
=
RIVKey
.
h_tempBlock
;
/* fileToL2 takes an input file, reads words (delimiting on " " and "\n")
* and returns a sparse RIV which is the vector sum of the base RIVs of each
* word contained
*/
sparseRIV
fileToL2
(
FILE
*
input
);
/* fileToL2Clean operates the same as fileToL2 butkeeps only words
* containing lowercase letters and the '_' symbol
* this is important if you will be lexPush-ing those words later
*/
sparseRIV
fileToL2Clean
(
FILE
*
data
);
void
cosineCompare
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
size_t
multiplierCount
,
int
(
*
action
)(
float
cosine
,
sparseRIV
base
,
sparseRIV
multiplier
));
void
getMagnitudes
(
sparseRIV
*
inputs
,
size_t
RIVCount
);
sparseRIV
text2L2
(
unsigned
char
*
text
);
//unused
unsigned
char
*
sscanAdvance
(
unsigned
char
**
string
,
unsigned
char
*
word
);
//unused
sparseRIV
fileToL2
(
FILE
*
data
){
unsigned
int
blockSize
;
int
seedCount
=
0
;
unsigned
char
word
[
100
]
=
{
0
};
int
*
locations
=
RIVKey
.
h_tempBlock
;
int
locationCount
=
0
;
while
(
fscanf
(
data
,
"%s"
,
word
)){
while
(
fscanf
(
data
,
"%
99
s"
,
word
)){
if
(
feof
(
data
)){
break
;
...
...
@@ -68,72 +53,75 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word);
if
(
!
(
*
word
)){
break
;
}
makeSeeds
(
word
,
&
seeds
,
&
seedCount
);
blockSize
=
locationCount
+
RIVKey
.
nonZeros
;
if
(
blockSize
>
RIVKey
.
tempSize
){
RIVKey
.
h_tempBlock
=
(
int
*
)
realloc
(
RIVKey
.
h_tempBlock
,
blockSize
*
sizeof
(
int
));
locations
=
RIVKey
.
h_tempBlock
;
RIVKey
.
tempSize
+=
RIVKey
.
nonZeros
;
}
int
*
locations
=
makeSparseLocations
(
seeds
,
seedCount
);
makeSparseLocations
(
word
,
locations
,
locationCount
);
locationCount
++
;
}
int
*
L2dense
;
L2dense
=
mapI2D
(
locations
,
seedCount
);
L2dense
=
mapI2D
(
locations
,
locationCount
);
sparseRIV
output
=
consolidateD2S
(
L2dense
);
free
(
L2dense
);
output
.
frequency
=
seed
Count
/
RIVKey
.
nonZeros
;
output
.
frequency
=
location
Count
/
RIVKey
.
nonZeros
;
output
.
boolean
=
1
;
return
output
;
}
sparseRIV
FileToL2Clean
(
FILE
*
data
){
sparseRIV
fileToL2Clean
(
FILE
*
data
){
unsigned
char
word
[
100
]
=
{
0
};
int
*
seeds
=
RIVKey
.
h_tempBlock
;
unsigned
char
word
[
100
]
=
{
0
};
int
*
locations
=
RIVKey
.
h_tempBlock
;
unsigned
int
blockSize
;
int
seed
Count
=
0
;
int
location
Count
=
0
;
while
(
fscanf
(
data
,
"%
100
s"
,
word
)){
while
(
fscanf
(
data
,
"%
99
s"
,
word
)){
if
(
feof
(
data
)){
break
;
}
if
(
!
(
*
word
)){
break
;
}
if
(
!
isWordClean
((
char
*
)
word
))
continue
;
makeSeeds
(
word
,
&
seeds
,
&
seedCount
);
blockSize
=
locationCount
+
RIVKey
.
nonZeros
;
if
(
blockSize
>
RIVKey
.
tempSize
){
RIVKey
.
h_tempBlock
=
(
int
*
)
realloc
(
RIVKey
.
h_tempBlock
,
blockSize
*
sizeof
(
int
));
locations
=
RIVKey
.
h_tempBlock
;
RIVKey
.
tempSize
+=
RIVKey
.
nonZeros
;
}
int
*
locations
=
makeSparseLocations
(
seeds
,
seedCount
);
makeSparseLocations
(
word
,
locations
,
locationCount
);
locationCount
+=
RIVKey
.
nonZeros
;
}
int
*
L2dense
;
L2dense
=
mapI2D
(
locations
,
seedCount
);
L2dense
=
mapI2D
(
locations
,
locationCount
);
sparseRIV
output
=
consolidateD2S
(
L2dense
);
free
(
L2dense
);
output
.
frequency
=
seed
Count
/
RIVKey
.
nonZeros
;
output
.
frequency
=
location
Count
/
RIVKey
.
nonZeros
;
output
.
boolean
=
1
;
return
output
;
}
void
cosineCompare
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
int
multiplierCount
,
float
threshold
){
void
cosineCompareUnbound
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
size_t
multiplierCount
,
float
threshold
){
int
*
baseDenseRIV
=
RIVKey
.
h_tempBlock
;
mapS2D
(
baseDenseRIV
,
baseRIV
);
float
cosSim
;
sparseRIV
*
multipliersStop
=
multipliers
+
multiplierCount
;
float
minsize
=
baseRIV
.
magnitude
*
.
75
;
float
maxsize
=
baseRIV
.
magnitude
*
1
.
25
;
while
(
multipliers
<
multipliersStop
){
if
((
(
*
multipliers
).
boolean
)
/* && (((*multipliers).magnitude < maxsize) && ((*multipliers).magnitude > minsize))*/
){
if
((
*
multipliers
).
boolean
){
int
dot
=
0
;
int
*
values
=
(
*
multipliers
).
values
;
int
*
locations
=
(
*
multipliers
).
locations
;
...
...
@@ -146,156 +134,68 @@ void cosineCompare(sparseRIV baseRIV, sparseRIV *multipliers, int multiplierCoun
values
++
;
}
cosSim
=
dot
/
((
baseRIV
.
magnitude
)
*
((
*
multipliers
).
magnitude
));
//
if(cosSim>=threshold){
printf
(
"
#######
%s
\t
%s
\n
%f
\n
"
,
(
*
multipliers
).
name
,
baseRIV
.
name
,
cosSim
);
if
(
cosSim
>=
threshold
){
printf
(
"%s
\t
%s
\n
%f
\n
"
,
(
*
multipliers
).
name
,
baseRIV
.
name
,
cosSim
);
(
*
multipliers
).
boolean
=
0
;
RIVKey
.
thing
++
;
scanf
(
"%d"
,
&
RIVKey
.
thing
);
//}
}
multipliers
++
;
}
}
void
getMagnitudes
(
sparseRIV
*
inputs
,
int
RIVCount
){
for
(
int
i
=
0
;
i
<
RIVCount
;
i
++
){
int
temp
=
0
;
int
*
values
=
inputs
[
i
].
values
;
int
*
values_stop
=
values
+
inputs
[
i
].
count
;
while
(
values
<
values_stop
){
temp
+=
(
*
values
)
*
(
*
values
);
values
++
;
}
float
magnitude
=
sqrt
(
temp
);
inputs
[
i
].
magnitude
=
magnitude
;
//printf("magnitude = %f, \n", magnitude);
}
}
int
*
mapS2D
(
int
*
destination
,
sparseRIV
input
){
memset
(
destination
,
0
,
RIVKey
.
RIVsize
*
sizeof
(
int
));
int
*
locations_slider
=
input
.
locations
;
int
*
values_slider
=
input
.
values
;
int
*
locations_stop
=
locations_slider
+
input
.
count
;
while
(
locations_slider
<
locations_stop
){
destination
[
*
locations_slider
]
=
*
values_slider
;
locations_slider
++
;
values_slider
++
;
multipliers
++
;
}
return
destination
;
}
int
*
mapI2D
(
int
*
locations
,
int
valueCount
){
int
*
destination
=
(
int
*
)
calloc
(
RIVKey
.
RIVsize
,
sizeof
(
int
));
int
*
locations_slider
=
locations
;
int
*
locations_stop
=
locations_slider
+
valueCount
;
int
value
=
1
;
while
(
locations_slider
<
locations_stop
){
destination
[
*
locations_slider
]
+=
value
;
locations_slider
++
;
value
=
(
value
==
1
)
?
-
1
:
1
;
}
return
destination
;
}
void
cosineCompare
(
sparseRIV
baseRIV
,
sparseRIV
*
multipliers
,
size_t
multiplierCount
,
int
(
*
action
)(
float
cosine
,
sparseRIV
base
,
sparseRIV
multiplier
)){
int
*
baseDenseRIV
=
RIVKey
.
h_tempBlock
;
mapS2D
(
baseDenseRIV
,
baseRIV
);
float
cosSim
;
sparseRIV
*
multipliersStop
=
multipliers
+
multiplierCount
;
float
minsize
=
baseRIV
.
magnitude
*
.
85
;
float
maxsize
=
baseRIV
.
magnitude
*
1
.
15
;
int
dot
=
0
;
int
*
values
;
int
*
locations
;
int
*
locations_Stop
;
while
(
multipliers
<
multipliersStop
){
if
(((
*
multipliers
).
boolean
)
&&
(((
*
multipliers
).
magnitude
<
maxsize
)
&&
((
*
multipliers
).
magnitude
>
minsize
))){
dot
=
0
;
values
=
(
*
multipliers
).
values
;
locations
=
(
*
multipliers
).
locations
;
locations_Stop
=
locations
+
(
*
multipliers
).
count
;
while
(
locations
<
locations_Stop
){
sparseRIV
consolidateD2S
(
int
*
denseInput
){
sparseRIV
output
;
output
.
count
=
0
;
int
*
locations
=
RIVKey
.
h_tempBlock
;
int
*
values
=
RIVKey
.
h_tempBlock
+
RIVKey
.
RIVsize
;
int
*
locations_slider
=
locations
;
int
*
values_slider
=
values
;
for
(
int
i
=
0
;
i
<
RIVKey
.
RIVsize
;
i
++
){
if
(
denseInput
[
i
]){
*
(
locations_slider
++
)
=
i
;
*
(
values_slider
++
)
=
denseInput
[
i
];
output
.
count
++
;
}
dot
+=
(
*
values
)
*
(
*
(
baseDenseRIV
+
(
*
locations
)));
locations
++
;
values
++
;
}
output
.
locations
=
(
int
*
)
malloc
(
output
.
count
*
sizeof
(
int
));
memcpy
(
output
.
locations
,
locations
,
output
.
count
*
sizeof
(
int
));
output
.
values
=
(
int
*
)
malloc
(
output
.
count
*
sizeof
(
int
));
memcpy
(
output
.
values
,
values
,
output
.
count
*
sizeof
(
int
));
return
output
;
}
cosSim
=
dot
/
((
baseRIV
.
magnitude
)
*
((
*
multipliers
).
magnitude
));
void
setKeyData
(
int
RIVsize
,
int
nonZeros
,
int
blockSize
){
RIVKey
.
RIVsize
=
RIVsize
;
if
(
nonZeros
%
2
){
printf
(
"your nonZeros must be an even number"
);
nonZeros
++
;
printf
(
", changed to %d"
,
nonZeros
);
action
(
cosSim
,
baseRIV
,
(
*
multipliers
));
}
RIVKey
.
nonZeros
=
nonZeros
;
RIVKey
.
masks
=
(
long
long
int
*
)
malloc
(
nonZeros
*
sizeof
(
long
long
int
));
multipliers
++
;
for
(
int
i
=
0
;
i
<
nonZeros
;
i
++
){
RIVKey
.
masks
[
i
]
=
SEEDMASK
>>
(
5
*
i
);
}
RIVKey
.
h_tempBlock
=
(
int
*
)
malloc
(
blockSize
*
sizeof
(
int
));
RIVKey
.
h_stagingBlock
=
(
int
*
)
malloc
(
blockSize
*
sizeof
(
int
));
RIVKey
.
h_staging_slider
=
RIVKey
.
h_stagingBlock
;
RIVKey
.
thing
=
0
;
}
void
makeSeeds
(
unsigned
char
*
word
,
int
**
seeds
,
int
*
seedCount
){
int
i
=
0
;
int
seedbase
=
0
;
while
(
*
word
){
seedbase
+=
(
*
(
word
))
<<
(
i
*
5
);
word
++
;
i
++
;
}
int
*
seedTrack
=
(
*
seeds
)
+*
seedCount
;
for
(
i
=
0
;
i
<
RIVKey
.
nonZeros
;
i
++
){
*
seedTrack
=
(
seedbase
>>
i
)
+
(
3
*
i
);
seedTrack
++
;
void
getMagnitudes
(
sparseRIV
*
inputs
,
size_t
RIVCount
){
for
(
int
i
=
0
;
i
<
RIVCount
;
i
++
){
unsigned
int
temp
=
0
;
int
*
values
=
inputs
[
i
].
values
;
int
*
values_stop
=
values
+
inputs
[
i
].
count
;
while
(
values
<
values_stop
){
temp
+=
(
*
values
)
*
(
*
values
);
values
++
;
}
*
seedCount
+=
RIVKey
.
nonZeros
;
return
;
}
int
*
makeSparseLocations
(
int
*
seeds
,
int
seedCount
){
int
*
locations
=
RIVKey
.
h_tempBlock
;
int
*
locations_slider
=
locations
;
int
*
seeds_stop
=
seeds
+
seedCount
;
long
long
int
*
mask
=
RIVKey
.
masks
;
long
long
int
*
mask_stop
=
mask
+
RIVKey
.
nonZeros
;
while
(
seeds
<
seeds_stop
){
*
locations_slider
=
(((
*
seeds
)
^
(
*
mask
))
&
2147483647
)
%
(
RIVKey
.
RIVsize
);
mask
++
;
locations_slider
++
;
seeds
++
;
if
(
!
(
mask
<
mask_stop
))
mask
-=
RIVKey
.
nonZeros
;
float
magnitude
=
sqrt
(
temp
);
inputs
[
i
].
magnitude
=
magnitude
;
}
return
locations
;
}
unsigned
char
*
sscanAdvance
(
unsigned
char
**
string
,
unsigned
char
*
word
){
unsigned
char
*
word_slider
=
word
;
...
...
@@ -312,25 +212,26 @@ unsigned char *sscanAdvance(unsigned char **string, unsigned char *word){
return
word
;
}
sparseRIV
text2L2
(
unsigned
char
*
text
){
unsigned
char
word
[
2000
]
=
{
0
};
int
*
seed
s
=
RIVKey
.
h_tempBlock
;
int
*
location
s
=
RIVKey
.
h_tempBlock
;
unsigned
char
*
text_slider
=
text
;
int
seed
Count
=
0
;
int
location
Count
=
0
;
while
(
*
text_slider
){
sscanAdvance
(
&
text_slider
,
word
);
if
(
word
[
0
]){
makeSeeds
(
word
,
&
seeds
,
&
seedCount
);
makeSparseLocations
(
word
,
locations
,
locationCount
);
locationCount
+=
RIVKey
.
nonZeros
;
}
}
int
*
locations
=
makeSparseLocations
(
seeds
,
seedCount
);
int
*
L2dense
;
L2dense
=
mapI2D
(
locations
,
seed
Count
);
L2dense
=
mapI2D
(
locations
,
location
Count
);
sparseRIV
output
=
consolidateD2S
(
L2dense
);
free
(
L2dense
);
...
...
@@ -358,61 +259,69 @@ int isWordClean(char* word){
return
1
;
}
denseRIV
lexPull
(
int
*
valuesOut
,
char
*
word
){
denseRIV
lexPull
(
char
*
word
){
#if CACHESIZE > 0
int
hash
=
cacheHash
(
word
);
if
(
!
strcmp
(
word
,
RIVKey
.
RIVCache
[
hash
].
name
)){
return
RIVKey
.
RIVCache
[
hash
];
}
#endif
/* CACHESIZE > 0 */
denseRIV
output
;
output
.
values
=
valuesOut
;
output
.
values
=
(
int
*
)
calloc
(
RIVKey
.
RIVsize
+
1
,
sizeof
(
int
));
output
.
frequency
=
output
.
values
+
RIVKey
.
RIVsize
;
char
pathString
[
200
];
FILE
*
lexWord
;
sprintf
(
pathString
,
"lexicon/%s"
,
word
);
lexWord
=
fopen
(
pathString
,
"r+
"
);
FILE
*
lexWord
=
fopen
(
pathString
,
"rb
"
);
strcpy
(
output
.
name
,
word
);
if
(
lexWord
){
fscanf
(
lexWord
,
"%d,%f"
,
&
output
.
frequency
,
&
output
.
magnitude
);
int
*
values_slider
=
valuesOut
;
int
*
values_stop
=
valuesOut
+
RIVKey
.
RIVsize
;
while
(
values_slider
<
values_stop
){
fscanf
(
lexWord
,
",%d"
,
values_slider
);
values_slider
++
;
}
fread
(
output
.
frequency
,
1
,
sizeof
(
int
),
lexWord
);
fread
(
&
(
output
.
magnitude
),
1
,
sizeof
(
int
),
lexWord
);
fread
(
output
.
values
,
RIVKey
.
RIVsize
,
sizeof
(
int
),
lexWord
);
fclose
(
lexWord
);
}
else
{
output
.
frequency
=
0
;
*
(
output
.
frequency
)
=
0
;
output
.
magnitude
=
0
;
memset
(
valuesOut
,
0
,
RIVKey
.
RIVsize
*
sizeof
(
int
));
}
return
output
;
}
int
lexPush
(
denseRIV
RIVout
){
char
pathString
[
1000
]
=
{
0
};
strcpy
(
pathString
,
"lexicon"
);
strcat
(
pathString
,
"/"
);
strcat
(
pathString
,
RIVout
.
name
);
//printf("%s\n", pathString);
FILE
*
lexWord
=
fopen
(
pathString
,
"w+"
);
if
(
!
lexWord
){
lexWord
=
fopen
(
pathString
,
"w+"
);
if
(
!
lexWord
){
printf
(
"fucked it up big time bro, %s
\n
"
,
pathString
);
printf
(
"%s
\n
"
,
pathString
);
return
1
;
}
}
//printf( "%f",RIVout.magnitude);
fprintf
(
lexWord
,
"%d,%f"
,
RIVout
.
frequency
,
RIVout
.
magnitude
);
int
*
values_slider
=
RIVout
.
values
;
int
*
values_stop
=
RIVout
.
values
+
RIVKey
.
RIVsize
;
while
(
values_slider
<
values_stop
){
fprintf
(
lexWord
,
",%d"
,
*
(
values_slider
));
values_slider
++
;
#if CACHESIZE == 0
fLexPush
(
RIVout
);
return
0
;
#else
/*CACHESIZE != 0 */
srand
(
wordtoSeed
((
unsigned
char
*
)
RIVout
.
name
));
int
hash
=
rand
()
%
RIVKey
.
cacheSize
;
if
(
!
strcmp
(
RIVout
.
name
,
RIVKey
.
RIVCache
[
hash
].
name
))
return
0
;
if
(
!
RIVKey
.
RIVCache
[
hash
].
frequency
){
RIVKey
.
RIVCache
[
hash
]
=
RIVout
;
return
0
;
}
else
if
(
*
RIVout
.
frequency
>*
RIVKey
.
RIVCache
[
hash
].
frequency
){
int
diag
=
fLexPush
(
RIVKey
.
RIVCache
[
hash
]);
RIVKey
.
RIVCache
[
hash
]
=
RIVout
;
return
diag
;
}
else
{
fLexPush
(
RIVout
);
}
fclose
(
lexWord
);
return
0
;
#endif
/*CACHESIZE == 0 */
}
int
cacheHash
(
char
*
word
){
int
i
=
0
;
int
seed
=
0
;
while
(
*
word
){
seed
+=
(
*
(
word
))
<<
(
i
*
5
);
word
++
;
i
++
;
}
srand
(
seed
);
return
rand
()
%
RIVKey
.
cacheSize
;
}
RIVtoolsCPU.h
→
RIVtoolsCPU
windows
.h
View file @
7c37cc43
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment