Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian-lstm
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
c282d622
authored
Jan 11, 2019
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sentences with ambiguous forms are removed
parent
afdb23fe
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
121 additions
and
24 deletions
main.py
preprocessing.py
main.py
View file @
c282d622
from
sklearn.model_selection
import
train_test_split
from
sklearn.model_selection
import
train_test_split
from
keras.models
import
Sequential
,
load_model
from
keras.models
import
Sequential
,
load_model
from
keras.layers
import
Dense
,
LSTM
,
Embedding
from
keras.layers
import
Dense
,
LSTM
,
Embedding
from
util
import
read_array
from
util
import
read_
list
,
read_
array
from
keras.utils
import
to_categorical
from
keras.utils
import
to_categorical
import
numpy
as
np
VOCAB_SIZE
=
85
VOCAB_SIZE
=
79
def
get_train_test_val
():
def
get_train_test_val
():
sequences
=
read_array
(
'sequences.csv'
)
sequences
=
read_array
(
'sequences
_na
.csv'
)
X
,
y
=
sequences
[:,:
-
1
],
sequences
[:,
-
1
]
X
,
y
=
sequences
[:,:
-
1
],
sequences
[:,
-
1
]
y
=
to_categorical
(
y
,
num_classes
=
VOCAB_SIZE
)
y
=
to_categorical
(
y
,
num_classes
=
VOCAB_SIZE
)
x_train
,
x_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
)
x_train
,
x_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
)
x_test
,
x_validate
,
y_test
,
y_validate
=
train_test_split
(
x_test
,
y_test
,
test_size
=
0.2
)
x_test
,
x_validate
,
y_test
,
y_validate
=
train_test_split
(
x_test
,
y_test
,
test_size
=
0.2
)
return
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
return
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
def
run_n_epochs
(
n
,
model
,
x_train
,
y_train
,
x_validate
,
y_validate
,
x_test
,
y_test
):
for
i
in
range
(
n
):
print
(
'
%
i iteration out of
%
i'
%
(
i
,
n
))
model
.
fit
(
x_train
,
y_train
,
batch_size
=
128
,
epochs
=
1
,
validation_data
=
(
x_validate
,
y_validate
))
print
(
'Saving the model...'
)
name
=
'lstm_'
+
str
(
i
)
+
'.h5'
model
.
save
(
name
)
def
train
():
def
train
():
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
seq_length
=
x_train
.
shape
[
1
]
seq_length
=
x_train
.
shape
[
1
]
print
(
x_train
.
shape
,
x_validate
.
shape
,
x_test
.
shape
)
print
(
y_train
.
shape
,
y_validate
.
shape
,
y_test
.
shape
)
# define model
model
=
Sequential
()
model
=
Sequential
()
model
.
add
(
Embedding
(
VOCAB_SIZE
,
50
,
input_length
=
seq_length
))
model
.
add
(
Embedding
(
VOCAB_SIZE
,
50
,
input_length
=
seq_length
))
model
.
add
(
LSTM
(
50
,
return_sequences
=
True
))
model
.
add
(
LSTM
(
50
))
model
.
add
(
LSTM
(
50
))
model
.
add
(
Dense
(
100
,
activation
=
'relu'
))
model
.
add
(
Dense
(
VOCAB_SIZE
,
activation
=
'sigmoid'
))
model
.
add
(
Dense
(
VOCAB_SIZE
,
activation
=
'softmax'
))
print
(
model
.
summary
())
print
(
model
.
summary
())
model
.
compile
(
loss
=
'categorical_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
])
model
.
compile
(
loss
=
'categorical_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'accuracy'
])
model
.
fit
(
x_train
,
y_train
,
batch_size
=
128
,
epochs
=
10
,
validation_data
=
(
x_validate
,
y_validate
))
run_n_epochs
(
100
,
model
,
x_train
,
y_train
,
x_validate
,
y_validate
,
x_test
,
y_test
)
print
(
'Saving the model...'
)
model
.
save
(
'lstm_test_validation_10_epoch_50_lstms.h5'
)
print
(
model
.
evaluate
(
x_test
,
y_test
))
def
train_saved_model
():
def
train_saved_model
():
model
=
load_model
(
'lstm_test_validation_
34
_epochs.h5'
)
model
=
load_model
(
'lstm_test_validation_
40
_epochs.h5'
)
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
model
.
fit
(
x_train
,
y_train
,
epochs
=
1
,
batch_size
=
128
,
validation_data
=
(
x_validate
,
y_validate
))
model
.
fit
(
x_train
,
y_train
,
epochs
=
1
,
batch_size
=
128
,
validation_data
=
(
x_validate
,
y_validate
))
print
(
'Saving the model...'
)
print
(
'Saving the model...'
)
model
.
save
(
'lstm_test_validation_
35
_epochs.h5'
)
model
.
save
(
'lstm_test_validation_
41
_epochs.h5'
)
print
(
model
.
evaluate
(
x_test
,
y_test
))
print
(
model
.
evaluate
(
x_test
,
y_test
))
train_saved_model
()
def
model_name
(
epoch
):
\ No newline at end of file
return
'lstm_test_validation_'
+
str
(
epoch
)
+
'_epochs.h5'
def
train_the_last_word_model
():
previous_loss
=
1.5555
epoch
=
43
x_train
,
y_train
,
x_test
,
y_test
,
x_validate
,
y_validate
=
get_train_test_val
()
while
(
True
):
model
=
load_model
(
model_name
(
epoch
))
train_history
=
model
.
fit
(
x_train
,
y_train
,
epochs
=
1
,
batch_size
=
2048
,
validation_data
=
(
x_validate
,
y_validate
))
val_loss
=
train_history
.
history
[
'val_loss'
][
-
1
]
if
val_loss
<
previous_loss
:
print
(
model
.
evaluate
(
x_test
,
y_test
))
print
(
'Improved the loss'
)
epoch
+=
1
model
.
save
(
model_name
(
epoch
))
previous_loss
=
val_loss
else
:
print
(
'No loss improvements'
)
train
()
\ No newline at end of file
preprocessing.py
View file @
c282d622
...
@@ -2,7 +2,7 @@ from estnltk import Text
...
@@ -2,7 +2,7 @@ from estnltk import Text
import
numpy
as
np
import
numpy
as
np
from
keras.preprocessing.text
import
text_to_word_sequence
,
Tokenizer
from
keras.preprocessing.text
import
text_to_word_sequence
,
Tokenizer
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
pickle
import
pickle
,
re
from
util
import
save_list
,
read_list
,
save_array
,
read_array
from
util
import
save_list
,
read_list
,
save_array
,
read_array
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.preprocessing.sequence
import
pad_sequences
from
keras.utils
import
to_categorical
from
keras.utils
import
to_categorical
...
@@ -18,6 +18,72 @@ next_words_file = 'next_words'
...
@@ -18,6 +18,72 @@ next_words_file = 'next_words'
sequences_file
=
'sequences'
sequences_file
=
'sequences'
forms_file
=
'forms'
forms_file
=
'forms'
def
preprocess
():
articles
=
Text
(
open
(
articles_file
,
encoding
=
'utf-8'
)
.
read
())
# transform to an array of sentences
sentences
=
articles
.
sentence_texts
# create an empty dict to store forms like {form: code}
dict_forms
=
{}
# create empty lists
sequences
,
next_words
=
[],
[]
for
i
in
tqdm
(
range
(
len
(
sentences
))):
# split the sentence into a list of lowercase words
sentences
[
i
]
=
text_to_word_sequence
(
sentences
[
i
])
num_words
=
len
(
sentences
[
i
])
encoded_forms
=
np
.
zeros
(
num_words
,
dtype
=
int
)
for
j
in
range
(
num_words
):
form
=
Text
(
sentences
[
i
][
j
])
.
forms
[
0
]
if
form
not
in
dict_forms
:
dict_forms
[
form
]
=
len
(
dict_forms
)
+
1
# set the form's code to the current form
encoded_forms
[
j
]
=
dict_forms
[
form
]
for
j
in
range
(
0
,
len
(
sentences
[
i
])
-
SEQUENCE_LEN
,
STEP
):
# split the sentences into sequences of SEQUENCE_LEN
sequences
.
append
(
encoded_forms
[
j
:
j
+
SEQUENCE_LEN
])
# set next words for the current sequence
next_words
.
append
(
encoded_forms
[
j
+
SEQUENCE_LEN
])
#save the lists
print
(
'Saving sequences...'
)
save_list
(
sequences
,
sequences_file
)
print
(
'Saving next_words...'
)
save_list
(
next_words
,
next_words_file
)
def
ambiguous
(
array_forms
,
dict_codes_to_forms
):
if
-
1
in
array_forms
:
return
True
return
False
def
form_sequences
():
articles
=
Text
(
open
(
articles_file
,
encoding
=
'utf-8'
)
.
read
())
sentences
=
articles
.
sentence_texts
sequences
=
[]
dict_forms_to_codes
,
dict_codes_to_forms
=
{
'ambiguous'
:
-
1
},
{
-
1
:
'ambiguous'
}
for
i
in
tqdm
(
range
(
len
(
sentences
))):
sentences
[
i
]
=
text_to_word_sequence
(
sentences
[
i
])
num_words
=
len
(
sentences
[
i
])
encoded_forms
=
np
.
zeros
(
num_words
,
dtype
=
int
)
if
num_words
>=
SEQUENCE_LEN
:
for
j
in
range
(
num_words
):
form
=
Text
(
sentences
[
i
][
j
])
.
forms
[
0
]
if
'|'
in
form
:
form
=
'ambiguous'
elif
form
not
in
dict_forms_to_codes
:
dict_forms_to_codes
[
form
]
=
len
(
dict_forms_to_codes
)
dict_codes_to_forms
[
dict_forms_to_codes
[
form
]]
=
form
encoded_forms
[
j
]
=
dict_forms_to_codes
[
form
]
for
j
in
range
(
0
,
num_words
-
SEQUENCE_LEN
):
if
not
ambiguous
(
encoded_forms
[
j
:
j
+
SEQUENCE_LEN
+
1
],
dict_codes_to_forms
):
sequences
.
append
(
encoded_forms
[
j
:
j
+
SEQUENCE_LEN
+
1
])
print
(
'Saving dictionaries...'
)
save_list
(
dict_forms_to_codes
,
'dict_forms_to_codes'
)
save_list
(
dict_codes_to_forms
,
'dict_codes_to_forms'
)
save_list
(
sequences
,
'sequences4'
)
print
(
sequences
)
def
save_forms_and_sequences
():
def
save_forms_and_sequences
():
# load the input data
# load the input data
articles
=
Text
(
open
(
articles_file
,
encoding
=
'utf-8'
)
.
read
())
articles
=
Text
(
open
(
articles_file
,
encoding
=
'utf-8'
)
.
read
())
...
@@ -26,16 +92,20 @@ def save_forms_and_sequences():
...
@@ -26,16 +92,20 @@ def save_forms_and_sequences():
forms
=
[]
forms
=
[]
# loop over all the sentences
# loop over all the sentences
for
i
in
tqdm
(
range
(
len
(
sentences
))):
for
i
in
tqdm
(
range
(
len
(
sentences
))):
forms
.
append
(
''
)
forms
_string
=
''
# split the sentence into a list of lowercase words
# split the sentence into a list of lowercase words
sentence
=
text_to_word_sequence
(
sentences
[
i
])
sentence
=
text_to_word_sequence
(
sentences
[
i
])
for
word
in
sentence
:
for
word
in
sentence
:
form
=
Text
(
word
)
.
forms
[
0
]
form
=
Text
(
word
)
.
forms
[
0
]
if
'|'
in
form
or
'?'
in
form
:
forms_string
=
'ambiguous'
break
if
form
==
''
:
if
form
==
''
:
form
=
' '
form
=
' '
# append the a new form to the forms[i] string
forms_string
=
forms_string
+
'~'
+
form
forms
[
i
]
=
forms
[
i
]
+
'~'
+
form
if
forms_string
!=
'ambiguous'
:
# save forms list
forms
.
append
(
forms_string
)
save_list
(
forms
,
forms_file
)
save_list
(
forms
,
forms_file
)
# tokenize the forms
# tokenize the forms
...
@@ -45,4 +115,8 @@ def save_forms_and_sequences():
...
@@ -45,4 +115,8 @@ def save_forms_and_sequences():
# pad sequences, using the maxlen
# pad sequences, using the maxlen
sequences
=
pad_sequences
(
sequences
,
70
)
sequences
=
pad_sequences
(
sequences
,
70
)
sequences
=
np
.
array
(
sequences
)
sequences
=
np
.
array
(
sequences
)
save_array
(
sequences
,
'sequences.csv'
)
save_array
(
sequences
,
'sequences_na.csv'
)
# not ambiguous
\ No newline at end of file
save_forms_and_sequences
()
# print(read_list(forms_file))
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment