Commit 355e205f by Paktalin

Current version of the project

parent c282d622
Showing with 30 additions and 12 deletions
......@@ -5,10 +5,10 @@ from util import read_list, read_array
from keras.utils import to_categorical
import numpy as np
VOCAB_SIZE = 79
VOCAB_SIZE = 85
def get_train_test_val():
sequences = read_array('sequences_na.csv')
sequences = read_array('sequences_splitted.csv')
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=VOCAB_SIZE)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
......@@ -55,6 +55,7 @@ def train_the_last_word_model():
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
while(True):
model = load_model(model_name(epoch))
print(model.summary())
train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate))
val_loss = train_history.history['val_loss'][-1]
if val_loss < previous_loss:
......
......@@ -6,6 +6,7 @@ import pickle, re
from util import save_list, read_list, save_array, read_array
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import matplotlib.pyplot as plt
# set sequence length and step for sentences splitting
SEQUENCE_LEN = 3
......@@ -97,15 +98,14 @@ def save_forms_and_sequences():
sentence = text_to_word_sequence(sentences[i])
for word in sentence:
form = Text(word).forms[0]
if '|' in form or '?' in form:
forms_string = 'ambiguous'
break
if form == '':
form = ' '
forms_string = forms_string + '~' + form
if forms_string != 'ambiguous':
forms.append(forms_string)
if '|' in form or '?' in form:
forms.append(forms_string)
forms_string = ''
else:
forms_string = forms_string + '~' + form
forms.append(forms_string)
save_list(forms, forms_file)
# tokenize the forms
......@@ -117,6 +117,23 @@ def save_forms_and_sequences():
sequences = np.array(sequences)
save_array(sequences, 'sequences_na.csv') # not ambiguous
save_forms_and_sequences()
forms = read_list(forms_file)
tokenizer = Tokenizer(split='~', filters='')
tokenizer.fit_on_texts(forms)
sequences = tokenizer.texts_to_sequences(forms)
minlen = 3
lengths = []
for sequence in sequences:
if len(sequence) < 3:
sequences.remove(sequence)
else:
lengths.append(len(sequence))
plt.hist(lengths, bins=100)
plt.show()
# print(read_list(forms_file))
\ No newline at end of file
sequences = pad_sequences(sequences, 40)
sequences = np.array(sequences)
save_array(sequences, 'sequences_splitted.csv')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment