Commit c282d622 by Paktalin

sentences with ambiguous forms are removed

parent afdb23fe
Showing with 121 additions and 24 deletions
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding
from util import read_array
from util import read_list, read_array
from keras.utils import to_categorical
import numpy as np
VOCAB_SIZE = 85
VOCAB_SIZE = 79
def get_train_test_val():
sequences = read_array('sequences.csv')
sequences = read_array('sequences_na.csv')
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=VOCAB_SIZE)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.2)
return x_train, y_train, x_test, y_test, x_validate, y_validate
def run_n_epochs(n, model, x_train, y_train, x_validate, y_validate, x_test, y_test):
for i in range(n):
print('%i iteration out of %i' % (i, n))
model.fit(x_train, y_train, batch_size=128, epochs=1, validation_data=(x_validate, y_validate))
print('Saving the model...')
name = 'lstm_' + str(i) + '.h5'
model.save(name)
def train():
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
seq_length = x_train.shape[1]
print(x_train.shape, x_validate.shape, x_test.shape)
print(y_train.shape, y_validate.shape, y_test.shape)
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 50, input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(100, activation='relu'))
model.add(Dense(VOCAB_SIZE, activation='softmax'))
model.add(Dense(VOCAB_SIZE, activation='sigmoid'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_validate, y_validate))
print('Saving the model...')
model.save('lstm_test_validation_10_epoch_50_lstms.h5')
print(model.evaluate(x_test, y_test))
run_n_epochs(100, model, x_train, y_train, x_validate, y_validate, x_test, y_test)
def train_saved_model():
model = load_model('lstm_test_validation_34_epochs.h5')
model = load_model('lstm_test_validation_40_epochs.h5')
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
model.fit(x_train, y_train, epochs=1, batch_size=128, validation_data=(x_validate, y_validate))
print('Saving the model...')
model.save('lstm_test_validation_35_epochs.h5')
model.save('lstm_test_validation_41_epochs.h5')
print(model.evaluate(x_test, y_test))
def model_name(epoch):
return 'lstm_test_validation_' + str(epoch) + '_epochs.h5'
def train_the_last_word_model():
previous_loss = 1.5555
epoch = 43
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
while(True):
model = load_model(model_name(epoch))
train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate))
val_loss = train_history.history['val_loss'][-1]
if val_loss < previous_loss:
print(model.evaluate(x_test, y_test))
print('Improved the loss')
epoch += 1
model.save(model_name(epoch))
previous_loss = val_loss
else:
print('No loss improvements')
train_saved_model()
\ No newline at end of file
train()
\ No newline at end of file
......@@ -2,7 +2,7 @@ from estnltk import Text
import numpy as np
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tqdm import tqdm
import pickle
import pickle, re
from util import save_list, read_list, save_array, read_array
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
......@@ -18,6 +18,72 @@ next_words_file = 'next_words'
sequences_file = 'sequences'
forms_file = 'forms'
def preprocess():
articles = Text(open(articles_file, encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code}
dict_forms = {}
# create empty lists
sequences, next_words = [], []
for i in tqdm(range(len(sentences))):
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
num_words = len(sentences[i])
encoded_forms = np.zeros(num_words, dtype=int)
for j in range(num_words):
form = Text(sentences[i][j]).forms[0]
if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
encoded_forms[j] = dict_forms[form]
for j in range(0, len(sentences[i]) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(encoded_forms[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(encoded_forms[j + SEQUENCE_LEN])
#save the lists
print('Saving sequences...')
save_list(sequences, sequences_file)
print('Saving next_words...')
save_list(next_words, next_words_file)
def ambiguous(array_forms, dict_codes_to_forms):
if -1 in array_forms:
return True
return False
def form_sequences():
articles = Text(open(articles_file, encoding='utf-8').read())
sentences = articles.sentence_texts
sequences = []
dict_forms_to_codes, dict_codes_to_forms = {'ambiguous':-1}, {-1: 'ambiguous'}
for i in tqdm(range(len(sentences))):
sentences[i] = text_to_word_sequence(sentences[i])
num_words = len(sentences[i])
encoded_forms = np.zeros(num_words, dtype=int)
if num_words >= SEQUENCE_LEN:
for j in range(num_words):
form = Text(sentences[i][j]).forms[0]
if '|' in form:
form = 'ambiguous'
elif form not in dict_forms_to_codes:
dict_forms_to_codes[form] = len(dict_forms_to_codes)
dict_codes_to_forms[dict_forms_to_codes[form]] = form
encoded_forms[j] = dict_forms_to_codes[form]
for j in range(0, num_words - SEQUENCE_LEN):
if not ambiguous(encoded_forms[j:j+SEQUENCE_LEN+1], dict_codes_to_forms):
sequences.append(encoded_forms[j:j+SEQUENCE_LEN+1])
print('Saving dictionaries...')
save_list(dict_forms_to_codes, 'dict_forms_to_codes')
save_list(dict_codes_to_forms, 'dict_codes_to_forms')
save_list(sequences, 'sequences4')
print(sequences)
def save_forms_and_sequences():
# load the input data
articles = Text(open(articles_file, encoding='utf-8').read())
......@@ -26,16 +92,20 @@ def save_forms_and_sequences():
forms = []
# loop over all the sentences
for i in tqdm(range(len(sentences))):
forms.append('')
forms_string = ''
# split the sentence into a list of lowercase words
sentence = text_to_word_sequence(sentences[i])
for word in sentence:
form = Text(word).forms[0]
if '|' in form or '?' in form:
forms_string = 'ambiguous'
break
if form == '':
form = ' '
# append the a new form to the forms[i] string
forms[i] = forms[i] + '~' + form
# save forms list
forms_string = forms_string + '~' + form
if forms_string != 'ambiguous':
forms.append(forms_string)
save_list(forms, forms_file)
# tokenize the forms
......@@ -45,4 +115,8 @@ def save_forms_and_sequences():
# pad sequences, using the maxlen
sequences = pad_sequences(sequences, 70)
sequences = np.array(sequences)
save_array(sequences, 'sequences.csv')
\ No newline at end of file
save_array(sequences, 'sequences_na.csv') # not ambiguous
save_forms_and_sequences()
# print(read_list(forms_file))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment