sentences with ambiguous forms are removed

c282d622 · Paktalin · afdb23fe · c282d622 · c282d622
Commit c282d622 authored Jan 11, 2019 by Paktalin
Showing with 121 additions and 24 deletions
main.py
preprocessing.py
--- a/main.py
+++ b/main.py
 from sklearn.model_selection import train_test_split
 from keras.models import Sequential, load_model
 from keras.layers import Dense, LSTM, Embedding
-from util import read_array
+from util import read_list, read_array
 from keras.utils import to_categorical
+import numpy as np

-VOCAB_SIZE = 85
+VOCAB_SIZE = 79

 def get_train_test_val():
-	sequences = read_array('sequences.csv')
+	sequences = read_array('sequences_na.csv')
 	X, y = sequences[:,:-1], sequences[:,-1]
 	y = to_categorical(y, num_classes=VOCAB_SIZE)
 	x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 	x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.2)
 	return x_train, y_train, x_test, y_test, x_validate, y_validate

+
+def run_n_epochs(n, model, x_train, y_train, x_validate, y_validate, x_test, y_test):
+	for i in range(n):
+		print('%i iteration out of %i' % (i, n))
+		model.fit(x_train, y_train, batch_size=128, epochs=1, validation_data=(x_validate, y_validate))
+		print('Saving the model...')
+		name = 'lstm_' + str(i) + '.h5'
+		model.save(name)
+
 def train():
 	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
 	seq_length = x_train.shape[1]
-	print(x_train.shape, x_validate.shape, x_test.shape)
-	print(y_train.shape, y_validate.shape, y_test.shape)

-	# define model
 	model = Sequential()
 	model.add(Embedding(VOCAB_SIZE, 50, input_length=seq_length))
-	model.add(LSTM(50, return_sequences=True))
 	model.add(LSTM(50))
-	model.add(Dense(100, activation='relu'))
-	model.add(Dense(VOCAB_SIZE, activation='softmax'))
+	model.add(Dense(VOCAB_SIZE, activation='sigmoid'))
 	print(model.summary())

 	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
-	model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_validate, y_validate))
-	print('Saving the model...')
-	model.save('lstm_test_validation_10_epoch_50_lstms.h5')
-	print(model.evaluate(x_test, y_test))
+	run_n_epochs(100, model, x_train, y_train, x_validate, y_validate, x_test, y_test)
+

 def train_saved_model():
-	model = load_model('lstm_test_validation_34_epochs.h5')
+	model = load_model('lstm_test_validation_40_epochs.h5')
 	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
 	model.fit(x_train, y_train, epochs=1, batch_size=128, validation_data=(x_validate, y_validate))
 	print('Saving the model...')
-	model.save('lstm_test_validation_35_epochs.h5')
+	model.save('lstm_test_validation_41_epochs.h5')
 	print(model.evaluate(x_test, y_test))

-train_saved_model()
\ No newline at end of file
+def model_name(epoch):
+	return 'lstm_test_validation_' + str(epoch) + '_epochs.h5'
+
+def train_the_last_word_model():
+	previous_loss = 1.5555
+	epoch = 43
+	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
+	while(True):
+		model = load_model(model_name(epoch))
+		train_history = model.fit(x_train, y_train, epochs=1, batch_size=2048, validation_data=(x_validate, y_validate))
+		val_loss = train_history.history['val_loss'][-1]
+		if val_loss < previous_loss:
+			print(model.evaluate(x_test, y_test))
+			print('Improved the loss')
+			epoch += 1
+			model.save(model_name(epoch))
+			previous_loss = val_loss
+		else:
+			print('No loss improvements')
+
+train()
\ No newline at end of file
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -2,7 +2,7 @@ from estnltk import Text
 import numpy as np
 from keras.preprocessing.text import text_to_word_sequence, Tokenizer
 from tqdm import tqdm
-import pickle
+import pickle, re
 from util import save_list, read_list, save_array, read_array
 from keras.preprocessing.sequence import pad_sequences
 from keras.utils import to_categorical
@@ -18,6 +18,72 @@ next_words_file = 'next_words'
 sequences_file = 'sequences'
 forms_file = 'forms'

+def preprocess():
+	articles = Text(open(articles_file, encoding='utf-8').read())
+	# transform to an array of sentences
+	sentences = articles.sentence_texts
+	# create an empty dict to store forms like {form: code}
+	dict_forms = {}
+	# create empty lists
+	sequences, next_words = [], []
+	for i in tqdm(range(len(sentences))):
+		# split the sentence into a list of lowercase words
+		sentences[i] = text_to_word_sequence(sentences[i])
+		num_words = len(sentences[i])
+		encoded_forms = np.zeros(num_words, dtype=int)
+		for j in range(num_words):
+			form = Text(sentences[i][j]).forms[0]
+			if form not in dict_forms:
+				dict_forms[form] = len(dict_forms) + 1
+			# set the form's code to the current form
+			encoded_forms[j] = dict_forms[form]
+
+		for j in range(0, len(sentences[i]) - SEQUENCE_LEN, STEP):
+			# split the sentences into sequences of SEQUENCE_LEN
+			sequences.append(encoded_forms[j: j + SEQUENCE_LEN])
+			# set next words for the current sequence
+			next_words.append(encoded_forms[j + SEQUENCE_LEN])
+
+	#save the lists
+	print('Saving sequences...')
+	save_list(sequences, sequences_file)
+	print('Saving next_words...')
+	save_list(next_words, next_words_file)
+
+def ambiguous(array_forms, dict_codes_to_forms):
+	if -1 in array_forms:
+		return True
+	return False
+
+def form_sequences():
+	articles = Text(open(articles_file, encoding='utf-8').read())
+	sentences = articles.sentence_texts
+	sequences = []
+	dict_forms_to_codes, dict_codes_to_forms = {'ambiguous':-1}, {-1: 'ambiguous'}
+	for i in tqdm(range(len(sentences))):
+		sentences[i] = text_to_word_sequence(sentences[i])
+		num_words = len(sentences[i])
+		encoded_forms = np.zeros(num_words, dtype=int)
+		if num_words >= SEQUENCE_LEN:
+			for j in range(num_words):
+				form = Text(sentences[i][j]).forms[0]
+				if '|' in form:
+					form = 'ambiguous'
+				elif form not in dict_forms_to_codes:
+					dict_forms_to_codes[form] = len(dict_forms_to_codes)
+					dict_codes_to_forms[dict_forms_to_codes[form]] = form
+				encoded_forms[j] = dict_forms_to_codes[form]
+			for j in range(0, num_words - SEQUENCE_LEN):
+				if not ambiguous(encoded_forms[j:j+SEQUENCE_LEN+1], dict_codes_to_forms):
+					sequences.append(encoded_forms[j:j+SEQUENCE_LEN+1])
+
+	print('Saving dictionaries...')
+	save_list(dict_forms_to_codes, 'dict_forms_to_codes')
+	save_list(dict_codes_to_forms, 'dict_codes_to_forms')
+	save_list(sequences, 'sequences4')
+	print(sequences)
+
+
 def save_forms_and_sequences():
 	# load the input data
 	articles = Text(open(articles_file, encoding='utf-8').read())
@@ -26,16 +92,20 @@ def save_forms_and_sequences():
 	forms = []
 	# loop over all the sentences
 	for i in tqdm(range(len(sentences))):
-		forms.append('')
+		forms_string = ''
 		# split the sentence into a list of lowercase words
 		sentence = text_to_word_sequence(sentences[i])
 		for word in sentence:
 			form = Text(word).forms[0]
+			if '|' in form or '?' in form:
+				forms_string = 'ambiguous'
+				break
 			if form == '':
 				form = ' '
-			# append the a new form to the forms[i] string
-			forms[i] = forms[i] + '~' + form
-	# save forms list
+			forms_string = forms_string + '~' + form
+		if forms_string != 'ambiguous':
+			forms.append(forms_string)
+
 	save_list(forms, forms_file)

 	# tokenize the forms
@@ -45,4 +115,8 @@ def save_forms_and_sequences():
 	# pad sequences, using the maxlen
 	sequences = pad_sequences(sequences, 70)
 	sequences = np.array(sequences)
-	save_array(sequences, 'sequences.csv')
\ No newline at end of file
+	save_array(sequences, 'sequences_na.csv') # not ambiguous
+
+save_forms_and_sequences()
+
+# print(read_list(forms_file))
\ No newline at end of file