Training results with sequences.csv

afdb23fe · Paktalin · f5c0f0ab · afdb23fe · afdb23fe · afdb23fe
Commit afdb23fe authored Jan 09, 2019 by Paktalin
Showing with 86 additions and 82 deletions
__pycache__/preprocessing.cpython-36.pyc
__pycache__/util.cpython-36.pyc
forms
lstm.h5
lstm_test_validation.h5
lstm_test_validation_10_epoch_50_lstms.h5
lstm_test_validation_10_epochs.h5
lstm_test_validation_15_epoch_50_lstms.h5
lstm_test_validation_15_epochs.h5
lstm_test_validation_18_epoch_50_lstms.h5
lstm_test_validation_18_epochs.h5
lstm_test_validation_21_epochs.h5
lstm_test_validation_22_epochs.h5
lstm_test_validation_23_epochs.h5
lstm_test_validation_24_epochs.h5
lstm_test_validation_25_epochs.h5
lstm_test_validation_26_epochs.h5
lstm_test_validation_27_epochs.h5
lstm_test_validation_28_epochs.h5
lstm_test_validation_29_epochs.h5
--- a/__pycache__/preprocessing.cpython-36.pyc
+++ b/__pycache__/preprocessing.cpython-36.pyc
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/forms
+++ b/forms
--- a/lstm.h5
+++ b/lstm.h5
--- a/lstm_test_validation.h5
+++ b/lstm_test_validation.h5
--- a/lstm_test_validation_10_epoch_50_lstms.h5
+++ b/lstm_test_validation_10_epoch_50_lstms.h5
--- a/lstm_test_validation_10_epochs.h5
+++ b/lstm_test_validation_10_epochs.h5
--- a/lstm_test_validation_15_epoch_50_lstms.h5
+++ b/lstm_test_validation_15_epoch_50_lstms.h5
--- a/lstm_test_validation_15_epochs.h5
+++ b/lstm_test_validation_15_epochs.h5
--- a/lstm_test_validation_18_epoch_50_lstms.h5
+++ b/lstm_test_validation_18_epoch_50_lstms.h5
--- a/lstm_test_validation_18_epochs.h5
+++ b/lstm_test_validation_18_epochs.h5
--- a/lstm_test_validation_21_epochs.h5
+++ b/lstm_test_validation_21_epochs.h5
--- a/lstm_test_validation_22_epochs.h5
+++ b/lstm_test_validation_22_epochs.h5
--- a/lstm_test_validation_23_epochs.h5
+++ b/lstm_test_validation_23_epochs.h5
--- a/lstm_test_validation_24_epochs.h5
+++ b/lstm_test_validation_24_epochs.h5
--- a/lstm_test_validation_25_epochs.h5
+++ b/lstm_test_validation_25_epochs.h5
--- a/lstm_test_validation_26_epochs.h5
+++ b/lstm_test_validation_26_epochs.h5
--- a/lstm_test_validation_27_epochs.h5
+++ b/lstm_test_validation_27_epochs.h5
--- a/lstm_test_validation_28_epochs.h5
+++ b/lstm_test_validation_28_epochs.h5
--- a/lstm_test_validation_29_epochs.h5
+++ b/lstm_test_validation_29_epochs.h5
--- a/lstm_test_validation_30_epochs.h5
+++ b/lstm_test_validation_30_epochs.h5
--- a/lstm_test_validation_31_epochs.h5
+++ b/lstm_test_validation_31_epochs.h5
--- a/lstm_test_validation_32_epochs.h5
+++ b/lstm_test_validation_32_epochs.h5
--- a/lstm_test_validation_33_epochs.h5
+++ b/lstm_test_validation_33_epochs.h5
--- a/lstm_test_validation_34_epochs.h5
+++ b/lstm_test_validation_34_epochs.h5
--- a/main.py
+++ b/main.py
-import numpy as np
 from sklearn.model_selection import train_test_split
-from keras.models import Sequential
+from keras.models import Sequential, load_model
-from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout
+from keras.layers import Dense, LSTM, Embedding
-from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN
+from util import read_array
+from keras.utils import to_categorical
-forms = 114
+VOCAB_SIZE = 85
-batch_size = 128
-# read sequences and next words from files
+def get_train_test_val():
-sequences = read_sequences()
+	sequences = read_array('sequences.csv')
-next_words = read_next_words()
+	X, y = sequences[:,:-1], sequences[:,-1]
+	y = to_categorical(y, num_classes=VOCAB_SIZE)
+	x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+	x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.2)
+	return x_train, y_train, x_test, y_test, x_validate, y_validate
-# split training and test sets
+def train():
-print('Splitting test and training sets...')
+	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
-x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33)
+	seq_length = x_train.shape[1]
-x_train, x_test = np.array(x_train), np.array(x_test)
+	print(x_train.shape, x_validate.shape, x_test.shape)
-print(x_train[0])
+	print(y_train.shape, y_validate.shape, y_test.shape)
+	# define model
+	model = Sequential()
+	model.add(Embedding(VOCAB_SIZE, 50, input_length=seq_length))
+	model.add(LSTM(50, return_sequences=True))
+	model.add(LSTM(50))
+	model.add(Dense(100, activation='relu'))
+	model.add(Dense(VOCAB_SIZE, activation='softmax'))
+	print(model.summary())
-print('Defining the model...')
+	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
-dropout = 0.2
+	model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_validate, y_validate))
-model = Sequential()
+	print('Saving the model...')
-model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms)))
+	model.save('lstm_test_validation_10_epoch_50_lstms.h5')
-if dropout > 0:
+	print(model.evaluate(x_test, y_test))
-    model.add(Dropout(dropout))
-model.add(Dense(forms))
-model.add(Activation('softmax'))
-print('Compiling the model...')
+def train_saved_model():
-model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
+	model = load_model('lstm_test_validation_34_epochs.h5')
-print('Fitting the data...')
+	x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
-model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
+	model.fit(x_train, y_train, epochs=1, batch_size=128, validation_data=(x_validate, y_validate))
-print('Saving the model...')
+	print('Saving the model...')
-model.save('lstm.h5')
+	model.save('lstm_test_validation_35_epochs.h5')
\ No newline at end of file
+	print(model.evaluate(x_test, y_test))
+train_saved_model()
\ No newline at end of file
--- a/next_words
+++ b/next_words
--- a/preprocessing.py
+++ b/preprocessing.py
 from estnltk import Text
 import numpy as np
-from keras.preprocessing.text import text_to_word_sequence
+from keras.preprocessing.text import text_to_word_sequence, Tokenizer
 from tqdm import tqdm
 import pickle
+from util import save_list, read_list, save_array, read_array
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
-# the maximum length of a sentence
-MAXLEN = 70
 # set sequence length and step for sentences splitting
 SEQUENCE_LEN = 3
 STEP = 1
+VOCAB_SIZE = 85
 articles_file = 'articles.txt'
 encoded_forms_file = 'encoded_forms.csv'
 next_words_file = 'next_words'
 sequences_file = 'sequences'
+forms_file = 'forms'
-def encode_forms():
+def save_forms_and_sequences():
-	# load data
+	# load the input data
 	articles = Text(open(articles_file, encoding='utf-8').read())
 	# transform to an array of sentences
 	sentences = articles.sentence_texts
-	# create an empty dict to store forms like {form: code}
+	forms = []
-	dict_forms = {}
+	# loop over all the sentences
-	# initialize a prefilled with zeros numpy array
-	encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
-	# loop over all sentences showing a loading bar
 	for i in tqdm(range(len(sentences))):
+		forms.append('')
 		# split the sentence into a list of lowercase words
-		sentences[i] = text_to_word_sequence(sentences[i])
+		sentence = text_to_word_sequence(sentences[i])
-		# loop over the words in the current sentence
+		for word in sentence:
-		for j in range(len(sentences[i][:MAXLEN])):
+			form = Text(word).forms[0]
-			form = Text(sentences[i][j]).forms[0]
+			if form == '':
-			# add the unseen form to the dictionary increasing its code value by one
+				form = ' '
-			if form not in dict_forms:
+			# append the a new form to the forms[i] string
-				dict_forms[form] = len(dict_forms) + 1
+			forms[i] = forms[i] + '~' + form
-			# set the form's code to the current form
+	# save forms list
-			encoded_forms[i,j] = dict_forms[form]
+	save_list(forms, forms_file)
-	np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
-def set_sequences_and_new_words():
+	# tokenize the forms
-	# create ampty lists
+	tokenizer = Tokenizer(split='~')
-	sequences = []
+	tokenizer.fit_on_texts(forms)
-	next_words = []
+	sequences = tokenizer.texts_to_sequences(forms)
-	# load the input array of encoded forms
+	# pad sequences, using the maxlen
-	sentences = np.genfromtxt(encoded_forms_file, delimiter='~')
+	sequences = pad_sequences(sequences, 70)
-	for i in tqdm(range(len(sentences))):
+	sequences = np.array(sequences)
-		sentence = sentences[i]
+	save_array(sequences, 'sequences.csv')
-		# loop over each sentence splitting it into sequences
\ No newline at end of file
-		for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
-			# split the sentences into sequences of SEQUENCE_LEN
-			sequences.append(sentence[j: j + SEQUENCE_LEN])
-			# set next words for the current sequence
-			next_words.append(sentence[j + SEQUENCE_LEN])
-	#save the lists
-	print('Saving sequences...')
-	with open(sequences_file, 'wb') as fp:
-		pickle.dump(sequences, fp)
-	print('Saving next_words...')
-	with open(next_words_file, 'wb') as fp:
-		pickle.dump(next_words, fp)
-def read_sequences():
-	with open (sequences_file, 'rb') as fp:
-		sequences = pickle.load(fp)
-	return sequences
-def read_next_words():
-	with open (next_words_file, 'rb') as fp:
-		next_words = pickle.load(fp)
-	return next_words
\ No newline at end of file
--- a/sequences
+++ b/sequences
--- a/sequences.csv
+++ b/sequences.csv
--- a/util.py
+++ b/util.py
+import pickle, numpy
+def save_list(list, file_name):
+	with open(file_name, 'wb') as fp:
+		pickle.dump(list, fp)
+def read_list(file_name):
+	with open (file_name, 'rb') as fp:
+		l = pickle.load(fp)
+	return l
+def save_array(array, file_name):
+	numpy.savetxt(file_name, array, fmt='%i', delimiter='~')
+def read_array(file_name):
+	return numpy.genfromtxt(file_name, delimiter='~')
\ No newline at end of file