preprocessing with MAXLEN. Has to be rewritten

f5c0f0ab · Paktalin · f334d8dd · f5c0f0ab · f5c0f0ab · f5c0f0ab
Commit f5c0f0ab authored Jan 07, 2019 by Paktalin
Showing with 80 additions and 49 deletions
__pycache__/preprocessing.cpython-36.pyc
main.py
next_words
preprocessing.py
sequences
--- a/__pycache__/preprocessing.cpython-36.pyc
+++ b/__pycache__/preprocessing.cpython-36.pyc
--- a/main.py
+++ b/main.py
 import numpy as np
 from sklearn.model_selection import train_test_split
-from tqdm import tqdm
 from keras.models import Sequential
 from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout
-import pickle
+from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN

-# load the input array
-sentences = np.genfromtxt('encoded_forms.csv', delimiter='~')
-# set sequence length and step for sentences splitting
-SEQUENCE_LEN = 3
-STEP = 1
 forms = 114
 batch_size = 128
-# create ampty lists
-sequences = []
-next_words = []

-# set sequences and next_words (x, y)
-for i in tqdm(range(len(sentences))):
-	sentence = sentences[i]
-	# loop over each sentence splitting it into sequences
-	for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
-		# split the sentences into sequences of SEQUENCE_LEN
-		sequences.append(sentence[j: j + SEQUENCE_LEN])
-		# set next words for the current sequence
-		next_words.append(sentence[j + SEQUENCE_LEN])
-
-#save the lists
-with open('sequences', 'wb') as fp:
-	pickle.dump(sequences, fp)
-with open('next_words', 'wb') as fp:
-	pickle.dump(next_words, fp)
+# read sequences and next words from files
+sequences = read_sequences()
+next_words = read_next_words()

 # split training and test sets
+print('Splitting test and training sets...')
 x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33)
+x_train, x_test = np.array(x_train), np.array(x_test)
+print(x_train[0])
+

+print('Defining the model...')
 dropout = 0.2
 model = Sequential()
 model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms)))
@@ -43,6 +27,9 @@ if dropout > 0:
 model.add(Dense(forms))
 model.add(Activation('softmax'))

+print('Compiling the model...')
 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
-model.fit(x_train, y_train, batch_size=batch_size, epochs=15, validation_data=(x_test, y_test))
+print('Fitting the data...')
+model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
+print('Saving the model...')
 model.save('lstm.h5')
\ No newline at end of file
--- a/next_words
+++ b/next_words
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -2,28 +2,71 @@ from estnltk import Text
 import numpy as np
 from keras.preprocessing.text import text_to_word_sequence
 from tqdm import tqdm
+import pickle

 # the maximum length of a sentence
-maxlen = 70
-# load data
-articles = Text(open('articles.txt', encoding='utf-8').read())
-# transform to an array of sentences
-sentences = articles.sentence_texts
+MAXLEN = 70
+# set sequence length and step for sentences splitting
+SEQUENCE_LEN = 3
+STEP = 1

-# create an empty dict to store forms like {form: code}
-dict_forms = {}
-# initialize a prefilled with zeros numpy array
-encoded_forms = np.zeros((len(sentences), maxlen), dtype=int)
-# loop over all sentences showing a loading bar
-for i in tqdm(range(len(sentences))):
-	# split the sentence into a list of lowercase words
-	sentences[i] = text_to_word_sequence(sentences[i])
-	# loop over the words in the current sentence
-	for j in range(len(sentences[i][:maxlen])):
-		form = Text(sentences[i][j]).forms[0]
-		# add the unseen form to the dictionary increasing its code value by one
-		if form not in dict_forms:
-			dict_forms[form] = len(dict_forms) + 1
-		# set the form's code to the current form
-		encoded_forms[i,j] = dict_forms[form]
-np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
\ No newline at end of file
+articles_file = 'articles.txt'
+encoded_forms_file = 'encoded_forms.csv'
+next_words_file = 'next_words'
+sequences_file = 'sequences'
+
+def encode_forms():
+	# load data
+	articles = Text(open(articles_file, encoding='utf-8').read())
+	# transform to an array of sentences
+	sentences = articles.sentence_texts
+	# create an empty dict to store forms like {form: code}
+	dict_forms = {}
+	# initialize a prefilled with zeros numpy array
+	encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
+	# loop over all sentences showing a loading bar
+	for i in tqdm(range(len(sentences))):
+		# split the sentence into a list of lowercase words
+		sentences[i] = text_to_word_sequence(sentences[i])
+		# loop over the words in the current sentence
+		for j in range(len(sentences[i][:MAXLEN])):
+			form = Text(sentences[i][j]).forms[0]
+			# add the unseen form to the dictionary increasing its code value by one
+			if form not in dict_forms:
+				dict_forms[form] = len(dict_forms) + 1
+			# set the form's code to the current form
+			encoded_forms[i,j] = dict_forms[form]
+	np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
+
+def set_sequences_and_new_words():
+	# create ampty lists
+	sequences = []
+	next_words = []
+	# load the input array of encoded forms
+	sentences = np.genfromtxt(encoded_forms_file, delimiter='~')
+	for i in tqdm(range(len(sentences))):
+		sentence = sentences[i]
+		# loop over each sentence splitting it into sequences
+		for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
+			# split the sentences into sequences of SEQUENCE_LEN
+			sequences.append(sentence[j: j + SEQUENCE_LEN])
+			# set next words for the current sequence
+			next_words.append(sentence[j + SEQUENCE_LEN])
+
+	#save the lists
+	print('Saving sequences...')
+	with open(sequences_file, 'wb') as fp:
+		pickle.dump(sequences, fp)
+	print('Saving next_words...')
+	with open(next_words_file, 'wb') as fp:
+		pickle.dump(next_words, fp)
+
+def read_sequences():
+	with open (sequences_file, 'rb') as fp:
+		sequences = pickle.load(fp)
+	return sequences
+
+def read_next_words():
+	with open (next_words_file, 'rb') as fp:
+		next_words = pickle.load(fp)
+	return next_words
\ No newline at end of file
--- a/sequences
+++ b/sequences