Finished with encoding and padding the input data

266b978c · Paktalin · 133c10a5 · 266b978c
Commit 266b978c authored Jan 05, 2019 by Paktalin
Showing with 15 additions and 20 deletions
preprocessing.py
--- a/preprocessing.py
+++ b/preprocessing.py
 from estnltk import Text
 import numpy as np
 from keras.preprocessing.text import text_to_word_sequence
-from keras.preprocessing import sequence
-from sklearn.preprocessing import LabelEncoder

 # the maximum length of a sentence
 maxlen = 70
@@ -11,23 +9,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read())
 # transform to an array of sentences
 sentences = articles.sentence_texts

-# construct an empty list for forms lists
-forms = []
-# initialize a label encoder
-label_encoder = LabelEncoder()
-for i in range(10):
-	# insert an empty list for the current sentence
-	forms.append([])
+N = 10
+# create an empty dict to store forms like {form: code}
+dict_forms = {}
+# initialize a prefilled with zeros numpy array
+values = np.zeros((N, maxlen), dtype=int)
+for i in range(N):
 	# split the sentence into a list of lowercase words
 	sentences[i] = text_to_word_sequence(sentences[i])
 	# loop over the words in the current sentence
-	for word in sentences[i]:
-		# append the word form to the current sentence forms
-		forms[i].append(Text(word).forms[0])
-	# encode the forms of the current sentence
-	forms[i] = label_encoder.fit_transform(forms[i])
-
-# list of lists into array of lists
-forms = np.array(forms)
-# pad sequences, transforming forms to array
-forms = sequence.pad_sequences(forms, maxlen=maxlen, value=-1)
\ No newline at end of file
+	for j in range(len(sentences[i])):
+		form = Text(sentences[i][j]).forms[0]
+		# add the unseen form to the dictionary
+		if form not in dict_forms:
+			dict_forms[form] = len(dict_forms) + 1
+		# set the form's code to the current form
+		values[i,j] = dict_forms[form]
+print(values)
\ No newline at end of file