Initial commit

133c10a5 · Paktalin · 133c10a5 · 133c10a5
Commit 133c10a5 authored Jan 05, 2019 by Paktalin
Showing with 34 additions and 0 deletions
articles.txt
preprocessing.py
--- a/articles.txt
+++ b/articles.txt
--- a/preprocessing.py
+++ b/preprocessing.py
+from estnltk import Text
+import numpy as np
+from keras.preprocessing.text import text_to_word_sequence
+from keras.preprocessing import sequence
+from sklearn.preprocessing import LabelEncoder
+# the maximum length of a sentence
+maxlen = 70
+# load data
+articles = Text(open('articles.txt', encoding='utf-8').read())
+# transform to an array of sentences
+sentences = articles.sentence_texts
+# construct an empty list for forms lists
+forms = []
+# initialize a label encoder
+label_encoder = LabelEncoder()
+for i in range(10):
+	# insert an empty list for the current sentence
+	forms.append([])
+	# split the sentence into a list of lowercase words
+	sentences[i] = text_to_word_sequence(sentences[i])
+	# loop over the words in the current sentence
+	for word in sentences[i]:
+		# append the word form to the current sentence forms
+		forms[i].append(Text(word).forms[0])
+	# encode the forms of the current sentence
+	forms[i] = label_encoder.fit_transform(forms[i])
+# list of lists into array of lists
+forms = np.array(forms)
+# pad sequences, transforming forms to array
+forms = sequence.pad_sequences(forms, maxlen=maxlen, value=-1)
\ No newline at end of file