Preprocessed forms

7a49bc01 · Paktalin · 266b978c · 7a49bc01 · 7a49bc01
Commit 7a49bc01 authored Jan 05, 2019 by Paktalin
Showing with 9 additions and 8 deletions
encoded_forms.csv
preprocessing.py
--- a/encoded_forms.csv
+++ b/encoded_forms.csv
--- a/preprocessing.py
+++ b/preprocessing.py
 from estnltk import Text
 import numpy as np
 from keras.preprocessing.text import text_to_word_sequence
+from tqdm import tqdm
 # the maximum length of a sentence
 maxlen = 70
@@ -9,20 +10,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read())
 # transform to an array of sentences
 sentences = articles.sentence_texts
-N = 10
 # create an empty dict to store forms like {form: code}
 dict_forms = {}
 # initialize a prefilled with zeros numpy array
-values = np.zeros((N, maxlen), dtype=int)
+encoded_forms = np.zeros((len(sentences), maxlen), dtype=int)
-for i in range(N):
+# loop over all sentences showing a loading bar
+for i in tqdm(range(len(sentences))):
 	# split the sentence into a list of lowercase words
 	sentences[i] = text_to_word_sequence(sentences[i])
 	# loop over the words in the current sentence
-	for j in range(len(sentences[i])):
+	for j in range(len(sentences[i][:maxlen])):
 		form = Text(sentences[i][j]).forms[0]
-		# add the unseen form to the dictionary
+		# add the unseen form to the dictionary increasing its code value by one
 		if form not in dict_forms:
 			dict_forms[form] = len(dict_forms) + 1
 		# set the form's code to the current form
-		values[i,j] = dict_forms[form]
+		encoded_forms[i,j] = dict_forms[form]
-print(values)
+np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
\ No newline at end of file