Commit 266b978c by Paktalin

Finished with encoding and padding the input data

parent 133c10a5
Showing with 15 additions and 20 deletions
from estnltk import Text
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
# the maximum length of a sentence
maxlen = 70
......@@ -11,23 +9,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# construct an empty list for forms lists
forms = []
# initialize a label encoder
label_encoder = LabelEncoder()
for i in range(10):
# insert an empty list for the current sentence
forms.append([])
N = 10
# create an empty dict to store forms like {form: code}
dict_forms = {}
# initialize a prefilled with zeros numpy array
values = np.zeros((N, maxlen), dtype=int)
for i in range(N):
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence
for word in sentences[i]:
# append the word form to the current sentence forms
forms[i].append(Text(word).forms[0])
# encode the forms of the current sentence
forms[i] = label_encoder.fit_transform(forms[i])
# list of lists into array of lists
forms = np.array(forms)
# pad sequences, transforming forms to array
forms = sequence.pad_sequences(forms, maxlen=maxlen, value=-1)
\ No newline at end of file
for j in range(len(sentences[i])):
form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary
if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
values[i,j] = dict_forms[form]
print(values)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment