Commit 266b978c by Paktalin

Finished with encoding and padding the input data

parent 133c10a5
Showing with 15 additions and 20 deletions
from estnltk import Text from estnltk import Text
import numpy as np import numpy as np
from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
# the maximum length of a sentence # the maximum length of a sentence
maxlen = 70 maxlen = 70
...@@ -11,23 +9,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read()) ...@@ -11,23 +9,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read())
# transform to an array of sentences # transform to an array of sentences
sentences = articles.sentence_texts sentences = articles.sentence_texts
# construct an empty list for forms lists N = 10
forms = [] # create an empty dict to store forms like {form: code}
# initialize a label encoder dict_forms = {}
label_encoder = LabelEncoder() # initialize a prefilled with zeros numpy array
for i in range(10): values = np.zeros((N, maxlen), dtype=int)
# insert an empty list for the current sentence for i in range(N):
forms.append([])
# split the sentence into a list of lowercase words # split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i]) sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence # loop over the words in the current sentence
for word in sentences[i]: for j in range(len(sentences[i])):
# append the word form to the current sentence forms form = Text(sentences[i][j]).forms[0]
forms[i].append(Text(word).forms[0]) # add the unseen form to the dictionary
# encode the forms of the current sentence if form not in dict_forms:
forms[i] = label_encoder.fit_transform(forms[i]) dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
# list of lists into array of lists values[i,j] = dict_forms[form]
forms = np.array(forms) print(values)
# pad sequences, transforming forms to array \ No newline at end of file
forms = sequence.pad_sequences(forms, maxlen=maxlen, value=-1)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment