Commit 133c10a5 by Paktalin

Initial commit

parents
Showing with 34 additions and 0 deletions
This source diff could not be displayed because it is too large. You can view the blob instead.
from estnltk import Text
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
# the maximum length of a sentence
maxlen = 70
# load data
articles = Text(open('articles.txt', encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# construct an empty list for forms lists
forms = []
# initialize a label encoder
label_encoder = LabelEncoder()
for i in range(10):
# insert an empty list for the current sentence
forms.append([])
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence
for word in sentences[i]:
# append the word form to the current sentence forms
forms[i].append(Text(word).forms[0])
# encode the forms of the current sentence
forms[i] = label_encoder.fit_transform(forms[i])
# list of lists into array of lists
forms = np.array(forms)
# pad sequences, transforming forms to array
forms = sequence.pad_sequences(forms, maxlen=maxlen, value=-1)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment