Commit 7a49bc01 by Paktalin

Preprocessed forms

parent 266b978c
Showing with 9 additions and 8 deletions
This source diff could not be displayed because it is too large. You can view the blob instead.
from estnltk import Text from estnltk import Text
import numpy as np import numpy as np
from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import text_to_word_sequence
from tqdm import tqdm
# the maximum length of a sentence # the maximum length of a sentence
maxlen = 70 maxlen = 70
...@@ -9,20 +10,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read()) ...@@ -9,20 +10,20 @@ articles = Text(open('articles.txt', encoding='utf-8').read())
# transform to an array of sentences # transform to an array of sentences
sentences = articles.sentence_texts sentences = articles.sentence_texts
N = 10
# create an empty dict to store forms like {form: code} # create an empty dict to store forms like {form: code}
dict_forms = {} dict_forms = {}
# initialize a prefilled with zeros numpy array # initialize a prefilled with zeros numpy array
values = np.zeros((N, maxlen), dtype=int) encoded_forms = np.zeros((len(sentences), maxlen), dtype=int)
for i in range(N): # loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))):
# split the sentence into a list of lowercase words # split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i]) sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence # loop over the words in the current sentence
for j in range(len(sentences[i])): for j in range(len(sentences[i][:maxlen])):
form = Text(sentences[i][j]).forms[0] form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary # add the unseen form to the dictionary increasing its code value by one
if form not in dict_forms: if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1 dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form # set the form's code to the current form
values[i,j] = dict_forms[form] encoded_forms[i,j] = dict_forms[form]
print(values) np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment