Commit f5c0f0ab by Paktalin

preprocessing with MAXLEN. Has to be rewritten

parent f334d8dd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout
import pickle
from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN
# load the input array
sentences = np.genfromtxt('encoded_forms.csv', delimiter='~')
# set sequence length and step for sentences splitting
SEQUENCE_LEN = 3
STEP = 1
forms = 114
batch_size = 128
# create ampty lists
sequences = []
next_words = []
# set sequences and next_words (x, y)
for i in tqdm(range(len(sentences))):
sentence = sentences[i]
# loop over each sentence splitting it into sequences
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
with open('sequences', 'wb') as fp:
pickle.dump(sequences, fp)
with open('next_words', 'wb') as fp:
pickle.dump(next_words, fp)
# read sequences and next words from files
sequences = read_sequences()
next_words = read_next_words()
# split training and test sets
print('Splitting test and training sets...')
x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33)
x_train, x_test = np.array(x_train), np.array(x_test)
print(x_train[0])
print('Defining the model...')
dropout = 0.2
model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms)))
......@@ -43,6 +27,9 @@ if dropout > 0:
model.add(Dense(forms))
model.add(Activation('softmax'))
print('Compiling the model...')
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=15, validation_data=(x_test, y_test))
print('Fitting the data...')
model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
print('Saving the model...')
model.save('lstm.h5')
\ No newline at end of file
File added
......@@ -2,28 +2,71 @@ from estnltk import Text
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from tqdm import tqdm
import pickle
# the maximum length of a sentence
maxlen = 70
# load data
articles = Text(open('articles.txt', encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
MAXLEN = 70
# set sequence length and step for sentences splitting
SEQUENCE_LEN = 3
STEP = 1
# create an empty dict to store forms like {form: code}
dict_forms = {}
# initialize a prefilled with zeros numpy array
encoded_forms = np.zeros((len(sentences), maxlen), dtype=int)
# loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))):
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence
for j in range(len(sentences[i][:maxlen])):
form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary increasing its code value by one
if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
encoded_forms[i,j] = dict_forms[form]
np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
\ No newline at end of file
articles_file = 'articles.txt'
encoded_forms_file = 'encoded_forms.csv'
next_words_file = 'next_words'
sequences_file = 'sequences'
def encode_forms():
# load data
articles = Text(open(articles_file, encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code}
dict_forms = {}
# initialize a prefilled with zeros numpy array
encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
# loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))):
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence
for j in range(len(sentences[i][:MAXLEN])):
form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary increasing its code value by one
if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
encoded_forms[i,j] = dict_forms[form]
np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
def set_sequences_and_new_words():
# create ampty lists
sequences = []
next_words = []
# load the input array of encoded forms
sentences = np.genfromtxt(encoded_forms_file, delimiter='~')
for i in tqdm(range(len(sentences))):
sentence = sentences[i]
# loop over each sentence splitting it into sequences
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
print('Saving sequences...')
with open(sequences_file, 'wb') as fp:
pickle.dump(sequences, fp)
print('Saving next_words...')
with open(next_words_file, 'wb') as fp:
pickle.dump(next_words, fp)
def read_sequences():
with open (sequences_file, 'rb') as fp:
sequences = pickle.load(fp)
return sequences
def read_next_words():
with open (next_words_file, 'rb') as fp:
next_words = pickle.load(fp)
return next_words
\ No newline at end of file
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment