Commit afdb23fe by Paktalin

Training results with sequences.csv

parent f5c0f0ab
File added
File added
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout
from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding
from util import read_array
from keras.utils import to_categorical
forms = 114
batch_size = 128
VOCAB_SIZE = 85
# read sequences and next words from files
sequences = read_sequences()
next_words = read_next_words()
def get_train_test_val():
sequences = read_array('sequences.csv')
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=VOCAB_SIZE)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.2)
return x_train, y_train, x_test, y_test, x_validate, y_validate
# split training and test sets
print('Splitting test and training sets...')
x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33)
x_train, x_test = np.array(x_train), np.array(x_test)
print(x_train[0])
def train():
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
seq_length = x_train.shape[1]
print(x_train.shape, x_validate.shape, x_test.shape)
print(y_train.shape, y_validate.shape, y_test.shape)
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 50, input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(100, activation='relu'))
model.add(Dense(VOCAB_SIZE, activation='softmax'))
print(model.summary())
print('Defining the model...')
dropout = 0.2
model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms)))
if dropout > 0:
model.add(Dropout(dropout))
model.add(Dense(forms))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_validate, y_validate))
print('Saving the model...')
model.save('lstm_test_validation_10_epoch_50_lstms.h5')
print(model.evaluate(x_test, y_test))
print('Compiling the model...')
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
print('Fitting the data...')
model.fit(x_train, y_train, batch_size=batch_size, epochs=15)
print('Saving the model...')
model.save('lstm.h5')
\ No newline at end of file
def train_saved_model():
model = load_model('lstm_test_validation_34_epochs.h5')
x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
model.fit(x_train, y_train, epochs=1, batch_size=128, validation_data=(x_validate, y_validate))
print('Saving the model...')
model.save('lstm_test_validation_35_epochs.h5')
print(model.evaluate(x_test, y_test))
train_saved_model()
\ No newline at end of file
No preview for this file type
from estnltk import Text
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tqdm import tqdm
import pickle
from util import save_list, read_list, save_array, read_array
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# the maximum length of a sentence
MAXLEN = 70
# set sequence length and step for sentences splitting
SEQUENCE_LEN = 3
STEP = 1
VOCAB_SIZE = 85
articles_file = 'articles.txt'
encoded_forms_file = 'encoded_forms.csv'
next_words_file = 'next_words'
sequences_file = 'sequences'
forms_file = 'forms'
def encode_forms():
# load data
def save_forms_and_sequences():
# load the input data
articles = Text(open(articles_file, encoding='utf-8').read())
# transform to an array of sentences
sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code}
dict_forms = {}
# initialize a prefilled with zeros numpy array
encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
# loop over all sentences showing a loading bar
forms = []
# loop over all the sentences
for i in tqdm(range(len(sentences))):
forms.append('')
# split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence
for j in range(len(sentences[i][:MAXLEN])):
form = Text(sentences[i][j]).forms[0]
# add the unseen form to the dictionary increasing its code value by one
if form not in dict_forms:
dict_forms[form] = len(dict_forms) + 1
# set the form's code to the current form
encoded_forms[i,j] = dict_forms[form]
np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
sentence = text_to_word_sequence(sentences[i])
for word in sentence:
form = Text(word).forms[0]
if form == '':
form = ' '
# append the a new form to the forms[i] string
forms[i] = forms[i] + '~' + form
# save forms list
save_list(forms, forms_file)
def set_sequences_and_new_words():
# create ampty lists
sequences = []
next_words = []
# load the input array of encoded forms
sentences = np.genfromtxt(encoded_forms_file, delimiter='~')
for i in tqdm(range(len(sentences))):
sentence = sentences[i]
# loop over each sentence splitting it into sequences
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
print('Saving sequences...')
with open(sequences_file, 'wb') as fp:
pickle.dump(sequences, fp)
print('Saving next_words...')
with open(next_words_file, 'wb') as fp:
pickle.dump(next_words, fp)
def read_sequences():
with open (sequences_file, 'rb') as fp:
sequences = pickle.load(fp)
return sequences
def read_next_words():
with open (next_words_file, 'rb') as fp:
next_words = pickle.load(fp)
return next_words
\ No newline at end of file
# tokenize the forms
tokenizer = Tokenizer(split='~')
tokenizer.fit_on_texts(forms)
sequences = tokenizer.texts_to_sequences(forms)
# pad sequences, using the maxlen
sequences = pad_sequences(sequences, 70)
sequences = np.array(sequences)
save_array(sequences, 'sequences.csv')
\ No newline at end of file
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
import pickle, numpy
def save_list(list, file_name):
with open(file_name, 'wb') as fp:
pickle.dump(list, fp)
def read_list(file_name):
with open (file_name, 'rb') as fp:
l = pickle.load(fp)
return l
def save_array(array, file_name):
numpy.savetxt(file_name, array, fmt='%i', delimiter='~')
def read_array(file_name):
return numpy.genfromtxt(file_name, delimiter='~')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment