Commit afdb23fe by Paktalin

Training results with sequences.csv

parent f5c0f0ab
File added
File added
import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from keras.models import Sequential from keras.models import Sequential, load_model
from keras.layers import Bidirectional, Dense, Activation, LSTM, Dropout from keras.layers import Dense, LSTM, Embedding
from preprocessing import read_sequences, read_next_words, SEQUENCE_LEN from util import read_array
from keras.utils import to_categorical
forms = 114 VOCAB_SIZE = 85
batch_size = 128
# read sequences and next words from files def get_train_test_val():
sequences = read_sequences() sequences = read_array('sequences.csv')
next_words = read_next_words() X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=VOCAB_SIZE)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_test, x_validate, y_test, y_validate = train_test_split(x_test, y_test, test_size=0.2)
return x_train, y_train, x_test, y_test, x_validate, y_validate
# split training and test sets def train():
print('Splitting test and training sets...') x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
x_train, x_test, y_train, y_test = train_test_split(sequences, next_words, test_size=0.33) seq_length = x_train.shape[1]
x_train, x_test = np.array(x_train), np.array(x_test) print(x_train.shape, x_validate.shape, x_test.shape)
print(x_train[0]) print(y_train.shape, y_validate.shape, y_test.shape)
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 50, input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(100, activation='relu'))
model.add(Dense(VOCAB_SIZE, activation='softmax'))
print(model.summary())
print('Defining the model...') model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
dropout = 0.2 model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_validate, y_validate))
model = Sequential() print('Saving the model...')
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, forms))) model.save('lstm_test_validation_10_epoch_50_lstms.h5')
if dropout > 0: print(model.evaluate(x_test, y_test))
model.add(Dropout(dropout))
model.add(Dense(forms))
model.add(Activation('softmax'))
print('Compiling the model...') def train_saved_model():
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model = load_model('lstm_test_validation_34_epochs.h5')
print('Fitting the data...') x_train, y_train, x_test, y_test, x_validate, y_validate = get_train_test_val()
model.fit(x_train, y_train, batch_size=batch_size, epochs=15) model.fit(x_train, y_train, epochs=1, batch_size=128, validation_data=(x_validate, y_validate))
print('Saving the model...') print('Saving the model...')
model.save('lstm.h5') model.save('lstm_test_validation_35_epochs.h5')
\ No newline at end of file print(model.evaluate(x_test, y_test))
train_saved_model()
\ No newline at end of file
No preview for this file type
from estnltk import Text from estnltk import Text
import numpy as np import numpy as np
from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tqdm import tqdm from tqdm import tqdm
import pickle import pickle
from util import save_list, read_list, save_array, read_array
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# the maximum length of a sentence
MAXLEN = 70
# set sequence length and step for sentences splitting # set sequence length and step for sentences splitting
SEQUENCE_LEN = 3 SEQUENCE_LEN = 3
STEP = 1 STEP = 1
VOCAB_SIZE = 85
articles_file = 'articles.txt' articles_file = 'articles.txt'
encoded_forms_file = 'encoded_forms.csv' encoded_forms_file = 'encoded_forms.csv'
next_words_file = 'next_words' next_words_file = 'next_words'
sequences_file = 'sequences' sequences_file = 'sequences'
forms_file = 'forms'
def encode_forms(): def save_forms_and_sequences():
# load data # load the input data
articles = Text(open(articles_file, encoding='utf-8').read()) articles = Text(open(articles_file, encoding='utf-8').read())
# transform to an array of sentences # transform to an array of sentences
sentences = articles.sentence_texts sentences = articles.sentence_texts
# create an empty dict to store forms like {form: code} forms = []
dict_forms = {} # loop over all the sentences
# initialize a prefilled with zeros numpy array
encoded_forms = np.zeros((len(sentences), MAXLEN), dtype=int)
# loop over all sentences showing a loading bar
for i in tqdm(range(len(sentences))): for i in tqdm(range(len(sentences))):
forms.append('')
# split the sentence into a list of lowercase words # split the sentence into a list of lowercase words
sentences[i] = text_to_word_sequence(sentences[i]) sentence = text_to_word_sequence(sentences[i])
# loop over the words in the current sentence for word in sentence:
for j in range(len(sentences[i][:MAXLEN])): form = Text(word).forms[0]
form = Text(sentences[i][j]).forms[0] if form == '':
# add the unseen form to the dictionary increasing its code value by one form = ' '
if form not in dict_forms: # append the a new form to the forms[i] string
dict_forms[form] = len(dict_forms) + 1 forms[i] = forms[i] + '~' + form
# set the form's code to the current form # save forms list
encoded_forms[i,j] = dict_forms[form] save_list(forms, forms_file)
np.savetxt("encoded_forms.csv", encoded_forms, delimiter="~", fmt='%i')
def set_sequences_and_new_words(): # tokenize the forms
# create ampty lists tokenizer = Tokenizer(split='~')
sequences = [] tokenizer.fit_on_texts(forms)
next_words = [] sequences = tokenizer.texts_to_sequences(forms)
# load the input array of encoded forms # pad sequences, using the maxlen
sentences = np.genfromtxt(encoded_forms_file, delimiter='~') sequences = pad_sequences(sequences, 70)
for i in tqdm(range(len(sentences))): sequences = np.array(sequences)
sentence = sentences[i] save_array(sequences, 'sequences.csv')
# loop over each sentence splitting it into sequences \ No newline at end of file
for j in range(0, len(sentence) - SEQUENCE_LEN, STEP):
# split the sentences into sequences of SEQUENCE_LEN
sequences.append(sentence[j: j + SEQUENCE_LEN])
# set next words for the current sequence
next_words.append(sentence[j + SEQUENCE_LEN])
#save the lists
print('Saving sequences...')
with open(sequences_file, 'wb') as fp:
pickle.dump(sequences, fp)
print('Saving next_words...')
with open(next_words_file, 'wb') as fp:
pickle.dump(next_words, fp)
def read_sequences():
with open (sequences_file, 'rb') as fp:
sequences = pickle.load(fp)
return sequences
def read_next_words():
with open (next_words_file, 'rb') as fp:
next_words = pickle.load(fp)
return next_words
\ No newline at end of file
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
import pickle, numpy
def save_list(list, file_name):
with open(file_name, 'wb') as fp:
pickle.dump(list, fp)
def read_list(file_name):
with open (file_name, 'rb') as fp:
l = pickle.load(fp)
return l
def save_array(array, file_name):
numpy.savetxt(file_name, array, fmt='%i', delimiter='~')
def read_array(file_name):
return numpy.genfromtxt(file_name, delimiter='~')
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment