Commit 29f76cce by Paktalin

Created a dataframe of verbs with forms and postags used as features

parent 043c934f
...@@ -2,6 +2,7 @@ from estnltk import Text ...@@ -2,6 +2,7 @@ from estnltk import Text
from util import save_dict, load_dict, save_csv, read_csv from util import save_dict, load_dict, save_csv, read_csv
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
def map_verbs_with_sentences(): def map_verbs_with_sentences():
verbs = {} verbs = {}
...@@ -41,18 +42,38 @@ def verbs_dict_to_df(): ...@@ -41,18 +42,38 @@ def verbs_dict_to_df():
i += 1 i += 1
return pd.DataFrame.from_dict(rows_list) return pd.DataFrame.from_dict(rows_list)
def clean_dataframe(): def clean_dataframe(df):
df = read_csv('verbs_with_noun_likes.csv', sep='~')
df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
df = df[pd.notnull(df['noun_like_form'])] # remove examples with null forms df = df[pd.notnull(df['noun_like_form'])] # remove examples with null forms
df = df[~df['noun_like_form'].str.contains('\|')] # remove example with several forms df = df[~df['noun_like_form'].str.contains('\|')] # remove example with several forms
df = df[~df['sentence'].str.contains('A post shared by')] df = df[~df['sentence'].str.contains('A post shared by')]
df = df[df['noun_like_form'] != '?'] # remove examples with unknown forms df = df[df['noun_like_form'] != '?'] # remove examples with unknown forms
save_csv(df, 'cleaned_dataframe.csv', sep='~') save_csv(df, 'cleaned_dataframe.csv', sep='~')
print(df[df['distance'] > 100]['sentence']) print(df[df['distance'] > 100]['sentence'])
plt.scatter(df['distance'], df['noun_like_pos'], alpha=0.05, c='green')
plt.show()
# df = verbs_dict_to_df() def add_value_to_dict(value, dictionary, distance):
# save_csv(df, 'verbs_with_noun_likes.csv', sep='~') if not value in dictionary:
clean_dataframe() dictionary[value] = 0
\ No newline at end of file dictionary[value] += 1 / distance
def construct_df_of_verbs(initial_df):
verbs = load_dict('verbs_dict')
rows = []
total_verbs = len(verbs)
i = 0
for verb in verbs:
print('%i/%i %s' % (i, total_verbs, verb))
row = {'verb': verb, 'number_of_samples': len(initial_df[initial_df['verb'] == verb])}
for index, use_case in initial_df[initial_df['verb'] == verb].iterrows():
add_value_to_dict(use_case['noun_like_form'], row, use_case['distance'])
add_value_to_dict(use_case['noun_like_pos'], row, use_case['distance'])
add_value_to_dict(use_case['verbs_form'], row, use_case['distance'])
rows.append(row)
i += 1
verbs_df = pd.DataFrame.from_dict(rows)
verbs_df = verbs_df[verbs_df['number_of_samples'] != 0]
save_csv(verbs_df, 'verbs.csv', sep='~', header=True)
print(verbs_df)
df = read_csv('verbs.csv', sep='~', header=0)
print(df)
\ No newline at end of file
...@@ -20,8 +20,8 @@ def write_string_to_file(string, path, mode): ...@@ -20,8 +20,8 @@ def write_string_to_file(string, path, mode):
with io.open(path, mode, encoding='utf-8') as file: with io.open(path, mode, encoding='utf-8') as file:
file.write(string) file.write(string)
def save_csv(df, path, sep=','): def save_csv(df, path, sep=',', header=False):
df.to_csv(path, index=False, header=False, sep=sep) df.to_csv(path, index=False, header=header, sep=sep)
def read_csv(path, sep=',', header=None): def read_csv(path, sep=',', header=None):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python') df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment