Commit 5c87a32c by Paktalin

common substring is now a feature

parent f3e44de1
import pandas as pd
from util import get_postimees_urls, get_text
from preprocessing import get_verbs_gf
from preprocessing import get_preprocessed_verbs
import progressbar
import numpy as np
from tqdm import tqdm
print("getting verbs...")
verbs = get_verbs_gf()
verbs = get_preprocessed_verbs()
print(verbs)
retrieve liks to postimees articles
# retrieve liks to postimees articles
print("getting postimees urls...")
postimees_urls = get_postimees_urls()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
def get_verbs_gf():
# read file as dataframe
df = read_csv()
df = split_double(df)
return df
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",")
def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ")
df = split_double_forms(df)
df = add_common_substring(df)
save_csv(df)
def read_csv():
df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python')
def read_csv(path, sep):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=None, engine='python')
return df
def split_double(df):
def save_csv(df):
df.to_csv("preprocessed_verbs.csv", index=False)
def split_double_forms(df):
for i in range(len(df.index)):
row = df.iloc[i]
split_row = row.str.split('|', expand=True)
......@@ -21,4 +28,22 @@ def split_double(df):
df = df.append(second_form, ignore_index=True)
except Exception as e:
pass
return df
\ No newline at end of file
return df
def add_common_substring(df):
df["common_substring"] = ''
for row in range(df.shape[0]):
for column in range(df.shape[1]-2):
verb1 = df[column][row]
verb2 = df[column+1][row]
current_common = find_common_substring(verb1, verb1)
if df["common_substring"][row] == '':
df["common_substring"][row] = current_common
elif current_common != df["common_substring"][row]:
df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row])
print(df)
return df
def find_common_substring(string1, string2):
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
return string1[match.a : match.size]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment