Commit 5c87a32c by Paktalin

common substring is now a feature

parent f3e44de1
import pandas as pd import pandas as pd
from util import get_postimees_urls, get_text from util import get_postimees_urls, get_text
from preprocessing import get_verbs_gf from preprocessing import get_preprocessed_verbs
import progressbar import progressbar
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
print("getting verbs...") print("getting verbs...")
verbs = get_verbs_gf()
verbs = get_preprocessed_verbs()
print(verbs) print(verbs)
retrieve liks to postimees articles # retrieve liks to postimees articles
print("getting postimees urls...") print("getting postimees urls...")
postimees_urls = get_postimees_urls() postimees_urls = get_postimees_urls()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from difflib import SequenceMatcher
def get_verbs_gf(): def get_preprocessed_verbs():
# read file as dataframe return read_csv("preprocessed_verbs.csv", ",")
df = read_csv()
df = split_double(df) def preprocess_verbs():
return df df = read_csv("verbs_gf.csv", ", ")
df = split_double_forms(df)
df = add_common_substring(df)
save_csv(df)
def read_csv(): def read_csv(path, sep):
df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python') df = pd.read_csv(path, sep=sep, encoding='utf8', header=None, engine='python')
return df return df
def split_double(df): def save_csv(df):
df.to_csv("preprocessed_verbs.csv", index=False)
def split_double_forms(df):
for i in range(len(df.index)): for i in range(len(df.index)):
row = df.iloc[i] row = df.iloc[i]
split_row = row.str.split('|', expand=True) split_row = row.str.split('|', expand=True)
...@@ -22,3 +29,21 @@ def split_double(df): ...@@ -22,3 +29,21 @@ def split_double(df):
except Exception as e: except Exception as e:
pass pass
return df return df
def add_common_substring(df):
df["common_substring"] = ''
for row in range(df.shape[0]):
for column in range(df.shape[1]-2):
verb1 = df[column][row]
verb2 = df[column+1][row]
current_common = find_common_substring(verb1, verb1)
if df["common_substring"][row] == '':
df["common_substring"][row] = current_common
elif current_common != df["common_substring"][row]:
df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row])
print(df)
return df
def find_common_substring(string1, string2):
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
return string1[match.a : match.size]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment