Commit f3e44de1 by Paktalin

second forms are saved as separate verbs at the end of the dataframe

parent 79266248
...@@ -9,19 +9,19 @@ from tqdm import tqdm ...@@ -9,19 +9,19 @@ from tqdm import tqdm
print("getting verbs...") print("getting verbs...")
verbs = get_verbs_gf() verbs = get_verbs_gf()
print(verbs) print(verbs)
# retrieve liks to postimees articles retrieve liks to postimees articles
# print("getting postimees urls...") print("getting postimees urls...")
# postimees_urls = get_postimees_urls() postimees_urls = get_postimees_urls()
# print("extracting text from the urls...") print("extracting text from the urls...")
# articles = [] articles = []
# for i in tqdm(range(len(postimees_urls))): for i in tqdm(range(len(postimees_urls))):
# url = postimees_urls[i] url = postimees_urls[i]
# articles.append(get_text(url)) articles.append(get_text(url))
# # try to find a verb in an article # try to find a verb in an article
# for column in verbs: for column in verbs:
# verb_form = verbs.iloc[2][column] verb_form = verbs.iloc[2][column]
# if type(verb_form) is str: if type(verb_form) is str:
# print(verb_form) print(verb_form)
# print(str(articles[0].find(verb_form))) print(str(articles[0].find(verb_form)))
\ No newline at end of file \ No newline at end of file
...@@ -11,18 +11,14 @@ def read_csv(): ...@@ -11,18 +11,14 @@ def read_csv():
df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python') df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python')
return df return df
def split_double(df):
def double(column): for i in range(len(df.index)):
new_column = column.str.split('|', expand=True) row = df.iloc[i]
split_row = row.str.split('|', expand=True)
try: try:
column = new_column[0] second_form = split_row[1]
new_column = new_column[1] second_form[second_form.isnull()] = split_row[0]
df = df.append(second_form, ignore_index=True)
except Exception as e: except Exception as e:
new_column = [None]*len(new_column) pass
return column, new_column
def split_double(df):
for column_name in df.columns:
second_column_name = str(column_name) + "double"
df[column_name], df[second_column_name] = double(df[column_name])
return df return df
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment