Commit 79266248 by Paktalin

created separate columns for double forms

parent e4c2aa75
import pandas as pd import pandas as pd
from util import get_postimees_urls, get_verbs_gf, get_text from util import get_postimees_urls, get_text
from preprocessing import get_verbs_gf
import progressbar import progressbar
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
print("getting verbs...") print("getting verbs...")
verbs = get_verbs_gf() verbs = get_verbs_gf()
print(verbs)
# retrieve liks to postimees articles # retrieve liks to postimees articles
print("getting postimees urls...") # print("getting postimees urls...")
postimees_urls = get_postimees_urls() # postimees_urls = get_postimees_urls()
print("extracting text from the urls...") # print("extracting text from the urls...")
articles = [] # articles = []
for i in tqdm(range(len(postimees_urls))): # for i in tqdm(range(len(postimees_urls))):
url = postimees_urls[i] # url = postimees_urls[i]
articles.append(get_text(url)) # articles.append(get_text(url))
# try to find a verb in an article # # try to find a verb in an article
for column in verbs: # for column in verbs:
verb_form = verbs.iloc[2][column] # verb_form = verbs.iloc[2][column]
if type(verb_form) is str: # if type(verb_form) is str:
print(verb_form) # print(verb_form)
print(str(articles[0].find(verb_form))) # print(str(articles[0].find(verb_form)))
\ No newline at end of file \ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np
def get_verbs_gf(): def get_verbs_gf():
# read file as dataframe # read file as dataframe
df read_csv() df = read_csv()
df = split_double(df)
return df
def read_csv(): def read_csv():
df = pd.read_csv("verbs_gf.csv", sep=",", names=columns, encoding='utf8') df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python')
return df
def double(column):
new_column = column.str.split('|', expand=True)
try:
column = new_column[0]
new_column = new_column[1]
except Exception as e:
new_column = [None]*len(new_column)
return column, new_column
def split_double(df):
for column_name in df.columns:
second_column_name = str(column_name) + "double"
df[column_name], df[second_column_name] = double(df[column_name])
return df return df
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment