Commit fbaa031a by Paktalin

fixed bug in verbs_preprocessing

parent 6f1d415f
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -14,10 +14,11 @@ postimees_urls = get_postimees_urls()
print("extracting text from the urls...")
articles = []
for i in tqdm(range(len(postimees_urls))): # loading bar
for i in tqdm(range(3935, len(postimees_urls))): # loading bar
url = postimees_urls[i]
articles.append(get_text(url))
write_to_file(articles, 'articles.txt')
article = get_text(url)
write_to_file(article, 'articles.txt', 'a')
# # try to find a verb in an article
# for column in verbs:
......
......@@ -18,5 +18,5 @@ def save_postimees_urls():
print("Extracted links from %i pages" % page_index)
break
page_index += 1
write_to_file(url_list, 'postimees_urls.txt')
write_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list
\ No newline at end of file
from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
import re
def extract_sentences(verbs, articles):
for i in range(len(verbs)):
verb = verbs["common_substring"][i]
print(verb)
occurences = [m.start() for m in re.finditer(verb, articles)]
print(occurences)
with open('articles.txt', 'r', encoding='utf-8') as articles:
articles = articles.read().replace('\n', '')
verbs = get_preprocessed_verbs()
extract_sentences(verbs, articles)
\ No newline at end of file
import pandas as pd
import urllib
import urllib, io
from bs4 import BeautifulSoup
def get_verbs_cooljugator():
......@@ -15,13 +15,20 @@ def get_verbs_gf():
return df
def get_soup(url):
page = urllib.request.urlopen(url)
page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
try:
page = urllib.request.urlopen(page)
except Exception as e:
print(e)
page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
page = urllib.request.urlopen(page)
return BeautifulSoup(page, 'html.parser')
def write_to_file(list, path):
with open(path, 'w') as file:
def write_to_file(list, path, mode):
with io.open(path, mode, encoding='utf-8') as file:
for line in list:
file.write(line + "\n")
file.write(line)
def get_text(article_url):
article_text = ""
......
......@@ -3,16 +3,16 @@ import numpy as np
from difflib import SequenceMatcher
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",")
return read_csv("preprocessed_verbs.csv", ",", header=0)
def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ")
df = read_csv("verbs_gf.csv", ", ", None)
df = split_double_forms(df)
df = add_common_substring(df)
save_csv(df)
def read_csv(path, sep):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=None, engine='python')
def read_csv(path, sep, header):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df
def save_csv(df):
......@@ -25,6 +25,7 @@ def split_double_forms(df):
try:
second_form = split_row[1]
second_form[second_form.isnull()] = split_row[0]
df.iloc[i] = split_row[0]
df = df.append(second_form, ignore_index=True)
except Exception as e:
pass
......@@ -37,11 +38,10 @@ def add_common_substring(df):
verb1 = df[column][row]
verb2 = df[column+1][row]
current_common = find_common_substring(verb1, verb1)
if df["common_substring"][row] == '':
if df["common_substring"][row] == '' and isinstance(current_common, str):
df["common_substring"][row] = current_common
elif current_common != df["common_substring"][row]:
df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row])
print(df)
return df
def find_common_substring(string1, string2):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment