Commit fbaa031a by Paktalin

fixed bug in verbs_preprocessing

parent 6f1d415f
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -14,10 +14,11 @@ postimees_urls = get_postimees_urls() ...@@ -14,10 +14,11 @@ postimees_urls = get_postimees_urls()
print("extracting text from the urls...") print("extracting text from the urls...")
articles = [] articles = []
for i in tqdm(range(len(postimees_urls))): # loading bar for i in tqdm(range(3935, len(postimees_urls))): # loading bar
url = postimees_urls[i] url = postimees_urls[i]
articles.append(get_text(url)) article = get_text(url)
write_to_file(articles, 'articles.txt') write_to_file(article, 'articles.txt', 'a')
# # try to find a verb in an article # # try to find a verb in an article
# for column in verbs: # for column in verbs:
......
...@@ -18,5 +18,5 @@ def save_postimees_urls(): ...@@ -18,5 +18,5 @@ def save_postimees_urls():
print("Extracted links from %i pages" % page_index) print("Extracted links from %i pages" % page_index)
break break
page_index += 1 page_index += 1
write_to_file(url_list, 'postimees_urls.txt') write_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list return url_list
\ No newline at end of file
from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
import re
def extract_sentences(verbs, articles):
for i in range(len(verbs)):
verb = verbs["common_substring"][i]
print(verb)
occurences = [m.start() for m in re.finditer(verb, articles)]
print(occurences)
with open('articles.txt', 'r', encoding='utf-8') as articles:
articles = articles.read().replace('\n', '')
verbs = get_preprocessed_verbs()
extract_sentences(verbs, articles)
\ No newline at end of file
import pandas as pd import pandas as pd
import urllib import urllib, io
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def get_verbs_cooljugator(): def get_verbs_cooljugator():
...@@ -15,13 +15,20 @@ def get_verbs_gf(): ...@@ -15,13 +15,20 @@ def get_verbs_gf():
return df return df
def get_soup(url): def get_soup(url):
page = urllib.request.urlopen(url)
page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
try:
page = urllib.request.urlopen(page)
except Exception as e:
print(e)
page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
page = urllib.request.urlopen(page)
return BeautifulSoup(page, 'html.parser') return BeautifulSoup(page, 'html.parser')
def write_to_file(list, path): def write_to_file(list, path, mode):
with open(path, 'w') as file: with io.open(path, mode, encoding='utf-8') as file:
for line in list: for line in list:
file.write(line + "\n") file.write(line)
def get_text(article_url): def get_text(article_url):
article_text = "" article_text = ""
......
...@@ -3,16 +3,16 @@ import numpy as np ...@@ -3,16 +3,16 @@ import numpy as np
from difflib import SequenceMatcher from difflib import SequenceMatcher
def get_preprocessed_verbs(): def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",") return read_csv("preprocessed_verbs.csv", ",", header=0)
def preprocess_verbs(): def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ") df = read_csv("verbs_gf.csv", ", ", None)
df = split_double_forms(df) df = split_double_forms(df)
df = add_common_substring(df) df = add_common_substring(df)
save_csv(df) save_csv(df)
def read_csv(path, sep): def read_csv(path, sep, header):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=None, engine='python') df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df return df
def save_csv(df): def save_csv(df):
...@@ -25,6 +25,7 @@ def split_double_forms(df): ...@@ -25,6 +25,7 @@ def split_double_forms(df):
try: try:
second_form = split_row[1] second_form = split_row[1]
second_form[second_form.isnull()] = split_row[0] second_form[second_form.isnull()] = split_row[0]
df.iloc[i] = split_row[0]
df = df.append(second_form, ignore_index=True) df = df.append(second_form, ignore_index=True)
except Exception as e: except Exception as e:
pass pass
...@@ -37,11 +38,10 @@ def add_common_substring(df): ...@@ -37,11 +38,10 @@ def add_common_substring(df):
verb1 = df[column][row] verb1 = df[column][row]
verb2 = df[column+1][row] verb2 = df[column+1][row]
current_common = find_common_substring(verb1, verb1) current_common = find_common_substring(verb1, verb1)
if df["common_substring"][row] == '': if df["common_substring"][row] == '' and isinstance(current_common, str):
df["common_substring"][row] = current_common df["common_substring"][row] = current_common
elif current_common != df["common_substring"][row]: elif current_common != df["common_substring"][row]:
df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row]) df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row])
print(df)
return df return df
def find_common_substring(string1, string2): def find_common_substring(string1, string2):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment