Commit 77313441 by Paktalin

improved the occurences extracting algorithm and more

parent e449480f
from util import read_csv, get_soup, save_csv
import pandas as pd
from verbs_preprocessing import preprocess_verbs
from tqdm import tqdm
import urllib.parse
def get_ma_form():
return read_csv('preprocessed_verbs.csv')[0]
def get_n_from_cooljugator(verbs):
verbs['n'] = ''
cooljugator_url = 'https://cooljugator.com/ee/'
for i in tqdm(range(len(verbs))):
url = cooljugator_url + urllib.parse.quote(verbs['ma'][i])
try:
soup = get_soup(url)
result = soup.find('div', {'id': 'present1'}).find('div', {'class': 'meta-form'}).text
verbs['n'][i] = result
except Exception as e:
pass
print(verbs)
save_csv(verbs, 'with_n_form.csv')
def get_any_form_from_cooljugator(_verbs):
columns = ['ma', 'present1', 'present2', 'present3', 'present4', 'present5', 'present6', 'present_neg', 'presentPASS', 'presentPASS_neg',
'conditional1', 'conditional2', 'conditional3', 'conditional4', 'conditional5', 'conditional6', 'conditional_neg', 'conditionalPASS', 'conditionalPASS_neg',
'imperative2', 'imperative3', 'imperative4', 'imperative5', 'imperative6', 'imperativePASS',
'imperative2_neg', 'imperative3_neg', 'imperative4_neg', 'imperative5_neg', 'imperative6_neg', 'imperativePASS_neg']
_verbs = _verbs.reindex(columns=columns)
cooljugator_url = 'https://cooljugator.com/ee/'
for row in tqdm(range(3965, len(_verbs))):
url = cooljugator_url + urllib.parse.quote(_verbs['ma'][row])
try:
soup = get_soup(url)
for column in _verbs.columns:
if column != 'ma':
result = soup.find('div', {'id': column}).find('div', {'class': 'meta-form'}).text
_verbs[column][row] = result
except Exception as e:
print(e)
pass
save_csv(_verbs, 'with_all_forms.csv')
verbs = get_ma_form().to_frame()
verbs.columns = ['ma']
verbs = get_any_form_from_cooljugator(verbs)
\ No newline at end of file
...@@ -2,13 +2,13 @@ from util import get_postimees_urls ...@@ -2,13 +2,13 @@ from util import get_postimees_urls
from tqdm import tqdm from tqdm import tqdm
def save_postimees_urls(): def save_postimees_urls():
urlpage = 'https://www.postimees.ee/search?sections=81&page=' postimees_url = 'https://www.postimees.ee/search?sections=81&page='
url_list = [] url_list = []
page_index = 0 page_index = 0
while True: while True:
print("Scraping page " + str(page_index)) print("Scraping page " + str(page_index))
try: try:
soup = get_soup(urlpage + str(page_index)) soup = get_soup(postimees_url + str(page_index))
results_list = soup.find_all("span", {'class': "search-result__headline flex--equal-width"}) results_list = soup.find_all("span", {'class': "search-result__headline flex--equal-width"})
for result in results_list: for result in results_list:
url_list.append(result.find("a", href=True)['href']) url_list.append(result.find("a", href=True)['href'])
......
This source diff could not be displayed because it is too large. You can view the blob instead.
0,1,2,3,4,5,6,7,common_substring
aasima,aasida,aasib,aasitakse,aasige,aasis,aasinud,aasitud,aasi aasima,aasida,aasib,aasitakse,aasige,aasis,aasinud,aasitud,aasi
abielluma,abielluda,abiellub,abiellutakse,abielluge,abiellus,abiellunud,abiellutud,abiellu abielluma,abielluda,abiellub,abiellutakse,abielluge,abiellus,abiellunud,abiellutud,abiellu
abistama,abistada,abistab,abistatakse,abistage,abistas,abistanud,abistatud,abista abistama,abistada,abistab,abistatakse,abistage,abistas,abistanud,abistatud,abista
......
from util import save_csv, get_preprocessed_verbs, get_articles from util import save_csv, get_preprocessed_verbs, get_articles, write_to_file
from tqdm import tqdm from tqdm import tqdm
import io, re
def extract_verbs_occurences_from_articles(verbs, articles): def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = '' verbs['occurences'] = ''
print("finding approximate verbs occurences") print("finding approximate verbs occurences")
# trial with the first verb for i in tqdm(range(len(verbs))):
verb = verbs["common_substring"][0] # finish the pattern
spaced_verb = ' ' + verb pattern = '.*\W' + verbs[8][i] + '.*'
occurences = [sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence] occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
verbs['occurences'][0] = filter_wrong_occurences(verbs.iloc[0], occurences) verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
save_csv(verbs, "with_approximate_occurences.csv")
# for i in tqdm(range(len(verbs))):
# verb = verbs["common_substring"][i]
# occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
# verbs['occurences'][i] = occurences
# save_csv(verbs, "with_approximate_occurences.csv")
def filter_wrong_occurences(verb, occurences): def filter_wrong_occurences(verb, occurences):
print("filtering wrong occurences")
all_forms = get_all_forms(verb) all_forms = get_all_forms(verb)
verified_occurences = []
not_fond = []
for occurence in occurences: for occurence in occurences:
found = False found = False
for form in all_forms: for form in all_forms:
if form in occurence: pattern = '.*\W'+form+'\W.*'
if re.match(pattern, occurence):
verified_occurences.append(occurence)
found = True found = True
break break
if not found: if not found:
occurences.remove(occurence) not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
occurences = list(set(occurences)) with io.open('not_found.txt', 'a', encoding='utf-8') as file:
print(occurences) file.write(not_found)
return verified_occurences
def get_all_forms(verb): def get_all_forms(verb):
all_forms = [] all_forms = []
all_forms.extend(forms_from_ma(verb[0][:-2])) all_forms.extend(forms_from_ma(verb[0][:-2]))
all_forms.extend(forms_from_da(verb[1][:-2])) all_forms.extend(forms_from_da(verb[1][:-2]))
all_forms.extend(forms_from_b(verb[2][:-1])) all_forms.extend(forms_from_b(verb[2][:-1]))
all_forms.extend(forms_from_kse(verb[3][:-3]))
all_forms.append(verb[6]) all_forms.append(verb[6])
all_forms.append(verb[7]) all_forms.extend(forms_from_dud(verb[7][:-2]))
return all_forms return all_forms
def forms(root, endings): def forms(root, endings):
...@@ -53,14 +54,18 @@ def forms_from_ma(root): ...@@ -53,14 +54,18 @@ def forms_from_ma(root):
return forms(root, endings) return forms(root, endings)
def forms_from_da(root): def forms_from_da(root):
endings = ['da', 'gu', 'gem', 'ge', 'nuksin', 'nuks', 'nuksid', 'nuksime', 'nuksite', 'di', 'nuvat', 'davat', 'des', 'dav'] endings = ['da', 'gu', 'gem', 'ge', 'nuksin', 'nuks', 'nuksid', 'nuksime', 'nuksite', 'nuvat', 'des']
return forms(root, endings) return forms(root, endings)
def forms_from_kse(root): def forms_from_kse(root):
endings = ['kse', 'ks', 'gu', '', 'vat', 'v'] endings = ['kse', 'ks', 'gu', '', 'vat', 'v']
return forms(root, endings) return forms(root, endings)
def forms_from_dud(root):
endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
return forms(root, endings)
verbs = get_preprocessed_verbs() verbs = get_preprocessed_verbs()
articles = get_articles() articles = get_articles().lower()
extract_verbs_occurences_from_articles(verbs, articles) extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file
...@@ -2,18 +2,11 @@ import pandas as pd ...@@ -2,18 +2,11 @@ import pandas as pd
import urllib, io import urllib, io
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def get_verbs_cooljugator():
# read file as dataframe
df = pd.read_csv("verbs_cooljugator.txt", sep="\n", names=["verb"], encoding='utf8')
# cut translation
return df['verb'].str.split(' *- *').str[0]
def get_soup(url): def get_soup(url):
page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
try: try:
page = urllib.request.urlopen(page) page = urllib.request.urlopen(page)
except Exception as e: except Exception as e:
print(e)
page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'}) page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
page = urllib.request.urlopen(page) page = urllib.request.urlopen(page)
return BeautifulSoup(page, 'html.parser') return BeautifulSoup(page, 'html.parser')
...@@ -24,9 +17,9 @@ def write_to_file(list, path, mode): ...@@ -24,9 +17,9 @@ def write_to_file(list, path, mode):
file.write(line) file.write(line)
def save_csv(df, path): def save_csv(df, path):
df.to_csv(path, index=False) df.to_csv(path, index=False, header=False)
def read_csv(path, sep, header): def read_csv(path, sep=',', header=None):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python') df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df return df
...@@ -36,7 +29,7 @@ def get_articles(): ...@@ -36,7 +29,7 @@ def get_articles():
return articles_string return articles_string
def get_preprocessed_verbs(): def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0) return read_csv("preprocessed_verbs.csv")
def get_postimees_urls(): def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n') return open("postimees_urls.txt", "r").read().split('\n')
\ No newline at end of file
...@@ -4,7 +4,8 @@ from difflib import SequenceMatcher ...@@ -4,7 +4,8 @@ from difflib import SequenceMatcher
from util import save_csv, read_csv from util import save_csv, read_csv
def preprocess_verbs(): def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ", None) print('preprocessing verbs...')
df = read_csv("verbs_gf.csv", ", ")
df = split_double_forms(df) df = split_double_forms(df)
df = add_common_substring(df) df = add_common_substring(df)
save_csv(df, "preprocessed_verbs.csv") save_csv(df, "preprocessed_verbs.csv")
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed. Click to expand it.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment