improved the occurences extracting algorithm and more

77313441 · Paktalin · e449480f · 77313441 · 77313441 · 77313441
Commit 77313441 authored Nov 26, 2018 by Paktalin
Showing with 85 additions and 35 deletions
__pycache__/util.cpython-36.pyc
__pycache__/verbs_preprocessing.cpython-36.pyc
all_verb_forms.py
articles_preprocessing.py
not_found.txt
preprocessed_verbs.csv
preprocessing.py
util.py
verbs_preprocessing.py
with_all_forms.csv
with_n_form.csv
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/__pycache__/verbs_preprocessing.cpython-36.pyc
+++ b/__pycache__/verbs_preprocessing.cpython-36.pyc
--- a/all_verb_forms.py
+++ b/all_verb_forms.py
+from util import read_csv, get_soup, save_csv
+import pandas as pd
+from verbs_preprocessing import preprocess_verbs
+from tqdm import tqdm
+import urllib.parse
+def get_ma_form():
+	return read_csv('preprocessed_verbs.csv')[0]
+def get_n_from_cooljugator(verbs):
+	verbs['n'] = ''
+	cooljugator_url = 'https://cooljugator.com/ee/'
+	for i in tqdm(range(len(verbs))):
+		url = cooljugator_url + urllib.parse.quote(verbs['ma'][i])
+		try:
+			soup = get_soup(url)
+			result = soup.find('div', {'id': 'present1'}).find('div', {'class': 'meta-form'}).text
+			verbs['n'][i] = result
+		except Exception as e:
+			pass
+	print(verbs)
+	save_csv(verbs, 'with_n_form.csv')
+def get_any_form_from_cooljugator(_verbs):
+	columns = ['ma', 'present1', 'present2', 'present3', 'present4', 'present5', 'present6', 'present_neg', 'presentPASS', 'presentPASS_neg',
+	'conditional1', 'conditional2', 'conditional3', 'conditional4', 'conditional5', 'conditional6', 'conditional_neg', 'conditionalPASS', 'conditionalPASS_neg',
+	'imperative2', 'imperative3', 'imperative4', 'imperative5', 'imperative6', 'imperativePASS',
+	'imperative2_neg', 'imperative3_neg', 'imperative4_neg', 'imperative5_neg', 'imperative6_neg', 'imperativePASS_neg']
+	_verbs = _verbs.reindex(columns=columns)
+	cooljugator_url = 'https://cooljugator.com/ee/'
+	for row in tqdm(range(3965, len(_verbs))):
+		url = cooljugator_url + urllib.parse.quote(_verbs['ma'][row])
+		try:
+			soup = get_soup(url)
+			for column in _verbs.columns:
+				if column != 'ma':
+					result = soup.find('div', {'id': column}).find('div', {'class': 'meta-form'}).text
+					_verbs[column][row] = result
+		except Exception as e:
+			print(e)
+			pass
+		save_csv(_verbs, 'with_all_forms.csv')
+verbs = get_ma_form().to_frame()
+verbs.columns = ['ma']
+verbs = get_any_form_from_cooljugator(verbs)
\ No newline at end of file
--- a/articles_preprocessing.py
+++ b/articles_preprocessing.py
@@ -2,13 +2,13 @@ from util import get_postimees_urls
 from tqdm import tqdm
 def save_postimees_urls():
-	urlpage = 'https://www.postimees.ee/search?sections=81&page='
+	postimees_url = 'https://www.postimees.ee/search?sections=81&page='
 	url_list = []
 	page_index = 0
 	while True:
 		print("Scraping page " + str(page_index))
 		try:
-			soup = get_soup(urlpage + str(page_index))
+			soup = get_soup(postimees_url + str(page_index))
 			results_list = soup.find_all("span", {'class': "search-result__headline flex--equal-width"})
 			for result in results_list:
 				url_list.append(result.find("a", href=True)['href'])

--- a/not_found.txt
+++ b/not_found.txt
--- a/preprocessed_verbs.csv
+++ b/preprocessed_verbs.csv
-0,1,2,3,4,5,6,7,common_substring
 aasima,aasida,aasib,aasitakse,aasige,aasis,aasinud,aasitud,aasi
 abielluma,abielluda,abiellub,abiellutakse,abielluge,abiellus,abiellunud,abiellutud,abiellu
 abistama,abistada,abistab,abistatakse,abistage,abistas,abistanud,abistatud,abista

--- a/preprocessing.py
+++ b/preprocessing.py
-from util import save_csv, get_preprocessed_verbs, get_articles
+from util import save_csv, get_preprocessed_verbs, get_articles, write_to_file
 from tqdm import tqdm
+import io, re
 def extract_verbs_occurences_from_articles(verbs, articles):
 	verbs['occurences'] = ''
 	print("finding approximate verbs occurences")
-	# trial with the first verb
+	for i in tqdm(range(len(verbs))):
-	verb = verbs["common_substring"][0]
+		# finish the pattern
-	spaced_verb = ' ' + verb
+		pattern = '.*\W' + verbs[8][i] + '.*'
-	occurences = [sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]
+		occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
-	verbs['occurences'][0] = filter_wrong_occurences(verbs.iloc[0], occurences)
+		verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
+	save_csv(verbs, "with_approximate_occurences.csv")
-	# for i in tqdm(range(len(verbs))):
-	# 	verb = verbs["common_substring"][i]
-	# 	occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
-	# 	verbs['occurences'][i] = occurences
-	# save_csv(verbs, "with_approximate_occurences.csv")
 def filter_wrong_occurences(verb, occurences):
-	print("filtering wrong occurences")
 	all_forms = get_all_forms(verb)
+	verified_occurences = []
+	not_fond = []
 	for occurence in occurences:
 		found = False
 		for form in all_forms:
-			if form in occurence:
+			pattern = '.*\W'+form+'\W.*'
+			if re.match(pattern, occurence):
+				verified_occurences.append(occurence)
 				found = True
 				break
 		if not found:
-			occurences.remove(occurence)
+			not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
-	occurences = list(set(occurences))
+			with io.open('not_found.txt', 'a', encoding='utf-8') as file:
-	print(occurences)
+				file.write(not_found)
+	return verified_occurences
 def get_all_forms(verb):
 	all_forms = []
 	all_forms.extend(forms_from_ma(verb[0][:-2]))
 	all_forms.extend(forms_from_da(verb[1][:-2]))
 	all_forms.extend(forms_from_b(verb[2][:-1]))
+	all_forms.extend(forms_from_kse(verb[3][:-3]))
 	all_forms.append(verb[6])
-	all_forms.append(verb[7])
+	all_forms.extend(forms_from_dud(verb[7][:-2]))
 	return all_forms
 def forms(root, endings):
@@ -53,14 +54,18 @@ def forms_from_ma(root):
 	return forms(root, endings)
 def forms_from_da(root):
-	endings = ['da', 'gu', 'gem', 'ge', 'nuksin', 'nuks', 'nuksid', 'nuksime', 'nuksite', 'di', 'nuvat', 'davat', 'des', 'dav']
+	endings = ['da', 'gu', 'gem', 'ge', 'nuksin', 'nuks', 'nuksid', 'nuksime', 'nuksite', 'nuvat', 'des']
 	return forms(root, endings)
 def forms_from_kse(root):
 	endings = ['kse', 'ks', 'gu', '', 'vat', 'v']
 	return forms(root, endings)
+def forms_from_dud(root):
+	endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
+	return forms(root, endings)
 verbs = get_preprocessed_verbs()
-articles = get_articles()
+articles = get_articles().lower()
 extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file
--- a/util.py
+++ b/util.py
@@ -2,18 +2,11 @@ import pandas as pd
 import urllib, io
 from bs4 import BeautifulSoup
-def get_verbs_cooljugator():
-	# read file as dataframe
-	df = pd.read_csv("verbs_cooljugator.txt", sep="\n", names=["verb"], encoding='utf8')
-	# cut translation
-	return df['verb'].str.split(' *- *').str[0]
 def get_soup(url):
 	page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
 	try:
 		page = urllib.request.urlopen(page)
 	except Exception as e:
-		print(e)
 		page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
 		page = urllib.request.urlopen(page)
 	return BeautifulSoup(page, 'html.parser')
@@ -24,9 +17,9 @@ def write_to_file(list, path, mode):
 			file.write(line)
 def save_csv(df, path):
-	df.to_csv(path, index=False)
+	df.to_csv(path, index=False, header=False)
-def read_csv(path, sep, header):
+def read_csv(path, sep=',', header=None):
 	df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
 	return df
@@ -36,7 +29,7 @@ def get_articles():
 	return articles_string
 def get_preprocessed_verbs():
-	return read_csv("preprocessed_verbs.csv", ",", header=0)
+	return read_csv("preprocessed_verbs.csv")
 def get_postimees_urls():
 	return open("postimees_urls.txt", "r").read().split('\n') 
\ No newline at end of file
--- a/verbs_preprocessing.py
+++ b/verbs_preprocessing.py
@@ -4,7 +4,8 @@ from difflib import SequenceMatcher
 from util import save_csv, read_csv
 def preprocess_verbs():
-	df = read_csv("verbs_gf.csv", ", ", None)
+	print('preprocessing verbs...')
+	df = read_csv("verbs_gf.csv", ", ")
 	df = split_double_forms(df)
 	df = add_common_substring(df)
 	save_csv(df, "preprocessed_verbs.csv")

--- a/with_all_forms.csv
+++ b/with_all_forms.csv
--- a/with_n_form.csv
+++ b/with_n_form.csv