extracted occurences

a433e0ec · Paktalin · 97fbfda2 · a433e0ec · a433e0ec · a433e0ec
Commit a433e0ec authored Nov 27, 2018 by Paktalin
Showing with 38 additions and 25 deletions
__pycache__/util.cpython-36.pyc
articles_preprocessing.py
cleaned_verbs_with_occurences.csv
preprocessing.py
util.py
verbs_cooljugator.txt
with_all_forms.csv
with_approximate_occurences.csv
with_approximate_occurences_all.csv
with_n_form.csv
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/articles_preprocessing.py
+++ b/articles_preprocessing.py
-from util import get_postimees_urls
+from util import get_postimees_urls, write_list_to_file
 from tqdm import tqdm

 def save_postimees_urls():
@@ -16,7 +16,7 @@ def save_postimees_urls():
 			print("Extracted links from %i pages" % page_index)
 			break
 		page_index += 1
-	write_to_file(url_list, 'postimees_urls.txt', 'w')
+	write_list_to_file(url_list, 'postimees_urls.txt', 'w')
 	return url_list

 def extract_articles_from_urls():
@@ -24,7 +24,7 @@ def extract_articles_from_urls():
 	for i in tqdm(range(9551, len(postimees_urls))): # loading bar
 		url = postimees_urls[i]
 		article = get_text_from_articles(url)
-	write_to_file(article, 'articles.txt', 'a')
+	write_list_to_file(article, 'articles.txt', 'a')

 def get_text_from_articles(article_url):
 	article_text = ""

--- a/cleaned_verbs_with_occurences.csv
+++ b/cleaned_verbs_with_occurences.csv
--- a/preprocessing.py
+++ b/preprocessing.py
-from util import save_csv, get_preprocessed_verbs, get_articles, write_to_file
+from util import save_csv, get_preprocessed_verbs, get_articles, write_string_to_file, get_verbs_with_occurences
 from tqdm import tqdm
-import io, re
+import io, re, ast, csv
+
+def get_verbs_with_not_empty_occurences():
+	csv.field_size_limit(100000000)
+	verbs = get_verbs_with_occurences()
+	verbs = verbs.loc[verbs[9] != '[]'][[0, 9]]
+	verbs.index = range(len(verbs))
+	save_csv(verbs, 'cleaned_verbs_with_occurences.csv')
+	print(verbs)

 def extract_verbs_occurences_from_articles(verbs, articles):
 	verbs['occurences'] = ''
-	print("finding approximate verbs occurences")
-
-	for i in tqdm(range(len(verbs))):
-		# finish the pattern
-		pattern = '^(.*\W)*' + verbs[8][i] + '(?!(mi|ja)).*$'
-		occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
+	for i in tqdm(range(1473, len(verbs))):
+		spaced_verb = ' ' + verbs[8][i]
+		occurences = list(set([sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]))
 		verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
-	save_csv(verbs, "with_approximate_occurences.csv")
+		save_csv(verbs, "with_approximate_occurences_1473.csv")

 def filter_wrong_occurences(verb, occurences):
 	all_forms = get_all_forms(verb)
 	verified_occurences = []
-	not_fond = []
 	for occurence in occurences:
 		found = False
 		for form in all_forms:
-			pattern = '^(.*\W)*'+form+'(\W.*)*$'
-			if re.match(pattern, occurence):
+			if form in occurence:
+				pattern = re.compile('.*'+form+'(\W.*)*$')
+				if pattern.match(occurence):
 					verified_occurences.append(occurence)
 					found = True
 					break
-		if not found:
-			not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
-			with io.open('not_found.txt', 'a', encoding='utf-8') as file:
-				file.write(not_found)
+		# if not found:
+		# 	not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
+		# 	write_string_to_file(not_found, 'not_found.txt', 'a')
 	return verified_occurences


@@ -43,7 +47,7 @@ def get_all_forms(verb):
 	return all_forms

 def forms(root, endings):
-	return [root+ending+' ' for ending in endings] + [root+ending+'.' for ending in endings] + [root+ending+'?' for ending in endings] + [root+ending+'!' for ending in endings] + [root+ending+',' for ending in endings]
+	return [root+ending for ending in endings]

 def forms_from_b(root):
 	endings = ['n', 'd', 'b', 'me', 'te', 'vad', '', 'ksin', 'ksid', 'ks', 'ksime', 'ksite']
@@ -65,7 +69,8 @@ def forms_from_dud(root):
 	endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
 	return forms(root, endings)

+# verbs = get_preprocessed_verbs()
+# articles = get_articles().lower()
+# extract_verbs_occurences_from_articles(verbs, articles)

-verbs = get_preprocessed_verbs()
-articles = get_articles().lower()
-extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file
+get_verbs_with_not_empty_occurences()
\ No newline at end of file
--- a/util.py
+++ b/util.py
@@ -11,11 +11,15 @@ def get_soup(url):
 		page = urllib.request.urlopen(page)
 	return BeautifulSoup(page, 'html.parser')

-def write_to_file(list, path, mode):
+def write_list_to_file(list, path, mode):
 	with io.open(path, mode, encoding='utf-8') as file:
 		for line in list:
 			file.write(line)

+def write_string_to_file(string, path, mode):
+	with io.open(path, mode, encoding='utf-8') as file:
+		file.write(string)
+
 def save_csv(df, path):
 	df.to_csv(path, index=False, header=False)

@@ -33,3 +37,6 @@ def get_preprocessed_verbs():

 def get_postimees_urls():
 	return open("postimees_urls.txt", "r").read().split('\n') 
+
+def get_verbs_with_occurences():
+	return read_csv("with_approximate_occurences_all.csv")
\ No newline at end of file
--- a/verbs_cooljugator.txt
+++ b/verbs_cooljugator.txt
--- a/with_all_forms.csv
+++ b/with_all_forms.csv
--- a/with_approximate_occurences.csv
+++ b/with_approximate_occurences.csv
--- a/with_approximate_occurences_all.csv
+++ b/with_approximate_occurences_all.csv
--- a/with_n_form.csv
+++ b/with_n_form.csv