finished scripts for articles and verbs preprocessing

80597f10 · Paktalin · fbaa031a · 80597f10 · 80597f10 · 80597f10
Commit 80597f10 authored Nov 23, 2018 by Paktalin
Showing with 68 additions and 80 deletions
__pycache__/util.cpython-36.pyc
__pycache__/verbs_preprocessing.cpython-36.pyc
articles.txt
postimees.py → articles_preprocessing.py
main.py
postimees_preprocessing.py
preprocessing.py
util.py
verbs_preprocessing.py
with_approximate_occurences.csv
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/__pycache__/verbs_preprocessing.cpython-36.pyc
+++ b/__pycache__/verbs_preprocessing.cpython-36.pyc
--- a/articles.txt
+++ b/articles.txt
--- a/postimees.py
+++ b/postimees.py
-from util import get_soup, write_to_file
-
-def get_postimees_urls():
-	return open("postimees_urls.txt", "r").read().split('\n') 
+from util import get_postimees_urls
+from tqdm import tqdm

 def save_postimees_urls():
 	urlpage = 'https://www.postimees.ee/search?sections=81&page='
@@ -19,4 +17,22 @@ def save_postimees_urls():
 			break
 		page_index += 1
 	write_to_file(url_list, 'postimees_urls.txt', 'w')
-	return url_list
\ No newline at end of file
+	return url_list
+
+def extract_articles_from_urls():
+	postimees_urls = get_postimees_urls()
+	for i in tqdm(range(9551, len(postimees_urls))): # loading bar
+		url = postimees_urls[i]
+		article = get_text_from_articles(url)
+	write_to_file(article, 'articles.txt', 'a')
+
+def get_text_from_articles(article_url):
+	article_text = ""
+	soup = get_soup(article_url)
+	results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
+	for result in results_list:
+		try:
+			article_text += "\n" + result.find("p").text
+		except Exception as e:
+			pass
+	return article_text
\ No newline at end of file
--- a/main.py
+++ b/main.py
-import pandas as pd
-from util import get_text, write_to_file
-from preprocessing import get_preprocessed_verbs
-from postimees import get_postimees_urls
-
-import progressbar
-import numpy as np
-from tqdm import tqdm
-
-print("getting verbs...")
-verbs = get_preprocessed_verbs()
-print("getting postimees urls...")
-postimees_urls = get_postimees_urls()
-
-print("extracting text from the urls...")
-articles = []
-for i in tqdm(range(3935, len(postimees_urls))): # loading bar
-	url = postimees_urls[i]
-	article = get_text(url)
-	write_to_file(article, 'articles.txt', 'a')
-
-
-# # try to find a verb in an article
-# for column in verbs:
-# 	verb_form = verbs.iloc[2][column]
-# 	if type(verb_form) is str:
-# 		print(verb_form)
-# 		print(str(articles[0].find(verb_form)))
\ No newline at end of file
--- a/postimees_preprocessing.py
+++ b/postimees_preprocessing.py
-from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
-import re
-
-def extract_sentences(verbs, articles):
-	for i in range(len(verbs)):
-		verb = verbs["common_substring"][i]
-		print(verb)
-		occurences = [m.start() for m in re.finditer(verb, articles)]
-		print(occurences)
-
-
-with open('articles.txt', 'r', encoding='utf-8') as articles:
-    articles = articles.read().replace('\n', '')
-verbs = get_preprocessed_verbs()
-extract_sentences(verbs, articles)
\ No newline at end of file
--- a/preprocessing.py
+++ b/preprocessing.py
+from util import save_csv, get_preprocessed_verbs, get_articles
+from tqdm import tqdm
+
+def extract_verbs_occurences_from_articles(verbs, articles):
+	verbs['occurences'] = ''
+	print("finding approximate verbs occurences")
+
+	verb = verbs["common_substring"][0]
+	occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
+	verbs['occurences'][0] = filter_wrong_occurences(verb, occurences)
+
+
+	# for i in tqdm(range(len(verbs))):
+	# 	verb = verbs["common_substring"][i]
+	# 	occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
+	# 	verbs['occurences'][i] = occurences
+	# save_csv(verbs, "with_approximate_occurences.csv")
+
+def filter_wrong_occurences(verb, occurences):
+	print(verb)
+	print(occurences)
+
+
+verbs = get_preprocessed_verbs()
+articles = get_articles()
+extract_occurences(verbs)
\ No newline at end of file
--- a/util.py
+++ b/util.py
@@ -8,14 +8,7 @@ def get_verbs_cooljugator():
 	# cut translation
 	return df['verb'].str.split(' *- *').str[0]

-def get_verbs_gf():
-	# read file as dataframe
-	columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
-	df = pd.read_csv("verbs_gf.csv", sep=",|\|", names=columns, encoding='utf8', engine='python')
-	return df
-
 def get_soup(url):
-	
 	page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
 	try:
 		page = urllib.request.urlopen(page)
@@ -30,13 +23,19 @@ def write_to_file(list, path, mode):
 		for line in list:
 			file.write(line)

-def get_text(article_url):
-	article_text = ""
-	soup = get_soup(article_url)
-	results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
-	for result in results_list:
-		try:
-			article_text += "\n" + result.find("p").text
-		except Exception as e:
-			pass
-	return article_text
\ No newline at end of file
+def save_csv(df, path):
+	df.to_csv(path, index=False)
+
+def read_csv(path, sep, header):
+	df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
+	return df
+
+def get_articles():
+	with open('articles.txt', 'r', encoding='utf-8') as articles:
+    return articles.read().replace('\n', '')
+
+def get_preprocessed_verbs():
+	return read_csv("preprocessed_verbs.csv", ",", header=0)
+
+def get_postimees_urls():
+	return open("postimees_urls.txt", "r").read().split('\n') 
\ No newline at end of file
--- a/verbs_preprocessing.py
+++ b/verbs_preprocessing.py
 import pandas as pd
 import numpy as np
 from difflib import SequenceMatcher
-
-def get_preprocessed_verbs():
-	return read_csv("preprocessed_verbs.csv", ",", header=0)
+from util import save_csv, read_csv

 def preprocess_verbs():
 	df = read_csv("verbs_gf.csv", ", ", None)
 	df = split_double_forms(df)
 	df = add_common_substring(df)
-	save_csv(df)
-
-def read_csv(path, sep, header):
-	df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
-	return df
-
-def save_csv(df):
-	df.to_csv("preprocessed_verbs.csv", index=False)
+	save_csv(df, "preprocessed_verbs.csv")

 def split_double_forms(df):
 	for i in range(len(df.index)):

--- a/with_approximate_occurences.csv
+++ b/with_approximate_occurences.csv