fixed bug in verbs_preprocessing

fbaa031a · Paktalin · 6f1d415f · fbaa031a · fbaa031a · fbaa031a
Commit fbaa031a authored Nov 23, 2018 by Paktalin
Showing with 40 additions and 15 deletions
__pycache__/postimees.cpython-36.pyc
__pycache__/util.cpython-36.pyc
__pycache__/verbs_preprocessing.cpython-36.pyc
articles.txt
main.py
postimees.py
postimees_preprocessing.py
preprocessed_verbs.csv
util.py
preprocessing.py → verbs_preprocessing.py
--- a/__pycache__/postimees.cpython-36.pyc
+++ b/__pycache__/postimees.cpython-36.pyc
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/__pycache__/verbs_preprocessing.cpython-36.pyc
+++ b/__pycache__/verbs_preprocessing.cpython-36.pyc
--- a/articles.txt
+++ b/articles.txt
--- a/main.py
+++ b/main.py
@@ -14,10 +14,11 @@ postimees_urls = get_postimees_urls()
 print("extracting text from the urls...")
 articles = []
-for i in tqdm(range(len(postimees_urls))): # loading bar
+for i in tqdm(range(3935, len(postimees_urls))): # loading bar
 	url = postimees_urls[i]
-	articles.append(get_text(url))
+	article = get_text(url)
-write_to_file(articles, 'articles.txt')
+	write_to_file(article, 'articles.txt', 'a')
 # # try to find a verb in an article
 # for column in verbs:

--- a/postimees.py
+++ b/postimees.py
@@ -18,5 +18,5 @@ def save_postimees_urls():
 			print("Extracted links from %i pages" % page_index)
 			break
 		page_index += 1
-	write_to_file(url_list, 'postimees_urls.txt')
+	write_to_file(url_list, 'postimees_urls.txt', 'w')
 	return url_list
\ No newline at end of file
--- a/postimees_preprocessing.py
+++ b/postimees_preprocessing.py
+from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
+import re
+def extract_sentences(verbs, articles):
+	for i in range(len(verbs)):
+		verb = verbs["common_substring"][i]
+		print(verb)
+		occurences = [m.start() for m in re.finditer(verb, articles)]
+		print(occurences)
+with open('articles.txt', 'r', encoding='utf-8') as articles:
+    articles = articles.read().replace('\n', '')
+verbs = get_preprocessed_verbs()
+extract_sentences(verbs, articles)
\ No newline at end of file
--- a/preprocessed_verbs.csv
+++ b/preprocessed_verbs.csv
--- a/util.py
+++ b/util.py
 import pandas as pd
-import urllib
+import urllib, io
 from bs4 import BeautifulSoup
 def get_verbs_cooljugator():
@@ -15,13 +15,20 @@ def get_verbs_gf():
 	return df
 def get_soup(url):
-	page = urllib.request.urlopen(url)
+	page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
+	try:
+		page = urllib.request.urlopen(page)
+	except Exception as e:
+		print(e)
+		page = urllib.request.Request(url, headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
+		page = urllib.request.urlopen(page)
 	return BeautifulSoup(page, 'html.parser')
-def write_to_file(list, path):
+def write_to_file(list, path, mode):
-	with open(path, 'w') as file:
+	with io.open(path, mode, encoding='utf-8') as file:
 		for line in list:
-			file.write(line + "\n")
+			file.write(line)
 def get_text(article_url):
 	article_text = ""

--- a/preprocessing.py
+++ b/preprocessing.py
@@ -3,16 +3,16 @@ import numpy as np
 from difflib import SequenceMatcher
 def get_preprocessed_verbs():
-	return read_csv("preprocessed_verbs.csv", ",")
+	return read_csv("preprocessed_verbs.csv", ",", header=0)
 def preprocess_verbs():
-	df = read_csv("verbs_gf.csv", ", ")
+	df = read_csv("verbs_gf.csv", ", ", None)
 	df = split_double_forms(df)
 	df = add_common_substring(df)
 	save_csv(df)
-def read_csv(path, sep):
+def read_csv(path, sep, header):
-	df = pd.read_csv(path, sep=sep, encoding='utf8', header=None, engine='python')
+	df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
 	return df
 def save_csv(df):
@@ -25,6 +25,7 @@ def split_double_forms(df):
 		try:
 			second_form = split_row[1]
 			second_form[second_form.isnull()] = split_row[0]
+			df.iloc[i] = split_row[0]
 			df = df.append(second_form, ignore_index=True)
 		except Exception as e:
 			pass
@@ -37,11 +38,10 @@ def add_common_substring(df):
 			verb1 = df[column][row]
 			verb2 = df[column+1][row]
 			current_common = find_common_substring(verb1, verb1)
-			if df["common_substring"][row] == '':
+			if df["common_substring"][row] == '' and isinstance(current_common, str):
 				df["common_substring"][row] = current_common
 			elif current_common != df["common_substring"][row]:
 				df["common_substring"][row] = find_common_substring(current_common, df["common_substring"][row])
-	print(df)
 	return df
 def find_common_substring(string1, string2):