Initial commit

e4c2aa75 · Paktalin · e4c2aa75 · e4c2aa75 · e4c2aa75 · e4c2aa75
Commit e4c2aa75 authored Nov 22, 2018 by Paktalin
Showing with 84 additions and 0 deletions
__pycache__/util.cpython-36.pyc
main.py
preprocessing.py
util.py
verbs_cooljugator.txt
verbs_gf.csv
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/main.py
+++ b/main.py
+import pandas as pd
+from util import get_postimees_urls, get_verbs_gf, get_text
+import progressbar
+import numpy as np
+from tqdm import tqdm
+
+print("getting verbs...")
+verbs = get_verbs_gf()
+# retrieve liks to postimees articles
+print("getting postimees urls...")
+postimees_urls = get_postimees_urls()
+
+print("extracting text from the urls...")
+articles = []
+for i in tqdm(range(len(postimees_urls))):
+	url = postimees_urls[i]
+	articles.append(get_text(url))
+
+# try to find a verb in an article
+for column in verbs:
+	verb_form = verbs.iloc[2][column]
+	if type(verb_form) is str:
+		print(verb_form)
+		print(str(articles[0].find(verb_form)))
\ No newline at end of file
--- a/preprocessing.py
+++ b/preprocessing.py
+import pandas as pd
+
+
+def get_verbs_gf():
+	# read file as dataframe
+	df read_csv()
+
+def read_csv():
+	df = pd.read_csv("verbs_gf.csv", sep=",", names=columns, encoding='utf8')
+	return df
\ No newline at end of file
--- a/util.py
+++ b/util.py
+import pandas as pd
+import urllib
+from bs4 import BeautifulSoup
+
+def get_verbs_cooljugator():
+	# read file as dataframe
+	df = pd.read_csv("verbs_cooljugator.txt", sep="\n", names=["verb"], encoding='utf8')
+	# cut translation
+	return df['verb'].str.split(' *- *').str[0]
+
+def get_verbs_gf():
+	# read file as dataframe
+	columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+	df = pd.read_csv("verbs_gf.csv", sep=",|\|", names=columns, encoding='utf8', engine='python')
+	return df
+
+def get_soup(url):
+	page = urllib.request.urlopen(url)
+	return BeautifulSoup(page, 'html.parser')
+
+def get_postimees_urls():
+	urlpage = 'https://www.postimees.ee/search?sections=81&page='
+	links_list = []
+	page_index = 248
+	while True:
+		print("Scraping page " + str(page_index))
+		try:
+			soup = get_soup(urlpage + str(page_index))
+			results_list = soup.find_all("span", {'class': "search-result__headline flex--equal-width"})
+			for result in results_list:
+				links_list.append(result.find("a", href=True)['href'])
+		except Exception as e:
+			print("Extracted links from %i pages" % page_index)
+			break
+		page_index += 1
+	return links_list
+
+def get_text(article_url):
+	article_text = ""
+	soup = get_soup(article_url)
+	results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
+	for result in results_list:
+		try:
+			article_text += "\n" + result.find("p").text
+		except Exception as e:
+			pass
+	return article_text
\ No newline at end of file
--- a/verbs_cooljugator.txt
+++ b/verbs_cooljugator.txt
--- a/verbs_gf.csv
+++ b/verbs_gf.csv