Commit e4c2aa75 by Paktalin

Initial commit

parents
import pandas as pd
from util import get_postimees_urls, get_verbs_gf, get_text
import progressbar
import numpy as np
from tqdm import tqdm
print("getting verbs...")
verbs = get_verbs_gf()
# retrieve liks to postimees articles
print("getting postimees urls...")
postimees_urls = get_postimees_urls()
print("extracting text from the urls...")
articles = []
for i in tqdm(range(len(postimees_urls))):
url = postimees_urls[i]
articles.append(get_text(url))
# try to find a verb in an article
for column in verbs:
verb_form = verbs.iloc[2][column]
if type(verb_form) is str:
print(verb_form)
print(str(articles[0].find(verb_form)))
\ No newline at end of file
import pandas as pd
def get_verbs_gf():
# read file as dataframe
df read_csv()
def read_csv():
df = pd.read_csv("verbs_gf.csv", sep=",", names=columns, encoding='utf8')
return df
\ No newline at end of file
import pandas as pd
import urllib
from bs4 import BeautifulSoup
def get_verbs_cooljugator():
# read file as dataframe
df = pd.read_csv("verbs_cooljugator.txt", sep="\n", names=["verb"], encoding='utf8')
# cut translation
return df['verb'].str.split(' *- *').str[0]
def get_verbs_gf():
# read file as dataframe
columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
df = pd.read_csv("verbs_gf.csv", sep=",|\|", names=columns, encoding='utf8', engine='python')
return df
def get_soup(url):
page = urllib.request.urlopen(url)
return BeautifulSoup(page, 'html.parser')
def get_postimees_urls():
urlpage = 'https://www.postimees.ee/search?sections=81&page='
links_list = []
page_index = 248
while True:
print("Scraping page " + str(page_index))
try:
soup = get_soup(urlpage + str(page_index))
results_list = soup.find_all("span", {'class': "search-result__headline flex--equal-width"})
for result in results_list:
links_list.append(result.find("a", href=True)['href'])
except Exception as e:
print("Extracted links from %i pages" % page_index)
break
page_index += 1
return links_list
def get_text(article_url):
article_text = ""
soup = get_soup(article_url)
results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
for result in results_list:
try:
article_text += "\n" + result.find("p").text
except Exception as e:
pass
return article_text
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment