Commit 80597f10 by Paktalin

finished scripts for articles and verbs preprocessing

parent fbaa031a
This source diff could not be displayed because it is too large. You can view the blob instead.
from util import get_soup, write_to_file
def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n')
from util import get_postimees_urls
from tqdm import tqdm
def save_postimees_urls():
urlpage = 'https://www.postimees.ee/search?sections=81&page='
......@@ -19,4 +17,22 @@ def save_postimees_urls():
break
page_index += 1
write_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list
\ No newline at end of file
return url_list
def extract_articles_from_urls():
postimees_urls = get_postimees_urls()
for i in tqdm(range(9551, len(postimees_urls))): # loading bar
url = postimees_urls[i]
article = get_text_from_articles(url)
write_to_file(article, 'articles.txt', 'a')
def get_text_from_articles(article_url):
article_text = ""
soup = get_soup(article_url)
results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
for result in results_list:
try:
article_text += "\n" + result.find("p").text
except Exception as e:
pass
return article_text
\ No newline at end of file
import pandas as pd
from util import get_text, write_to_file
from preprocessing import get_preprocessed_verbs
from postimees import get_postimees_urls
import progressbar
import numpy as np
from tqdm import tqdm
print("getting verbs...")
verbs = get_preprocessed_verbs()
print("getting postimees urls...")
postimees_urls = get_postimees_urls()
print("extracting text from the urls...")
articles = []
for i in tqdm(range(3935, len(postimees_urls))): # loading bar
url = postimees_urls[i]
article = get_text(url)
write_to_file(article, 'articles.txt', 'a')
# # try to find a verb in an article
# for column in verbs:
# verb_form = verbs.iloc[2][column]
# if type(verb_form) is str:
# print(verb_form)
# print(str(articles[0].find(verb_form)))
\ No newline at end of file
from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
import re
def extract_sentences(verbs, articles):
for i in range(len(verbs)):
verb = verbs["common_substring"][i]
print(verb)
occurences = [m.start() for m in re.finditer(verb, articles)]
print(occurences)
with open('articles.txt', 'r', encoding='utf-8') as articles:
articles = articles.read().replace('\n', '')
verbs = get_preprocessed_verbs()
extract_sentences(verbs, articles)
\ No newline at end of file
from util import save_csv, get_preprocessed_verbs, get_articles
from tqdm import tqdm
def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = ''
print("finding approximate verbs occurences")
verb = verbs["common_substring"][0]
occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
verbs['occurences'][0] = filter_wrong_occurences(verb, occurences)
# for i in tqdm(range(len(verbs))):
# verb = verbs["common_substring"][i]
# occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
# verbs['occurences'][i] = occurences
# save_csv(verbs, "with_approximate_occurences.csv")
def filter_wrong_occurences(verb, occurences):
print(verb)
print(occurences)
verbs = get_preprocessed_verbs()
articles = get_articles()
extract_occurences(verbs)
\ No newline at end of file
......@@ -8,14 +8,7 @@ def get_verbs_cooljugator():
# cut translation
return df['verb'].str.split(' *- *').str[0]
def get_verbs_gf():
# read file as dataframe
columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
df = pd.read_csv("verbs_gf.csv", sep=",|\|", names=columns, encoding='utf8', engine='python')
return df
def get_soup(url):
page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
try:
page = urllib.request.urlopen(page)
......@@ -30,13 +23,19 @@ def write_to_file(list, path, mode):
for line in list:
file.write(line)
def get_text(article_url):
article_text = ""
soup = get_soup(article_url)
results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
for result in results_list:
try:
article_text += "\n" + result.find("p").text
except Exception as e:
pass
return article_text
\ No newline at end of file
def save_csv(df, path):
df.to_csv(path, index=False)
def read_csv(path, sep, header):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df
def get_articles():
with open('articles.txt', 'r', encoding='utf-8') as articles:
return articles.read().replace('\n', '')
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0)
def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n')
\ No newline at end of file
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0)
from util import save_csv, read_csv
def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ", None)
df = split_double_forms(df)
df = add_common_substring(df)
save_csv(df)
def read_csv(path, sep, header):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df
def save_csv(df):
df.to_csv("preprocessed_verbs.csv", index=False)
save_csv(df, "preprocessed_verbs.csv")
def split_double_forms(df):
for i in range(len(df.index)):
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment