Commit 80597f10 by Paktalin

finished scripts for articles and verbs preprocessing

parent fbaa031a
This source diff could not be displayed because it is too large. You can view the blob instead.
from util import get_soup, write_to_file from util import get_postimees_urls
from tqdm import tqdm
def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n')
def save_postimees_urls(): def save_postimees_urls():
urlpage = 'https://www.postimees.ee/search?sections=81&page=' urlpage = 'https://www.postimees.ee/search?sections=81&page='
...@@ -20,3 +18,21 @@ def save_postimees_urls(): ...@@ -20,3 +18,21 @@ def save_postimees_urls():
page_index += 1 page_index += 1
write_to_file(url_list, 'postimees_urls.txt', 'w') write_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list return url_list
def extract_articles_from_urls():
postimees_urls = get_postimees_urls()
for i in tqdm(range(9551, len(postimees_urls))): # loading bar
url = postimees_urls[i]
article = get_text_from_articles(url)
write_to_file(article, 'articles.txt', 'a')
def get_text_from_articles(article_url):
article_text = ""
soup = get_soup(article_url)
results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"})
for result in results_list:
try:
article_text += "\n" + result.find("p").text
except Exception as e:
pass
return article_text
\ No newline at end of file
import pandas as pd
from util import get_text, write_to_file
from preprocessing import get_preprocessed_verbs
from postimees import get_postimees_urls
import progressbar
import numpy as np
from tqdm import tqdm
print("getting verbs...")
verbs = get_preprocessed_verbs()
print("getting postimees urls...")
postimees_urls = get_postimees_urls()
print("extracting text from the urls...")
articles = []
for i in tqdm(range(3935, len(postimees_urls))): # loading bar
url = postimees_urls[i]
article = get_text(url)
write_to_file(article, 'articles.txt', 'a')
# # try to find a verb in an article
# for column in verbs:
# verb_form = verbs.iloc[2][column]
# if type(verb_form) is str:
# print(verb_form)
# print(str(articles[0].find(verb_form)))
\ No newline at end of file
from verbs_preprocessing import get_preprocessed_verbs, preprocess_verbs
import re
def extract_sentences(verbs, articles):
for i in range(len(verbs)):
verb = verbs["common_substring"][i]
print(verb)
occurences = [m.start() for m in re.finditer(verb, articles)]
print(occurences)
with open('articles.txt', 'r', encoding='utf-8') as articles:
articles = articles.read().replace('\n', '')
verbs = get_preprocessed_verbs()
extract_sentences(verbs, articles)
\ No newline at end of file
from util import save_csv, get_preprocessed_verbs, get_articles
from tqdm import tqdm
def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = ''
print("finding approximate verbs occurences")
verb = verbs["common_substring"][0]
occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
verbs['occurences'][0] = filter_wrong_occurences(verb, occurences)
# for i in tqdm(range(len(verbs))):
# verb = verbs["common_substring"][i]
# occurences = [sentence + '.' for sentence in articles.split('.') if verb in sentence]
# verbs['occurences'][i] = occurences
# save_csv(verbs, "with_approximate_occurences.csv")
def filter_wrong_occurences(verb, occurences):
print(verb)
print(occurences)
verbs = get_preprocessed_verbs()
articles = get_articles()
extract_occurences(verbs)
\ No newline at end of file
...@@ -8,14 +8,7 @@ def get_verbs_cooljugator(): ...@@ -8,14 +8,7 @@ def get_verbs_cooljugator():
# cut translation # cut translation
return df['verb'].str.split(' *- *').str[0] return df['verb'].str.split(' *- *').str[0]
def get_verbs_gf():
# read file as dataframe
columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
df = pd.read_csv("verbs_gf.csv", sep=",|\|", names=columns, encoding='utf8', engine='python')
return df
def get_soup(url): def get_soup(url):
page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) page = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
try: try:
page = urllib.request.urlopen(page) page = urllib.request.urlopen(page)
...@@ -30,13 +23,19 @@ def write_to_file(list, path, mode): ...@@ -30,13 +23,19 @@ def write_to_file(list, path, mode):
for line in list: for line in list:
file.write(line) file.write(line)
def get_text(article_url): def save_csv(df, path):
article_text = "" df.to_csv(path, index=False)
soup = get_soup(article_url)
results_list = soup.find_all("div", {'class': "article-body__item article-body__item--htmlElement"}) def read_csv(path, sep, header):
for result in results_list: df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
try: return df
article_text += "\n" + result.find("p").text
except Exception as e: def get_articles():
pass with open('articles.txt', 'r', encoding='utf-8') as articles:
return article_text return articles.read().replace('\n', '')
\ No newline at end of file
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0)
def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n')
\ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from difflib import SequenceMatcher from difflib import SequenceMatcher
from util import save_csv, read_csv
def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv", ",", header=0)
def preprocess_verbs(): def preprocess_verbs():
df = read_csv("verbs_gf.csv", ", ", None) df = read_csv("verbs_gf.csv", ", ", None)
df = split_double_forms(df) df = split_double_forms(df)
df = add_common_substring(df) df = add_common_substring(df)
save_csv(df) save_csv(df, "preprocessed_verbs.csv")
def read_csv(path, sep, header):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
return df
def save_csv(df):
df.to_csv("preprocessed_verbs.csv", index=False)
def split_double_forms(df): def split_double_forms(df):
for i in range(len(df.index)): for i in range(len(df.index)):
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment