Commit a433e0ec by Paktalin

extracted occurences

parent 97fbfda2
from util import get_postimees_urls from util import get_postimees_urls, write_list_to_file
from tqdm import tqdm from tqdm import tqdm
def save_postimees_urls(): def save_postimees_urls():
...@@ -16,7 +16,7 @@ def save_postimees_urls(): ...@@ -16,7 +16,7 @@ def save_postimees_urls():
print("Extracted links from %i pages" % page_index) print("Extracted links from %i pages" % page_index)
break break
page_index += 1 page_index += 1
write_to_file(url_list, 'postimees_urls.txt', 'w') write_list_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list return url_list
def extract_articles_from_urls(): def extract_articles_from_urls():
...@@ -24,7 +24,7 @@ def extract_articles_from_urls(): ...@@ -24,7 +24,7 @@ def extract_articles_from_urls():
for i in tqdm(range(9551, len(postimees_urls))): # loading bar for i in tqdm(range(9551, len(postimees_urls))): # loading bar
url = postimees_urls[i] url = postimees_urls[i]
article = get_text_from_articles(url) article = get_text_from_articles(url)
write_to_file(article, 'articles.txt', 'a') write_list_to_file(article, 'articles.txt', 'a')
def get_text_from_articles(article_url): def get_text_from_articles(article_url):
article_text = "" article_text = ""
......
This source diff could not be displayed because it is too large. You can view the blob instead.
from util import save_csv, get_preprocessed_verbs, get_articles, write_to_file from util import save_csv, get_preprocessed_verbs, get_articles, write_string_to_file, get_verbs_with_occurences
from tqdm import tqdm from tqdm import tqdm
import io, re import io, re, ast, csv
def get_verbs_with_not_empty_occurences():
csv.field_size_limit(100000000)
verbs = get_verbs_with_occurences()
verbs = verbs.loc[verbs[9] != '[]'][[0, 9]]
verbs.index = range(len(verbs))
save_csv(verbs, 'cleaned_verbs_with_occurences.csv')
print(verbs)
def extract_verbs_occurences_from_articles(verbs, articles): def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = '' verbs['occurences'] = ''
print("finding approximate verbs occurences") for i in tqdm(range(1473, len(verbs))):
spaced_verb = ' ' + verbs[8][i]
for i in tqdm(range(len(verbs))): occurences = list(set([sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]))
# finish the pattern
pattern = '^(.*\W)*' + verbs[8][i] + '(?!(mi|ja)).*$'
occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences) verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
save_csv(verbs, "with_approximate_occurences.csv") save_csv(verbs, "with_approximate_occurences_1473.csv")
def filter_wrong_occurences(verb, occurences): def filter_wrong_occurences(verb, occurences):
all_forms = get_all_forms(verb) all_forms = get_all_forms(verb)
verified_occurences = [] verified_occurences = []
not_fond = []
for occurence in occurences: for occurence in occurences:
found = False found = False
for form in all_forms: for form in all_forms:
pattern = '^(.*\W)*'+form+'(\W.*)*$' if form in occurence:
if re.match(pattern, occurence): pattern = re.compile('.*'+form+'(\W.*)*$')
verified_occurences.append(occurence) if pattern.match(occurence):
found = True verified_occurences.append(occurence)
break found = True
if not found: break
not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence)) # if not found:
with io.open('not_found.txt', 'a', encoding='utf-8') as file: # not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
file.write(not_found) # write_string_to_file(not_found, 'not_found.txt', 'a')
return verified_occurences return verified_occurences
...@@ -43,7 +47,7 @@ def get_all_forms(verb): ...@@ -43,7 +47,7 @@ def get_all_forms(verb):
return all_forms return all_forms
def forms(root, endings): def forms(root, endings):
return [root+ending+' ' for ending in endings] + [root+ending+'.' for ending in endings] + [root+ending+'?' for ending in endings] + [root+ending+'!' for ending in endings] + [root+ending+',' for ending in endings] return [root+ending for ending in endings]
def forms_from_b(root): def forms_from_b(root):
endings = ['n', 'd', 'b', 'me', 'te', 'vad', '', 'ksin', 'ksid', 'ks', 'ksime', 'ksite'] endings = ['n', 'd', 'b', 'me', 'te', 'vad', '', 'ksin', 'ksid', 'ks', 'ksime', 'ksite']
...@@ -65,7 +69,8 @@ def forms_from_dud(root): ...@@ -65,7 +69,8 @@ def forms_from_dud(root):
endings = ['ud', 'av', 'avat', 'agu', 'i', 'a'] endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
return forms(root, endings) return forms(root, endings)
# verbs = get_preprocessed_verbs()
# articles = get_articles().lower()
# extract_verbs_occurences_from_articles(verbs, articles)
verbs = get_preprocessed_verbs() get_verbs_with_not_empty_occurences()
articles = get_articles().lower() \ No newline at end of file
extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file
...@@ -11,11 +11,15 @@ def get_soup(url): ...@@ -11,11 +11,15 @@ def get_soup(url):
page = urllib.request.urlopen(page) page = urllib.request.urlopen(page)
return BeautifulSoup(page, 'html.parser') return BeautifulSoup(page, 'html.parser')
def write_to_file(list, path, mode): def write_list_to_file(list, path, mode):
with io.open(path, mode, encoding='utf-8') as file: with io.open(path, mode, encoding='utf-8') as file:
for line in list: for line in list:
file.write(line) file.write(line)
def write_string_to_file(string, path, mode):
with io.open(path, mode, encoding='utf-8') as file:
file.write(string)
def save_csv(df, path): def save_csv(df, path):
df.to_csv(path, index=False, header=False) df.to_csv(path, index=False, header=False)
...@@ -32,4 +36,7 @@ def get_preprocessed_verbs(): ...@@ -32,4 +36,7 @@ def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv") return read_csv("preprocessed_verbs.csv")
def get_postimees_urls(): def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n') return open("postimees_urls.txt", "r").read().split('\n')
\ No newline at end of file
def get_verbs_with_occurences():
return read_csv("with_approximate_occurences_all.csv")
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment