Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian_verbs
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
e449480f
authored
Nov 24, 2018
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
finished the first version of filtering wrong occurences out
parent
80597f10
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
48 additions
and
7 deletions
__pycache__/util.cpython-36.pyc
preprocessing.py
util.py
__pycache__/util.cpython-36.pyc
View file @
e449480f
No preview for this file type
preprocessing.py
View file @
e449480f
from
util
import
save_csv
,
get_preprocessed_verbs
,
get_articles
from
tqdm
import
tqdm
def
extract_verbs_occurences_from_articles
(
verbs
,
articles
):
verbs
[
'occurences'
]
=
''
print
(
"finding approximate verbs occurences"
)
# trial with the first verb
verb
=
verbs
[
"common_substring"
][
0
]
occurences
=
[
sentence
+
'.'
for
sentence
in
articles
.
split
(
'.'
)
if
verb
in
sentence
]
verbs
[
'occurences'
][
0
]
=
filter_wrong_occurences
(
verb
,
occurences
)
spaced_verb
=
' '
+
verb
occurences
=
[
sentence
+
'.'
for
sentence
in
articles
.
split
(
'.'
)
if
spaced_verb
in
sentence
]
verbs
[
'occurences'
][
0
]
=
filter_wrong_occurences
(
verbs
.
iloc
[
0
],
occurences
)
# for i in tqdm(range(len(verbs))):
# verb = verbs["common_substring"][i]
...
...
@@ -17,10 +19,48 @@ def extract_verbs_occurences_from_articles(verbs, articles):
# save_csv(verbs, "with_approximate_occurences.csv")
def
filter_wrong_occurences
(
verb
,
occurences
):
print
(
verb
)
print
(
"filtering wrong occurences"
)
all_forms
=
get_all_forms
(
verb
)
for
occurence
in
occurences
:
found
=
False
for
form
in
all_forms
:
if
form
in
occurence
:
found
=
True
break
if
not
found
:
occurences
.
remove
(
occurence
)
occurences
=
list
(
set
(
occurences
))
print
(
occurences
)
def
get_all_forms
(
verb
):
all_forms
=
[]
all_forms
.
extend
(
forms_from_ma
(
verb
[
0
][:
-
2
]))
all_forms
.
extend
(
forms_from_da
(
verb
[
1
][:
-
2
]))
all_forms
.
extend
(
forms_from_b
(
verb
[
2
][:
-
1
]))
all_forms
.
append
(
verb
[
6
])
all_forms
.
append
(
verb
[
7
])
return
all_forms
def
forms
(
root
,
endings
):
return
[
root
+
ending
+
' '
for
ending
in
endings
]
+
[
root
+
ending
+
'.'
for
ending
in
endings
]
+
[
root
+
ending
+
'?'
for
ending
in
endings
]
+
[
root
+
ending
+
'!'
for
ending
in
endings
]
+
[
root
+
ending
+
','
for
ending
in
endings
]
def
forms_from_b
(
root
):
endings
=
[
'n'
,
'd'
,
'b'
,
'me'
,
'te'
,
'vad'
,
''
,
'ksin'
,
'ksid'
,
'ks'
,
'ksime'
,
'ksite'
]
return
forms
(
root
,
endings
)
def
forms_from_ma
(
root
):
endings
=
[
'ma'
,
'mas'
,
'mast'
,
'maks'
,
'mata'
,
'v'
,
'vat'
,
'sin'
,
'sid'
,
's'
,
'sime'
,
'site'
]
return
forms
(
root
,
endings
)
def
forms_from_da
(
root
):
endings
=
[
'da'
,
'gu'
,
'gem'
,
'ge'
,
'nuksin'
,
'nuks'
,
'nuksid'
,
'nuksime'
,
'nuksite'
,
'di'
,
'nuvat'
,
'davat'
,
'des'
,
'dav'
]
return
forms
(
root
,
endings
)
def
forms_from_kse
(
root
):
endings
=
[
'kse'
,
'ks'
,
'gu'
,
''
,
'vat'
,
'v'
]
return
forms
(
root
,
endings
)
verbs
=
get_preprocessed_verbs
()
articles
=
get_articles
()
extract_occurences
(
verbs
)
\ No newline at end of file
extract_verbs_occurences_from_articles
(
verbs
,
articles
)
\ No newline at end of file
util.py
View file @
e449480f
...
...
@@ -32,7 +32,8 @@ def read_csv(path, sep, header):
def
get_articles
():
with
open
(
'articles.txt'
,
'r'
,
encoding
=
'utf-8'
)
as
articles
:
return
articles
.
read
()
.
replace
(
'
\n
'
,
''
)
articles_string
=
articles
.
read
()
.
replace
(
'
\n
'
,
''
)
return
articles_string
def
get_preprocessed_verbs
():
return
read_csv
(
"preprocessed_verbs.csv"
,
","
,
header
=
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment