Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian_verbs
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
f3e44de1
authored
Nov 22, 2018
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
second forms are saved as separate verbs at the end of the dataframe
parent
79266248
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
28 deletions
__pycache__/preprocessing.cpython-36.pyc
main.py
preprocessing.py
__pycache__/preprocessing.cpython-36.pyc
View file @
f3e44de1
No preview for this file type
main.py
View file @
f3e44de1
...
...
@@ -9,19 +9,19 @@ from tqdm import tqdm
print
(
"getting verbs..."
)
verbs
=
get_verbs_gf
()
print
(
verbs
)
#
retrieve liks to postimees articles
#
print("getting postimees urls...")
#
postimees_urls = get_postimees_urls()
retrieve
liks
to
postimees
articles
print
(
"getting postimees urls..."
)
postimees_urls
=
get_postimees_urls
()
#
print("extracting text from the urls...")
#
articles = []
#
for i in tqdm(range(len(postimees_urls))):
#
url = postimees_urls[i]
#
articles.append(get_text(url))
print
(
"extracting text from the urls..."
)
articles
=
[]
for
i
in
tqdm
(
range
(
len
(
postimees_urls
))):
url
=
postimees_urls
[
i
]
articles
.
append
(
get_text
(
url
))
# # try to find a verb in an article
# for column in verbs:
# verb_form = verbs.iloc[2][column]
# if type(verb_form) is str:
# print(verb_form)
# print(str(articles[0].find(verb_form)))
\ No newline at end of file
# try to find a verb in an article
for
column
in
verbs
:
verb_form
=
verbs
.
iloc
[
2
][
column
]
if
type
(
verb_form
)
is
str
:
print
(
verb_form
)
print
(
str
(
articles
[
0
]
.
find
(
verb_form
)))
\ No newline at end of file
preprocessing.py
View file @
f3e44de1
...
...
@@ -11,18 +11,14 @@ def read_csv():
df
=
pd
.
read_csv
(
"verbs_gf.csv"
,
sep
=
", "
,
encoding
=
'utf8'
,
header
=
None
,
engine
=
'python'
)
return
df
def
double
(
column
):
new_column
=
column
.
str
.
split
(
'|'
,
expand
=
True
)
try
:
column
=
new_column
[
0
]
new_column
=
new_column
[
1
]
except
Exception
as
e
:
new_column
=
[
None
]
*
len
(
new_column
)
return
column
,
new_column
def
split_double
(
df
):
for
column_name
in
df
.
columns
:
second_column_name
=
str
(
column_name
)
+
"double"
df
[
column_name
],
df
[
second_column_name
]
=
double
(
df
[
column_name
])
for
i
in
range
(
len
(
df
.
index
)):
row
=
df
.
iloc
[
i
]
split_row
=
row
.
str
.
split
(
'|'
,
expand
=
True
)
try
:
second_form
=
split_row
[
1
]
second_form
[
second_form
.
isnull
()]
=
split_row
[
0
]
df
=
df
.
append
(
second_form
,
ignore_index
=
True
)
except
Exception
as
e
:
pass
return
df
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment