Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
likorn
/
estonian_verbs
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
79266248
authored
Nov 22, 2018
by
Paktalin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
created separate columns for double forms
parent
e4c2aa75
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
40 additions
and
18 deletions
__pycache__/preprocessing.cpython-36.pyc
__pycache__/util.cpython-36.pyc
main.py
preprocessing.py
__pycache__/preprocessing.cpython-36.pyc
0 → 100644
View file @
79266248
File added
__pycache__/util.cpython-36.pyc
View file @
79266248
No preview for this file type
main.py
View file @
79266248
import
pandas
as
pd
from
util
import
get_postimees_urls
,
get_verbs_gf
,
get_text
from
util
import
get_postimees_urls
,
get_text
from
preprocessing
import
get_verbs_gf
import
progressbar
import
numpy
as
np
from
tqdm
import
tqdm
print
(
"getting verbs..."
)
verbs
=
get_verbs_gf
()
print
(
verbs
)
# retrieve liks to postimees articles
print
(
"getting postimees urls..."
)
postimees_urls
=
get_postimees_urls
()
#
print("getting postimees urls...")
#
postimees_urls = get_postimees_urls()
print
(
"extracting text from the urls..."
)
articles
=
[]
for
i
in
tqdm
(
range
(
len
(
postimees_urls
))):
url
=
postimees_urls
[
i
]
articles
.
append
(
get_text
(
url
))
#
print("extracting text from the urls...")
#
articles = []
#
for i in tqdm(range(len(postimees_urls))):
#
url = postimees_urls[i]
#
articles.append(get_text(url))
# try to find a verb in an article
for
column
in
verbs
:
verb_form
=
verbs
.
iloc
[
2
][
column
]
if
type
(
verb_form
)
is
str
:
print
(
verb_form
)
print
(
str
(
articles
[
0
]
.
find
(
verb_form
)))
\ No newline at end of file
# # try to find a verb in an article
# for column in verbs:
# verb_form = verbs.iloc[2][column]
# if type(verb_form) is str:
# print(verb_form)
# print(str(articles[0].find(verb_form)))
\ No newline at end of file
preprocessing.py
View file @
79266248
import
pandas
as
pd
import
numpy
as
np
def
get_verbs_gf
():
# read file as dataframe
df
read_csv
()
df
=
read_csv
()
df
=
split_double
(
df
)
return
df
def
read_csv
():
df
=
pd
.
read_csv
(
"verbs_gf.csv"
,
sep
=
","
,
names
=
columns
,
encoding
=
'utf8'
)
df
=
pd
.
read_csv
(
"verbs_gf.csv"
,
sep
=
", "
,
encoding
=
'utf8'
,
header
=
None
,
engine
=
'python'
)
return
df
def
double
(
column
):
new_column
=
column
.
str
.
split
(
'|'
,
expand
=
True
)
try
:
column
=
new_column
[
0
]
new_column
=
new_column
[
1
]
except
Exception
as
e
:
new_column
=
[
None
]
*
len
(
new_column
)
return
column
,
new_column
def
split_double
(
df
):
for
column_name
in
df
.
columns
:
second_column_name
=
str
(
column_name
)
+
"double"
df
[
column_name
],
df
[
second_column_name
]
=
double
(
df
[
column_name
])
return
df
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment