Commit f3e44de1 by Paktalin

second forms are saved as separate verbs at the end of the dataframe

parent 79266248
......@@ -9,19 +9,19 @@ from tqdm import tqdm
print("getting verbs...")
verbs = get_verbs_gf()
print(verbs)
# retrieve liks to postimees articles
# print("getting postimees urls...")
# postimees_urls = get_postimees_urls()
retrieve liks to postimees articles
print("getting postimees urls...")
postimees_urls = get_postimees_urls()
# print("extracting text from the urls...")
# articles = []
# for i in tqdm(range(len(postimees_urls))):
# url = postimees_urls[i]
# articles.append(get_text(url))
print("extracting text from the urls...")
articles = []
for i in tqdm(range(len(postimees_urls))):
url = postimees_urls[i]
articles.append(get_text(url))
# # try to find a verb in an article
# for column in verbs:
# verb_form = verbs.iloc[2][column]
# if type(verb_form) is str:
# print(verb_form)
# print(str(articles[0].find(verb_form)))
\ No newline at end of file
# try to find a verb in an article
for column in verbs:
verb_form = verbs.iloc[2][column]
if type(verb_form) is str:
print(verb_form)
print(str(articles[0].find(verb_form)))
\ No newline at end of file
......@@ -11,18 +11,14 @@ def read_csv():
df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python')
return df
def double(column):
new_column = column.str.split('|', expand=True)
try:
column = new_column[0]
new_column = new_column[1]
except Exception as e:
new_column = [None]*len(new_column)
return column, new_column
def split_double(df):
for column_name in df.columns:
second_column_name = str(column_name) + "double"
df[column_name], df[second_column_name] = double(df[column_name])
for i in range(len(df.index)):
row = df.iloc[i]
split_row = row.str.split('|', expand=True)
try:
second_form = split_row[1]
second_form[second_form.isnull()] = split_row[0]
df = df.append(second_form, ignore_index=True)
except Exception as e:
pass
return df
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment