created separate columns for double forms

79266248 · Paktalin · e4c2aa75 · 79266248 · 79266248 · 79266248
Commit 79266248 authored Nov 22, 2018 by Paktalin
Showing with 40 additions and 18 deletions
__pycache__/preprocessing.cpython-36.pyc
__pycache__/util.cpython-36.pyc
main.py
preprocessing.py
--- a/__pycache__/preprocessing.cpython-36.pyc
+++ b/__pycache__/preprocessing.cpython-36.pyc
--- a/__pycache__/util.cpython-36.pyc
+++ b/__pycache__/util.cpython-36.pyc
--- a/main.py
+++ b/main.py
 import pandas as pd
-from util import get_postimees_urls, get_verbs_gf, get_text
+from util import get_postimees_urls, get_text
+from preprocessing import get_verbs_gf
+
 import progressbar
 import numpy as np
 from tqdm import tqdm

 print("getting verbs...")
 verbs = get_verbs_gf()
+print(verbs)
 # retrieve liks to postimees articles
-print("getting postimees urls...")
-postimees_urls = get_postimees_urls()
+# print("getting postimees urls...")
+# postimees_urls = get_postimees_urls()

-print("extracting text from the urls...")
-articles = []
-for i in tqdm(range(len(postimees_urls))):
-	url = postimees_urls[i]
-	articles.append(get_text(url))
+# print("extracting text from the urls...")
+# articles = []
+# for i in tqdm(range(len(postimees_urls))):
+# 	url = postimees_urls[i]
+# 	articles.append(get_text(url))

-# try to find a verb in an article
-for column in verbs:
-	verb_form = verbs.iloc[2][column]
-	if type(verb_form) is str:
-		print(verb_form)
-		print(str(articles[0].find(verb_form)))
\ No newline at end of file
+# # try to find a verb in an article
+# for column in verbs:
+# 	verb_form = verbs.iloc[2][column]
+# 	if type(verb_form) is str:
+# 		print(verb_form)
+# 		print(str(articles[0].find(verb_form)))
\ No newline at end of file
--- a/preprocessing.py
+++ b/preprocessing.py
 import pandas as pd
-
+import numpy as np

 def get_verbs_gf():
 	# read file as dataframe
-	df read_csv()
+	df = read_csv()
+	df = split_double(df)
+	return df

 def read_csv():
-	df = pd.read_csv("verbs_gf.csv", sep=",", names=columns, encoding='utf8')
+	df = pd.read_csv("verbs_gf.csv", sep=", ", encoding='utf8', header=None, engine='python')
+	return df
+
+
+def double(column):
+	new_column = column.str.split('|', expand=True)
+	try:
+		column = new_column[0]
+		new_column = new_column[1]
+	except Exception as e:
+		new_column = [None]*len(new_column)
+	return column, new_column
+
+def split_double(df):
+	for column_name in df.columns:
+		second_column_name = str(column_name) + "double"
+		df[column_name], df[second_column_name] = double(df[column_name])
 	return df
\ No newline at end of file