Commit 8890361f by Paktalin

Moved to a different approach with estnltk library

parent a433e0ec
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from estnltk import Text
from util import save_dict, load_dict, save_csv, read_csv
import pandas as pd
import matplotlib.pyplot as plt
def map_verbs_with_sentences():
verbs = {}
articles = Text(open('articles.txt', encoding='utf-8').read())
articles = articles.replace('\xa0', ' ')
for sentence in articles.sentence_texts:
words = Text(sentence).get.word_texts.lemmas.postags.as_dataframe
for verb in words[words['postags'] == 'V']['lemmas']:
if verb != 'ei':
if verb in verbs:
verbs[verb].append(sentence)
else:
verbs[verb] = [sentence]
save_dict(verbs, 'verbs_dict')
#print(sentence)
def verbs_dict_to_df():
print('Loading verbs...')
verbs = load_dict('verbs_dict')
print('Finished loading verbs')
rows_list = []
total_verbs = len(verbs)
i = 0
for verb in verbs:
print('%i/%i %s' % (i, total_verbs, verb))
for sentence in verbs[verb]:
sentence = Text(sentence.replace('\n', '').replace('~', ''))
words = Text(sentence).get.word_texts.lemmas.postags.forms.as_dataframe
noun_likes = words[(words['postags'] == 'P') | (words['postags'] == 'S') | (words['postags'] == 'H') | (words['postags'] == 'A')
| (words['postags'] == 'O') | (words['postags'] == 'N') | (words['postags'] == 'U') | (words['postags'] == 'Y')]
for index, noun_like in noun_likes.iterrows():
for verb_occurence_index in words.index[words['lemmas'] == verb]:
dict = {'verb': verb, 'noun_like': noun_like['lemmas'], 'verb_form': words.iloc[verb_occurence_index]['forms'], 'noun_like_form': noun_like['forms'], 'noun_like_pos': noun_like['postags'], 'sentence': sentence, 'distance': abs(verb_occurence_index - index) }
rows_list.append(dict)
i += 1
return pd.DataFrame.from_dict(rows_list)
def clean_dataframe():
df = read_csv('verbs_with_noun_likes.csv', sep='~')
df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
df = df[pd.notnull(df['noun_like_form'])] # remove examples with null forms
df = df[~df['noun_like_form'].str.contains('\|')] # remove example with several forms
df = df[df['noun_like_form'] != '?'] # remove examples with unknown forms
save_csv(df, 'cleaned_dataframe.csv', sep='~')
print(df[df['distance'] > 100]['sentence'])
plt.scatter(df['distance'], df['noun_like_form'], alpha=0.2, c='k')
plt.show()
clean_dataframe()
\ No newline at end of file
from estnltk import Text, TextCleaner, ESTONIAN
import random
from util import load_dict, save_dict
def save_random_verbs():
ESTONIAN = ESTONIAN + '«»„ˮ“€’.…'
text_cleaner = TextCleaner(ESTONIAN)
verbs = load_dict('verbs_dict')
verbs_to_label = {}
for i in range(100):
random_verb = list(random.choice(list(verbs.items())))
verbs_to_label[random_verb[0]] = random.choice(random_verb[1])
print(verbs_to_label)
save_dict(verbs_to_label, 'verbs_with_labels')
def load_random_verbs():
verbs = load_dict('verbs_with_labels')
for verb in verbs:
sentence = verbs[verb]
print('%s: \'%s\'' % (verb, sentence))
sentence = Text(sentence).get.word_texts.lemmas.postags.as_dataframe
print(sentence)
load_random_verbs()
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -2,37 +2,39 @@ from util import save_csv, get_preprocessed_verbs, get_articles, write_string_to ...@@ -2,37 +2,39 @@ from util import save_csv, get_preprocessed_verbs, get_articles, write_string_to
from tqdm import tqdm from tqdm import tqdm
import io, re, ast, csv import io, re, ast, csv
def get_verbs_with_not_empty_occurences(): def get_verbs_with_cleaned_occurences():
csv.field_size_limit(100000000) csv.field_size_limit(100000000)
verbs = get_verbs_with_occurences() verbs = get_verbs_with_occurences()
verbs = verbs.loc[verbs[9] != '[]'][[0, 9]] verbs = verbs.loc[verbs[9] != '[]'][[0, 9]]
verbs.index = range(len(verbs)) verbs.index = range(len(verbs))
verbs[9] = verbs[9].str.replace('\\xa0', ' ', regex=False)
print(verbs[9])
save_csv(verbs, 'cleaned_verbs_with_occurences.csv') save_csv(verbs, 'cleaned_verbs_with_occurences.csv')
print(verbs)
def extract_verbs_occurences_from_articles(verbs, articles): def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = '' verbs['occurences'] = ''
for i in tqdm(range(1473, len(verbs))): verbs['forms'] = ''
for i in tqdm(range(len(verbs))):
spaced_verb = ' ' + verbs[8][i] spaced_verb = ' ' + verbs[8][i]
occurences = list(set([sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence])) occurences = list(set([sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]))
verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences) cleaned_occurences = filter_wrong_occurences(verbs.iloc[i], occurences)
verbs['occurences'][i], verbs['forms'][i] = cleaned_occurences[0], cleaned_occurences[1]
save_csv(verbs, "with_approximate_occurences_1473.csv") save_csv(verbs, "with_approximate_occurences_1473.csv")
def filter_wrong_occurences(verb, occurences): def filter_wrong_occurences(verb, occurences):
all_forms = get_all_forms(verb) all_forms = get_all_forms(verb)
verified_occurences = [] verified_occurences = [[],[]]
for occurence in occurences: for occurence in occurences:
found = False found = False
for form in all_forms: for form in all_forms:
if form in occurence: if form in occurence:
pattern = re.compile('.*'+form+'(\W.*)*$') pattern = re.compile('.*'+form+'(\W.*)*$')
if pattern.match(occurence): if pattern.match(occurence):
verified_occurences.append(occurence) verified_occurences[0].append(occurence)
verified_occurences[1].append(form)
found = True found = True
break break
# if not found:
# not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
# write_string_to_file(not_found, 'not_found.txt', 'a')
return verified_occurences return verified_occurences
...@@ -69,8 +71,8 @@ def forms_from_dud(root): ...@@ -69,8 +71,8 @@ def forms_from_dud(root):
endings = ['ud', 'av', 'avat', 'agu', 'i', 'a'] endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
return forms(root, endings) return forms(root, endings)
# verbs = get_preprocessed_verbs() verbs = get_preprocessed_verbs()
# articles = get_articles().lower() articles = get_articles().lower()
# extract_verbs_occurences_from_articles(verbs, articles) extract_verbs_occurences_from_articles(verbs, articles)
get_verbs_with_not_empty_occurences() # get_verbs_with_cleaned_occurences()
\ No newline at end of file \ No newline at end of file
import pandas as pd import pandas as pd
import urllib, io import urllib, io, pickle
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def get_soup(url): def get_soup(url):
...@@ -20,8 +20,8 @@ def write_string_to_file(string, path, mode): ...@@ -20,8 +20,8 @@ def write_string_to_file(string, path, mode):
with io.open(path, mode, encoding='utf-8') as file: with io.open(path, mode, encoding='utf-8') as file:
file.write(string) file.write(string)
def save_csv(df, path): def save_csv(df, path, sep=','):
df.to_csv(path, index=False, header=False) df.to_csv(path, index=False, header=False, sep=sep)
def read_csv(path, sep=',', header=None): def read_csv(path, sep=',', header=None):
df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python') df = pd.read_csv(path, sep=sep, encoding='utf8', header=header, engine='python')
...@@ -39,4 +39,12 @@ def get_postimees_urls(): ...@@ -39,4 +39,12 @@ def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n') return open("postimees_urls.txt", "r").read().split('\n')
def get_verbs_with_occurences(): def get_verbs_with_occurences():
return read_csv("with_approximate_occurences_all.csv") return read_csv("with_approximate_occurences_all.csv")
\ No newline at end of file
def load_dict(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
def save_dict(obj, name):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
\ No newline at end of file
File added
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment