Commit a433e0ec by Paktalin

extracted occurences

parent 97fbfda2
from util import get_postimees_urls
from util import get_postimees_urls, write_list_to_file
from tqdm import tqdm
def save_postimees_urls():
......@@ -16,7 +16,7 @@ def save_postimees_urls():
print("Extracted links from %i pages" % page_index)
break
page_index += 1
write_to_file(url_list, 'postimees_urls.txt', 'w')
write_list_to_file(url_list, 'postimees_urls.txt', 'w')
return url_list
def extract_articles_from_urls():
......@@ -24,7 +24,7 @@ def extract_articles_from_urls():
for i in tqdm(range(9551, len(postimees_urls))): # loading bar
url = postimees_urls[i]
article = get_text_from_articles(url)
write_to_file(article, 'articles.txt', 'a')
write_list_to_file(article, 'articles.txt', 'a')
def get_text_from_articles(article_url):
article_text = ""
......
This source diff could not be displayed because it is too large. You can view the blob instead.
from util import save_csv, get_preprocessed_verbs, get_articles, write_to_file
from util import save_csv, get_preprocessed_verbs, get_articles, write_string_to_file, get_verbs_with_occurences
from tqdm import tqdm
import io, re
import io, re, ast, csv
def get_verbs_with_not_empty_occurences():
csv.field_size_limit(100000000)
verbs = get_verbs_with_occurences()
verbs = verbs.loc[verbs[9] != '[]'][[0, 9]]
verbs.index = range(len(verbs))
save_csv(verbs, 'cleaned_verbs_with_occurences.csv')
print(verbs)
def extract_verbs_occurences_from_articles(verbs, articles):
verbs['occurences'] = ''
print("finding approximate verbs occurences")
for i in tqdm(range(len(verbs))):
# finish the pattern
pattern = '^(.*\W)*' + verbs[8][i] + '(?!(mi|ja)).*$'
occurences = list(set([sentence + '.' for sentence in articles.split('.') if re.match(pattern, sentence)]))
for i in tqdm(range(1473, len(verbs))):
spaced_verb = ' ' + verbs[8][i]
occurences = list(set([sentence + '.' for sentence in articles.split('.') if spaced_verb in sentence]))
verbs['occurences'][i] = filter_wrong_occurences(verbs.iloc[i], occurences)
save_csv(verbs, "with_approximate_occurences.csv")
save_csv(verbs, "with_approximate_occurences_1473.csv")
def filter_wrong_occurences(verb, occurences):
all_forms = get_all_forms(verb)
verified_occurences = []
not_fond = []
for occurence in occurences:
found = False
for form in all_forms:
pattern = '^(.*\W)*'+form+'(\W.*)*$'
if re.match(pattern, occurence):
verified_occurences.append(occurence)
found = True
break
if not found:
not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
with io.open('not_found.txt', 'a', encoding='utf-8') as file:
file.write(not_found)
if form in occurence:
pattern = re.compile('.*'+form+'(\W.*)*$')
if pattern.match(occurence):
verified_occurences.append(occurence)
found = True
break
# if not found:
# not_found = ('%s was not found in \"%s\"\n' % (verb[0], occurence))
# write_string_to_file(not_found, 'not_found.txt', 'a')
return verified_occurences
......@@ -43,7 +47,7 @@ def get_all_forms(verb):
return all_forms
def forms(root, endings):
return [root+ending+' ' for ending in endings] + [root+ending+'.' for ending in endings] + [root+ending+'?' for ending in endings] + [root+ending+'!' for ending in endings] + [root+ending+',' for ending in endings]
return [root+ending for ending in endings]
def forms_from_b(root):
endings = ['n', 'd', 'b', 'me', 'te', 'vad', '', 'ksin', 'ksid', 'ks', 'ksime', 'ksite']
......@@ -65,7 +69,8 @@ def forms_from_dud(root):
endings = ['ud', 'av', 'avat', 'agu', 'i', 'a']
return forms(root, endings)
# verbs = get_preprocessed_verbs()
# articles = get_articles().lower()
# extract_verbs_occurences_from_articles(verbs, articles)
verbs = get_preprocessed_verbs()
articles = get_articles().lower()
extract_verbs_occurences_from_articles(verbs, articles)
\ No newline at end of file
get_verbs_with_not_empty_occurences()
\ No newline at end of file
......@@ -11,11 +11,15 @@ def get_soup(url):
page = urllib.request.urlopen(page)
return BeautifulSoup(page, 'html.parser')
def write_to_file(list, path, mode):
def write_list_to_file(list, path, mode):
with io.open(path, mode, encoding='utf-8') as file:
for line in list:
file.write(line)
def write_string_to_file(string, path, mode):
with io.open(path, mode, encoding='utf-8') as file:
file.write(string)
def save_csv(df, path):
df.to_csv(path, index=False, header=False)
......@@ -32,4 +36,7 @@ def get_preprocessed_verbs():
return read_csv("preprocessed_verbs.csv")
def get_postimees_urls():
return open("postimees_urls.txt", "r").read().split('\n')
\ No newline at end of file
return open("postimees_urls.txt", "r").read().split('\n')
def get_verbs_with_occurences():
return read_csv("with_approximate_occurences_all.csv")
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment