import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

def train_testgit_split(X, Y, test_size):
	test_size = int(test_size*X.shape[0])
	Xtrain = X[:-test_size]
	Xtest = X[-test_size:]
	Ytrain = Y[:-test_size]
	Ytest = Y[-test_size:]
	return Xtrain, Xtest, Ytrain, Ytest

def visualize(label):
	words = ''
	for msg in df[df['labels'] == label]['data']:
		msg = msg.lower()
		words += msg + ' '
	word_cloud = WordCloud(width=600, height=400).generate(words)
	plt.imshow(word_cloud)
	plt.axis('off')
	plt.show()


df = pd.read_csv('./files/sms_spam.csv', encoding='ISO-8859-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['labels', 'data']
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values

count_vectorizer = CountVectorizer(decode_error='ignore')
X = count_vectorizer.fit_transform(df['data'])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)

model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print('Train score is', model.score(Xtrain, Ytrain))
print('Test score is', model.score(Xtest, Ytest))

visualize('spam')
visualize('ham')

df['predictions'] = model.predict(X)

sneaky_spam = df[(df['b_labels'] == 1) & (df['predictions'] == 0)]['data']
for msg in sneaky_spam:
	print(msg)

print('\n\n')
not_actually_spam = df[(df['b_labels'] == 0) & df['predictions'] == 1]['data']
for msg in not_actually_spam:
	print(msg)