import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from wordcloud import WordCloud def train_testgit_split(X, Y, test_size): test_size = int(test_size*X.shape[0]) Xtrain = X[:-test_size] Xtest = X[-test_size:] Ytrain = Y[:-test_size] Ytest = Y[-test_size:] return Xtrain, Xtest, Ytrain, Ytest def visualize(label): words = '' for msg in df[df['labels'] == label]['data']: msg = msg.lower() words += msg + ' ' word_cloud = WordCloud(width=600, height=400).generate(words) plt.imshow(word_cloud) plt.axis('off') plt.show() df = pd.read_csv('./files/sms_spam.csv', encoding='ISO-8859-1') df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1) df.columns = ['labels', 'data'] df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1}) Y = df['b_labels'].values count_vectorizer = CountVectorizer(decode_error='ignore') X = count_vectorizer.fit_transform(df['data']) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33) model = MultinomialNB() model.fit(Xtrain, Ytrain) print('Train score is', model.score(Xtrain, Ytrain)) print('Test score is', model.score(Xtest, Ytest)) visualize('spam') visualize('ham') df['predictions'] = model.predict(X) sneaky_spam = df[(df['b_labels'] == 1) & (df['predictions'] == 0)]['data'] for msg in sneaky_spam: print(msg) print('\n\n') not_actually_spam = df[(df['b_labels'] == 0) & df['predictions'] == 1]['data'] for msg in not_actually_spam: print(msg)