Commit 675c29bb by Paktalin

Edited sms spam detector

parent 050c8495
Showing with 55 additions and 0 deletions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
def train_test_split(X, Y, test_size):
test_size = int(test_size*X.shape[0])
Xtrain = X[:-test_size]
Xtest = X[-test_size:]
Ytrain = Y[:-test_size]
Ytest = Y[-test_size:]
return Xtrain, Xtest, Ytrain, Ytest
def visualize(label):
words = ''
for msg in df[df['labels'] == label]['data']:
msg = msg.lower()
words += msg + ' '
word_cloud = WordCloud(width=600, height=400).generate(words)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
df = pd.read_csv('./files/sms_spam.csv', encoding='ISO-8859-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['labels', 'data']
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values
count_vectorizer = CountVectorizer(decode_error='ignore')
X = count_vectorizer.fit_transform(df['data'])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print('Train score is', model.score(Xtrain, Ytrain))
print('Test score is', model.score(Xtest, Ytest))
visualize('spam')
visualize('ham')
df['predictions'] = model.predict(X)
sneaky_spam = df[(df['b_labels'] == 1) & (df['predictions'] == 0)]['data']
for msg in sneaky_spam:
print(msg)
print('\n\n')
not_actually_spam = df[(df['b_labels'] == 0) & df['predictions'] == 1]['data']
for msg in not_actually_spam:
print(msg)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment