PROJECT : Spam SMS Data Analytics#

WORK IN PROGRESS

[2]:

import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.multiclass import OneVsRestClassifier


from sklearn.metrics import classification_report

[3]:

df = pd.read_csv("./SMSSpamCollection.csv", sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])

df.head()

[3]:

	label	message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

[6]:

df.label.value_counts()

[6]:

ham     4827
spam     747
Name: label, dtype: int64

Loading NLTK (Natural Language Toolkit)#

[ ]:

import nltk
nltk.download('all')

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

Tokenization#

[8]:

def splitIntoTokens(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

[ ]:

df['tokenized_message'] = df['message'].apply(splitIntoTokens)

Lemmatization (convert a word into its base form)#

[9]:

def getLemmas(tokens):
    lemmas = []
    lemmatizer = WordNetLemmatizer()
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

[ ]:


df['lemmatized_message'] = df['tokenized_message'].apply(getLemmas)

Removing Stop Words#

[10]:

stopWords = set(stopwords.words('english'))
def removeStopWords(lemmas):
    filteredSentence = []
    filteredSentence = ' '.join([word for word in lemmas if word not in stopWords])
    return filteredSentence

[ ]:


df['filtered_message'] = df['lemmatized_message'].apply(removeStopWords)

TFIDF Matrix#

The Term Document Matrix (TDM) is a matrix that contains the frequency of occurrence of terms in a collection of documents.
In a Term Frequency Inverse Document Frequency (TFIDF) matrix, the term importance is expressed by Inverse Document Frequency (IDF)
IDF diminishes the weight of the most commonly occurring words and increases the weightage of rare words.

[11]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidfVectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df = 1/df.shape[0],
    max_df = 0.7
)

[12]:



tfidfModel = tfidfVectorizer.fit(df['filtered_message'])

[13]:

X = tfidfModel.transform(df['filtered_message']).toarray()
y = df['label'].values

print(X.shape, y.shape)

(5574, 40373) (5574,)

Stratified Shuffle Sampling#

[87]:

data_gen = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=7)

Models Pool#

[ ]:

classifiers = [
    DecisionTreeClassifier(),
    GaussianNB(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=100, max_features=10),
    AdaBoostClassifier(),
]

models_params = []
for model in classifiers:
    for train_index, test_index in data_gen.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        model_report = report['spam']
        model_report.update({
            'model' : type(model).__name__,
            'accuracy' : report['accuracy']
        })

        models_params.append(model_report)

[96]:

report_df = pd.DataFrame(models_params)

report_df

[96]:

	precision	recall	f1-score	support	model	accuracy
0	0.880734	0.857143	0.868778	224	DecisionTreeClassifier	0.965332
1	0.658863	0.879464	0.753346	224	GaussianNB	0.922893
2	1.000000	0.816964	0.899263	224	SGDClassifier	0.975493
3	0.000000	0.000000	0.000000	224	SVC	0.866109
4	1.000000	0.160714	0.276923	224	KNeighborsClassifier	0.887627
5	0.989130	0.812500	0.892157	224	OneVsRestClassifier	0.973700
6	0.000000	0.000000	0.000000	224	RandomForestClassifier	0.866109
7	0.943590	0.821429	0.878282	224	AdaBoostClassifier	0.969516

[100]:

del report_df['support']

[101]:

report_df.plot.bar(x='model')

[101]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f9f850c3b10>

../../_images/Concepts_SpamDetection_SpamDataAnalytics_25_1.png

PROJECT : Spam SMS Data Analytics

Contents

PROJECT : Spam SMS Data Analytics#

Loading NLTK (Natural Language Toolkit)#

Tokenization#

Lemmatization (convert a word into its base form)#

Removing Stop Words#

TFIDF Matrix#

Stratified Shuffle Sampling#

Models Pool#