Scrape a corpus of news articles from a set of web pages, pre-process the corpus, and evaluate the performance of automated classification of these articles in a supervised learning context.
from IPython.display import display
%matplotlib inline
from pandas.io.json import json_normalize
import urllib.request
import json
import time
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from wordcloud import WordCloud
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
The articles and their labels are collected in two steps:
class NewsArchive:
def __init__(self):
'''
To initialize News repository source URL
Create Required Data Folders to store articles and category information
'''
os.makedirs(os.getcwd() + "/data", exist_ok=True) # Create ./data for storing articles and categories csv
os.makedirs(os.getcwd() + "/data/articles", exist_ok=True) # Create ./data/articles for storing articles
self.__sourceUrl = "https://news-articles.herokuapp.com/archive/"
def request_html(self, page_name="index"):
'''
Parameters: page_name (url part of the page without ".html" and default as "index")
Return: HTML
--------
To request data from a URL though HTTP request
'''
url = self.__sourceUrl + page_name+".html"
try:
response = urllib.request.urlopen(url) # Hit URL: url
raw_html = response.read().decode() # Decode response
return raw_html # return raw html
except Exception as h:
print("Cannot find data at given URL\nError:"+str(h))
return ""
from bs4 import BeautifulSoup
class ParseData:
def parse_index_page(self, html):
'''
Parameters: html content
Return: Dictionary with month names and corrosponding urls
--------
Method to parse index html content to get URLs for monthly data URLs
'''
parsed_html = BeautifulSoup(html,"html.parser")
month_dict={}
div = parsed_html.body.find_all("div", {"class":"main"})
for ul in div:
for li in ul.find_all("li"):
for a in li.find_all('a'):
month_dict[a.string] = a['href'].replace(".html","")
return month_dict
def parse_month_pages(self, html, month):
'''
Parameters: html content, month name
Return: Acticles list
--------
Method to parse URL for Monthly List of articles content to get articles data
'''
parsed_html = BeautifulSoup(html,"html.parser")
article_list=[]
div = parsed_html.body.find_all("div", {"class":"main"})
for table in div:
for tr in table.find_all("tr"):
article_dict={}
for td in tr.find_all('td', {"class":"category"}):
article_dict['category'] = td.string.replace(u'\xa0', u'')
for td in tr.find_all('td', {"class":"title"}):
for a in td.find_all('a'):
article_dict['id'] = a['href'].rstrip(".html")
article_dict['name'] = a.string
article_list.append(article_dict)
return article_list
def parse_article(self,html):
'''
Parameters: html content
Return: Parsed Acticle heading and paragraphs in String format
--------
Method to parse URL to get articles data
'''
parsed_html = BeautifulSoup(html,"html.parser")
article_data = " "
# Add heading to article data
for heading in parsed_html.body.find_all("h2"):
article_data = heading.getText() + article_data
# To get text from p tags
for data in parsed_html.body.find_all("p", attrs={'class': None}):
text = data.getText()
if[len(text)!=0]:
article_data = article_data + text
return article_data
Creating objects of the classes so that their functions for retreiving, storing and parsing the data can be applied
# Creating objects of classes to use them further
newsArchive = NewsArchive()
parse = ParseData()
# First Level Parsing:- Index Page
indexHTML = newsArchive.request_html()
# Second Level Parsing:- Monthly Pages
month_data = parse.parse_index_page(indexHTML)
# Creating categories-articles dataframe
df = pd.DataFrame([])
for key, value in month_data.items():
html = newsArchive.request_html(value)
month_data = parse.parse_month_pages(html, key)
df = df.append(pd.read_json(json.dumps(month_data)))
df.reset_index(drop=True, inplace=True)
display(df[0:5])
Check how many records in category dataframe are null and remove those records
# Find if any Null values in the records
print(df.isnull().any())
# Check number of records that are null in the dataset
print("\nTotal Null Valued Records: " + str(df.isnull().T.any().T.sum()))
print("\nNumber of records with N/A or NaN Values:", len(df))
df = df.dropna()
print("\nNumber of records with clean values:", len(df))
df.reset_index(drop=True, inplace=True)
display(df[-5:])
Retrieve all web pages corresponding to these article URLs. From the web pages, extract the main body text containing the content of each news article. Save the body of each article as plain text
def store_articles(row):
'''
Parameters: A dataframe row with structure (category, id, name)
--------
Save the article in /data/articles/ folder (using the id from dataframe row)
'''
html = newsArchive.request_html(row['id'])
article = parse.parse_article(html)
# Write article to file
# exception handled if file locked by some other operation or could not write to file
try:
filename = os.getcwd() + "/data/articles/" + row['id'] + ".txt"
# open the file in write mode
f = open(filename, "w",encoding='utf-8')
f.write(article)
f.close()
except Exception as h:
print("Cannot write to file\n" + row['id'] +" \nError:"+str(h))
_ = df.apply(store_articles, axis=1)
files = os.listdir(os.getcwd() + "/data/articles/")
print("Samples Files in the directory ./data/articles/")
display(files[:5])
print("Total Files saved:", len(files))
# Assign numerical values to categories and save as target corrosponding to every article
category_labels = pd.Series(df.category)
target, labels = pd.factorize(category_labels)
df['target'] = target
display(df[0:5])
# Factored List will be used as target class values
print(target)
# Labels Indexes will be used later to decipher the target name
print(labels)
# Save Cleaned data for articles names and their Categories
# exception handled if file locked by some other operation or could not write to file
try:
filename = os.getcwd() + "/data/category.csv"
df.to_csv(filename, sep=',', encoding='utf-8')
print("Write to CSV:\n" + filename)
except Exception as h:
print("Cannot write to file \nError:" + str(h))
Kindly Note: The dataframe for categories-arcticles was cleaned earlier but this step is just to ensure that correct records will be used for processing the articles.
# Check number of records that are null in the dataset
print("\nTotal Null Valued Records: " + str(df.isnull().T.any().T.sum()))
article_ids = df.id.tolist()
article_ids[0:5]
documents = []
for article_id in article_ids:
path = os.getcwd() + "/data/articles/" + article_id + ".txt"
rawtext = open(path, "r", encoding = 'utf-8').read()
documents.append(rawtext)
print("Total text documents loaded: %d" % len(documents))
print("Article ID- %s " % article_ids[0])
print("Target Class- %s " % target[0])
print("Target Class Label- %s " % labels[target[0]])
print("Raw Document- %s " % documents[0])
The raw documents are splitted to form tokens. These tokens will be cleaned and then will be used for analysis. Tokens are cleaned by following Standard Pre-processing Steps:
Tokenize and normalize data using the custom class object
def tokenize_normalize(documents):
'''
Parameters: documents (List of documents)
Return: normalized tokens list for every document
--------
To convert the words in the documents to lower case and tokenize them using scikit-learn
'''
tokenize = CountVectorizer().build_tokenizer()
tokens = []
for article in documents:
tokens.append(tokenize(article.lower())) # Normalization(lowercase conversion) and Tokenization
return tokens
tokens = tokenize_normalize(documents)
# Sample Normalized Tokens and length of tokens for an article
print("Sample Normalized Tokens for Article with ID: '%s' \n" % article_ids[0])
print(tokens[0])
print("\nLength of Tokens: %d\n" % len(tokens[0]))
Scikit-learn feature extract Stopwords list (frozenset)
# List of Stopwords that will be removed
stopwords = text.ENGLISH_STOP_WORDS
print("Stopwords List:")
print(stopwords)
Filter out the above shown stopwords from the list of tokens for all the documents using custom class object
def filter_stopwords(tokens, stopwords):
'''
Parameters: tokens (List of tokens for every document), stopwords (List of stopwords)
Return: stopwords filtered tokens list for every document
--------
To update the tokens list by filtering stopwords
'''
for i in range (0, len(tokens)):
filtered_token = []
for token in tokens[i]:
if token not in stopwords:
filtered_token.append(token)
tokens[i] = filtered_token
return tokens
# Filter out stopwords from the list of tokens for every document
tokens = filter_stopwords(tokens, stopwords)
print("Created %d filtered token lists" % len(tokens))
# Sample Filtered Tokens and length of tokens for an article
print("\nSample Filtered Tokens for Article with ID: '%s' \n" % article_ids[0])
print(tokens[0])
print("\nLength of Tokens: %d\n" % len(tokens[0]))
Reduce tokens to their canonical form
Kindly Note: We are using WordNetLemmatization instead of standard English stemming algorithm (called the Porter Stemmer) beacuse Lemmatization preseves the words instead of trimming the last characters
def lemmatizer(tokens):
'''
Parameters: tokens (List of tokens for every document)
Return: lemmatized tokens list for every document
--------
Function for lemmatizing to reduce a term to its canonical form
WordNetLemmatizer from nltk used since scikit-learn does not support it
'''
lemmatizer = nltk.stem.WordNetLemmatizer()
for i in range (0, len(tokens)):
lemma_tokens = []
for token in tokens[i]:
lemma_tokens.append(lemmatizer.lemmatize(token))
tokens[i] = lemma_tokens
return tokens
# lemmatizing tokens for every document
tokens = lemmatizer(tokens)
print("Created %d Lemmatized token lists" % len(tokens))
# Sample Filtered Tokens and length of tokens for an article
print("\nSample Lemmatized Tokens for Article with ID: '%s' \n" % article_ids[0])
print(tokens[0])
print("\nLength of Tokens: %d\n" % len(tokens[0]))
corpus = []
for articles in tokens:
corpus.append(" ".join(articles))
# Sample Clean Article and number of articles in corpus
print("Sample clean acrticle for Article with ID: '%s' \n" % article_ids[0])
print(corpus[0])
print("\nNumber of Articles in Corpus: %d\n" % len(corpus))
vectorizer = CountVectorizer(min_df = 3)
countVector = vectorizer.fit_transform(corpus)
print(countVector.shape)
print("Number of terms: %d" % len(vectorizer.vocabulary_))
print(list(vectorizer.vocabulary_.keys())[:40])
"and" in vectorizer.get_feature_names()
Instead of doing everything separately, Scikit-learn provides the capability to perform everything in a single step where we can also adjust parameters. However, we can also create TF-IDF Matrix using Count Vector generated earlier. This command will directly create the weighted term document matrix.
Parameters Used:
def lemma_tokenizer(text):
'''
Function for lemmatizing to reduce a term to its canonical form
'''
# use the standard scikit-learn tokenizer first
standard_tokenizer = CountVectorizer().build_tokenizer()
tokens = standard_tokenizer(text)
# then use NLTK to perform lemmatisation on each token
lemmatizer = nltk.stem.WordNetLemmatizer()
lemma_tokens = []
for token in tokens:
lemma_tokens.append( lemmatizer.lemmatize(token) )
return lemma_tokens
tfidf = TfidfVectorizer(stop_words="english",min_df = 3,ngram_range=(1, 2),tokenizer=lemma_tokenizer)
X = tfidf.fit_transform(documents)
print(X.shape)
"and" in tfidf.get_feature_names()
vocab = tfidf.vocabulary_
print("Number of Terms: %d" % len(vocab))
terms = tfidf.get_feature_names()
print("Distinct Vocabulary Terms: %d" % len(terms))
document_term_matrix = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names(), index = article_ids).T
print("Sample Bag of Words Model (with Term Weights):")
document_term_matrix[-10:]
df2=document_term_matrix.T.sum().sort_values(ascending=False)
df2[:10]
category_id_df = df[['category', 'target']].drop_duplicates().sort_values('target')
category_to_id = dict(category_id_df.values)
category_to_id
# Using the scikit-learn chi2 function to find features for chi-square category-wise and finding most correlated terms
class_labels = df.target
N = 2
Chi_df = X
for category, category_id in sorted(category_to_id.items()):
features_chi2 = chi2(Chi_df.toarray(), class_labels == category_id)
indices = np.argsort(features_chi2[0])
feature_names = np.array(tfidf.get_feature_names())[indices]
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
print("Category: '%s'" % category)
print()
print("1. Most correlated unigrams:\n . {}".format('\n . '.join(unigrams[-N:])))
print()
print("2. Most correlated bigrams:\n . {}".format('\n . '.join(bigrams[-N:])))
print("-----------------------\n")
The idea is to concatenate all the documents belonging to one category and then create a word cloud for the words belonging to one category. The word count is shown later in the visualizations section
category_concated_documents = []
for i in range(0,len(labels)):
indexes = list(df.loc[df['category'] == labels[i]].index)
concat_doc = ""
for j in indexes:
concat_doc = concat_doc + " " + corpus[j]
category_concated_documents.append(concat_doc)
print("Number of words in Category "+ labels[i] +" are: "+ str(len(category_concated_documents[i])))
The articles belonging to different categories differ from each other and it is also possibl that one or more catgories are highly related. For finding similarity between categories we can use cosine similarity.
*Cosine similarity*: Most common approach for measuring similarity between two documents in a bag-of-words representation is to look at the cosine of the angle between their corresponding two term vectors. The motivation is that vectors for documents containing similar terms will point in the same direction in the m-dimensional vector space.
category_vectorizer = TfidfVectorizer(stop_words="english",min_df = 3,ngram_range=(1, 2),tokenizer=lemma_tokenizer)
category_vector = category_vectorizer.fit_transform(category_concated_documents)
print(category_vector.shape)
cosine_matrix = pd.DataFrame([])
# Measure the cosine similarity between the first document vector and all of the others
print("Cosine Similarity Scores:")
for i in range(0,len(labels)):
temp_scores = []
for row in range(0,len(labels)):
cos = cosine_similarity( category_vector[i], category_vector[row] )
temp_scores.append(cos[0][0])
cosine_matrix[labels[i]] = temp_scores
cosine_matrix[""] = labels
cosine_matrix.set_index("", inplace=True)
display(cosine_matrix)
Classification Algorithms Used:
# Import Libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X.shape
target
dataset_train, dataset_test, target_train, target_test = train_test_split(X, target, test_size=0.3)
print("Training set size is %d" % dataset_train.shape[0] )
print("Test set size is %d" % dataset_test.shape[0] )
Variables for Measuring Running Times of Algorithms:
model_names = ["MultinomialNB", "LinearSVC"]
models = []
train_times = []
tuned_models = []
tuned_train_times = []
startTime = time.time()
model = MultinomialNB()
model.fit(dataset_train, target_train)
# Find out elapsed time in finding training the model using Naive Bayes Classifier
elapsed_time = time.time() - startTime
train_times.append(elapsed_time)
models.append(model)
print(models[0])
print("Time Required to Train the model: " + str(train_times[0]))
startTime = time.time()
model = LinearSVC()
model.fit(dataset_train, target_train)
# Find out elapsed time in finding training the model using LinearSVC (SVM with Linear Kernel) Classifier
elapsed_time = time.time() - startTime
train_times.append(elapsed_time)
models.append(model)
print(models[1])
print("Time Required to Train the model: " + str(train_times[1]))
The models are at a risk of overfitting since the parameters are selected randomly. The k fold cross-validation strategy alongwith parameter selection while training the model can ensure that the model does not get biased towards the data set. Using GridSearchCV for the two algorithms along with 5 fold cross-validation to make tuned models.
The smoothing priors α ≥ 0 accounts for features not present in the learning samples and prevents zero probabilities in further computations. Setting α = 1 is called Laplace smoothing, while α < 1 is called Lidstone smoothing.[6]
startTime = time.time()
model = MultinomialNB()
param_grid = [{'alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}]
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(dataset_train, target_train)
# Find out elapsed time in finding training the model using Naive Bayes Classifier
elapsed_time = time.time() - startTime
tuned_train_times.append(elapsed_time)
tuned_models.append(grid)
print(tuned_models[0])
print("Time Required to Train the model: " + str(tuned_train_times[0]))
The pipeline module is used here for feature selection criteria and then classification. The SVM optimization is set using C parameter. It decides how much to avoid misclassifying each training example. "For large values of C, the optimization will choose a smaller-margin hyperplane if that hyperplane does a better job of getting all the training points classified correctly. Conversely, a very small value of C will cause the optimizer to look for a larger-margin separating hyperplane, even if that hyperplane misclassifies more points. For very tiny values of C, you should get misclassified examples, often even if your training data is linearly separable."[11]
startTime = time.time()
model = Pipeline([
('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
tol=1e-3))),
('classification', LinearSVC(penalty="l2"))])
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'classification__C': C_OPTIONS
},
{
'classification__C': C_OPTIONS
},
]
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(dataset_train, target_train)
# Find out elapsed time in finding training the model using Naive Bayes Classifier
elapsed_time = time.time() - startTime
tuned_train_times.append(elapsed_time)
tuned_models.append(grid)
print(tuned_models[1])
print("Time Required to Train the model: " + str(tuned_train_times[1]))
The following parameters will be used to evaluate the perfromace of the two classification models:
Using the unseen test data to evaluate the models accuracy.
# Predicting values using the two Algorithm
test_times = []
predictions = []
for i in range(0,len(model_names)):
startTime = time.time()
predictions.append(models[i].predict(dataset_test))
# Find out elapsed time in testing the model using KNN Classifier
elapsed_time = time.time() - startTime
test_times.append(elapsed_time)
print("Time Required to test "+ model_names[i] +" :" + str(test_times[i]))
print("\nSample Predicted Classes Vs Actual Classes:")
print("Target \t\t",target_test[:15])
print("Predictions\t",predictions[i][:15])
print()
Overall accuracy score for the predictions is the fraction of correct predictions. AccuracyScore Function returns a value between 0 (completely wrong) and 1 (predictions are 100% accurate):
#Accuracies for different classifiers
accuracy_scores = []
for m in range(0,len(model_names)):
accuracy_scores.append(accuracy_score(target_test, predictions[m]))
print("Accuracy of "+model_names[m]+" Classifier =\t%.4f" % accuracy_scores[m])
print()
# Predicting values using the two Algorithm (Trained with cross-validation and parameter tuning)
tuned_test_times = []
tuned_predictions = []
for i in range(0,len(model_names)):
startTime = time.time()
tuned_predictions.append(tuned_models[i].predict(dataset_test))
# Find out elapsed time in testing the model using KNN Classifier
elapsed_time = time.time() - startTime
tuned_test_times.append(elapsed_time)
print("Time Required to test "+ model_names[i] +" :" + str(tuned_test_times[i]))
print("\nSample Predicted Classes Vs Actual Classes:")
print("Target \t\t",target_test[:15])
print("Predictions\t",tuned_predictions[i][:15])
print()
#Accuracies for different classifiers
tuned_accuracy_scores = []
for m in range(0,len(model_names)):
tuned_accuracy_scores.append(accuracy_score(target_test, tuned_predictions[m]))
print("Accuracy of "+model_names[m]+" Classifier =\t%.4f" % tuned_accuracy_scores[m])
print()
As we can see, tuned models are perform better than the models with default parameters since it involves cross validation as well while selecting the grid parameters. We will be using tuned models for further evaluation.
Manual comparison of the target labels for the test data with the predictions could mislead.
For the comparison of algorithms we need to determine the extent to which the classifier made correct/incorrect predictions for N-Classes:
'0'
which are actually '0'
'1'
which are actually '1'
and so on'1'
which are actually '0'
'1'
which are actually '2'
and so on# Predictions made by different Classifiers
for m in range(0,len(model_names)):
print("Predictions by "+ model_names[m] +" :")
for i in range(0,len(labels)):
n = (tuned_predictions[m] == i).sum()
print( "Number of articles predicted as "+labels[i]+" : " + str(n) )
print()
# Actual Class Labels
print("Actual Class Labels:")
for i in range(0,len(labels)):
n = (target_test == i).sum()
print( "Number of articles predicted as "+labels[i]+" : " + str(n) )
Confusion matrix can be created for the results to verify correct or incorrect predictions. For N-Classes model( Example Classes=3), the values correspond to:
[ [C I I]
[I C I]
[I I C] ]
A perfect classifier with 100% accuracy would produce a pure diagonal matrix. It means all the labels for the test data were correctly predicted.
# class targets
class_targets = []
for i in range(0,len(labels)):
class_targets.append(i)
# Confusion Matrix for different Classifier
cm = []
for m in range(0,len(model_names)):
print("Confusion Matrix Dataframe for "+model_names[m]+":")
df_cm = pd.DataFrame(confusion_matrix(target_test, tuned_predictions[m],labels=class_targets), columns=list(labels))
df_cm.columns.name = 'Predicted Labels'
df_cm['Actual Labels'] = labels
df_cm.set_index('Actual Labels', inplace=True)
display(df_cm)
cm.append(df_cm)
print()
Measures from information retrieval can be used in Machine Learning evaluation.
Kindly Note: Every class label has their own set of values of these measures
F-Score (F1-score with β=1): There is often a trade-off between precision and recall. We can combine precision and recall into a single score using the F1 Measure, which is a weighted average of the precision and recall. The F1 Measure reaches its best value at 1 and worst at 0.
F1 = 2 (precision recall) / (precision + recall) (where beta = 1)
Although, Scikit-learn provides the capability to find precision, recall, F1 measures separately, we will be using the single line classification report functionality to find out all the measures at once
# Function to convert text report to a dataframe
import re
from io import StringIO
def report_to_df(report):
report = re.sub(r" +", " ", report).replace("avg / total", "avg/total").replace("\n ", "\n")
report_df = pd.read_csv(StringIO("Class" + report), sep=' ')
return(report_df)
classification_reports = []
temp = list(labels) + ["avg/total"]
for m in range(0,len(model_names)):
print("\nClassification Report for "+model_names[m]+" Classifier:")
#txt report to df
report = classification_report(target_test, tuned_predictions[m])
classification_reports.append(report_to_df(report))
classification_reports[m]["Class"] = temp
classification_reports[m].set_index("Class", inplace=True)
display(classification_reports[m])
print()
A problem with simply randomly splitting a dataset into two sets is that each random split might give different results. We are also ignoring a portion of the dataset.
k-fold cross-validation helps to evaluate a classifier by adressing this issue.
Steps for finding acuracy using k-fold cross-validation are:
cross_validation_scores = []
cross_validation_accuracy = []
for m in range(0,len(model_names)):
acc_scores = cross_val_score(tuned_models[m], X, target, cv=5)
print(acc_scores)
# Saving cross-validation values
cross_validation_scores.append(acc_scores)
# Saving Cross Validatin Accuracy by finding mean of accuracy observed
cross_validation_accuracy.append(acc_scores.mean())
print("Mean 5-fold Cross-Validation Accuracy of "+model_names[m]+" Classifier = %.4f" % cross_validation_accuracy[m])
print()
Plotting the count of all available categories
fig_categories = df.groupby('category').name.count().plot.bar(figsize=(6,5),fontsize=14, color='purple')
fig_categories.set_xlabel("Categories",fontsize=14)
fig_categories.set_ylabel("Articles Count",fontsize=14)
plt.suptitle("Articles Count by Category",fontsize=20)
plt.savefig('graphs/category_count.png',dpi=100, bbox_inches='tight')
This figure shows the count of articles per category. This also illustrates that there is a class imbalance where we do not have equal number of documents for sports, business and technology classes.
Arranging categories based on number of articles present (from more to less):
Sports > Business > Technology
Plotting to visualize significant terms based on their frequency of occurence. The dataset used was cleaned already and the clean corpus is used to create wordcloud and hence parameters for stopwords removal is not required.
# Create ./graphs for storing plots
os.makedirs(os.getcwd() + "/graphs", exist_ok=True)
# Word Cloud Category Wise to get most-significant terms
fig = plt.figure()
for i in range(0,len(labels)):
wordcloud = WordCloud(background_color='white',
width=6000,
height=3000
).generate(category_concated_documents[i])
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Word Cloud for Category: '"+labels[i]+"'\n", fontsize=22)
plt.savefig("graphs/WordCloud_"+ labels[i] +".png",dpi=100, bbox_inches='tight')
plt.show()
The images above shows word clouds for the different categories. The most significant terms can be recognized immediatly from wordcloud. The above images have word clouds for the categories: technology, business, sports.
Plotting cosine similarity in a heat map for comparing the similarity of the categories.
fig_cosine, ax = plt.subplots(figsize=(9,6))
fig_cosine.suptitle("Cosine Similarity Matrix", fontsize=20, verticalalignment='bottom')
sns.heatmap(cosine_matrix, annot=True, fmt="g", cmap=matplotlib.cm.Wistia)
fig_cosine.savefig('graphs/cosine_similarity.png', dpi=100, bbox_inches='tight')
plt.show(fig_cosine)
The plot shows the similarity between the two categories. It can be seen from the heat map that the same categories have a similarity score of 1 and the different categories have different scores.
To compare the performace of the classification algorithms used
# Converting accuracies to a dataframe
df_accuracy = pd.DataFrame(columns=["5-fold Cross-Validation Accuracy",
"Accuracy(Default Parameters)",
"Accuracy(Tuned Parameters)"])
df_accuracy["5-fold Cross-Validation Accuracy"] = cross_validation_accuracy
df_accuracy["Accuracy(Default Parameters)"] = accuracy_scores
df_accuracy["Accuracy(Tuned Parameters)"] = tuned_accuracy_scores
df_accuracy["Model"] = model_names
df_accuracy.set_index("Model", inplace=True)
display(df_accuracy)
# Plotting Accuracies for two models
axes = df_accuracy.plot.barh(figsize=(10,4))
axes.set_xlabel("Accuaccy",fontsize=14)
axes.set_ylabel("Model Name",fontsize=14)
axes.set_title("Accuracy of Models",fontsize=20)
axes.legend(loc='upper center', bbox_to_anchor=(0.5, 1), ncol=3)
plt.savefig('graphs/accuracy.png',dpi=100, bbox_inches='tight')
The graph ilustrates a comparison between overall accuracy of the two models alongwith the cross-validation accuracy. This can help to understand how the models behave if only
Compare the time taken to train and test the models
indices = np.arange(len(models))
training_time = np.array(train_times) / np.max(train_times)
testing_time = np.array(test_times) / np.max(test_times)
tuned_training_time = np.array(tuned_train_times) / np.max(tuned_train_times)
tuned_testing_time = np.array(tuned_test_times) / np.max(tuned_test_times)
fig_times, ax = plt.subplots(1,2)
fig_times.suptitle("Normalized Execution Times", fontsize=20, verticalalignment='bottom')
ax[0].figure.set_size_inches(15, 5)
ax[0].set_title("Models with Default parameters", fontsize=20)
ax[0].bar(indices + .3, training_time, .2, label="Training time")
ax[0].bar(indices + .6, testing_time, .2, label="Testing time")
ax[0].set_xticks(())
ax[0].legend(loc='upper center')
ax[1].figure.set_size_inches(15, 5)
ax[1].set_title("Models with Tuned parameters", fontsize=20)
ax[1].bar(indices + .3, tuned_training_time, .2, label="Training time")
ax[1].bar(indices + .6, tuned_testing_time, .2, label="Testing time")
ax[1].set_xticks(())
ax[1].legend(loc='upper center')
for i, c in zip(indices, model_names):
ax[0].text(i+0.2,-.1, c)
ax[1].text(i+0.2,-.1, c)
fig_times.savefig('graphs/execution_times.png', dpi=100, bbox_inches='tight')
plt.show(fig_times)
Comparison between the execution times for the different algorithms. Comparing Training time to the test time.
Confusion Matrices comparison for the different algorithms used
fig_heatMap, ax = plt.subplots(1,len(model_names))
fig_heatMap.suptitle("Visual Representation of Confusion Matrices", fontsize=20, verticalalignment='bottom')
for m in range(0,len(model_names)):
sns.heatmap(cm[m], annot=True, fmt="g", cmap=matplotlib.cm.Wistia, ax=ax[m])
ax[m].figure.set_size_inches(15, 5)
ax[m].set_title(model_names[m], fontsize=15)
fig_heatMap.savefig('graphs/confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.show(fig_heatMap)
The image illustrated the confusion matrix for the two classification algorithm. The high concentration on diagonals represent efficienct algorithms as they produced high number of True-Positives.
def get_measures(measure):
'''
Function to get a dataframe for the requested measure for comparative study between different models
'''
cr = pd.DataFrame()
for m in range(0,len(model_names)):
cr = cr.append(classification_reports[m][measure], ignore_index=True)
cr["Model"] = model_names
cr.set_index("Model", inplace=True)
return cr.T
fig_measures, ax6 = plt.subplots(1,3)
get_measures("precision").plot.bar(ax=ax6[0], legend=False,figsize=(15,3), color=['y','g'])
get_measures("recall").plot.bar(ax=ax6[1], legend=False, color=['y','g'])
get_measures("f1-score").plot.bar(ax=ax6[2], legend=False, color=['y','g'])
ax6[0].set_title("Precision ",fontsize=20)
ax6[1].set_title("Recall",fontsize=20)
ax6[2].set_title("F1-Score",fontsize=20)
ax6[0].set_ylabel("Accuaccy",fontsize=14)
ax6[2].legend(loc='upper center', bbox_to_anchor=(0.45, 1.30), ncol=2, fontsize=12)
fig_measures.text(0.5, -0.25, 'Class Labels', fontsize=14)
fig_measures.text(0.40, 1.05, 'Evaluation Measures', fontsize=24)
plt.savefig('graphs/evaluation_measures.png',dpi=100, bbox_inches='tight')
plt.show(fig_measures)
The graph shows the comparision between different Evaluation Measures for better comparison between them.
Plot 1: Corpus Summary - Category wise articles count
Plot 2: Word Cloud
Plot 3: Cosine Similarity
Plot 4: Comparison of Accuracies of the models
Plot 5: Comparison of Execution Times of the models
1. Models with Default Parameters
¶Training Time for : (Multinomial Naive Bayes < LinearSVC clearly)
The reason behind more time required by LinearSVC is that it involved two steps to train a model. First step was to select features and the second was to build the classification model using Pipeline module from scikit-learn. Also the Naive Bayes classifier is a probablistic model and is faster to train.
Testing Time for : (Multinomial Naive Bayes ~= LinearSVC clearly)
Both the algorithms took similar time and none of them is a clear winner.
2. Model with Parameter Tuning
¶Training Time for : (Multinomial Naive Bayes < LinearSVC clearly)
Clearly, both the classification algorithm took a lot more time to train as compared to algorithms with default parameters becuase the parameter tuning for the algorithms involved cross-validation which accounts for most of the time. Again, Naive Bayes was faster to train being less complex model and just one paramter required to be tuned.
Testing Time for : (Multinomial Naive Bayes > LinearSVC clearly)
LinearSVC clearly took a lot more time to classify documents as compared to Multinomial Naive Bayes which shows naive bayes can perfrom faster than LinearSVC after parameter tuning.
Plot 6: Comparison of Confusion Matrices for the models
Plot 7: Sub-Plots Comparison of Evaluation Measures
- If less time is available to spend on training examples then Naive Bayes can be used as Naive Bayes can scale easily due to classification based on probabilistic model.
- If better accuracy is required, Multinomial Naive Bayes can be used. For our dataset, this difference can be ignored for very small values because dataset does not contain any critical information like medical reports. But in general, SVM can perform really well for text classification as it accepts kernel function ('Linear' in our case)
- If faster implementation is required then Naive Bayes can be chosen because tuning LinearSVC is very difficult on the other hand tuning Naive Bayes is a very easy task.
Naive Bayes Algorithm for Text Classification:
Advantages[7]
- Very Simple and easy to implement
- Need less training data
- If the NB conditional independence assumption holds, then it will converge quickly
- Highly scalable. It scales linearly with the number of predictors and data points (as seen can be trained quickly)
- Multi-class classification problems can also be solved using Naive Bayes (as in our example)
- Not sensitive to irrelevant features.
Disadvantages[8][9]
- Naive Bayes is based on the independence assumptions. The conditional independence assumption states that features are independent of each other given the class.
- We need to estimate the likelihood value by a frequentist approach for any possible value of a feature. This can result in probabilities going towards 0 or 1, which in turn leads to numerical instabilities and worse results. However, we resolved this by using Lidstone smoothening.
SVM (kernal='Linear') for Text Classification:[10]
- Works well with even unstructured and semi structured data like text, Images and trees.
- Allows to pass the kernel function and an appropriate Kernel function can help to solve any complex problem.
- It scales relatively well to high dimensional data. (As in out case, high number of features)
- The risk of overfitting is less in SVM using simple kernel function 'Linear' for small dataset and universal kernel function 'Radial Basis Function (RBF)' for complex problems
- Tuning the parameters/ kernel function can be cumbersome.
- Long training time is required for larger datasets.