# install and import necessary libraries.

!pip install contractions

import re, string, unicodedata                          # Import Regex, string and unicodedata.
import contractions                                     # Import contractions library.
from bs4 import BeautifulSoup                           # Import BeautifulSoup.

import numpy as np                                      # Import numpy.
import pandas as pd                                     # Import pandas.
import nltk                                             # Import Natural Language Tool-Kit.

nltk.download('stopwords')                              # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords                       # Import stopwords.
from nltk.tokenize import word_tokenize, sent_tokenize  # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.


from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

Requirement already satisfied: contractions in c:\programdata\anaconda3\lib\site-packages (0.0.25)
Requirement already satisfied: textsearch in c:\programdata\anaconda3\lib\site-packages (from contractions) (0.0.17)
Requirement already satisfied: Unidecode in c:\programdata\anaconda3\lib\site-packages (from textsearch->contractions) (1.1.1)
Requirement already satisfied: pyahocorasick in c:\programdata\anaconda3\lib\site-packages (from textsearch->contractions) (1.4.0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prasad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prasad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Prasad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

1. Import the libraries, load dataset, print shape of data, data description. (5 )¶

# Loading data into pandas dataframe
data = pd.read_csv("Tweets.csv")

data.shape

(14640, 15)

data.head()

2. Understanding of data-columns: (5 )¶

a. Drop all other columns except “text” and “airline_sentiment”.

b. Check the shape of data.

c. Print first 5 rows of data.

data = data.loc[:, ['text','airline_sentiment']]

data.shape

(14640, 2)

pd.set_option('display.max_colwidth', None) 
data.head(5)

df = data.groupby('airline_sentiment').count()
print(df)

                   text
airline_sentiment      
negative           9178
neutral            3099
positive           2363

3. Text pre-processing: Data preparation. (20 )¶

a. Html tag removal.

b. Tokenization.

c. Remove the numbers.

d. Removal of Special Characters and Punctuations.

e. Conversion to lowercase.

f. Lemmatize or stemming.

g. Join the words in the list to convert back to text string in the dataframe. (So that each row contains the data in text format.)

h. Print first 5 rows of data after pre-processing.

#remove html tags
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['text'] = data['text'].apply(lambda x: strip_html(x))
data.head()

#remove contractions
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text) 

data['text'] = data['text'].apply(lambda x: replace_contractions(x))
data.head()

#remove numbers

def remove_numbers(text):
  text = re.sub(r'\d+', '', text)
  return text

data['text'] = data['text'].apply(lambda x: remove_numbers(x))
data.head()

#Tokenization


data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) # Tokenization of data
data.head()

#remove special characters and punctuations

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

data['text'] = data['text'].apply(lambda x: remove_punctuation(x))
data.head()

#Conversion to lowercase.

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


data['text'] = data['text'].apply(lambda x: to_lowercase(x))
data.head()

# save the stopwords in a list named stopwords.
stopwords = stopwords.words('english')
#removal of stopwords
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data.head()

#Lemmatize 
lemmatizer = WordNetLemmatizer()


def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

#not used in this example
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []                            # Create empty list to store pre-processed words.
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)                # Append processed words to new list.
    return stems


data['text'] = data['text'].apply(lambda x: lemmatize_list(x))
data.head()

#Join the words in the list to convert back to text string in the dataframe. (So that each row contains the data in text format.)

def join_words(words):
    return ' '.join(words)

data['text'] = data['text'].apply(lambda x: join_words(x))
data.head()

4. Vectorization: (10 )¶

    a. Use CountVectorizer.
    b. Use TfidfVectorizer.

#CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(data['text'])
# summarize
#print(vectorizer.vocabulary_)
# encode document
count_vector = vectorizer.transform(data['text'])
# summarize encoded vector
print(count_vector.shape)
count_vector_array = count_vector.toarray()

(14640, 12058)

#Tf-Idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(data['text'])
# summarize
#print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform(data['text'])
# summarize encoded vector
print(vector.shape)
tfidf_vector_array = vector.toarray()

[9.89843391 9.89843391 9.89843391 ... 9.89843391 8.98214318 9.89843391]
(14640, 12058)

Fit and evaluate model using both type of vectorization. (6+6 )¶

Using CountVectorizer¶

# Split data into training and testing set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(count_vector_array,data.airline_sentiment, test_size=0.3, random_state=42)

# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=10, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, count_vector_array, data.airline_sentiment, cv=10)))

RandomForestClassifier(n_estimators=10, n_jobs=4)
0.7066939890710383

# Predict the result for test data using the model built above.
result = forest.predict(X_test)

conf_mat = confusion_matrix(y_test, result)

print(conf_mat)

df_cm = pd.DataFrame(conf_mat, index = [i for i in ['negative','neutral', 'positive']],
                  columns = [i for i in ['negative', 'neutral', 'positive']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')

[[2541  199   74]
 [ 426  379   79]
 [ 226   96  372]]

<matplotlib.axes._subplots.AxesSubplot at 0x1b4892eff08>

cls = [
       DecisionTreeClassifier(),
       RandomForestClassifier(n_estimators=10)
       ]

cls_name = []

i = 0
accuracy = []
for cl in cls:
    model = cl.fit(X_train,y_train)
    lbl_pred = model.predict(X_test)
    a = (100*accuracy_score(lbl_pred, y_test))
    a = round(a,2)
    accuracy.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Accuracy Score : {}%".format(cls_name[i],a))
    print ( classification_report(lbl_pred, y_test))
    i +=1

DecisionTreeClassifier  Accuracy Score : 70.77%
              precision    recall  f1-score   support

    negative       0.82      0.80      0.81      2867
     neutral       0.45      0.46      0.46       877
    positive       0.58      0.62      0.60       648

    accuracy                           0.71      4392
   macro avg       0.62      0.63      0.62      4392
weighted avg       0.71      0.71      0.71      4392

RandomForestClassifier  Accuracy Score : 75.02%
              precision    recall  f1-score   support

    negative       0.90      0.80      0.85      3187
     neutral       0.42      0.54      0.48       689
    positive       0.55      0.74      0.63       516

    accuracy                           0.75      4392
   macro avg       0.63      0.69      0.65      4392
weighted avg       0.79      0.75      0.76      4392

Using TfIdfVectorizer¶

# Split data into training and testing set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vector_array,data.airline_sentiment, test_size=0.3, random_state=42)

# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=10, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, tfidf_vector_array, data.airline_sentiment, cv=10)))

RandomForestClassifier(n_estimators=10, n_jobs=4)
0.7075136612021857

# Predict the result for test data using the model built above.
result = forest.predict(X_test)

conf_mat = confusion_matrix(y_test, result)

print(conf_mat)

df_cm = pd.DataFrame(conf_mat, index = [i for i in ['negative','neutral', 'positive']],
                  columns = [i for i in ['negative', 'neutral', 'positive']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')

[[2641  123   50]
 [ 505  326   53]
 [ 257   94  343]]

<matplotlib.axes._subplots.AxesSubplot at 0x1b490b36c88>

cls = [
       DecisionTreeClassifier(),
       RandomForestClassifier(n_estimators=10)
       ]

cls_name = []

i = 0
accuracy = []
for cl in cls:
    model = cl.fit(X_train,y_train)
    lbl_pred = model.predict(X_test)
    a = (100*accuracy_score(lbl_pred, y_test))
    a = round(a,2)
    accuracy.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Accuracy Score : {}%".format(cls_name[i],a))
    print ( classification_report(lbl_pred, y_test))
    i +=1

DecisionTreeClassifier  Accuracy Score : 69.17%
              precision    recall  f1-score   support

    negative       0.82      0.79      0.80      2926
     neutral       0.43      0.44      0.43       852
    positive       0.52      0.59      0.55       614

    accuracy                           0.69      4392
   macro avg       0.59      0.61      0.60      4392
weighted avg       0.70      0.69      0.70      4392

RandomForestClassifier  Accuracy Score : 74.98%
              precision    recall  f1-score   support

    negative       0.94      0.77      0.85      3399
     neutral       0.37      0.60      0.45       544
    positive       0.48      0.75      0.59       449

    accuracy                           0.75      4392
   macro avg       0.60      0.71      0.63      4392
weighted avg       0.82      0.75      0.77      4392

Summarize your understanding of the application of Various Pre-processing and Vectorization and performance of your model on this dataset. (8 )¶

Here the attributes we are taking for processing is airline_sentiment and the text column which is nothing but the tweets. seniment_text column we uses as lablel and text we use to generated document matrix. As per the tweets the sentiments are classified into three categories namely, negative, nuetral and positive. We use html tag removal first, followed by removal of contractions, removal of numbers, tokenization, removal of punctations and special characters, converting the text to lowercase, removal of stopwords etc. After all these text processing we do Lemmatization using the WordnetLemmatizer. Then we join the words in the tokenized array before generating tabluar inforation using CounterVectorization and TF-IDF vectorization.

The data is split into training and test and then RandomForestClassifier model is built with 10 estimators. This we do for both the term matrix developed usind tf-idf vectorizer and countvectorizer. An effort to compare models built using RandomForest and DecisionTree is also done. In all cases we get a accuracy between 70-80% in predicting the sentiment of a tweet.The corresponding heatmaps are also generated.

	tweet_id	airline_sentiment	airline_sentiment_confidence	negativereason	negativereason_confidence	airline	airline_sentiment_gold	name	negativereason_gold	text	tweet_coord	tweet_created	tweet_location	user_timezone
0	570306133677760513	neutral	1.0000	NaN	NaN	Virgin America	NaN	cairdin	NaN	@VirginAmerica What @dhepburn said.	NaN	2015-02-24 11:35:52 -0800	NaN	Eastern Time (US & Canada)
1	570301130888122368	positive	0.3486	NaN	0.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica plus you've added commercials to the experience... tacky.	NaN	2015-02-24 11:15:59 -0800	NaN	Pacific Time (US & Canada)
2	570301083672813571	neutral	0.6837	NaN	NaN	Virgin America	NaN	yvonnalynn	NaN	@VirginAmerica I didn't today... Must mean I need to take another trip!	NaN	2015-02-24 11:15:48 -0800	Lets Play	Central Time (US & Canada)
3	570301031407624196	negative	1.0000	Bad Flight	0.7033	Virgin America	NaN	jnardino	NaN	@VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces & they have little recourse	NaN	2015-02-24 11:15:36 -0800	NaN	Pacific Time (US & Canada)
4	570300817074462722	negative	1.0000	Can't Tell	1.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica and it's a really big bad thing about it	NaN	2015-02-24 11:14:45 -0800	NaN	Pacific Time (US & Canada)

	text	airline_sentiment
0	@VirginAmerica What @dhepburn said.	neutral
1	@VirginAmerica plus you have added commercials to the experience... tacky.	positive
2	@VirginAmerica I did not today... Must mean I need to take another trip!	neutral
3	@VirginAmerica it is really aggressive to blast obnoxious "entertainment" in your guests' faces & they have little recourse	negative
4	@VirginAmerica and it is a really big bad thing about it	negative

	text	airline_sentiment
0	[@, VirginAmerica, What, @, dhepburn, said, .]	neutral
1	[@, VirginAmerica, plus, you, have, added, commercials, to, the, experience, ..., tacky, .]	positive
2	[@, VirginAmerica, I, did, not, today, ..., Must, mean, I, need, to, take, another, trip, !]	neutral
3	[@, VirginAmerica, it, is, really, aggressive, to, blast, obnoxious, ``, entertainment, '', in, your, guests, ', faces, &, they, have, little, recourse]	negative
4	[@, VirginAmerica, and, it, is, a, really, big, bad, thing, about, it]	negative

	text	airline_sentiment
0	[VirginAmerica, What, dhepburn, said]	neutral
1	[VirginAmerica, plus, you, have, added, commercials, to, the, experience, tacky]	positive
2	[VirginAmerica, I, did, not, today, Must, mean, I, need, to, take, another, trip]	neutral
3	[VirginAmerica, it, is, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, they, have, little, recourse]	negative
4	[VirginAmerica, and, it, is, a, really, big, bad, thing, about, it]	negative

	text	airline_sentiment
0	[virginamerica, dhepburn, said]	neutral
1	[virginamerica, plus, added, commercials, experience, tacky]	positive
2	[virginamerica, today, must, mean, need, take, another, trip]	neutral
3	[virginamerica, really, aggressive, blast, obnoxious, entertainment, guests, faces, little, recourse]	negative
4	[virginamerica, really, big, bad, thing]	negative

	text	airline_sentiment
0	[virginamerica, dhepburn, say]	neutral
1	[virginamerica, plus, add, commercials, experience, tacky]	positive
2	[virginamerica, today, must, mean, need, take, another, trip]	neutral
3	[virginamerica, really, aggressive, blast, obnoxious, entertainment, guests, face, little, recourse]	negative
4	[virginamerica, really, big, bad, thing]	negative

	text	airline_sentiment
0	virginamerica dhepburn say	neutral
1	virginamerica plus add commercials experience tacky	positive
2	virginamerica today must mean need take another trip	neutral
3	virginamerica really aggressive blast obnoxious entertainment guests face little recourse	negative
4	virginamerica really big bad thing	negative