# install and import necessary libraries.
!pip install contractions
import re, string, unicodedata # Import Regex, string and unicodedata.
import contractions # Import contractions library.
from bs4 import BeautifulSoup # Import BeautifulSoup.
import numpy as np # Import numpy.
import pandas as pd # Import pandas.
import nltk # Import Natural Language Tool-Kit.
nltk.download('stopwords') # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords # Import stopwords.
from nltk.tokenize import word_tokenize, sent_tokenize # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer # Import Lemmatizer.
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Loading data into pandas dataframe
data = pd.read_csv("Tweets.csv")
data.shape
data.head()
a. Drop all other columns except “text” and “airline_sentiment”.
b. Check the shape of data.
c. Print first 5 rows of data.
data = data.loc[:, ['text','airline_sentiment']]
data.shape
pd.set_option('display.max_colwidth', None)
data.head(5)
df = data.groupby('airline_sentiment').count()
print(df)
a. Html tag removal.
b. Tokenization.
c. Remove the numbers.
d. Removal of Special Characters and Punctuations.
e. Conversion to lowercase.
f. Lemmatize or stemming.
g. Join the words in the list to convert back to text string in the dataframe. (So that each row contains the data in text format.)
h. Print first 5 rows of data after pre-processing.
#remove html tags
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
data['text'] = data['text'].apply(lambda x: strip_html(x))
data.head()
#remove contractions
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
data['text'] = data['text'].apply(lambda x: replace_contractions(x))
data.head()
#remove numbers
def remove_numbers(text):
text = re.sub(r'\d+', '', text)
return text
data['text'] = data['text'].apply(lambda x: remove_numbers(x))
data.head()
#Tokenization
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) # Tokenization of data
data.head()
#remove special characters and punctuations
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
data['text'] = data['text'].apply(lambda x: remove_punctuation(x))
data.head()
#Conversion to lowercase.
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
data['text'] = data['text'].apply(lambda x: to_lowercase(x))
data.head()
# save the stopwords in a list named stopwords.
stopwords = stopwords.words('english')
#removal of stopwords
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords:
new_words.append(word)
return new_words
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data.head()
#Lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize_list(words):
new_words = []
for word in words:
new_words.append(lemmatizer.lemmatize(word, pos='v'))
return new_words
#not used in this example
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = [] # Create empty list to store pre-processed words.
for word in words:
stem = stemmer.stem(word)
stems.append(stem) # Append processed words to new list.
return stems
data['text'] = data['text'].apply(lambda x: lemmatize_list(x))
data.head()
#Join the words in the list to convert back to text string in the dataframe. (So that each row contains the data in text format.)
def join_words(words):
return ' '.join(words)
data['text'] = data['text'].apply(lambda x: join_words(x))
data.head()
a. Use CountVectorizer.
b. Use TfidfVectorizer.
#CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(data['text'])
# summarize
#print(vectorizer.vocabulary_)
# encode document
count_vector = vectorizer.transform(data['text'])
# summarize encoded vector
print(count_vector.shape)
count_vector_array = count_vector.toarray()
#Tf-Idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(data['text'])
# summarize
#print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform(data['text'])
# summarize encoded vector
print(vector.shape)
tfidf_vector_array = vector.toarray()
# Split data into training and testing set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(count_vector_array,data.airline_sentiment, test_size=0.3, random_state=42)
# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(X_train, y_train)
print(forest)
print(np.mean(cross_val_score(forest, count_vector_array, data.airline_sentiment, cv=10)))
# Predict the result for test data using the model built above.
result = forest.predict(X_test)
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)
df_cm = pd.DataFrame(conf_mat, index = [i for i in ['negative','neutral', 'positive']],
columns = [i for i in ['negative', 'neutral', 'positive']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')
cls = [
DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=10)
]
cls_name = []
i = 0
accuracy = []
for cl in cls:
model = cl.fit(X_train,y_train)
lbl_pred = model.predict(X_test)
a = (100*accuracy_score(lbl_pred, y_test))
a = round(a,2)
accuracy.append(a)
cls_name.append(cl.__class__.__name__)
print ("{} Accuracy Score : {}%".format(cls_name[i],a))
print ( classification_report(lbl_pred, y_test))
i +=1
# Split data into training and testing set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vector_array,data.airline_sentiment, test_size=0.3, random_state=42)
# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
forest = RandomForestClassifier(n_estimators=10, n_jobs=4)
forest = forest.fit(X_train, y_train)
print(forest)
print(np.mean(cross_val_score(forest, tfidf_vector_array, data.airline_sentiment, cv=10)))
# Predict the result for test data using the model built above.
result = forest.predict(X_test)
conf_mat = confusion_matrix(y_test, result)
print(conf_mat)
df_cm = pd.DataFrame(conf_mat, index = [i for i in ['negative','neutral', 'positive']],
columns = [i for i in ['negative', 'neutral', 'positive']])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')
cls = [
DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=10)
]
cls_name = []
i = 0
accuracy = []
for cl in cls:
model = cl.fit(X_train,y_train)
lbl_pred = model.predict(X_test)
a = (100*accuracy_score(lbl_pred, y_test))
a = round(a,2)
accuracy.append(a)
cls_name.append(cl.__class__.__name__)
print ("{} Accuracy Score : {}%".format(cls_name[i],a))
print ( classification_report(lbl_pred, y_test))
i +=1
Here the attributes we are taking for processing is airline_sentiment and the text column which is nothing but the tweets. seniment_text column we uses as lablel and text we use to generated document matrix. As per the tweets the sentiments are classified into three categories namely, negative, nuetral and positive. We use html tag removal first, followed by removal of contractions, removal of numbers, tokenization, removal of punctations and special characters, converting the text to lowercase, removal of stopwords etc. After all these text processing we do Lemmatization using the WordnetLemmatizer. Then we join the words in the tokenized array before generating tabluar inforation using CounterVectorization and TF-IDF vectorization.
The data is split into training and test and then RandomForestClassifier model is built with 10 estimators. This we do for both the term matrix developed usind tf-idf vectorizer and countvectorizer. An effort to compare models built using RandomForest and DecisionTree is also done. In all cases we get a accuracy between 70-80% in predicting the sentiment of a tweet.The corresponding heatmaps are also generated.