import pandas as pd # library for working with dataframes
import numpy as np # library for working with arrays
import matplotlib.pyplot as plt # low level visualization library
%matplotlib inline
import seaborn as sns # higher level visualization library compared to matplotlib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score
from IPython.display import Image
import pydotplus as pydot
from sklearn import tree
from os import system
from yellowbrick.classifier import ClassificationReport, ROCAUC
plt.style.use('ggplot')
pd.options.display.float_format = '{:,.2f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
# read the csv file into a dataframe
df=pd.read_csv("bank-full.csv")
Bank client data:
df.head()
# Target is our dependent variable , rest are the independent variables.
df.shape
#there are 45211 rows and 17 columns in the dataframe.
df.info()
# There are continous and non continous variables in the dataframe
# age, balance, day, duration, campaign, pdays, previous are continous variables.
df.describe().transpose()
#Observations
# Average age is around 40-41
# Average balance in accound is 1362 and there are members with negative balance.
# Here we have mean , median , quartiles etc. of all continous variables
# There are no missing values in the continous variable columns because all the
# continous variables got listed in five point summary and none of the values are
# a string.
df.isnull().values.any()
# There are no null values in the dataframe.
# Listing all unique values
df_unique = df.nunique().to_frame().reset_index()
df_unique.columns = ['Variable','DistinctCount']
print(df_unique)
# Age Box plot
sns.boxplot(x=df['age'])
# Balance Box plot
sns.boxplot(x=df['balance'])
#There are outliers in the balance data.
# Day Box plot
sns.boxplot(x=df['day'])
# duration Box plot
sns.boxplot(x=df['duration'])
#There are outliers in duration
# campaign Box plot
sns.boxplot(x=df['campaign'])
#There are outliers in campaign data
# pdays Box plot
sns.boxplot(x=df['pdays'])
#pdays is highly skewed
# pdays Box plot
sns.boxplot(x=df['previous'])
#previous is highly skewed
df['pdays'].value_counts(normalize= True)
#pdays is highly skewed out of 45211 values 36954 has a
#value -1 (-1 tells us the person has not been contacted or contact period is beyond 900 days)
df['previous'].value_counts(normalize=True)
#previous is highly skewed out of 45211 values 36954 has a value 0
# dropping pdays and previous from the dataframe
df.drop(['pdays','previous'], axis=1,inplace = True)
df.head()
# converting variables to categorical variables
for feature in df.columns:
if df[feature].dtype == 'object':
df[feature] = pd.Categorical(df[feature])
df.info()
# examining all categorical varables and its counts
df['job'].value_counts()
df['marital'].value_counts()
df['education'].value_counts()
df['default'].value_counts()
df['housing'].value_counts()
df['loan'].value_counts()
df['contact'].value_counts()
df['month'].value_counts()
# most of the people where contacted in the month May.
# Dec, Mar, Sep ,Oct saw least contacted month
df['poutcome'].value_counts()
df['Target'].value_counts()
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False)
plt.show()
# We dont want any highly correlated variables. We will only take one column from highly corelated columns. We dont see any highly correlated variables.
sns.scatterplot(x="age", y="balance", hue='Target', data=df)
# When balance had increased the person had not taken a term deposit.
sns.scatterplot(x="day", y="duration", hue='Target', data=df)
sns.catplot(x="Target", y="balance", hue="marital", kind="bar", data=df);
sns.catplot(x="Target", y="balance", hue="job", kind="bar", data=df);
sns.catplot(x="Target", y="balance", hue="housing", kind="bar", data=df);
sns.distplot(df['age'])
sns.distplot(df['balance'])
#balance seems to be higly skewed.
for feature in df.columns:
if df[feature].dtype == 'object':
df[feature] = pd.Categorical(df[feature])
df.head()
#finding out unique values
df.nunique()
df.dtypes
df['job'].value_counts()
df['education'].value_counts()
df['poutcome'].value_counts()
df.skew()
#balance is highly skewed, due to outliers
Q1 = df["balance"].quantile(0.25)
Q3 = df["balance"].quantile(0.75)
df["balance"] = np.where(df["balance"] < Q1 , Q1 ,df["balance"])
df["balance"] = np.where(df["balance"] > Q3, Q3,df["balance"])
df['balance'].skew()
Q1 = df["duration"].quantile(0.25)
Q3 = df["duration"].quantile(0.75)
df["duration"] = np.where(df["duration"] < Q1 , Q1 ,df["duration"])
df["duration"] = np.where(df["duration"] > Q3, Q3,df["duration"])
df['duration'].skew()
Q1 = df["campaign"].quantile(0.25)
Q3 = df["campaign"].quantile(0.75)
df["campaign"] = np.where(df["campaign"] < Q1 , Q1 ,df["campaign"])
df["campaign"] = np.where(df["campaign"] > Q3, Q3,df["campaign"])
df['campaign'].skew()
df.skew()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['balance', 'duration']] = scaler.fit_transform(df[['balance', 'duration']])
df
df.head()
oneHotCols=["job","marital","education","default", "housing", "loan", "contact", "month", "poutcome"]
df=pd.get_dummies(df, columns=oneHotCols)
X = df.drop("Target" , axis=1)
y = df.pop("Target")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)
#predict on test
pred_logit = model.predict(X_test)
model_score = model.score(X_test, y_test)
print(model_score)
acc_logit = accuracy_score(y_test, pred_logit)
recall_logit = recall_score(y_test, pred_logit, pos_label="yes")
precision_logit = precision_score(y_test, pred_logit,pos_label="yes" )
f1_logit = f1_score(y_test, pred_logit, pos_label="yes")
resultsDf = pd.DataFrame({'Method':['Logistic Regression'], 'accuracy': acc_logit, 'recall':f1_logit, 'precision': precision_logit , 'f1_score' : f1_logit })
resultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_logit, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_logit)
roc = ROCAUC(LogisticRegression(solver="liblinear"))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(LogisticRegression(solver="liblinear"))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
dTree = DecisionTreeClassifier(criterion = 'entropy', random_state=22, max_depth=4)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
dTreeR = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
pred_dTreeR = dTreeR.predict(X_test)
acc_dTreeR = accuracy_score(y_test, pred_dTreeR)
recall_dTreeR = recall_score(y_test, pred_dTreeR, pos_label="yes")
precision_dTreeR = precision_score(y_test, pred_dTreeR, pos_label="yes")
f1_dTreeR = f1_score(y_test, pred_dTreeR, pos_label="yes")
tempResultsDf = pd.DataFrame({'Method':['DecisionTree'], 'accuracy': acc_dTreeR, 'recall':f1_dTreeR, 'precision': precision_dTreeR , 'f1_score' : f1_dTreeR })
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.concat([resultsDf, tempResultsDf])
tempResultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_dTreeR, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_dTreeR)
roc = ROCAUC(DecisionTreeClassifier(criterion = "gini", max_depth=4))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(DecisionTreeClassifier(criterion = "gini", max_depth=4))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(criterion = 'gini' , n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)
print(rfcl.score(X_train, y_train))
print(rfcl.score(X_test, y_test))
pred_rfcl = rfcl.predict(X_test)
acc_rfcl = accuracy_score(y_test, pred_rfcl)
recall_rfcl = recall_score(y_test, pred_rfcl, pos_label="yes")
precision_rfcl = precision_score(y_test, pred_rfcl, pos_label="yes")
f1_rfcl = f1_score(y_test, pred_rfcl, pos_label="yes")
tempResultsDf = pd.DataFrame({'Method':['RandomForest'], 'accuracy': acc_rfcl, 'recall':recall_rfcl, 'precision': precision_rfcl , 'f1_score' : f1_rfcl })
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.concat([resultsDf, tempResultsDf])
tempResultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_rfcl, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_rfcl)
roc = ROCAUC(RandomForestClassifier(criterion = 'gini' , n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(RandomForestClassifier(criterion = 'gini' , n_estimators = 50))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier( n_estimators = 50, learning_rate = 0.1, random_state=22)
abcl = abcl.fit(X_train, y_train)
print(abcl.score(X_train, y_train))
print(abcl.score(X_test, y_test))
pred_abcl = abcl.predict(X_test)
acc_abcl = accuracy_score(y_test, pred_abcl)
recall_abcl = recall_score(y_test, pred_abcl, pos_label="yes")
precision_abcl = precision_score(y_test, pred_abcl, pos_label="yes")
f1_abcl = f1_score(y_test, pred_abcl, pos_label="yes")
tempResultsDf = pd.DataFrame({'Method':['AdaBoost'], 'accuracy': acc_abcl, 'recall':recall_abcl, 'precision': precision_abcl , 'f1_score' : f1_abcl })
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.concat([resultsDf, tempResultsDf])
tempResultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_abcl, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_abcl)
roc = ROCAUC(AdaBoostClassifier(n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(AdaBoostClassifier())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier( n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(X_train, y_train)
print(bgcl.score(X_train, y_train))
print(bgcl.score(X_test, y_test))
pred_bgcl = bgcl.predict(X_test)
acc_bgcl = accuracy_score(y_test, pred_bgcl)
recall_bgcl = recall_score(y_test, pred_bgcl, pos_label="yes")
precision_bgcl = precision_score(y_test, pred_bgcl, pos_label="yes")
f1_bgcl = f1_score(y_test, pred_bgcl, pos_label="yes")
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': acc_bgcl, 'recall':recall_bgcl, 'precision': precision_bgcl , 'f1_score' : f1_bgcl })
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.concat([resultsDf, tempResultsDf])
tempResultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_bgcl, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_bgcl)
roc = ROCAUC(GradientBoostingClassifier(n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(GradientBoostingClassifier())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier( n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(X_train, y_train)
print(gbcl.score(X_train, y_train))
print(gbcl.score(X_test, y_test))
pred_gbcl = gbcl.predict(X_test)
acc_gbcl = accuracy_score(y_test, pred_gbcl)
recall_gbcl = recall_score(y_test, pred_gbcl, pos_label="yes")
precision_gbcl = precision_score(y_test, pred_gbcl, pos_label="yes")
f1_gbcl = f1_score(y_test, pred_gbcl, pos_label="yes")
tempResultsDf = pd.DataFrame({'Method':['GradientBoost'], 'accuracy': acc_gbcl, 'recall':recall_gbcl, 'precision': precision_gbcl , 'f1_score' : f1_gbcl })
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.concat([resultsDf, tempResultsDf])
tempResultsDf.reset_index(drop=True)
# Confusion matrix
pd.crosstab(y_test, pred_gbcl, rownames=['Actual'], colnames=['Predicted'])
draw_cm(y_test, pred_gbcl)
roc = ROCAUC( GradientBoostingClassifier( n_estimators = 50, learning_rate = 0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
# Visualize model performance with yellowbrick library
viz = ClassificationReport( GradientBoostingClassifier( n_estimators = 50, learning_rate = 0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
resultsDf.reset_index(drop=True)
We are getting accuracy around 0.90 with most of the models. But to maximise the reach for the term deposit customers we have to minimise false negatives. That is those customers who are actually positive but labeled as not potential customers. Considering that we want to select Bagging Classifier in this case. Also Bagging classifier has got the most favourable f1 score.