import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
customerData = pd.read_csv("Bank_Personal_Loan_Modelling.csv")
customerData.shape
customerData.info()
customerData.describe().transpose()
customerData['CD Account'].value_counts(normalize=True)
# ID: Customer ID
# Age: Customer's age in completed years
# Experience: #years of professional experience
# Income: Annual income of the customer ($000)
# ZIP Code: Home Address ZIP code.
# Family: Family size of the customer
# CCAvg: Avg. spending on credit cards per month ($000)
# Education: Education Level. 1: Undergrad; 2: Graduate; 3: Advanced/Professional
# Mortgage: Value of house mortgage if any. ($000)
# Personal Loan: Did this customer accept the personal loan offered in the last campaign?
# Securities Account: Does the customer have a securities account with the bank?
# CD Account: Does the customer have a certificate of deposit (CD) account with the bank?
# Online: Does the customer use internet banking facilities?
# Credit card: Does the customer use a credit card issued by the bank?
customerData.head()
#checking for null values if any
customerData.isnull().values.any()
#number of unique values in each columns
customerData['Mortgage'].nunique()
#Number of people with zero mortgage?
customerData[customerData['Mortgage'] == 0].count()
# Number of people with zero credit card spending per month?
customerData[customerData['CCAvg'] == 0].count()
#Value counts of all categorical columns
customerData["Family"].value_counts()
customerData["Education"].value_counts()
#here target variable is personal loans
#9.6% of the customers had availed Personal loans
#confirms the data provided in the question
customerData["Personal Loan"].value_counts()
customerData["Securities Account"].value_counts()
customerData["CD Account"].value_counts()
customerData["Online"].value_counts()
customerData["CreditCard"].value_counts()
#Univariate analysis
sns.distplot(customerData["Age"], kde=0)
sns.distplot(customerData["Experience"], kde=0)
sns.distplot(customerData["Family"], kde=0)
sns.distplot(customerData["Education"], kde=0)
sns.distplot(customerData["Securities Account"], kde=0)
sns.distplot(customerData["CD Account"], kde=0)
sns.distplot(customerData["Online"], kde=0)
sns.distplot(customerData["CreditCard"], kde=0)
#bivariate analysis
customerData.corr()
#there is high correlation between age and experience.
#getting rid of some columns as its not of significance
customerData.drop("ID", axis=1, inplace=True)
customerData.drop("ZIP Code", axis=1 , inplace=True)
sns.pairplot(customerData)
sns.pairplot(customerData, vars=[ 'Income','CCAvg', 'Mortgage', 'Personal Loan'])
customerData.skew()
# drawing confusion matrix
def draw_confusion_matrix( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm,cmap="YlGnBu", annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
#we had already dropped non relevant data like ID and ZIP code
#we are going to split the dataset for training and testing with a 70:30 cut
x=customerData.drop(["Personal Loan"],axis=1)
y=customerData["Personal Loan"]
#split the data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=7)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=30, penalty='l2', C = 0.90,solver='liblinear')
logreg.fit(x_train, y_train)
y_predict = logreg.predict(x_test)
print("Trainig accuracy",logreg.score(x_train,y_train))
print()
print("Testing accuracy",logreg.score(x_test, y_test))
print()
print('Confusion Matrix')
print(draw_confusion_matrix(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
# we would like to change the Logistic Regression parameters to see if it improves the performance matrix
logreg_improve = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l1',
random_state=42, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
logreg_improve.fit(x_train, y_train)
y_predict = logreg_improve.predict(x_test)
print("Trainig accuracy",logreg_improve.score(x_train,y_train))
print()
print("Testing accuracy",logreg_improve.score(x_test, y_test))
print()
print('Confusion Matrix')
print(draw_confusion_matrix(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
Confusion matrix means
True Positive (observed=1,predicted=1):
Predicted that customer was a good Personal loan target and the customer was indeed a target for Personal loan
False Positive (observed=0,predicted=1):
Predicted that customer was target for Personal loan and observed that customer was not good target for Personal loan
True Negative (observed=0,predicted=0):
Predicted that customer was not a good target for personal loan and the customer was not good target for Personal loan
False Negative (observed=1,predicted=0):
Predicted that customer was not a good target for personal loan and the customer was a good target for personal loan
Here the bank wants to give offer Personal loans to the people who are eligible for the Personal loan i.e. less number of False Positive, if FP is high bank would lose money. So that the bank doesn't lose money on the people who are not eligible for the loan. Hence Precision is the important metric.
In case of False negative bank will lose few customers but that okay because the bank would want to retain money more than customers who are not eligible for loan.
After achieving the desired accuracy we can deploy the model for practical use. Bank can better predict its marketing campaign focusing on customers who have better chance of converting to a Personal loan customer.