import numpy as np   
from sklearn.linear_model import LinearRegression
import pandas as pd    
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split

customerData = pd.read_csv("Bank_Personal_Loan_Modelling.csv")  
customerData.shape

(5000, 14)

customerData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB

customerData.describe().transpose()

customerData['CD Account'].value_counts(normalize=True)

0    0.9396
1    0.0604
Name: CD Account, dtype: float64

# ID: Customer ID
# Age: Customer's age in completed years
# Experience: #years of professional experience
# Income: Annual income of the customer ($000)
# ZIP Code: Home Address ZIP code.
# Family: Family size of the customer
# CCAvg: Avg. spending on credit cards per month ($000)
# Education: Education Level. 1: Undergrad; 2: Graduate; 3: Advanced/Professional
# Mortgage: Value of house mortgage if any. ($000)
# Personal Loan: Did this customer accept the personal loan offered in the last campaign?
# Securities Account: Does the customer have a securities account with the bank?
# CD Account: Does the customer have a certificate of deposit (CD) account with the bank?
# Online: Does the customer use internet banking facilities?
# Credit card: Does the customer use a credit card issued by the bank?
customerData.head()

#checking for null values if any
customerData.isnull().values.any()

False

#number of unique values in each columns
customerData['Mortgage'].nunique()

347

#Number of people with zero mortgage?
customerData[customerData['Mortgage'] == 0].count()

ID                    3462
Age                   3462
Experience            3462
Income                3462
ZIP Code              3462
Family                3462
CCAvg                 3462
Education             3462
Mortgage              3462
Personal Loan         3462
Securities Account    3462
CD Account            3462
Online                3462
CreditCard            3462
dtype: int64

# Number of people with zero credit card spending per month?
customerData[customerData['CCAvg'] == 0].count()

ID                    106
Age                   106
Experience            106
Income                106
ZIP Code              106
Family                106
CCAvg                 106
Education             106
Mortgage              106
Personal Loan         106
Securities Account    106
CD Account            106
Online                106
CreditCard            106
dtype: int64

#Value counts of all categorical columns

customerData["Family"].value_counts()

1    1472
2    1296
4    1222
3    1010
Name: Family, dtype: int64

customerData["Education"].value_counts()

1    2096
3    1501
2    1403
Name: Education, dtype: int64

#here target variable is personal loans
#9.6% of the customers had availed Personal loans 
#confirms the data provided in the question
customerData["Personal Loan"].value_counts()

0    4520
1     480
Name: Personal Loan, dtype: int64

customerData["Securities Account"].value_counts()

0    4478
1     522
Name: Securities Account, dtype: int64

customerData["CD Account"].value_counts()

0    4698
1     302
Name: CD Account, dtype: int64

customerData["Online"].value_counts()

1    2984
0    2016
Name: Online, dtype: int64

customerData["CreditCard"].value_counts()

0    3530
1    1470
Name: CreditCard, dtype: int64

#Univariate analysis
sns.distplot(customerData["Age"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc736348>

sns.distplot(customerData["Experience"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc7f3ac8>

sns.distplot(customerData["Family"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc7fbe88>

sns.distplot(customerData["Education"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc929f08>

sns.distplot(customerData["Securities Account"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc99cb88>

sns.distplot(customerData["CD Account"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280cc1c12c8>

sns.distplot(customerData["Online"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280ccb62188>

sns.distplot(customerData["CreditCard"], kde=0)

<matplotlib.axes._subplots.AxesSubplot at 0x280ccbf3848>

#bivariate analysis
customerData.corr()
#there is high correlation between age and experience.

#getting rid of some columns as its not of significance
customerData.drop("ID", axis=1, inplace=True)
customerData.drop("ZIP Code", axis=1 , inplace=True)
sns.pairplot(customerData)

<seaborn.axisgrid.PairGrid at 0x280ccbf3d08>

sns.pairplot(customerData, vars=[ 'Income','CCAvg', 'Mortgage', 'Personal Loan'])

<seaborn.axisgrid.PairGrid at 0x280d42c69c8>

customerData.skew()

Age                  -0.029341
Experience           -0.026325
Income                0.841339
Family                0.155221
CCAvg                 1.598443
Education             0.227093
Mortgage              2.104002
Personal Loan         2.743607
Securities Account    2.588268
CD Account            3.691714
Online               -0.394785
CreditCard            0.904589
dtype: float64

# drawing confusion matrix
def draw_confusion_matrix( actual, predicted ):
    cm = confusion_matrix( actual, predicted)
    sns.heatmap(cm,cmap="YlGnBu", annot=True,  fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

#we had already dropped non relevant data like ID and ZIP code
#we are going to split the dataset for training and testing with a 70:30 cut

x=customerData.drop(["Personal Loan"],axis=1)
y=customerData["Personal Loan"]

#split the data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=7)

from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=30, penalty='l2', C = 0.90,solver='liblinear')
logreg.fit(x_train, y_train)
y_predict = logreg.predict(x_test)

print("Trainig accuracy",logreg.score(x_train,y_train))  
print()
print("Testing accuracy",logreg.score(x_test, y_test))
print()
print('Confusion Matrix')
print(draw_confusion_matrix(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))

Trainig accuracy 0.9488571428571428

Testing accuracy 0.9533333333333334

Confusion Matrix

None

Recall: 0.6376811594202898

Precision: 0.8148148148148148

F1 Score: 0.7154471544715448

Roc Auc Score: 0.8114984358041243

Parameter tuning to improve results.¶

Below we had improved the testing accuracy by changing some of the parameters applied for Logistic regression . We are targeting for a good overall testing accuracy with a balance of Recall and Precision values, with Precision values getting importance¶

# we would like to change the Logistic Regression parameters to see if it improves the performance matrix


logreg_improve = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

logreg_improve.fit(x_train, y_train)
y_predict = logreg_improve.predict(x_test)

print("Trainig accuracy",logreg_improve.score(x_train,y_train))  
print()
print("Testing accuracy",logreg_improve.score(x_test, y_test))
print()
print('Confusion Matrix')
print(draw_confusion_matrix(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))

Trainig accuracy 0.9508571428571428

Testing accuracy 0.9553333333333334

Confusion Matrix

None

Recall: 0.644927536231884

Precision: 0.8317757009345794

F1 Score: 0.726530612244898

Roc Auc Score: 0.8158558386005235

OBSERVATION¶

This model is capable of predicting a customer who could be a potential target for Personal loan. The model has testing accuracy of around 95.53% . Since the loss to the bank will be more on false positive cases, Precision value is important and here we are having .83 as precision value, which is good. Only 18 false positive cases are reported as per the heatmap above.¶

Confusion matrix means

True Positive (observed=1,predicted=1):

Predicted that customer was a good Personal loan target and the customer was indeed a target for Personal loan

False Positive (observed=0,predicted=1):

Predicted that customer was target for Personal loan and observed that customer was not good target for Personal loan

True Negative (observed=0,predicted=0):

Predicted that customer was not a good target for personal loan and the customer was not good target for Personal loan

False Negative (observed=1,predicted=0):

Predicted that customer was not a good target for personal loan and the customer was a good target for personal loan

Here the bank wants to give offer Personal loans to the people who are eligible for the Personal loan i.e. less number of False Positive, if FP is high bank would lose money. So that the bank doesn't lose money on the people who are not eligible for the loan. Hence Precision is the important metric.

In case of False negative bank will lose few customers but that okay because the bank would want to retain money more than customers who are not eligible for loan.

After achieving the desired accuracy we can deploy the model for practical use. Bank can better predict its marketing campaign focusing on customers who have better chance of converting to a Personal loan customer.

	count	mean	std	min	25%	50%	75%	max
ID	5000.0	2500.500000	1443.520003	1.0	1250.75	2500.5	3750.25	5000.0
Age	5000.0	45.338400	11.463166	23.0	35.00	45.0	55.00	67.0
Experience	5000.0	20.104600	11.467954	-3.0	10.00	20.0	30.00	43.0
Income	5000.0	73.774200	46.033729	8.0	39.00	64.0	98.00	224.0
ZIP Code	5000.0	93152.503000	2121.852197	9307.0	91911.00	93437.0	94608.00	96651.0
Family	5000.0	2.396400	1.147663	1.0	1.00	2.0	3.00	4.0
CCAvg	5000.0	1.937938	1.747659	0.0	0.70	1.5	2.50	10.0
Education	5000.0	1.881000	0.839869	1.0	1.00	2.0	3.00	3.0
Mortgage	5000.0	56.498800	101.713802	0.0	0.00	0.0	101.00	635.0
Personal Loan	5000.0	0.096000	0.294621	0.0	0.00	0.0	0.00	1.0
Securities Account	5000.0	0.104400	0.305809	0.0	0.00	0.0	0.00	1.0
CD Account	5000.0	0.060400	0.238250	0.0	0.00	0.0	0.00	1.0
Online	5000.0	0.596800	0.490589	0.0	0.00	1.0	1.00	1.0
CreditCard	5000.0	0.294000	0.455637	0.0	0.00	0.0	1.00	1.0

	ID	Age	Experience	Income	ZIP Code	Family	CCAvg	Education	Mortgage	Personal Loan	Securities Account	CD Account	Online	CreditCard
ID	1.000000	-0.008473	-0.008326	-0.017695	0.013432	-0.016797	-0.024675	0.021463	-0.013920	-0.024801	-0.016972	-0.006909	-0.002528	0.017028
Age	-0.008473	1.000000	0.994215	-0.055269	-0.029216	-0.046418	-0.052012	0.041334	-0.012539	-0.007726	-0.000436	0.008043	0.013702	0.007681
Experience	-0.008326	0.994215	1.000000	-0.046574	-0.028626	-0.052563	-0.050077	0.013152	-0.010582	-0.007413	-0.001232	0.010353	0.013898	0.008967
Income	-0.017695	-0.055269	-0.046574	1.000000	-0.016410	-0.157501	0.645984	-0.187524	0.206806	0.502462	-0.002616	0.169738	0.014206	-0.002385
ZIP Code	0.013432	-0.029216	-0.028626	-0.016410	1.000000	0.011778	-0.004061	-0.017377	0.007383	0.000107	0.004704	0.019972	0.016990	0.007691
Family	-0.016797	-0.046418	-0.052563	-0.157501	0.011778	1.000000	-0.109275	0.064929	-0.020445	0.061367	0.019994	0.014110	0.010354	0.011588
CCAvg	-0.024675	-0.052012	-0.050077	0.645984	-0.004061	-0.109275	1.000000	-0.136124	0.109905	0.366889	0.015086	0.136534	-0.003611	-0.006689
Education	0.021463	0.041334	0.013152	-0.187524	-0.017377	0.064929	-0.136124	1.000000	-0.033327	0.136722	-0.010812	0.013934	-0.015004	-0.011014
Mortgage	-0.013920	-0.012539	-0.010582	0.206806	0.007383	-0.020445	0.109905	-0.033327	1.000000	0.142095	-0.005411	0.089311	-0.005995	-0.007231
Personal Loan	-0.024801	-0.007726	-0.007413	0.502462	0.000107	0.061367	0.366889	0.136722	0.142095	1.000000	0.021954	0.316355	0.006278	0.002802
Securities Account	-0.016972	-0.000436	-0.001232	-0.002616	0.004704	0.019994	0.015086	-0.010812	-0.005411	0.021954	1.000000	0.317034	0.012627	-0.015028
CD Account	-0.006909	0.008043	0.010353	0.169738	0.019972	0.014110	0.136534	0.013934	0.089311	0.316355	0.317034	1.000000	0.175880	0.278644
Online	-0.002528	0.013702	0.013898	0.014206	0.016990	0.010354	-0.003611	-0.015004	-0.005995	0.006278	0.012627	0.175880	1.000000	0.004210
CreditCard	0.017028	0.007681	0.008967	-0.002385	0.007691	0.011588	-0.006689	-0.011014	-0.007231	0.002802	-0.015028	0.278644	0.004210	1.000000

	ID	Age	Experience	Income	ZIP Code	Family	CCAvg	Education	Securities Account	CreditCard
0	1	25	1	49	91107	4	1.6	1	1	0
1	2	45	19	34	90089	3	1.5	1	1	0
2	3	39	15	11	94720	1	1.0	1	0	0
3	4	35	9	100	94112	1	2.7	2	0	0
4	5	35	8	45	91330	4	1.0	2	0	1