In [232]:
import warnings
warnings.filterwarnings('ignore')
In [233]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection  import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering 
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.stats import zscore
from scipy.spatial.distance import cdist, pdist
import sklearn.metrics


df = pd.read_excel("Credit Card Customer Data.xlsx")
df.dtypes
Out[233]:
Sl_No                  int64
Customer Key           int64
Avg_Credit_Limit       int64
Total_Credit_Cards     int64
Total_visits_bank      int64
Total_visits_online    int64
Total_calls_made       int64
dtype: object
In [234]:
#examining first few rows
df.head()
Out[234]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
0 1 87073 100000 2 1 1 0
1 2 38414 50000 3 0 10 9
2 3 17341 50000 7 1 3 4
3 4 40496 30000 5 1 1 4
4 5 47437 100000 6 0 12 3
In [235]:
#dataframe has 660 rows and 7 columns
df.shape
Out[235]:
(660, 7)
In [236]:
# examining for null values
df.isna().sum()
Out[236]:
Sl_No                  0
Customer Key           0
Avg_Credit_Limit       0
Total_Credit_Cards     0
Total_visits_bank      0
Total_visits_online    0
Total_calls_made       0
dtype: int64
In [237]:
#five point summary
df.describe().transpose()
Out[237]:
count mean std min 25% 50% 75% max
Sl_No 660.0 330.500000 190.669872 1.0 165.75 330.5 495.25 660.0
Customer Key 660.0 55141.443939 25627.772200 11265.0 33825.25 53874.5 77202.50 99843.0
Avg_Credit_Limit 660.0 34574.242424 37625.487804 3000.0 10000.00 18000.0 48000.00 200000.0
Total_Credit_Cards 660.0 4.706061 2.167835 1.0 3.00 5.0 6.00 10.0
Total_visits_bank 660.0 2.403030 1.631813 0.0 1.00 2.0 4.00 5.0
Total_visits_online 660.0 2.606061 2.935724 0.0 1.00 2.0 4.00 15.0
Total_calls_made 660.0 3.583333 2.865317 0.0 1.00 3.0 5.00 10.0
In [238]:
#getting rid of first two columns which is not releavant for clustering
custData=df.iloc[:,2:]
custData.head()
Out[238]:
Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
0 100000 2 1 1 0
1 50000 3 0 10 9
2 50000 7 1 3 4
3 30000 5 1 1 4
4 100000 6 0 12 3
In [239]:
#Scaling the columns to same standards
custDataScaled=custData.apply(zscore)
custDataScaled.head(10)
Out[239]:
Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
0 1.740187 -1.249225 -0.860451 -0.547490 -1.251537
1 0.410293 -0.787585 -1.473731 2.520519 1.891859
2 0.410293 1.058973 -0.860451 0.134290 0.145528
3 -0.121665 0.135694 -0.860451 -0.547490 0.145528
4 1.740187 0.597334 -1.473731 3.202298 -0.203739
5 -0.387644 -0.787585 -1.473731 -0.547490 1.542593
6 1.740187 0.135694 -1.473731 2.861408 -0.553005
7 -0.520633 -0.787585 -1.473731 -0.547490 -0.902271
8 -0.786612 -1.249225 -1.473731 -0.206600 -0.553005
9 -0.839808 -0.325946 -1.473731 -0.547490 1.193326

1. Univariate analysis (10 )

In [240]:
for i in custDataScaled.columns:
    sns.distplot(custDataScaled[i],hist=False,)
    plt.show()
In [241]:
sns.pairplot(custDataScaled,diag_kind='kde')
Out[241]:
<seaborn.axisgrid.PairGrid at 0x1ea21bf8f48>
In [242]:
custDataScaled.corr()
Out[242]:
Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
Avg_Credit_Limit 1.000000 0.608860 -0.100312 0.551385 -0.414352
Total_Credit_Cards 0.608860 1.000000 0.315796 0.167758 -0.651251
Total_visits_bank -0.100312 0.315796 1.000000 -0.551861 -0.506016
Total_visits_online 0.551385 0.167758 -0.551861 1.000000 0.127299
Total_calls_made -0.414352 -0.651251 -0.506016 0.127299 1.000000

2.Insights about the graphs (5 )

From the distribution plots we can infer that there are multiple peaks in some of the independent attributes. We had applied scaling to reduce the effects of magnitude and units of data by applying zscore value. From the pairplot we dont see a clear correlation between independent variables. So we plan to keep all the independent variables as such

3. KMeans clustering , elbow plot and box plot (10 )

In [243]:
#Finding optimal no. of clusters

clusters=range(1,10)
meanDistortions=[]

for k in clusters:
    kmeans=KMeans(n_clusters=k)
    kmeans.fit(custDataScaled)
    prediction=kmeans.predict(custDataScaled)
    meanDistortions.append(sum(np.min(cdist(custDataScaled, kmeans.cluster_centers_, 'euclidean'), axis=1)) / custDataScaled.shape[0])


plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
Out[243]:
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
In [244]:
# Let us first start with K = 3

final_kmeans=KMeans(3)
final_kmeans.fit(custDataScaled)
prediction=final_kmeans.predict(custDataScaled)

#Append the prediction
custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = prediction
df["GROUP"] = prediction
print("Groups Assigned : \n")
df.head()
Groups Assigned : 

Out[244]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made GROUP
0 1 87073 100000 2 1 1 0 0
1 2 38414 50000 3 0 10 9 1
2 3 17341 50000 7 1 3 4 0
3 4 40496 30000 5 1 1 4 0
4 5 47437 100000 6 0 12 3 2
In [245]:
customerDataClust_k = df.groupby(['GROUP'])
customerDataClust_k.mean()
Out[245]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
GROUP
0 417.528497 54881.329016 33782.383420 5.515544 3.489637 0.981865 2.000000
1 117.857143 55239.830357 12174.107143 2.410714 0.933036 3.553571 6.870536
2 611.280000 56708.760000 141040.000000 8.740000 0.600000 10.900000 1.080000
In [246]:
customerDataClust_k.count()
Out[246]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
GROUP
0 386 386 386 386 386 386 386
1 224 224 224 224 224 224 224
2 50 50 50 50 50 50 50
In [247]:
custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))
Out[247]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2694DD08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26CA4F48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26CFF548>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26D368C8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26D6AF08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26DA49C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26DDC7C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26E15508>]],
      dtype=object)

Using the box plot a value of =3 is giving clear seperation of clusters as seen from the graphs above.

4. Hierarchical clustering , dendogram and cophentic correlation (15 )

In [248]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_ward = linkage(custDataScaled, metric='euclidean',method = 'ward')
dendrogram(Z_ward,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
In [249]:
max_d =20
hward_clusters = fcluster(Z_ward, max_d, criterion='distance')
In [250]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_average = linkage(custDataScaled, metric='euclidean',method = 'average')
dendrogram(Z_average,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
In [251]:
max_d = 4
haverage_clusters = fcluster(Z_average, max_d, criterion='distance')
In [252]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_complete = linkage(custDataScaled, metric='euclidean',method = 'complete')
dendrogram(Z_complete,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
In [253]:
max_d = 5
hcomplete_clusters = fcluster(Z_complete, max_d, criterion='distance')

Final Agglomerative clustering used with average linking

In [254]:
model = AgglomerativeClustering(n_clusters= 3 , affinity='euclidean',  linkage='average')
model.fit(custDataScaled)
Out[254]:
AgglomerativeClustering(linkage='average', n_clusters=3)
In [255]:
df['GROUP'] = model.labels_
df.head(10)
Out[255]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made GROUP
0 1 87073 100000 2 1 1 0 0
1 2 38414 50000 3 0 10 9 2
2 3 17341 50000 7 1 3 4 0
3 4 40496 30000 5 1 1 4 0
4 5 47437 100000 6 0 12 3 1
5 6 58634 20000 3 0 1 8 2
6 7 48370 100000 5 0 11 2 1
7 8 37376 15000 3 0 1 1 2
8 9 82490 5000 2 0 2 2 2
9 10 44770 3000 4 0 1 7 2
In [256]:
customerDataClust_h = df.groupby(['GROUP'])
customerDataClust_h.mean()
Out[256]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
GROUP
0 417.528497 54881.329016 33782.383420 5.515544 3.489637 0.981865 2.000000
1 611.280000 56708.760000 141040.000000 8.740000 0.600000 10.900000 1.080000
2 117.857143 55239.830357 12174.107143 2.410714 0.933036 3.553571 6.870536
In [257]:
customerDataClust_h.count()
Out[257]:
Sl_No Customer Key Avg_Credit_Limit Total_Credit_Cards Total_visits_bank Total_visits_online Total_calls_made
GROUP
0 386 386 386 386 386 386 386
1 50 50 50 50 50 50 50
2 224 224 224 224 224 224 224
In [258]:
custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = model.labels_
custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))
Out[258]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA27283288>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA1C855908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2727C588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA272644C8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA271A5D08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2700CFC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2748AF08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2746BF48>]],
      dtype=object)

Cophenetic correlation values

In [259]:
#ward linkage
c_ward, coph_dists = cophenet(Z_ward , pdist(custDataScaled))

c_ward
Out[259]:
0.8499528168988276
In [260]:
#average linkage
c_average, coph_dists = cophenet(Z_average , pdist(custDataScaled))

c_average
Out[260]:
0.9129668555270531
In [261]:
#complete linkage
c_complete, coph_dists = cophenet(Z_complete , pdist(custDataScaled))

c_complete
Out[261]:
0.9013598521297431

5. Calculate average silhouette score for both methods (5 )

In [262]:
# Calculate Avg Silhoutte Score for KMeans clustering
silhouette_score(custDataScaled,final_kmeans.labels_)
Out[262]:
0.5908634951401681
In [263]:
# Calculate Avg Silhoutte Score for hierarchical clustering (ward)
silhouette_score(custDataScaled,hward_clusters)
Out[263]:
0.5908634951401681
In [264]:
# Calculate Avg Silhoutte Score for hierarchical clustering (average linking)
silhouette_score(custDataScaled,haverage_clusters)
Out[264]:
0.5323027880688744
In [265]:
# Calculate Avg Silhoutte Score for hierarchical clustering (complete linking)
silhouette_score(custDataScaled,hcomplete_clusters)
Out[265]:
0.5894229959403595

Silhouette Score is better when closer 1 and worse when closer to -1 here, it is not so great for both the cases.

6. Compare K-means clusters with Hierarchical clusters. (5 )

Compared to K-Means have got better silhouette scores when we compare with hierarchical clustering. We are getting equal number of records with three clusters (386,224,50) in both cases. The mean values of independent variables is also very similar when comparing the kmeans and hierarchical clustering techniques. Comparing the box plot gives similar notion.

7. Analysis the clusters formed, tell us how is one cluster different from another and answer all the key questions (10 )

I tried to find the elbow point with a elbow graph with k values between 1 to 9 and selected 3 as the k value. I am getting decent number for silhouette score around .59 which is not ideal (which should be close to one). Looking at boxplot we could clearly see that the independent variable groups are clearly differentiated in this grouping.

For hierarchical clustering single linkage i didnt try because proximity between two clusters cannot be good as there are outliers. I had tried to experiment with complete linkage, average and ward and from the cophenetic correlation and silhouette values average linking is the one which is giving good values. Finaly Agglometric clustering is used with 3 clusters with average linkage , because even though we have better silhouette score for ward linkage, we have better value for cophenetic correlation with average linkage.