import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection  import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering 
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.stats import zscore
from scipy.spatial.distance import cdist, pdist
import sklearn.metrics


df = pd.read_excel("Credit Card Customer Data.xlsx")
df.dtypes

Sl_No                  int64
Customer Key           int64
Avg_Credit_Limit       int64
Total_Credit_Cards     int64
Total_visits_bank      int64
Total_visits_online    int64
Total_calls_made       int64
dtype: object

#examining first few rows
df.head()

#dataframe has 660 rows and 7 columns
df.shape

(660, 7)

# examining for null values
df.isna().sum()

Sl_No                  0
Customer Key           0
Avg_Credit_Limit       0
Total_Credit_Cards     0
Total_visits_bank      0
Total_visits_online    0
Total_calls_made       0
dtype: int64

#five point summary
df.describe().transpose()

#getting rid of first two columns which is not releavant for clustering
custData=df.iloc[:,2:]
custData.head()

#Scaling the columns to same standards
custDataScaled=custData.apply(zscore)
custDataScaled.head(10)

1. Univariate analysis (10 )¶

for i in custDataScaled.columns:
    sns.distplot(custDataScaled[i],hist=False,)
    plt.show()

sns.pairplot(custDataScaled,diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x1ea21bf8f48>

custDataScaled.corr()

2.Insights about the graphs (5 )¶

From the distribution plots we can infer that there are multiple peaks in some of the independent attributes. We had applied scaling to reduce the effects of magnitude and units of data by applying zscore value. From the pairplot we dont see a clear correlation between independent variables. So we plan to keep all the independent variables as such¶

3. KMeans clustering , elbow plot and box plot (10 )¶

#Finding optimal no. of clusters

clusters=range(1,10)
meanDistortions=[]

for k in clusters:
    kmeans=KMeans(n_clusters=k)
    kmeans.fit(custDataScaled)
    prediction=kmeans.predict(custDataScaled)
    meanDistortions.append(sum(np.min(cdist(custDataScaled, kmeans.cluster_centers_, 'euclidean'), axis=1)) / custDataScaled.shape[0])


plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')

Text(0.5, 1.0, 'Selecting k with the Elbow Method')

# Let us first start with K = 3

final_kmeans=KMeans(3)
final_kmeans.fit(custDataScaled)
prediction=final_kmeans.predict(custDataScaled)

#Append the prediction
custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = prediction
df["GROUP"] = prediction
print("Groups Assigned : \n")
df.head()

Groups Assigned :

customerDataClust_k = df.groupby(['GROUP'])
customerDataClust_k.mean()

customerDataClust_k.count()

custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2694DD08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26CA4F48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26CFF548>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26D368C8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26D6AF08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26DA49C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26DDC7C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA26E15508>]],
      dtype=object)

Using the box plot a value of =3 is giving clear seperation of clusters as seen from the graphs above.¶

4. Hierarchical clustering , dendogram and cophentic correlation (15 )¶

plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_ward = linkage(custDataScaled, metric='euclidean',method = 'ward')
dendrogram(Z_ward,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

max_d =20
hward_clusters = fcluster(Z_ward, max_d, criterion='distance')

plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_average = linkage(custDataScaled, metric='euclidean',method = 'average')
dendrogram(Z_average,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

max_d = 4
haverage_clusters = fcluster(Z_average, max_d, criterion='distance')

plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_complete = linkage(custDataScaled, metric='euclidean',method = 'complete')
dendrogram(Z_complete,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

max_d = 5
hcomplete_clusters = fcluster(Z_complete, max_d, criterion='distance')

Final Agglomerative clustering used with average linking¶

model = AgglomerativeClustering(n_clusters= 3 , affinity='euclidean',  linkage='average')
model.fit(custDataScaled)

AgglomerativeClustering(linkage='average', n_clusters=3)

df['GROUP'] = model.labels_
df.head(10)

customerDataClust_h = df.groupby(['GROUP'])
customerDataClust_h.mean()

customerDataClust_h.count()

custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = model.labels_
custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA27283288>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA1C855908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2727C588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA272644C8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001EA271A5D08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2700CFC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2748AF08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001EA2746BF48>]],
      dtype=object)

Cophenetic correlation values¶

#ward linkage
c_ward, coph_dists = cophenet(Z_ward , pdist(custDataScaled))

c_ward

0.8499528168988276

#average linkage
c_average, coph_dists = cophenet(Z_average , pdist(custDataScaled))

c_average

0.9129668555270531

#complete linkage
c_complete, coph_dists = cophenet(Z_complete , pdist(custDataScaled))

c_complete

0.9013598521297431

5. Calculate average silhouette score for both methods (5 )¶

# Calculate Avg Silhoutte Score for KMeans clustering
silhouette_score(custDataScaled,final_kmeans.labels_)

0.5908634951401681

# Calculate Avg Silhoutte Score for hierarchical clustering (ward)
silhouette_score(custDataScaled,hward_clusters)

0.5908634951401681

# Calculate Avg Silhoutte Score for hierarchical clustering (average linking)
silhouette_score(custDataScaled,haverage_clusters)

0.5323027880688744

# Calculate Avg Silhoutte Score for hierarchical clustering (complete linking)
silhouette_score(custDataScaled,hcomplete_clusters)

0.5894229959403595

Silhouette Score is better when closer 1 and worse when closer to -1 here, it is not so great for both the cases.¶

6. Compare K-means clusters with Hierarchical clusters. (5 )¶

Compared to K-Means have got better silhouette scores when we compare with hierarchical clustering. We are getting equal number of records with three clusters (386,224,50) in both cases. The mean values of independent variables is also very similar when comparing the kmeans and hierarchical clustering techniques. Comparing the box plot gives similar notion.

7. Analysis the clusters formed, tell us how is one cluster different from another and answer all the key questions (10 )¶

I tried to find the elbow point with a elbow graph with k values between 1 to 9 and selected 3 as the k value. I am getting decent number for silhouette score around .59 which is not ideal (which should be close to one). Looking at boxplot we could clearly see that the independent variable groups are clearly differentiated in this grouping.

For hierarchical clustering single linkage i didnt try because proximity between two clusters cannot be good as there are outliers. I had tried to experiment with complete linkage, average and ward and from the cophenetic correlation and silhouette values average linking is the one which is giving good values. Finaly Agglometric clustering is used with 3 clusters with average linkage , because even though we have better silhouette score for ward linkage, we have better value for cophenetic correlation with average linkage.

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1	87073	100000	2	1	1	0
1	2	38414	50000	3	0	10	9
2	3	17341	50000	7	1	3	4
3	4	40496	30000	5	1	1	4
4	5	47437	100000	6	0	12	3

	count	mean	std	min	25%	50%	75%	max
Sl_No	660.0	330.500000	190.669872	1.0	165.75	330.5	495.25	660.0
Customer Key	660.0	55141.443939	25627.772200	11265.0	33825.25	53874.5	77202.50	99843.0
Avg_Credit_Limit	660.0	34574.242424	37625.487804	3000.0	10000.00	18000.0	48000.00	200000.0
Total_Credit_Cards	660.0	4.706061	2.167835	1.0	3.00	5.0	6.00	10.0
Total_visits_bank	660.0	2.403030	1.631813	0.0	1.00	2.0	4.00	5.0
Total_visits_online	660.0	2.606061	2.935724	0.0	1.00	2.0	4.00	15.0
Total_calls_made	660.0	3.583333	2.865317	0.0	1.00	3.0	5.00	10.0

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1.740187	-1.249225	-0.860451	-0.547490	-1.251537
1	0.410293	-0.787585	-1.473731	2.520519	1.891859
2	0.410293	1.058973	-0.860451	0.134290	0.145528
3	-0.121665	0.135694	-0.860451	-0.547490	0.145528
4	1.740187	0.597334	-1.473731	3.202298	-0.203739
5	-0.387644	-0.787585	-1.473731	-0.547490	1.542593
6	1.740187	0.135694	-1.473731	2.861408	-0.553005
7	-0.520633	-0.787585	-1.473731	-0.547490	-0.902271
8	-0.786612	-1.249225	-1.473731	-0.206600	-0.553005
9	-0.839808	-0.325946	-1.473731	-0.547490	1.193326

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
Avg_Credit_Limit	1.000000	0.608860	-0.100312	0.551385	-0.414352
Total_Credit_Cards	0.608860	1.000000	0.315796	0.167758	-0.651251
Total_visits_bank	-0.100312	0.315796	1.000000	-0.551861	-0.506016
Total_visits_online	0.551385	0.167758	-0.551861	1.000000	0.127299
Total_calls_made	-0.414352	-0.651251	-0.506016	0.127299	1.000000

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
GROUP
0	417.528497	54881.329016	33782.383420	5.515544	3.489637	0.981865	2.000000
1	117.857143	55239.830357	12174.107143	2.410714	0.933036	3.553571	6.870536
2	611.280000	56708.760000	141040.000000	8.740000	0.600000	10.900000	1.080000

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
GROUP
0	386	386	386	386	386	386	386
1	224	224	224	224	224	224	224
2	50	50	50	50	50	50	50

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
GROUP
0	386	386	386	386	386	386	386
1	50	50	50	50	50	50	50
2	224	224	224	224	224	224	224