import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.stats import zscore
from scipy.spatial.distance import cdist, pdist
import sklearn.metrics
df = pd.read_excel("Credit Card Customer Data.xlsx")
df.dtypes
#examining first few rows
df.head()
#dataframe has 660 rows and 7 columns
df.shape
# examining for null values
df.isna().sum()
#five point summary
df.describe().transpose()
#getting rid of first two columns which is not releavant for clustering
custData=df.iloc[:,2:]
custData.head()
#Scaling the columns to same standards
custDataScaled=custData.apply(zscore)
custDataScaled.head(10)
for i in custDataScaled.columns:
sns.distplot(custDataScaled[i],hist=False,)
plt.show()
sns.pairplot(custDataScaled,diag_kind='kde')
custDataScaled.corr()
#Finding optimal no. of clusters
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
kmeans=KMeans(n_clusters=k)
kmeans.fit(custDataScaled)
prediction=kmeans.predict(custDataScaled)
meanDistortions.append(sum(np.min(cdist(custDataScaled, kmeans.cluster_centers_, 'euclidean'), axis=1)) / custDataScaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
# Let us first start with K = 3
final_kmeans=KMeans(3)
final_kmeans.fit(custDataScaled)
prediction=final_kmeans.predict(custDataScaled)
#Append the prediction
custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = prediction
df["GROUP"] = prediction
print("Groups Assigned : \n")
df.head()
customerDataClust_k = df.groupby(['GROUP'])
customerDataClust_k.mean()
customerDataClust_k.count()
custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_ward = linkage(custDataScaled, metric='euclidean',method = 'ward')
dendrogram(Z_ward,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
max_d =20
hward_clusters = fcluster(Z_ward, max_d, criterion='distance')
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_average = linkage(custDataScaled, metric='euclidean',method = 'average')
dendrogram(Z_average,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
max_d = 4
haverage_clusters = fcluster(Z_average, max_d, criterion='distance')
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z_complete = linkage(custDataScaled, metric='euclidean',method = 'complete')
dendrogram(Z_complete,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
max_d = 5
hcomplete_clusters = fcluster(Z_complete, max_d, criterion='distance')
model = AgglomerativeClustering(n_clusters= 3 , affinity='euclidean', linkage='average')
model.fit(custDataScaled)
df['GROUP'] = model.labels_
df.head(10)
customerDataClust_h = df.groupby(['GROUP'])
customerDataClust_h.mean()
customerDataClust_h.count()
custDataScaled_df = custDataScaled
custDataScaled_df["GROUP"] = model.labels_
custDataScaled_df.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))
#ward linkage
c_ward, coph_dists = cophenet(Z_ward , pdist(custDataScaled))
c_ward
#average linkage
c_average, coph_dists = cophenet(Z_average , pdist(custDataScaled))
c_average
#complete linkage
c_complete, coph_dists = cophenet(Z_complete , pdist(custDataScaled))
c_complete
# Calculate Avg Silhoutte Score for KMeans clustering
silhouette_score(custDataScaled,final_kmeans.labels_)
# Calculate Avg Silhoutte Score for hierarchical clustering (ward)
silhouette_score(custDataScaled,hward_clusters)
# Calculate Avg Silhoutte Score for hierarchical clustering (average linking)
silhouette_score(custDataScaled,haverage_clusters)
# Calculate Avg Silhoutte Score for hierarchical clustering (complete linking)
silhouette_score(custDataScaled,hcomplete_clusters)
Compared to K-Means have got better silhouette scores when we compare with hierarchical clustering. We are getting equal number of records with three clusters (386,224,50) in both cases. The mean values of independent variables is also very similar when comparing the kmeans and hierarchical clustering techniques. Comparing the box plot gives similar notion.
I tried to find the elbow point with a elbow graph with k values between 1 to 9 and selected 3 as the k value. I am getting decent number for silhouette score around .59 which is not ideal (which should be close to one). Looking at boxplot we could clearly see that the independent variable groups are clearly differentiated in this grouping.
For hierarchical clustering single linkage i didnt try because proximity between two clusters cannot be good as there are outliers. I had tried to experiment with complete linkage, average and ward and from the cophenetic correlation and silhouette values average linking is the one which is giving good values. Finaly Agglometric clustering is used with 3 clusters with average linkage , because even though we have better silhouette score for ward linkage, we have better value for cophenetic correlation with average linkage.