# ==============================================
# 1. Import libraries
# ==============================================
import pandas as pd
import numpy as np
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.preprocessing import LabelEncoder

# ==============================================
# 2. Generate Dummy Genetic Mutation Data
# ==============================================
np.random.seed(42)
n_samples = 200

genes = [‘BRCA1’, ‘BRCA2’, ‘TP53’, ‘EGFR’, ‘KRAS’]
mutations = [‘missense’, ‘nonsense’, ‘frameshift’, ‘silent’]
chromosomes = [‘chr1’, ‘chr7’, ‘chr12’, ‘chr17’]
zygosity = [‘heterozygous’, ‘homozygous’]

data = {
‘Gene’: np.random.choice(genes, n_samples),
‘Mutation_Type’: np.random.choice(mutations, n_samples),
‘Chromosome’: np.random.choice(chromosomes, n_samples),
‘Zygosity’: np.random.choice(zygosity, n_samples),
‘Variant_Presence’: np.random.choice([‘present’, ‘absent’], n_samples)
}

df = pd.DataFrame(data)
print(“Sample Data:”)
print(df.head())

# ==============================================
# 3. Encode categorical data numerically
# ==============================================
df_encoded = df.apply(LabelEncoder().fit_transform)

# ==============================================
# 4. Function for silhouette using Hamming distance
# ==============================================
def categorical_silhouette_score(X, labels):
dist_matrix = pdist(X, metric=’hamming’)
return silhouette_score(squareform(dist_matrix), labels, metric=’precomputed’)

# ==============================================
# 5. K-Modes Clustering
# ==============================================
km = KModes(n_clusters=3, init=’Huang’, n_init=5, verbose=0, random_state=42)
km_labels = km.fit_predict(df)
km_silhouette = categorical_silhouette_score(df_encoded, km_labels)

# Add cluster membership to dataframe
df[‘KModes_Cluster’] = km_labels

print(f”\nK-Modes Silhouette Score: {km_silhouette:.3f}”)

# ==============================================
# 6. Hierarchical Clustering (Hamming distance)
# ==============================================
dist_matrix = pdist(df_encoded, metric=’hamming’)
linkage_matrix = linkage(dist_matrix, method=’average’)
hier_labels = fcluster(linkage_matrix, t=3, criterion=’maxclust’)
hier_silhouette = categorical_silhouette_score(df_encoded, hier_labels)

# Add cluster membership
df[‘Hierarchical_Cluster’] = hier_labels

print(f”Hierarchical Clustering Silhouette Score: {hier_silhouette:.3f}”)

# ==============================================
# 7. Latent Class Analysis (proxy via K-Modes Cao init)
# ==============================================
km_lca = KModes(n_clusters=3, init=’Cao’, n_init=10, verbose=0, random_state=42)
lca_labels = km_lca.fit_predict(df)
lca_silhouette = categorical_silhouette_score(df_encoded, lca_labels)

# Add cluster membership
df[‘LCA_Cluster’] = lca_labels

print(f”LCA (proxy via K-Modes Cao init) Silhouette Score: {lca_silhouette:.3f}”)

# ==============================================
# 8. Compare and display results
# ==============================================
results = pd.DataFrame({
‘Method’: [‘K-Modes (Huang)’, ‘Hierarchical (Hamming)’, ‘LCA (Cao init)’],
‘Silhouette_Score’: [km_silhouette, hier_silhouette, lca_silhouette]
}).sort_values(by=’Silhouette_Score’, ascending=False)

print(“\n=== Cluster Validation Results ===”)
print(results)

best_method = results.iloc[0][‘Method’]
print(f”\n🏆 Best clustering method: {best_method}”)

# ==============================================
# 9. Display dataframe with cluster memberships
# ==============================================
print(“\nSample data with cluster memberships:”)
print(df.head())

Print Friendly, PDF & Email
categoricl clustering

Venugopal Manneni


A doctor in statistics from Osmania University. I have been working in the fields of Analytics and research for the last 15 years. My expertise is to architecting the solutions for the data driven problems using statistical methods, Machine Learning and deep learning algorithms for both structured and unstructured data. In these fields I’ve also published papers. I love to play cricket and badminton.


Post navigation