Handwrite Digits Recognition with k-Means Clustering

May 5, 2022

Intro

Dataset: data/optdigits.csv source: UCI

Multiclass
Image

Import packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image

from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Read dataset

df = pd.read_csv("data/creditcard.csv")

df = pd.read_csv("data/optdigits.csv", header=None)

df.columns = ['p'+str(i) for i in range(64)]+['label']

# plot one sample
sample = df.iloc[0][:-1].to_numpy().reshape(8,8)
print("This is",df.iloc[0][-1])
plt.imshow(sample)

This is 0

<matplotlib.image.AxesImage at 0x7f77ae97b850>

Train

# Train test split
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state=6)

X = train.drop(['label'],axis=1)
y = train.label

# normalize
normalizer = Normalizer().fit(X)
X =  normalizer.transform(X)

# fit training set
model = KMeans(n_clusters=10, random_state=43)
model.fit(X) # matrix, matrix

KMeans(n_clusters=10, random_state=43)

Plot result

fig = plt.figure(figsize=(8, 3))
 
fig.suptitle('Cluser Center Images', fontsize=14, fontweight='bold')

for i in range(10):
 
    # Initialize subplots in a grid of 2X5, at i+1th position
    ax = fig.add_subplot(2, 5, 1 + i)

    # Display images
    ax.imshow(model.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary)

plt.show()

model_label=[2, 0, 5, 8, 4, 7, 3, 1, 6, 9]

Test result

X = test.drop(['label'],axis=1)
y = test.label

# normalize
X =  normalizer.transform(X)

# get prediction
y_pred = model.predict(X)
test['pred'] = [model_label[i] for i in y_pred]


# Score
print('Accuracy:',accuracy_score(y, test.pred))

Accuracy: 0.7947712418300653

# elbow method
inertia=[]
k_clusters=[]

for k in range(1,20):

    X = train.drop(['label'],axis=1)
    X =  normalizer.transform(X)

    # fit training set
    model_temp = KMeans(n_clusters = k, random_state=43)
    model_temp.fit(X) # matrix, matrix
    
    inertia.append(model_temp.inertia_)
    k_clusters.append(k)

# draw
plt.rcParams["figure.figsize"] = [8, 6]

plt.plot(k_clusters,inertia, marker='o')
plt.show()

# Error analysis
plt.rcParams["figure.figsize"] = [16, 30]

# i-th digit plot: pixel weight, correct, correct, wrong, wrong
for i in range(0,10):

    data=None
    for j in range(0,10):
        if model_label[j]==i:
            data = model.cluster_centers_[j].reshape((8, 8))

    plt.subplot(10,5,i*5+1)
    plt.imshow(data)

    # plot correct/wrong examples 
    t = test[(test.label==i)&(test.pred==i)]
    f = test[(test.label==i)&(test.pred!=i)]
    acc = len(t)/(len(t)+len(f))
    print(i,'digit accuracy:',acc)
    
    sample = t.iloc[0][:-2].to_numpy().reshape(8,8)
    plt.subplot(10,5,i*5+2)
    plt.imshow(sample)
    
    sample = t.iloc[1][:-2].to_numpy().reshape(8,8)
    plt.subplot(10,5,i*5+3)
    plt.imshow(sample)
    
    if acc<1:
    
        sample = f.iloc[0][:-2].to_numpy().reshape(8,8)
        plt.subplot(10,5,i*5+4)
        plt.imshow(sample)

        if len(f)>1:
            sample = f.iloc[1][:-2].to_numpy().reshape(8,8)
            plt.subplot(10,5,i*5+5)
            plt.imshow(sample)

digit accuracy: 0.9759036144578314
digit accuracy: 0.2987012987012987
digit accuracy: 0.8714285714285714
digit accuracy: 0.9066666666666666
digit accuracy: 0.8051948051948052
digit accuracy: 0.7532467532467533
digit accuracy: 0.9863013698630136
digit accuracy: 0.9846153846153847
digit accuracy: 0.7469879518072289
digit accuracy: 0.6705882352941176

Share on

Twitter Facebook LinkedIn

Handwrite Digits Recognition with k-Means Clustering

Intro

Import packages

Read dataset

Train

Plot result

Test result

Share on

Leave a comment

You may also enjoy

Neural Style Transfer from BTS to BTS

Web crawling with Python and BeautifulSoup (and a little HTML)

Text preprocessing with Python NLTK package

Dancer’s Business