简单的KNN算法实现

本文主要用于记录KNN算法实现过程,大部分均为代码

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

数据集下载与读取

mnist数据集地址为 http://yann.lecun.com/exdb/mnist/

下载完成后解压得到四个二进制文件,需要进行预处理后再进行预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def readLabel(label_dir: str):
# 读取标签数据
with open(label_dir, 'rb') as f:
return np.fromfile(f,offset = 8, dtype="int8")


def readImages(image_dir: str):
# 读取图片数据
with open(image_dir, 'rb') as f:
magic_num = int.from_bytes(f.read(4), byteorder="big")
num = int.from_bytes(f.read(4), byteorder="big")
rows = int.from_bytes(f.read(4), byteorder="big")
columns = int.from_bytes(f.read(4), byteorder="big")
return np.fromfile(f, dtype="uint8").reshape([num, rows, columns])


def make_df(images: np.ndarray, labels: np.ndarray, norm = True) ->pd.DataFrame:
"""
将图片的二维矩阵展平,保存至DataFrame中
"""
assert images.shape[0] == labels.shape[0]
images = images.reshape([images.shape[0], -1])

if norm:
x_mean = images.mean(axis = 1).reshape(-1, 1)
x_std = images.std(axis = 1).reshape(-1, 1)
images = (images - x_mean) / x_std

df = pd.DataFrame(images)
df["label"] = labels
return df


def dataSplit(df: pd.DataFrame, frac:float = 0.9) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
划分训练集和验证集
"""
train = df.sample(frac=frac)
return train, df.drop(train.index.tolist(), axis=0)

train_labels = readLabel("./Datasets/mnist/train-labels.idx1-ubyte")
test_labels = readLabel("./Datasets/mnist/t10k-labels.idx1-ubyte")

train_images = readImages("./Datasets/mnist/train-images.idx3-ubyte")
test_images = readImages("./Datasets/mnist/t10k-images.idx3-ubyte")

train_df = make_df(train_images, train_labels)
# 使用dataSplite(df)划分验证集和训练集
# train_df, dev_df = dataSplit(train_df)
test_df = make_df(test_images, test_labels)

查看部分图片

num = 25
images: pd.DataFrame = pd.concat([train_df, test_df], axis=0).sample(n = num)
plt.figure(figsize=(10,10), facecolor="#FFFFFF")
for i in range(25):
plt.subplot(5,5,i + 1)
img = images.iloc[i, :-1].to_numpy().reshape([28,28])
plt.imshow(img, cmap="gray")
plt.xticks([])
plt.yticks([])
plt.xlabel(str(images.iloc[i, -1]))

samples

PCA降维测试

特征值均为复数,因此无法使用PCA进行聚类分析

eigen_w, eigen_v = np.linalg.eig(np.cov(test_df.iloc[:, :-1].T))
order = np.argsort(eigen_w)[::-1]
order_w = eigen_w[order]
order_v = eigen_v[:, order]
print("特征值 1:", order_w[0])
X = test_df.iloc[:, :-1].dot(order_v[:, :2])
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s = 5, c = test_df.iloc[:, -1].tolist(), cmap="coolwarm")
plt.colorbar()
plt.show()

PCA

KNN聚类算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def KNN(data: np.ndarray, train_data: np.ndarray, train_label: np.ndarray, k: int, mode: str = "E", batch = 512, train_batch = 512) -> np.ndarray:
assert len(data.shape) == 2
# 4batch -> 4
# 3batch + 1 -> 4
# 3batch -> 3
batch_num = (data.shape[0] - 1) // batch + 1
train_batch_num = (train_data.shape[0] - 1) // train_batch + 1

data = data[:, np.newaxis, :]
train_data = train_data[np.newaxis, :, :]

# 定义距离公式
if mode == "E":
distance = lambda x, train_x: np.sqrt(np.sum((np.square(x - train_x)), axis = -1))
elif mode == "M":
distance = lambda x, train_x: np.sum(np.abs(x - train_x), axis = -1)
else:
distance = lambda x, train_x: x

# outputs保存每个batch输出的结果[(batch, k)]
outputs = []
for i in tqdm.tqdm(range(batch_num)):
# 保存各train_batch输出的距离
distances = []
for j in range(train_batch_num):
distances.append(distance(
data[i * batch: (i + 1) * batch, :, :],
train_data[:, j * train_batch: (j + 1) * train_batch, :]))

order = np.argsort(np.concatenate(distances, axis = -1), axis = -1)[:, :k]
outputs.append(train_label[order])

# 将输出结果拼接为矩阵,(data.shape[0], k)
result = np.concatenate(outputs, axis = 0)
return np.array([np.argmax(np.bincount(x)) for x in result])
test_num = 200
train_data = train_df.iloc[:, :-1].to_numpy()
train_label = train_df.iloc[:, -1].to_numpy()
data = test_df.sample(test_num)
label = data["label"].to_numpy()
data = data[list(range(28*28))].to_numpy()


%time pred = np.sum(label == KNN(data, train_data, train_label, k = 5, batch=16))
print("{:.2%}".format(pred / test_num))
100%|██████████| 13/13 [00:44<00:00,  3.39s/it]
Wall time: 44.1 s
97.00%

KNN分类的大致思路就是如此了,自己写的算法在速度上还是有很大的提升空间的,如果要测试完全部10000个样本大概耗时为40min。