本文主要用于记录KNN算法实现过程,大部分均为代码
import tqdm import numpy as np import pandas as pd import matplotlib.pyplot as plt
|
数据集下载与读取
mnist数据集地址为 http://yann.lecun.com/exdb/mnist/
下载完成后解压得到四个二进制文件,需要进行预处理后再进行预测
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| def readLabel(label_dir: str): with open(label_dir, 'rb') as f: return np.fromfile(f,offset = 8, dtype="int8")
def readImages(image_dir: str): with open(image_dir, 'rb') as f: magic_num = int.from_bytes(f.read(4), byteorder="big") num = int.from_bytes(f.read(4), byteorder="big") rows = int.from_bytes(f.read(4), byteorder="big") columns = int.from_bytes(f.read(4), byteorder="big") return np.fromfile(f, dtype="uint8").reshape([num, rows, columns])
def make_df(images: np.ndarray, labels: np.ndarray, norm = True) ->pd.DataFrame: """ 将图片的二维矩阵展平,保存至DataFrame中 """ assert images.shape[0] == labels.shape[0] images = images.reshape([images.shape[0], -1])
if norm: x_mean = images.mean(axis = 1).reshape(-1, 1) x_std = images.std(axis = 1).reshape(-1, 1) images = (images - x_mean) / x_std
df = pd.DataFrame(images) df["label"] = labels return df
def dataSplit(df: pd.DataFrame, frac:float = 0.9) -> Tuple[pd.DataFrame, pd.DataFrame]: """ 划分训练集和验证集 """ train = df.sample(frac=frac) return train, df.drop(train.index.tolist(), axis=0)
train_labels = readLabel("./Datasets/mnist/train-labels.idx1-ubyte") test_labels = readLabel("./Datasets/mnist/t10k-labels.idx1-ubyte")
train_images = readImages("./Datasets/mnist/train-images.idx3-ubyte") test_images = readImages("./Datasets/mnist/t10k-images.idx3-ubyte")
train_df = make_df(train_images, train_labels)
test_df = make_df(test_images, test_labels)
|
查看部分图片
num = 25 images: pd.DataFrame = pd.concat([train_df, test_df], axis=0).sample(n = num) plt.figure(figsize=(10,10), facecolor="#FFFFFF") for i in range(25): plt.subplot(5,5,i + 1) img = images.iloc[i, :-1].to_numpy().reshape([28,28]) plt.imshow(img, cmap="gray") plt.xticks([]) plt.yticks([]) plt.xlabel(str(images.iloc[i, -1]))
|
PCA降维测试
特征值均为复数,因此无法使用PCA进行聚类分析
eigen_w, eigen_v = np.linalg.eig(np.cov(test_df.iloc[:, :-1].T)) order = np.argsort(eigen_w)[::-1] order_w = eigen_w[order] order_v = eigen_v[:, order] print("特征值 1:", order_w[0])
|
X = test_df.iloc[:, :-1].dot(order_v[:, :2]) plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s = 5, c = test_df.iloc[:, -1].tolist(), cmap="coolwarm") plt.colorbar() plt.show()
|
KNN聚类算法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| def KNN(data: np.ndarray, train_data: np.ndarray, train_label: np.ndarray, k: int, mode: str = "E", batch = 512, train_batch = 512) -> np.ndarray: assert len(data.shape) == 2 batch_num = (data.shape[0] - 1) // batch + 1 train_batch_num = (train_data.shape[0] - 1) // train_batch + 1
data = data[:, np.newaxis, :] train_data = train_data[np.newaxis, :, :]
if mode == "E": distance = lambda x, train_x: np.sqrt(np.sum((np.square(x - train_x)), axis = -1)) elif mode == "M": distance = lambda x, train_x: np.sum(np.abs(x - train_x), axis = -1) else: distance = lambda x, train_x: x
outputs = [] for i in tqdm.tqdm(range(batch_num)): distances = [] for j in range(train_batch_num): distances.append(distance( data[i * batch: (i + 1) * batch, :, :], train_data[:, j * train_batch: (j + 1) * train_batch, :]))
order = np.argsort(np.concatenate(distances, axis = -1), axis = -1)[:, :k] outputs.append(train_label[order])
result = np.concatenate(outputs, axis = 0) return np.array([np.argmax(np.bincount(x)) for x in result])
|
test_num = 200 train_data = train_df.iloc[:, :-1].to_numpy() train_label = train_df.iloc[:, -1].to_numpy() data = test_df.sample(test_num) label = data["label"].to_numpy() data = data[list(range(28*28))].to_numpy()
%time pred = np.sum(label == KNN(data, train_data, train_label, k = 5, batch=16)) print("{:.2%}".format(pred / test_num))
|
100%|██████████| 13/13 [00:44<00:00, 3.39s/it]
Wall time: 44.1 s
97.00%
KNN分类的大致思路就是如此了,自己写的算法在速度上还是有很大的提升空间的,如果要测试完全部10000个样本大概耗时为40min。