Помогите написать метод множественной классификации k-nearest neighbours python (KNN)
У меня тренировочная задача реализовать классификатор на основе метода K-ближайших соседей. Мы применим его к задачам бинарной классификации (два класса) и многоклассовой классификации (несколько классов). Гиперпараметр (количество соседей) необходимо выбрать на основе кросс-валидации.
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as io
from pathlib import Path
import requests
from tqdm import tqdm
class SVHNLoader():
def __init__(self, download=False):
if download:
train_url = "http://ufldl.stanford.edu/housenumbers/train_32x32.mat"
test_url = "http://ufldl.stanford.edu/housenumbers/test_32x32.mat"
for url in (train_url, test_url):
self.download_data(url, verbose=True)
def get_data(self, max_train=1000, max_test=100):
train_X, train_y = self.load_data_mat("train_32x32.mat", max_train)
test_X, test_y = self.load_data_mat("test_32x32.mat", max_test)
return train_X, train_y, test_X, test_y
@staticmethod
def download_data(url, verbose=False):
filename = url.split('/')[-1]
print(f"Downloading {filename}")
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(filename, 'wb') as f:
for chunk in tqdm(r.iter_content(chunk_size=8192)):
f.write(chunk)
return filename
def load_data_mat(self, filename, max_samples, seed=42):
'''
Loads numpy arrays from .mat file
Returns:
X, np array (num_samples, 32, 32, 3) - images
y, np array of int (num_samples) - labels
'''
raw = io.loadmat(filename)
X = raw['X'] # Array of [32, 32, 3, n_samples]
y = raw['y'] # Array of [n_samples, 1]
X = np.moveaxis(X, [3], [0])
y = y.flatten()
# Fix up class 0 to be 0
y[y == 10] = 0
np.random.seed(seed)
samples = np.random.choice(np.arange(X.shape[0]), max_samples, replace=False)
return X[samples].astype(np.float32), y[samples]
Загрузка данных и визуализация:
svhn = SVHNLoader(download=True)
train_X, train_y, test_X, test_y = svhn.get_data()
samples_per_class = 5 # Number of samples per class to visualize
plot_index = 1
for example_index in range(samples_per_class):
for class_index in range(10):
plt.subplot(5, 10, plot_index)
image = train_X[train_y == class_index][example_index]
plt.imshow(image.astype(np.uint8))
plt.axis('off')
plot_index += 1
Подготовка меток и исходных данных:
# Выбираем только 0-ой и 9-ый классы
binary_train_mask = (train_y == 0) | (train_y == 9)
binary_train_X = train_X[binary_train_mask]
binary_train_y = train_y[binary_train_mask] == 0
binary_test_mask = (test_y == 0) | (test_y == 9)
binary_test_X = test_X[binary_test_mask]
binary_test_y = test_y[binary_test_mask] == 0
# Преобразуем в одномерный массив [num_samples, 32*32*3]
binary_train_X = binary_train_X.reshape(binary_train_X.shape[0], -1)
binary_test_X = binary_test_X.reshape(binary_test_X.shape[0], -1)
И дальше самое интересное: написать класс для бинарной многоклассовой классификации без применения Sklearn.
Начало:
class KNN:
"""
K-nearest-neighbor классификатор с L1 нормой
"""
def __init__(self, k=1):
self.k = k
def fit(self, X, y):
self.train_X = X
self.train_y = y
def predict(self, X, num_loops=0):
'''
Uses the KNN model to predict clases for the data samples provided
Arguments:
X, np array (num_samples, num_features) - samples to run
through the model
num_loops, int - which implementation to use
Returns:
predictions, np array of ints (num_samples) - predicted class
for each sample
'''
if num_loops == 0:
dists = self.compute_distances_no_loops(X)
elif num_loops == 1:
dists = self.compute_distances_one_loop(X)
else:
dists = self.compute_distances_two_loops(X)
if self.train_y.dtype == bool:
return self.predict_labels_binary(dists)
else:
return self.predict_labels_multiclass(dists)
def compute_distances_two_loops(self, X):
'''
Computes L1 distance from every sample of X to every training sample
Uses simplest implementation with 2 Python loops
Arguments:
X, np array (num_test_samples, num_features) - samples to run
Returns:
dists, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
'''
num_train = self.train_X.shape[0]
num_test = X.shape[0]
dists = np.zeros((num_test, num_train), np.float32)
for i_test in range(num_test):
for i_train in range(num_train):
# Fills dists[i_test][i_train]
dists[i_test, i_train] = np.sqrt(np.sum(np.square(self.X_train[i_train,:] - X[i_test,:])))
return dists
def compute_distances_one_loop(self, X):
'''
Computes L1 distance from every sample of X to every training sample
Vectorizes some of the calculations, so only 1 loop is used
Arguments:
X, np array (num_test_samples, num_features) - samples to run
Returns:
dists, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
'''
num_train = self.train_X.shape[0]
num_test = X.shape[0]
dists = np.zeros((num_test, num_train), np.float32)
for i_test in range(num_test):
# Fills the whole row of dists[i_test]
# without additional loops or list comprehensions
dists[i_test] = np.sqrt(np.sum(np.square(self.X_train - X[i_test,:]), axis = 1))
return dists
def compute_distances_no_loops(self, X):
'''
Computes L1 distance from every sample of X to every training sample
Fully vectorizes the calculations using numpy
Arguments:
X, np array (num_test_samples, num_features) - samples to run
Returns:
dists, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
'''
num_train = self.train_X.shape[0]
num_test = X.shape[0]
# Using float32 to to save memory - the default is float64
dists = np.zeros((num_test, num_train), np.float32)
# Implement computing all distances with no loops!
dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis]
+ (self.X_train**2).sum(axis=1)
- 2 * X.dot(self.X_train.T))
return dists
def predict_labels_binary(self, dists):
'''
Returns model predictions for binary classification case
Arguments:
dists, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
Returns:
pred, np array of bool (num_test_samples) - binary predictions
for every test sample
'''
num_test = dists.shape[0]
pred = np.zeros(num_test, bool)
for i in range(num_test):
# Implement choosing best class based on k
# nearest training samples
closest_y = []
distance = np.argsort(dists[i])[:k]
closest_y = self.y_train[distance]
pred[i] = np.argmax(np.bincount(closest_y))
return pred
Не понимаю как описать метод для многоклассовой классификации. С ЭТИМ МЕТОДОМ ПРОСЬБА ПОМОЧЬ:
def predict_labels_multiclass(self, dists):
'''
Returns model predictions for multi-class classification case
Arguments:
dists, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
Returns:
pred, np array of int (num_test_samples) - predicted class index
for every test sample
'''
num_test = dists.shape[0]
num_test = dists.shape[0]
pred = np.zeros(num_test, np.int)
for i in range(num_test):
# TODO: Implement choosing best class based on k
# nearest training samples
pass
return pred
Дальше идет создание классификатора:
knn_classifier = KNN(k=1)
knn_classifier.fit(binary_train_X, binary_train_y)
И проверка
dists = knn_classifier.compute_distances_two_loops(binary_test_X)
assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))
Вообще есть сложности с пониманием того как реализуется этот этот класс. KNN. Просьба указать есть ли ошибки, может его вообще иначе надо описывать.
Основа для текущего решения выполнена с помощью источника https://zhuanlan.zhihu.com/p/34193653