Помогите написать метод множественной классификации k-nearest neighbours python (KNN)

У меня тренировочная задача реализовать классификатор на основе метода K-ближайших соседей. Мы применим его к задачам бинарной классификации (два класса) и многоклассовой классификации (несколько классов). Гиперпараметр (количество соседей) необходимо выбрать на основе кросс-валидации.

import numpy as np
import matplotlib.pyplot as plt

import scipy.io as io
from pathlib import Path
import requests
from tqdm import tqdm


class SVHNLoader():
    def __init__(self, download=False):
        if download:
            train_url = "http://ufldl.stanford.edu/housenumbers/train_32x32.mat"
            test_url = "http://ufldl.stanford.edu/housenumbers/test_32x32.mat"
            for url in (train_url, test_url):
                self.download_data(url, verbose=True)
        
    def get_data(self, max_train=1000, max_test=100):
        train_X, train_y = self.load_data_mat("train_32x32.mat", max_train)
        test_X, test_y = self.load_data_mat("test_32x32.mat", max_test)
        return train_X, train_y, test_X, test_y

    @staticmethod
    def download_data(url, verbose=False):
        filename = url.split('/')[-1]
        print(f"Downloading {filename}")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filename, 'wb') as f:
                for chunk in tqdm(r.iter_content(chunk_size=8192)):
                    f.write(chunk)
        return filename

    def load_data_mat(self, filename, max_samples, seed=42):
        '''
        Loads numpy arrays from .mat file

        Returns:
        X, np array (num_samples, 32, 32, 3) - images
        y, np array of int (num_samples) - labels
        '''
        raw = io.loadmat(filename)
        X = raw['X']  # Array of [32, 32, 3, n_samples]
        y = raw['y']  # Array of [n_samples, 1]
        X = np.moveaxis(X, [3], [0])
        y = y.flatten()

        # Fix up class 0 to be 0
        y[y == 10] = 0

        np.random.seed(seed)
        samples = np.random.choice(np.arange(X.shape[0]), max_samples, replace=False)
        return X[samples].astype(np.float32), y[samples]

Загрузка данных и визуализация:

svhn = SVHNLoader(download=True)
train_X, train_y, test_X, test_y = svhn.get_data()

samples_per_class = 5  # Number of samples per class to visualize
plot_index = 1
for example_index in range(samples_per_class):
    for class_index in range(10):
        plt.subplot(5, 10, plot_index)
        image = train_X[train_y == class_index][example_index]
        plt.imshow(image.astype(np.uint8))
        plt.axis('off')
        plot_index += 1

Подготовка меток и исходных данных:

# Выбираем только 0-ой и 9-ый классы
binary_train_mask = (train_y == 0) | (train_y == 9)
binary_train_X = train_X[binary_train_mask]
binary_train_y = train_y[binary_train_mask] == 0

binary_test_mask = (test_y == 0) | (test_y == 9)
binary_test_X = test_X[binary_test_mask]
binary_test_y = test_y[binary_test_mask] == 0

# Преобразуем в одномерный массив [num_samples, 32*32*3]
binary_train_X = binary_train_X.reshape(binary_train_X.shape[0], -1)
binary_test_X = binary_test_X.reshape(binary_test_X.shape[0], -1)

И дальше самое интересное: написать класс для бинарной многоклассовой классификации без применения Sklearn.

Начало:

class KNN:
    """
    K-nearest-neighbor классификатор с L1 нормой
    """
    def __init__(self, k=1):
        self.k = k

    def fit(self, X, y):
        self.train_X = X
        self.train_y = y

    def predict(self, X, num_loops=0):
        '''
        Uses the KNN model to predict clases for the data samples provided
        
        Arguments:
        X, np array (num_samples, num_features) - samples to run
           through the model
        num_loops, int - which implementation to use

        Returns:
        predictions, np array of ints (num_samples) - predicted class
           for each sample
        '''
        if num_loops == 0:
            dists = self.compute_distances_no_loops(X)
        elif num_loops == 1:
            dists = self.compute_distances_one_loop(X)
        else:
            dists = self.compute_distances_two_loops(X)

        if self.train_y.dtype == bool:
            return self.predict_labels_binary(dists)
        else:
            return self.predict_labels_multiclass(dists)

    def compute_distances_two_loops(self, X):
        '''
        Computes L1 distance from every sample of X to every training sample
        Uses simplest implementation with 2 Python loops

        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        dists, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        '''
        num_train = self.train_X.shape[0]
        num_test = X.shape[0]
        dists = np.zeros((num_test, num_train), np.float32)
        for i_test in range(num_test):
            for i_train in range(num_train):
                # Fills dists[i_test][i_train]
                dists[i_test, i_train] = np.sqrt(np.sum(np.square(self.X_train[i_train,:] - X[i_test,:])))
        return dists

    def compute_distances_one_loop(self, X):
        '''
        Computes L1 distance from every sample of X to every training sample
        Vectorizes some of the calculations, so only 1 loop is used

        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        dists, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        '''
        num_train = self.train_X.shape[0]
        num_test = X.shape[0]
        dists = np.zeros((num_test, num_train), np.float32)
        for i_test in range(num_test):
            # Fills the whole row of dists[i_test]
            # without additional loops or list comprehensions
            dists[i_test] = np.sqrt(np.sum(np.square(self.X_train - X[i_test,:]), axis = 1))
        return dists

    def compute_distances_no_loops(self, X):
        '''
        Computes L1 distance from every sample of X to every training sample
        Fully vectorizes the calculations using numpy

        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        dists, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        '''
        num_train = self.train_X.shape[0]
        num_test = X.shape[0]
        # Using float32 to to save memory - the default is float64
        dists = np.zeros((num_test, num_train), np.float32)
        # Implement computing all distances with no loops!
        dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis]
            + (self.X_train**2).sum(axis=1)
            - 2 * X.dot(self.X_train.T))
        return dists

    def predict_labels_binary(self, dists):
        '''
        Returns model predictions for binary classification case
        
        Arguments:
        dists, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample

        Returns:
        pred, np array of bool (num_test_samples) - binary predictions 
           for every test sample
        '''
        num_test = dists.shape[0]
        pred = np.zeros(num_test, bool)
        for i in range(num_test):
            # Implement choosing best class based on k
            # nearest training samples
            closest_y = []
            distance = np.argsort(dists[i])[:k]
            closest_y = self.y_train[distance]
            pred[i] = np.argmax(np.bincount(closest_y))
        return pred

Не понимаю как описать метод для многоклассовой классификации. С ЭТИМ МЕТОДОМ ПРОСЬБА ПОМОЧЬ:

def predict_labels_multiclass(self, dists):
        '''
        Returns model predictions for multi-class classification case
        
        Arguments:
        dists, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample

        Returns:
        pred, np array of int (num_test_samples) - predicted class index 
           for every test sample
        '''
        num_test = dists.shape[0]
        num_test = dists.shape[0]
        pred = np.zeros(num_test, np.int)
        for i in range(num_test):
            # TODO: Implement choosing best class based on k
            # nearest training samples
            pass
        return pred

Дальше идет создание классификатора:

knn_classifier = KNN(k=1)
knn_classifier.fit(binary_train_X, binary_train_y)

И проверка

dists = knn_classifier.compute_distances_two_loops(binary_test_X)
assert np.isclose(dists[0, 10], np.sum(np.abs(binary_test_X[0] - binary_train_X[10])))

Вообще есть сложности с пониманием того как реализуется этот этот класс. KNN. Просьба указать есть ли ошибки, может его вообще иначе надо описывать.

Основа для текущего решения выполнена с помощью источника https://zhuanlan.zhihu.com/p/34193653


Ответы (0 шт):