本文共 10931 字,大约阅读时间需要 36 分钟。
KNN Algorithm implementation
reference:
from matplotlib.colors import ListedColormapfrom sklearn import neighbors, datasetsiris = datasets.load_iris()print('data shape is {}'.format(iris.data.shape))print('class shape is {}'.format(iris.target.shape))X = iris.data[:, :2] # use first two version for simplicityy = iris.targetimport numpy as npimport matplotlib.pyplot as pltfrom matplotlib.colors import ListedColormapfrom sklearn import neighbors, datasetsiris = datasets.load_iris()cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])K = 3x = X[-1]fig, ax = plt.subplots(figsize=(4,4))for i, iris_class in enumerate(['Iris Setosa', 'Iris Versicolour', 'Iris Virginica']): idx = y==i ax.scatter(X[idx,0], X[idx,1], c=cmap_bold.colors[i], edgecolor='k', s=20, label=iris_class);ax.set(xlabel='sepal length (cm)', ylabel='sepal width (cm)')ax.legend()# GRADED FUNCTION: DO NOT EDIT THIS LINEdef pairwise_distance_matrix(X, Y): """Compute the pairwise distance between rows of X and rows of Y Arguments ---------- X: ndarray of size (N, D) Y: ndarray of size (M, D) Returns -------- distance_matrix: matrix of shape (N, M), each entry distance_matrix[i,j] is the distance between ith row of X and the jth row of Y (we use the dot product to compute the distance). """ if(len(Y.shape) == 1): Y = Y.reshape(1, -1) N, D = X.shape M, _ = Y.shape #distance_matrix = np.zeros((N, M)) # <-- EDIT THIS to compute the correct distance matrix. xydiff = X[:, :, None] - Y[:, :, None].T distance_matrix = np.sqrt(np.sum(xydiff**2, axis = 1)) return distance_matrix# GRADED FUNCTION: DO NOT EDIT THIS LINEdef KNN(k, X, y, x): """K nearest neighbors k: number of nearest neighbors X: training input locations y: training labels x: test input """ N, D = X.shape num_classes = len(np.unique(y)) #dist = np.zeros(N) # <-- EDIT THIS to compute the pairwise distance matrix dist = pairwise_distance_matrix(X, x)# return Nx1 matrix and reshape to 1 dimension array # Next we make the predictions ypred = np.zeros(num_classes) classes = y[np.argsort(dist, axis=0)][:k] # find the labels of the k nearest neighbors for c in np.unique(classes): ypred[c] = len(classes[classes == c]) # <-- EDIT THIS to compute the correct prediction return np.argmax(ypred)x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1step = 0.1xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))ypred = []for data in np.array([xx.ravel(), yy.ravel()]).T: ypred.append(KNN(K, X, y, data.reshape(1,2)))fig, ax = plt.subplots(figsize=(4,4))ax.pcolormesh(xx, yy, np.array(ypred).reshape(xx.shape), cmap=cmap_light)ax.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold, edgecolor='k', s=20);
K-NN by sklearn
from sklearn import datasetsfrom sklearn.model_selection import train_test_splitfrom sklearn.neighbors import KNeighborsClassifierimport numpy as npiris = datasets.load_iris()X = iris.datay = iris.targetprint(X, y)X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003) # split train, test datasetsclf = KNeighborsClassifier(n_neighbors=3)clf.fit(X_train, y_train)correct = np.count_nonzero((clf.predict(X_test)==y_test) == True)print("Accuracy is: %.3f" %(correct/len(X_test)))
Accuracy is: 0.921
K-NN Classifier by Numpy
from sklearn import datasetsfrom collections import Counter # 为了做投票from sklearn.model_selection import train_test_splitimport numpy as np# 导入iris数据iris = datasets.load_iris()X = iris.datay = iris.targetX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003)def euc_dis(instance1, instance2): """ 计算两个样本instance1和instance2之间的欧式距离 instance1: 第一个样本, array型 instance2: 第二个样本, array型 """ if len(instance1.shape) == 1: instance1 = instance1.reshape(1, -1) diff = instance1 - instance2 dist = np.sqrt(np.sum(diff ** 2, axis = 1, keepdims = True)) return dist def knn_classify(X, y, testInstance, k): """ 给定一个测试数据testInstance, 通过KNN算法来预测它的标签。 X: 训练数据的特征 y: 训练数据的标签 testInstance: 测试数据,这里假定一个测试数据 array型 k: 选择多少个neighbors? """ # TODO 返回testInstance的预测标签 = {0,1,2} num_classes = len(np.unique(y)) distances = euc_dis(X, testInstance) classes = y[np.argsort(distances, axis = 0)][:k] # find the labels of the k nearest neighbors ypred = np.zeros(num_classes) for c in np.unique(classes): ypred[c] = len(classes[classes == c]) return np.argmax(ypred)# test euc_disdef test_euc_dis(): X = np.array([1, 2]).reshape(1, -1) Y = np.array([3, 5]).reshape(1, -1) L = euc_dis(X, Y) print(L)# 预测结果 # test_euc_dis()predictions = [knn_classify(X_train, y_train, data, 3) for data in X_test]correct = np.count_nonzero((predictions==y_test)==True)print ("Accuracy is: %.3f" %(correct/len(X_test)))
How to choose the k for KNN algorithms?
Decision Bounary (very important)
example. 60 grade for pass of the exam
Linear decision boundary(Linear model) and Non-linear decision boundary(Non-linear model)
find the decision boundary
set k=1 for KNN algorithm, the decision boundary is sharp
set k for other values, and calculate the decision boundary,the decision boundary become smoother
plot decision boundary for different k values:
import matplotlib.pyplot as pltimport numpy as npfrom itertools import productfrom sklearn.neighbors import KNeighborsClassifier# generate some randome data setn_points = 100X1 = np.random.multivariate_normal([1, 50], [[1, 0], [0, 10]], n_points)X2 = np.random.multivariate_normal([2, 50], [[1, 0], [0, 10]], n_points)X = np.concatenate([X1, X2])y = np.array([0] * n_points + [1] * n_points)print(X.shape, y.shape)# train the KNN modelclfs = []neighbors = [1, 3, 5, 9, 11, 13, 15, 17, 19] # different value for kfor i in range(len(neighbors)): clfs.append(KNeighborsClassifier(n_neighbors=neighbors[i]).fit(X, y))# visualize the datax_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # choose the min, max value from the first column of Xy_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # choose the min ,max value from the second column of Xxx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))f, axarr = plt.subplots(3, 3, sharex = 'col', sharey = 'row', figsize = (15, 12))for idx, clf, tt in zip(product([0, 1, 2], [0, 1, 2]), clfs, ['KNN (k=%d)'%k for k in neighbors]): Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha = 0.4) axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c = y, s = 20, edgecolor='k') axarr[idx[0], idx[1]].set_title(tt) plt.show()
result:
How to choose the best hyperparameter k from the plot above?
Cross validation
Core: try to verify different values of k and choose the best one!
Split the Traing data into Training data and Validation data . Choose the best hyperparameter k respect with the Validation dat
why do we split the training data into train data and validation data?
Test data can't not be used for adjusting the hyperparameter of the model. So we split the training data into two part for adjusting of our model.
K-fold Cross Validation:
ref:
When the data set is small, we choose the large K. K means the number of the fold.
The general procedure is as follows:
Example in Scikit-learn:
# scikit-learn k-fold cross-validationfrom numpy import arrayfrom sklearn.model_selection import KFold# data sampledata = array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])# prepare cross validationkfold = KFold(3, True, 1)# enumerate splitsfor train, test in kfold.split(data): print('train: %s, test: %s' % (data[train], data[test]))
result:
1 2 3 | train: [0.1 0.4 0.5 0.6], test: [0.2 0.3] train: [0.2 0.3 0.4 0.6], test: [0.1 0.5] train: [0.1 0.2 0.3 0.5], test: [0.4 0.6] |
Other : leave_one_out cross validation
choose only on as validation data form the training data. when the data set is very small.
set K = N (N is the size of the training data set)
K fold cross validation for K
import numpy as npfrom sklearn import datasetsfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.model_selection import KFold # K-fold-validation# import iris data setiris = datasets.load_iris()X = iris.datay = iris.targetprint (X.shape, y.shape)# defin the k for KNN algorithmks = [1, 3, 5, 7, 9, 11, 13, 15]# 5-fold-validation# K-fold return the index of the training data and validation data.kf = KFold(n_splits = 5, random_state = 2001, shuffle = True)# save the current accuracybest_k = ks[0]best_score = 0# iterator for each K valuefor k in ks: curr_score = 0 for train_index, valid_index in kf.split(X): # calculate accuracy for each fold validation clf = KNeighborsClassifier(n_neighbors = k) clf.fit(X[train_index], y[train_index]) curr_score = curr_score + clf.score(X[valid_index], y[valid_index]) # calculate the average accuracy for the 5-fold-cross-validation avg_score = curr_score / 5 if avg_score > best_score: best_k = k best_score = avg_score print("current best score is: %.2f"%best_score, "best k: %d"%best_k)
current best score is: 0.96 best k: 1
current best score is: 0.96 best k: 1 current best score is: 0.97 best k: 5 current best score is: 0.98 best k: 7 current best score is: 0.98 best k: 7 current best score is: 0.98 best k: 7 current best score is: 0.98 best k: 7 current best score is: 0.98 best k: 7
using the GridSearchCV package from sklearn
from sklearn import datasetsfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.model_selection import GridSearchCV# import iris data setiris = datasets.load_iris()X = iris.datay = iris.target# defin the k for KNN algorithmparameters = {'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15]}knn = KNeighborsClassifier() # ! do not specify any parameters for the constructor# search the best value for K via GridSearchCVclf = GridSearchCV(knn, parameters, cv = 5)clf.fit(X, y)# print the best score of Kprint ("best score is: %.2f"%clf.best_score_, " best param: ", clf.best_params_)
Result:
best score is: 0.98 best param: {'n_neighbors': 7}
Important: DO NOT USE THE TEST DATA FOR CROSS VALIDATION!
Feature Normalize
Min-max Normalization
xnew = (x - min(X)) / (max(X) - min(X)) map the data into [0, 1]
Z-score Normalizaiton / Standarlization into N(0, 1) Gaussian Distribution
xnew = (x - mean(X)) / std(X)
转载地址:http://dwwci.baihongyu.com/