KNN 算法优缺点:
 优点:精度高,对异常值不敏感
 缺点:计算复杂度高,空间复杂度高
 使用数据范围:数值型和标称型
 
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。
sklearn 实现KNN 算法
def sklearn_test(): from sklearn import datasets from sklearn.neighbors import 
KNeighborsClassifier from sklearn.model_selection import train_test_split 
import numpy as np np.random.seed(0) iris = datasets.load_iris() iris_x, iris_y 
= iris.data, iris.target # indices = np.random.permutation(len(iris_x)) # 产生随机数 
# iris_x_train, iris_x_test = iris_x[indices[:-10]], iris_x[indices[-10:] 
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_x, 
iris_y, test_size=0.1,random_state=42) knn = KNeighborsClassifier() 
knn.fit(iris_x_train, iris_y_train) iris_y_predict = knn.predict(iris_x_test) 
probability = knn.predict_proba(iris_x_test) print("hrllo") score = 
knn.score(iris_x_test, iris_y_test, sample_weight=None) print('then predict 
result of iris is:', iris_y_predict, 'and the real result of iris is: %d', 
iris_y_test) print('the accuracy is: %.2f' % score) # print("the neighbor point 
of last test sample:", neighborpoint) print("the probability is:", probability) 
KNN 算法的代码实现步骤:
import numpy as np import matplotlib.pyplot as plt # 创建训练集 def create_data(): 
x_train = np.array([[1,1.1], [1.3,0.8], [1.4,1.2], [1.1,0.9], [0.8,1.5], 
[2.5,2], [3.4,2.5], [3.7,2.5], [2,3]]) y_train = 
np.array(['a','a','a','a','a','b','b','b','b']) return x_train, y_train # 预测点 
x_test = np.array([2,2]) # 计算距离 def calculate_dis(x_train, k =3): dis = 
(x_train - x_test)**2 dis = dis.sum(axis = 1)**0.5 dis = dis.argsort() # 
argsort()函数,是numpy库中的函数,返回的是数组值从小到大的索引值 small_k = dis[:k] return dis,small_k # 
确定预测点所属类别 def pre_result(small_k, y_train): dic = {} for i in small_k: if 
y_train[i] in dic.keys(): dic[y_train[i]] += 1 else: dic[y_train[i]] = 1 return 
list(dic.keys())[0] # 将训练集按照所属类别分类 def to_array(cla): x_train, y_train = 
create_data() x = [] for i in range(len(y_train)): if y_train[i] == cla: 
x.append(list(x_train[i,:])) return np.array(x) # 画图 def plot_(x_train, pre, 
small_k): x_train_a = to_array('a') x_train_b = to_array('b') 
plt.scatter(x_train_a[:,0], x_train_a[:,1], c = 'b', marker='o', 
label='train_class_a') plt.scatter(x_train_b[:,0], x_train_b[:,1], c= 'r', 
marker='o', label = 'train_class_b') if pre == 'a': test_class = 'b' elif pre 
== 'b': test_class = 'r' plt.scatter(x_test[0], x_test[1], c = test_class, 
marker='*', label='test_class') for i in small_k: print([x_test[0], 
x_train[i,:][0]], [x_test[1], x_train[i,:][1]]) plt.plot([x_test[0], 
x_train[i,:][0]], [x_test[1], x_train[i,:][1]], c='c') plt.legend(loc = 'best') 
plt.show() def main(): x_train, y_train = create_data() dis, small_k = 
calculate_dis(x_train) pre = pre_result(small_k, y_train) plot_(x_train, pre, 
small_k) if __name__ == '__main__': main() 
画图结果展示:
以下是完整的代码实现过程
''' KNN 算法 优点:精度高,对异常值不敏感,武术家输入假定 缺点:计算复杂度高,空间复杂度高 使用数据范围:数值型和标称型 
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。 
''' import numpy as np import operator import os def create_dataSet(): group = 
np.array([[1, 1.1], [1, 1], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] 
return group, labels def classify0(in_x, data_set, labels, k): data_set_size = 
data_set.shape[0] diff_mat = np.tile(in_x, (data_set_size, 1)) - data_set dist 
= (diff_mat**2).sum(axis=1)**0.5 sorted_dist_index = dist.argsort() class_count 
= {} for i in range(k): votelabel = labels[sorted_dist_index[i]] 
class_count[votelabel] = class_count.get(votelabel, 0) + 1 sorted_class_count = 
sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) return 
sorted_class_count[0][0] # dating数据 def file2matrix(filename): file_path = 
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02' 
fr = open(file_path + filename) ar_lines = fr.readlines() num = len(ar_lines) 
return_mat = np.zeros((num, 3)) class_label = [] index = 0 for line in 
ar_lines: line = line.strip().split('\t') return_mat[index, :] = line[0:3] # 
class_label.append(int(line[-1])) if line[-1] == 'largeDoses': 
class_label.append(3) elif line[-1] == 'didntLike': class_label.append(2) else: 
class_label.append(1) index += 1 return return_mat, class_label # 归一化特征值 def 
auto_norm(data_set): min_val = data_set.min(0) # 行对比,得到n列最小的值 max_val = 
data_set.max(0) ranges = max_val - min_val norm_data_set = 
np.zeros(np.shape(data_set)) m = data_set.shape[0] norm_data_set = data_set - 
np.tile(min_val, (m, 1)) norm_data_set = norm_data_set/np.tile(ranges, (m, 1)) 
return norm_data_set, ranges, min_val # # 分类器针对约会网站的测试代码 def 
dating_class_test(): h = 0.1 dating_data_mat, dating_labels = 
file2matrix('/datingTestSet.txt') norm_mat, ranges, min_val = 
auto_norm(dating_data_mat) m = norm_mat.shape[0] num_test_vect = int(m*h) 
error_count = 0.0 for i in range(num_test_vect): classify_result = 
classify0(norm_mat[i,:], dating_data_mat[num_test_vect:, :], 
dating_labels[num_test_vect:], 3) print("the classifier come back with: %d, the 
real answer is : %d" %(int(classify_result), int(dating_labels[i]))) if 
classify_result != dating_labels[i]: error_count += 1 print("the total error 
rate is: %.2f%%" % (error_count*100/float(num_test_vect))) def 
classify_preson(): result_list = ['not at all', 'in small does', 'in large 
does'] percent_tats = float(input("percentage of time spent playing video 
games:")) ff_miles = float(input("frequent flier miles earned games:")) 
ice_cream = float(input("liters of ice cream consumed per year:")) dating_mat, 
dating_labels = file2matrix('/datingTestSet.txt') norm_mat, ranges, min_val = 
auto_norm(dating_mat) in_x = np.array((percent_tats, ff_miles, ice_cream)) 
classifier_result = classify0(in_x, norm_mat, dating_labels, 3) print("you will 
probably like this person:", result_list[classifier_result - 1]) # 导入手写数字文件 def 
img2vector(filename): filepath = 
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\trainingDigits' 
fr = open(filepath + filename) return_vect = [] for i in fr.readlines(): for j 
in i.strip(): return_vect.append(int(j)) # return_vect = 
np.array(return_vect).reshape(32, 32) return_vect = np.array(return_vect) 
return return_vect def handwriteing_classtest(): filepath = 
r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits' 
hw_labels = [] training_filelist = os.listdir(filepath + '/trainingDigits') m = 
len(training_filelist) training_mat = np.zeros((m, 1024)) for i in range(m): 
filename_str = training_filelist[i] file_str = filename_str.split('.')[0] 
class_num = int(file_str.split('_')[0]) hw_labels.append(class_num) 
training_mat[i, :] = img2vector('/%s' % filename_str) test_filelist = 
os.listdir(filepath + '/testDigits') error_count = 0.0 m_t = len(test_filelist) 
for i in range(m_t): filename_str = test_filelist[i] class_num = 
int(filename_str.split('.')[0].split('_')[0]) vector_undertest = 
img2vector('/%s' % filename_str) classifierresult = classify0(vector_undertest, 
training_mat, hw_labels, 3) print("the classifier came back with: %d, the real 
answer is: %d"%(classifierresult, class_num)) if (classifierresult != 
class_num): error_count += 1.0 print("\nthe total number of errors is: %d" 
%error_count) print("\nthe total error rate is: %f" %(error_count/float(m_t))) 
################测试的代码: def test(): from imp import reload # 重新导入模块 from 
matplotlib import pyplot as plt plt.scatter(group[:,0], group[:,1]) reload(KNN) 
datingDataMat, datingLabels = 
KNN.file2matrix(r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\datingTestSet2.txt') 
plt.scatter(datingDataMat[:,1], datingDataMat[:,2]) plt.scatter(x = 
datingDataMat[:,1], y = datingDataMat[:,2], c = np.array(datingLabels), s 
=np.array(datingLabels))