k近邻算法实例
一、问题:
海伦一直使用在线约会网站寻找适合自己的约会对象。她曾交往过三种类型的人:
不喜欢的人
一般喜欢的人
非常喜欢的人
这些人包含以下三种特征
每年获得的飞行常客里程数
玩视频游戏所耗时间百分比
每周消费的冰淇淋公升数
该网站现在需要尽可能向海伦推荐她喜欢的人,需要我们设计一个分类器,根据用户的以上三种特征,识别出是否该向海伦推荐
二、代码部分:
1.包模块:
import operator
from numpy import *
from os import listdir
import matplotlib
import matplotlib.pylab as plt
2.距离计算函数:
# 计算函数
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
3.数据准备:
def file2matrix(filename):
'14488\t7.153469\t1.673904\tsmallDoses\n',.......]
classLabelVector = []
index = 0
fr = open(filename)
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
print("\t分割后的列表:", listFromLine)
returnMat[index, :] = listFromLine[0:3]
print("第", index, "行得到的列表", returnMat[index, :]) 8.326976e+00 9.539520e-01]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat, classLabelVector
4.分析数据:
def printData(data):
plt.rcParams['font.sans-serif'] = ['Simhei']
fig = plt.figure()
ax = fig.add_subplot(111)
plt.scatter(data[:, 1], data[:, 2])
plt.xlabel("玩游戏所占时间")
plt.ylabel("每周消费冰激凌数")
plt.show()
5.归一化处理
def autoNormal(dataSet):
minVals = dataSet.min(0) # 参数0 按列查找最小值 这样可以找出三个特征值的最小值了
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
# tile(a,reps): a :要复制的值 reps:复制的次数 本次案例中 (m,1) m:复制m行 1:复制1列
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
6.分类器测试:
def datingClassTest(filename):
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix(filename)
printData(datingDataMat)
normMat, ranges, minVals = autoNormal(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("the total error rate is: %d" % ((errorCount/float(numTestVecs))*100), "%")
print(errorCount)
# the total error rate is: 5 %
# 5.0
if __name__ == '__main__':
datingClassTest("datingTestSet.txt")
更多推荐
所有评论(0)