首页 技术 正文
技术 2022年11月18日
0 收藏 925 点赞 2,624 浏览 3855 个字
import heapq
import randomclass Classifier:
def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k):

      ””” 一个分类器将建立与bucketprefix文件
      除textbucketnumber文件。数据格式是一个字符串,
      描述如何解释数据文件的每一行。

      ”””

        self.medianAndDeviation = []
self.k = k self.format = dataFormat.strip().split('\t')
self.data = [] for i in range(1, 11): if i != testBucketNumber:
filename = "%s-%02i" % (bucketPrefix, i)
f = open(filename)
lines = f.readlines()
f.close()
for line in lines[1:]:
fields = line.strip().split('\t')
ignore = []
vector = []
for i in range(len(fields)): if self.format[i] == 'num':
vector.append(float(fields[i]))
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
classification = fields[i]
self.data.append((classification, vector, ignore))
self.rawData = list(self.data) self.vlen = len(self.data[0][1]) for i in range(self.vlen):
self.normalizeColumn(i) def getMedian(self, alist):
"""返回列表"""
if alist == []:
return []
blist = sorted(alist)
length = len(alist)
if length % 2 == 1: return blist[int(((length + 1) / 2) - 1)]
else: v1 = blist[int(length / 2)]
v2 =blist[(int(length / 2) - 1)]
return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median):
"""取绝对标准偏差"""
sum = 0
for item in alist:
sum += abs(item - median)
return sum / len(alist) def normalizeColumn(self, columnNumber):
"""给出一列数,规范self.data列"""
# 先提取值列表
col = [v[1][columnNumber] for v in self.data]
median = self.getMedian(col)
asd = self.getAbsoluteStandardDeviation(col, median)
#print("Median: %f ASD = %f" % (median, asd))
self.medianAndDeviation.append((median, asd))
for v in self.data:
v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v):
"""We have stored the median and asd for each column.
We now use them to normalize vector v"""
vector = list(v)
for i in range(len(vector)):
(median, asd) = self.medianAndDeviation[i]
vector[i] = (vector[i] - median) / asd
return vector def testBucket(self, bucketPrefix, bucketNumber):
"""评估分类bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber)
f = open(filename)
lines = f.readlines()
totals = {}
f.close()
for line in lines:
data = line.strip().split('\t')
vector = []
classInColumn = -1
for i in range(len(self.format)):
if self.format[i] == 'num':
vector.append(float(data[i]))
elif self.format[i] == 'class':
classInColumn = i
theRealClass = data[classInColumn] classifiedAs = self.classify(vector)
totals.setdefault(theRealClass, {})
totals[theRealClass].setdefault(classifiedAs, 0)
totals[theRealClass][classifiedAs] += 1
return totals def manhattan(self, vector1, vector2):
"""计算曼哈顿距离"""
return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def knn(self, itemVector):
"""使用K近邻预测itemVector类""" neighbors = heapq.nsmallest(self.k,[(self.manhattan(itemVector, item[1]), item)
for item in self.data]) results = {}
for neighbor in neighbors:
theClass = neighbor[1][0]
results.setdefault(theClass, 0)
results[theClass] += 1
resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True) maxVotes = resultList[0][0]
possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes] answer = random.choice(possibleAnswers)
return( answer) def classify(self, itemVector):
"""返回类""" return(self.knn(self.normalizeVector(itemVector))) def tenfold(bucketPrefix, dataFormat, k):
results = {}
for i in range(1, 11):
c = Classifier(bucketPrefix, i, dataFormat, k)
t = c.testBucket(bucketPrefix, i)
for (key, value) in t.items():
results.setdefault(key, {})
for (ckey, cvalue) in value.items():
results[key].setdefault(ckey, 0)
results[key][ckey] += cvalue categories = list(results.keys())
categories.sort()
print( "\n Classified as: ")
header = " "
subheader = " +"
for category in categories:
header += "% 2s " % category
subheader += "-----+"
print (header)
print (subheader)
total = 0.0
correct = 0.0
for category in categories:
row = " %s |" % category
for c2 in categories:
if c2 in results[category]:
count = results[category][c2]
else:
count = 0
row += " %3i |" % count
total += count
if c2 == category:
correct += count
print(row)
print(subheader)
print("\n%5.3f percent correct" %((correct * 100) / total))
print("total of %i instances" % total)print("SMALL DATA SET")
tenfold("pimaSmall/pimaSmall",
"numnumnumnumnumnumnumnumclass", 1)
print("\n\nLARGE DATA SET")tenfold("pima/pima",
"numnumnumnumnumnumnumnumclass", 1)

  knn的python代码

相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,489
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,904
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,737
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,490
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:8,128
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:5,291