朴素贝叶斯分类

yuhai.china · 发表于 2013-1-27 04:40:39

给定训练样本
('Nobody owns water.','good');

('the quick rabbit jumps fences','good');

('buy pharmaceuticals','bad');

('make quick money at the online casino','bad');

('the quick brown fox jumps','good');

如何判断一个新的样本'quick money'是good还是bad呢，最常用的办法就是朴素贝叶斯分类

朴素贝叶斯分类的步骤大致如下：
1.根据样本集判断每个词属于各个分类的可能性。
也就是计算一个词的文档频度df
2.对待分类文本中的每一个词，计算相应的df,利用贝叶斯公式，把所有的df相乘，结果在乘以p(目录)的值，就算出了当前文本属于一个分类的概率
import reimport mathdef getwords(doc):spliter=re.compile('\\W*')words=[s.lower() for s in spliter.split(doc) if len(s)>2 and len(s)<20]return dict([(w,1) for w in words]);def sampletrain(cl):cl.train('Nobody owns water.','good');cl.train('the quick rabbit jumps fences','good');cl.train('buy pharmaceuticals','bad');cl.train('make quick money at the online casino','bad');cl.train('the quick brown fox jumps','good');class classifier:def __init__(self,getfeatures,filename=None):self.fc={}self.cc={}self.getfeatures=getfeaturesself.thresholds={}def setthreshold(self,cat,t):self.thresholds[cat]=tdef getthreshold(self,cat):if cat not in self.thresholds: return 1.0return self.thresholds[cat]def classify(self,item,default=None):probs={}max=0.0for cat in self.categories():probs[cat]=self.prob(item,cat)if probs[cat]>max:max=probs[cat]best=catfor cat in probs:if cat==best:continueif probs[cat]*self.getthreshold(best)>probs[best]:return defaultreturn bestdef incf(self,f,cat):self.fc.setdefault(f,{})self.fc[f].setdefault(cat,0)self.fc[f][cat]+=1def incc(self,cat):self.cc.setdefault(cat,0)self.cc[cat]+=1def fcount(self,f,cat):if f in self.fc and cat in self.fc[f]:return float(self.fc[f][cat])return 0.0def catcount(self,cat):if cat in self.cc:return float(self.cc[cat])return 0.0def totalcount(self):return sum(self.cc.values())def categories(self):return self.cc.keys()def train(self,item,cat):features=self.getfeatures(item)for f in features:self.incf(f,cat)self.incc(cat)def fprob(self,f,cat):if self.catcount(cat)==0:return 0return self.fcount(f,cat)/self.catcount(cat)def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):basicprob=prf(f,cat)totals=sum([self.fcount(f,c) for c in self.categories()])bp=((weight*ap)+(totals*basicprob))/(weight+totals)return bpclass nativebayes(classifier):def docprob(self,item,cat):features=self.getfeatures(item)p=1for f in features:p*=self.weightedprob(f,cat,self.fprob)return pdef prob(self,item,cat):catprob=self.catcount(cat)/self.totalcount()docprob=self.docprob(item,cat)return docprob*catprob

		自动登录	找回密码
密码			立即注册