Machine Learning in Action Chapter 3

《机器学习实战》第三章决策树源码及分析。

源码

trees.py

#-*-coding:utf-8-*-

from math import log
import operator

def calcShannonEnt(dataSet):
    """
    计算给定数据集的香农熵
    """
    # 计算数据集中实例的总数
    numEntries = len(dataSet) 
    # 创建一个数据字典，其键值是数据集最后一列的数值
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        # 累计类标签出现的次数
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        # 计算香农熵
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2)
    return shannonEnt

def createDataSet():
    """
    创建测试数据集及标签
    """
    dataSet = [[1,1,'yes'],
            [1,1,'yes'],
            [1,0,'no'],
            [0,1,'no'],
            [0,1,'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

def splitDataSet(dataSet,axis,value):
    """
    按照给定特征划分数据集，参数：
    dataSet: 待划分的数据集,
    axis: 划分数据集的特征,
    value: 需要返回的特征值
    """
    # 创建新的列表对象
    retDataSet = []
    for featVec in dataSet:
        # 将符合特征的数据抽取出来
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):
    """
    该函数实现选取特征，划分数据集，计算得出最好的划分数据集的特征。
    函数中调用的数据需要满足的要求：
    1.数据必须是一种列表，且所有的列表元素都具有相同的数据长度；
    2.数据的最后一列（或每个实例的最后一个元素）是当前实例的类别标签。
    """
    numFeatures = len(dataSet[0]) - 1
    # 计算整个数据集的原始香农值
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0 
    bestFeature = -1
    for i in range(numFeatures):
        # 创建唯一的分类标签列表
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    """
    使用分类名称的列表，创建键值为列表中唯一值的数据字典，
    字典对象储存每个类标签出现的频率，对字典排序，返回出
    现次数最多的分类名称。
    """
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
        sortedClassCount = sorted(classCount.iteritems(),\
                key = operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]

def createTree(dataSet,labels):
    """
    创建分类树。
    """
    # 创建包含数据集所有类标签的列表变量
    classList = [example[-1] for example in dataSet]
    # 列表中类别完全相同则停止继续划分，返回类别标签
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 使用完了所有特征，仍然不能将数据集划分成仅包含唯一
    # 唯一类别的分组，返回出现次数最多的类别。
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    # 选择划分数据集的最好特征
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    # myTree存储树的所有信息
    myTree = {bestFeatLabel:{}}
    del (labels[bestFeat])
    # 得到列表包含的所有属性值
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet\
                (dataSet,bestFeat,value),subLabels)
    return myTree

def classify(inputTree,featLabels,testVec):
    """
    使用决策树的分类函数。
    """
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    # 查找当前列表中第一个匹配firstStr变量的元素
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLabel = classify(secondDict[key],featLabels,testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

def storeTree(inputTree,filename):
    """
    使用pickle模块存储决策树
    """
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

treePlotter.py

#-*-coding:utf-8-*-
import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth",fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt,xy=parentPt,\
            xycoords='axes fraction',\
            xytext=centerPt, textcoords='axes fraction',\
            va="center", ha="center",bbox=nodeType,arrowprops=arrow_args)

def createPlot():
    fig = plt.figure(1,facecolor='white')
    fig.clf()
    createPlot.ax1 = plt.subplot(111,frameon=False)
    plotNode(u'决策节点',(0.5,0.1),(0.1,0.5),decisionNode)
    plotNode(u'叶节点',(0.8,0.1),(0.3,0.8),leafNode)
    plt.show()

def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':
            thisDepth = 1+ getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth

def retrieveTree(i):
    listOfTrees = [{'no surfacing':{0:'no',1:{'flippers':\
            {0:'no',1:'yes'}}}},
            {'no surfacing':{0:'no',1:{'flippers':\
                    {0:{'head':{0:'no',1:'yes'}},1:'no'}}}}]
    return listOfTrees[i]
    
def plotMidText(cntrPt,parentPt,txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid,yMid,txtString)
    
def plotTree(myTree,parentPt,nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = myTree.keys()[0]
    cntrPt = (plotTree.xOff + (1.0+float(numLeafs))/2.0/plotTree.totalW,\
            plotTree.yOff)
    plotMidText(cntrPt,parentPt,nodeTxt)
    plotNode(firstStr,cntrPt,parentPt,decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':
            plotTree(secondDict[key],cntrPt,str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),\
                    cntrPt,leafNode)
            plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

def createPlot(inTree):
    fig = plt.figure(1,facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111,frameon=False,**axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree,(0.5,1.0),'')
    plt.show()

树形图

总结

决策树分类器就像带有终止块的流程图，终止块表示分类结果。开始处理数据时，首先要测量集合中数据的不一致性，也就是熵（熵的计算公式：$$ H=-\sum_{i=1}^np(x_i)log_2p(x_i) $$），然后寻找最优方案划分数据集，知道数据集中所有数据属于同一分类。