挖掘频繁模式主要有三种挖掘算法,分别是Apriori算法、Eclat算法、FP-growth算法。Apriori算法通过不断的构造候选集、筛选候选集挖掘出频繁项集,需要多次扫描原始数据,效率较低。Eclat算法使用垂直数据格式挖掘频繁项集,只用扫描一次数据集,效率较高。FP-growth算法,通过对数据集两次扫描,对数据进行压缩处理,效率高于Apriori。
FP-growth算法构建,我主要分为五步:
- 数据集的清洗
- 创建FP树和链表
- 条件模式基
- 条件FP树
- 构造频繁项集
FP-growth算法:
def loadDataSet(): dataSet = [['l1', 'l2', 'l5'], ['l2', 'l4'], ['l2', 'l3'], ['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'], ['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3'] ] return dataSet def initDataset(dataSet): resDataSet = {} for items in dataSet: for item in items: if item in resDataSet: resDataSet[item] += 1 else: resDataSet[item] = 1 return resDataSet def filterDataSet(resDataSet, minSupp=2): for key in resDataSet: supp = resDataSet[key] if supp >= minSupp: continue else: del resDataSet[key] resDataSet = sorted(resDataSet.items(), key=lambda d: d[1], reverse=True) left = 0 right = 1 while right < len(resDataSet): x = resDataSet[left] y = resDataSet[right] if x[1] > y[1]: right += 1 left += 1 else: if x[0] > y[0]: temp = resDataSet[left] resDataSet[left] = resDataSet[right] resDataSet[right] = temp else: right += 1 left += 1 resDataSet = dict(resDataSet) return resDataSet # 创建PF树
#dataSet 数据集 def createFPtree(dataSet): resDataSet = initDataset(dataSet) resDataSet = filterDataSet(resDataSet) headerTable = resDataSet freqItemSet = set(headerTable.keys()) if len(freqItemSet) == 0: return None, None for i in headerTable: # 链表 headerTable[i] = [headerTable[i], None] # FP树,创建树根 # print(headerTable.items()) retTree = treeNode("Null Set", 1, None) y = {} for items in dataSet: x = initDataset([[frozenset(items)]]) # 例:frozenset({'l1', 'l5', 'l2'}): 1 for key, value in x.items(): if key in y: y[key] += value else: y[key] = value # dict(sorted(y.items(), key=lambda d: d[1])) # print(y) for itemSets, count in y.items(): sortPattern = {} for i in itemSets: if i in freqItemSet: sortPattern[i] = headerTable[i][0] # print("mcm", sortPattern) # 例:{'l1': 5, 'l5': 2, 'l2': 6} if len(sortPattern) > 0: itemSet = list(filterDataSet(sortPattern).keys()) # print('苟富贵', itemSet) updateTree(itemSet, retTree, headerTable, count) return retTree, headerTable # 更新链表
#testNode 是链表的指针,targetNode 是树节点 def updateNodeLink(testNode, targetNode): while testNode.nodeLink is not None: testNode = testNode.nodeLink testNode.nodeLink = targetNode # 更新树节点
# itemSet是每次扫描数据集的事务(经过处理的),headerTable 链表 ,count支持度 def updateTree(itemSet, FPTree, headerTable, count): if itemSet[0] in FPTree.children: FPTree.children[itemSet[0]].inc(count) else: FPTree.children[itemSet[0]] = treeNode(itemSet[0], count, FPTree) if headerTable[itemSet[0]][1] is None: headerTable[itemSet[0]][1] = FPTree.children[itemSet[0]] else: updateNodeLink(headerTable[itemSet[0]][1], FPTree.children[itemSet[0]]) if len(itemSet) > 1: updateTree(itemSet[1::], FPTree.children[itemSet[0]], headerTable, count) # 回溯 def PFTreeUpTrans(leafNode, prefixPath): if leafNode.parent is not None: prefixPath.append(leafNode.name) PFTreeUpTrans(leafNode.parent, prefixPath) # 找前缀路径 def findPrefixPath(TreeNode): conPattern = {} while TreeNode is not None: prefixPath = [] PFTreeUpTrans(TreeNode, prefixPath) if len(prefixPath) > 1: prefixPath = prefixPath[::-1] conPattern[tuple(prefixPath[:len(prefixPath) - 1])] = TreeNode.count TreeNode = TreeNode.nodeLink # print(conPattern) return conPattern def filterContact(resDataSet, minSupp=2): resDataSet1 = list(resDataSet.keys()) for key in resDataSet1: supp = resDataSet[key] # print(supp) if supp >= minSupp: continue else: del resDataSet[key] return resDataSet def initContact(dataSet, conPattern): resContact = {} for items in dataSet: for item in items: if item in resContact: resContact[item] += conPattern[items] else: resContact[item] = conPattern[items] return resContact # 构造条件FP树
# conPattern 是条件模式基 def contact(conPattern): retList = [] o = conPattern # print(o) conPattern = list(sorted(o.keys(), key=lambda d: d[0], reverse=True)) # print('fdf', conPattern) if len(conPattern) == 1: retList.append(filterContact(initContact(conPattern, o))) elif len(conPattern) > 1: left = 0 right = 1 conList1 = [conPattern[left]] conList2 = [] while right < len(conPattern): if conPattern[left][0] == conPattern[right][0]: conList1.append(conPattern[right]) right += 1 retList.append(filterContact(initContact(conList1, o))) if right < len(conPattern): if conPattern[left][0] != conPattern[right][0]: left = right conList1 = conList2 conList1.append(conPattern[left]) if right + 1 > len(conPattern): retList.append(filterContact(initContact(conList1, o))) else: retList.append(filterContact(initContact(conList1, o))) right += 1 # print('条件FP数', retList) return retList #构建一个集合的所有子集,利用二进制法 def PowerSetsBinary(items): N = len(items) b = [] for i in range(2 ** N): # 子集个数,每循环一次一个子集 combo = [] for j in range(N): # 用来判断二进制下标为j的位置数是否为1 if (i >> j) % 2: combo.append(items[j]) b.append(combo) # print(b) return b[1::] # 构造频繁项集
# item是 项 ,retList是条件FP树是一个元素为字典的列表
#通过条件PF树的非空子集+项,构建频繁项集 def frequentItems(item, retList): res = {} for iii in retList: key = [ii for ii in iii.keys()] value = [ii for ii in iii.values()] subSet = PowerSetsBinary(key) for subList in subSet: subList.append(item) # print(subSet, 'hf') # print('zf', res) for ii in range(len(subSet)): if frozenset(subSet[ii]) in res: res[frozenset(subSet[ii])] += res[frozenset(subSet[ii])] # print('f', res[frozenset(subSet[ii])]) else: if ii <= len(subSet) - 2: res[frozenset(subSet[ii])] = iii[key[ii]] else: res[frozenset(subSet[ii])] = min(value) # print('产生的频繁项集', res) return res if __name__ == '__main__': retTree, headerTable = createFPtree(loadDataSet()) items = list(headerTable.keys())[::-1] # print(items) for item in items: print("--------------------------{0}--------------------------- -".format(item)) yu = findPrefixPath(headerTable[item][1]) print("{0},".format(item), "条件模式基:", yu) yi = contact(yu) print("{0},".format(item), "条件FP树:", yi) res = frequentItems(item, yi) print("{0},".format(item), "产生的频繁项集:", res) print("-----------------------------------------------------------")
树节点:
class treeNode: def __init__(self, nameValue, num, parentNode): self.name = nameValue self.count = num self.nodeLink = None self.parent = parentNode self.children = {} def inc(self, num): self.count += num
标签:FP,right,--,items,itemSet,resDataSet,growth,conPattern,headerTable From: https://www.cnblogs.com/dreamspace/p/16939720.html