FP-Growth是最常见的关联分析算法之一,其基本步骤是:
(1)对事务数据采用一棵FP树进行压缩存储
(2)FP树被构造出来,再使用一种递归的分而治之的方法来挖掘频繁项集
import csv
from collections import defaultdict, namedtuple
from optparse import OptionParserfrom fp_node import FPNode
from fp_tree import FPTreedef conditional_tree_from_paths(paths, minimum_support):"""Builds a conditional FP-tree from the given prefix paths."""''' 使用一个项标号对应的所有追溯路径建立该项标号的一个条件FP树'''tree = FPTree() #初始化condition_item = None #要为其建立条件FP树的项标号items = set() #路径(树)中涉及到的所有项标号for path in paths: #将每个路径添加到FP树中if condition_item is None:condition_item = path[-1].item #将路径中的最后一个项标号作为要为其建立条件FP树的项标号point = #从根节点开始(其是起始父节点)for node in path:next_point = point.search(node.item) if not next_point: #若父节点中不存在该标号的子节点items.add(node.item) #记录新出现的项标号count = unt if node.item == condition_item else 0 #除了要为其建立条件FP数的项标号对应的节点(其Count来自原始FP树,其他节点Count都先设置为0,下面再计算)next_point = FPNode(tree, node.item, count) #建立一个新的FPNodepoint.add(next_point) #将其添加为父节点的子节点tree._update_route(next_point) #将其添加到对应项标号的链表中point = next_point #下移一层,继续处理路径的下一个节点assert condition_item is not None''' 计算除了condition_item标号以外节点的Count(通过路径数)'''for path in tree.prefix_paths(condition_item): #在刚建立的条件FP树上,找出以condition_item结尾的所有路径count = path[-1].count #对每条路径,取出其最后一个节点(其实就是路径最后的condition_item标号的节点)的countfor node in reversed(path[:-1]):node._count += count #将路径上所有节点的count用最后一个节点(condition_item标号的节点)的count修正''' 计算每个涉及的项标号的支持度,如果小于阈值,将其所有节点从树中剪掉(注意要处理被剪掉节点子树上的count)'''for item in items:support = unt for n des(item))if support < minimum_support:# Doesn't make the cut anymorefor node des(item):if node.parent is not None:ve(node)''' 删除每条路径的最后一个节点(也就是condition_item标号的节点),这样以其为最后节点的所有路径的条件子树就构建起来了'''for node des(condition_item):if node.parent is not None:ve(node)return treedef find_frequent_itemsets(transactions, minimum_support, include_support=False):'''FINDS FREQUENT ITEMSETS IN THE GIVEN TRANSACTIONS'''''' 计算频繁集'''items = defaultdict(lambda:0) #每个(经过预处理后)项标号对应的支持度字典(1-项集的支持度字典)processed_transactions = [] #保存预处理后获取的事务集''' 可以预处理下事务集,比如删除某些项编号,排除某些事务等,这里未做任何处理'''for transaction in transactions: transaction = transaction #transaction[0].split() processed = []for item in transaction:items[item] += 1processed.append(item)processed_transactions.append(processed)items = dict((item, support) for item, support in items.items()if support >= minimum_support) #用阈值筛选出频繁的1-项集def clean_transaction(transaction):'''STRIPS TRANSACTIONS OF INFREQUENT ITEMS AND SURVIVING ITEMS ARE SORTED IN DECREASING ORDER OF FREQUENCY'''''' 按照预处理的结果,自每个事务中删除已经排除的项标号,同时将事务中的每个项标号按照其支持度大小排序(这样构造FP树后,支持度低的都在树的底层,便于修剪'''transaction = list(filter(lambda v: v in items, transaction)) transaction.sort(key=lambda v: items[v], reverse=True) return transaction''' 构建FP树'''master = FPTree()for transaction in map(clean_transaction, processed_transactions): #对数据进行预处理master.add(transaction) #将事务(路径)逐个添加到FP树# master.inspect()def find_with_suffix(tree, suffix):''' 后缀法沿树逐层向上寻找频繁项集(注意,这里第一次用的是原始FP树,下层递归就都是用的重新构建的条件FP树了)'''for item, nodes in tree.items(): #对每个项标号处理(注意由于建立FP树时,事务中都是支持度高的项在前面,会优先处理支持度高的项标号)support = unt for n in nodes) #计算该项标号的支持度if support >= minimum_support: #如果满足支持度阈值,找到一个频繁集found_set = [item] + suffixyield (found_set, support) if include_support else found_set'''Build a conditional tree and recursively search for frequent itemsets within it.'''''' 从该标号对应的节点继续向上追溯,找出所有路径,构造条件FP树'''cond_tree = conditional_tree_from_paths(tree.prefix_paths(item),minimum_support)''' 在条件FP树上继续寻找频繁的前缀路径'''for found_suffix in find_with_suffix(cond_tree, found_set):yield found_suffix'''Search for frequent itemsets, and yield the results we find.'''''' 搜索频繁集并返回迭代器'''for itemset in find_with_suffix(master, []):yield itemsetif __name__ == '__main__':data = []with open('data/transaction.csv','r') as csvf:lines = ader(csvf)data = list(lines) minsup = 2ffi = find_frequent_itemsets(data, minsup, True) for itemset, support in ffi:print('{' + ', '.join(itemset) + '} ' + str(support))
class FPNode(object):'''A NODE IN AN FP TREE'''''' FP树的节点类''' def __init__(self, tree, item, count=1):self._tree = tree #节点所属的树self._item = item #节点对应的item的标号self._count = count #通过节点的路径的个数(对树的根节点,总为None)self._parent = None #节点的父节点(对树的根节点,总为None)self._children = {} # 包含各子节点的字典self._neighbour = None # 链接的下一个节点(用于每个item节点链表的跟踪指针)def __repr__(self)::return '<%s (root)>' % type(self).__name__return '<%s %r (%r)>' % (type(self).__name__, self.item, unt)def add(self, child):'''ADDS GIVEN NODE AS A CHILD OF THE CURRENT NODE'''''' 添加一个子节点 '''if not isinstance(child, FPNode):raise TypeError("ERROR: CHILD TO BE ADDED MUST BE FPNode")if not child.item in self._children: #如果尚不存在该标号的子节点才添加self._children[child.item] = child #创建新节点child.parent = self #设置新节点的父节点为本节点def search(self, item):'''CHECKS IF CURRENT NODE HAS A CHILD NODE FOR THE GIVEN ITEM'''''' 在子节点字典中搜索一个特定项标号的节点并返回'''try:return self._children[item]except KeyError:return Nonedef remove(self, child):'''REMOVES CHILD NODE FROM CHILDREN OF CURRENT NODE'''''' 删除指定的子节点 '''try:if self._children[child.item] is child:del self._children[child.item] #从字典中删除child.parent = None #设置被删除的子节点父节点为空self._tree._removed(child) #同时从item链中删除该节点for sub_child in child.children: #处理被删除子节点的各个子节点,将其转移到当前节点中try:self._children[sub_child.item]._count += unt #如果该节点标号在当前节点的子节点中存在,直接将其Count加到当前节点的子节点里sub_child.parent = None #节点父节点置空except KeyError:self.add(sub_child) #否则,将该节点添加为当前节点的子节点child._children = {} #清空被删除子节点的子节点字典else:raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')except:raise ValueError('ERROR: CHILD TO BE REMOVED IS NOT THE CHILD OF THIS NODE')def __contains__(self, item):''' 是否包含特定项标号的子节点'''return item in self._children @propertydef tree(self):'''RETURNS THE TREE TO WHICH CURRENT NODE BELONGS'''''' 返回所属的树'''return self._tree @propertydef item(self):'''RETURNS ITEM CONTAINED IN CURRENT NODE'''''' 返回对应的项标号'''return self._item @propertydef count(self):'''RETURNS THE COUNT OF CURRENT NODE'S ITEM'''''' 返回经过节点的路径条数'''return self._count def increment(self):'''INCREMENTS THE COUNT OF CURRENT NODE'S ITEM'''''' 路径条数+1'''if self._count is None:raise ValueError('ERROR: ROOT NODE HAS NO COUNT')self._count += 1 @propertydef root(self):'''CHECKS IF CURRENT NODE IS ROOT OF THE FP TREE'''''' 是否是一个根节点'''return self._item is None and self._count is None@propertydef leaf(self):'''CHECKS IF CURRENT NODE IS NODE OF THE FP TREE'''''' 是否是一个叶子节点'''return len(self._children) == 0 def parent():''' 父节点获取或设置'''def fget(self):return self._parentdef fset(self, value):if value is not None and not isinstance(value, FPNode):raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A PARENT')if value is :raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE PARENT FROM ANOTHER TREE')self._parent = valuereturn locals()parent = property(**parent()) def neighbour():''' 链接的下一个节点的获取和设置'''def fget(self):return self._neighbourdef fset(self, value):if value is not None and not isinstance(value, FPNode):raise TypeError('ERROR: A NODE MUST HAVE AN FP NODE AS A NEIGHBOUR')if value is :raise ValueError('ERROR: NODE OF ONE TREE CANNOT HAVE NEIGHBOUR FROM ANOTHER TREE')self._neighbour = valuereturn locals()neighbour = property(**neighbour())@propertydef children(self):'''RETURNS CHILDREN OF CURRENT NODE'''''' 返回所有子节点构成的元组'''return tuple(self._children.values())def inspect(self, depth=0):''' 节点输出(控制台)'''print(' ' * depth + repr(self))for child in self.children:child.inspect(depth + 1)
from collections import namedtuple
from fp_node import FPNodeclass FPTree(object):'''FP TREE STRUCTURE'''''' FP树结构类'''Route = namedtuple('Route', 'head tail') #一个简化类(命名元组),用于描述每个项标号所有的FPNode的链表,其实同时包含头尾两个FPNode的引用,通过FPNode的neibour指针遍历获取所有FPNodedef __init__(self):''' 初始化根节点和各项标号链表使用的字典 '''self._root = FPNode(self, None, None) #根节点self._routes = {} #以项标号为key,包含每个项的FP节点列表(其实是上面的命名元组,仅包含头尾两个FPNode)@propertydef root(self):'''RETURNS ROOT OF THE FP TREE'''''' 返回根节点'''return self._rootdef add(self, transaction):'''ADDS A TRANSACTION TO THE TREE'''''' 向树中添加一个事务(或一条路径)'''point = self._root #从根节点开始for item in transaction: #按事务中的项排序沿树逐层向下查找next_point = point.search(item) if next_point: #父节点的子节点字典中以前已经建立了该项标号的子节点 next_point.increment() #直接给找到的子节点计数+1else:next_point = FPNode(self, item) #否则新建一个节点(count默认为1)point.add(next_point) #添加为父节点的一个子节点self._update_route(next_point) #添加到相应项标号的节点链表中point = next_point #向下移动到当前节点,继续处理下一层的节点def _update_route(self, point):'''ADD THE NODE POINT TO THE ROUTE THROUGH ALL NODES FOR ITS ITEM'''''' 将新建节点添加到对应项标号的节点列表中'''assert self y:route = self._routes[point.item] #如果项标号对应的列表已经存在(建立过)route[1].neighbour = point #将新节点链接到尾节点(作为尾节点的下一个)self._routes[point.item] = self.Route(route[0], point) #修正尾节点为刚添加的节点 except KeyError:self._routes[point.item] = self.Route(point, point) #如果没有建立过,新建项编号的链表,首尾节点引用都是当前添加的节点def items(self):'''GENERATE 2-TUPLES FOR EACH ITEM OF THE FORM (ITEM, GENERATOR)'''''' 返回一个迭代器,提供项标号和其所有FPNode的迭代器'''for item in self._routes:yield(item, des(item))def nodes(self, item):'''GENERATES THE SEQUENCE OF NODES THAT CONTAIN THE GIVEN ITEM'''''' 对于给定的项标号,返回一个枚举其所有FPNode的迭代器'''try:node = self._routes[item][0]except KeyError:returnwhile node:yield nodenode = ighbourdef prefix_paths(self, item):'''GENERATES PREFIX PATHS ENDING WITH CURRENT ITEM'''''' 给定一个类标号,返回所有以该标号结尾的路径(该操作用于获取构造针对特定项标号的条件FP树所需要的所有路径)'''def collect_path(node):''' 从给定的node向上追溯到树的根,返回追溯获取的路径'''path = []while node and :path.append(node)node = verse()return pathreturn (collect_path(node) for node des(item))def inspect(self):''' 输出树(控制台),包括树结构和各项标号及其所有FPNode'''print('nTREE:')inspect(1)print('nROUTES:')for item, nodes in self.items():print('%r' % item)for node in nodes:print('%r' % node)def _removed(self, node_to_remove):'''PERFORMS CLEANUP DURING REMOVAL OF A NODE'''''' 自项标号链表中删除一个节点'''head, tail = self._routes[node_to_remove.item] #获取链表的首节点和尾节点if node_to_remove is head: #如果要删除的节点是首节点if node_to_remove is tail or not node_ighbour: #如果只有一个节点# It was the sole node.del self._routes[node_to_remove.item] #直接从字典中删除该项标号及其链表else:self._routes[node_to_remove.item] = self.Route(node_ighbour, tail) #否则修正命名元组中的首节点else: #如果要删除的不是首节点for node des(node_to_remove.item):ighbour is node_to_remove: #找到其前一个节点ighbour = node_ighbour # skip over #修改前一个节点的neibour指针if node_to_remove is tail: self._routes[node_to_remove.item] = self.Route(head, node) #如果删除的是尾节点,修正命名元组中的尾节点break
本文发布于:2024-02-04 18:23:58,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170713581458270.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |