diff --git a/Readme.md b/Readme.md index b15cdaa..7aef84d 100644 --- a/Readme.md +++ b/Readme.md @@ -1,11 +1,11 @@ Python FP-Growth ================ -This module provides a pure Python implementation of the FP-growth algorithm for -finding frequent itemsets. FP-growth exploits an (often-valid) assumption that -many transactions will have items in common to build a prefix tree. If the -assumption holds true, this tree produces a compact representation of the actual -transactions and is used to generate itemsets much faster than *Apriori* can. +## concept +Association Rule (연관규칙)을 적용하기 위해서는 각 item들이 각itemset에서 어떤 빈도로 나타났고, 어떤 item과 함께 나왔는지 세는 것이 필수이다. 하지만 데이터셋이 큰 경우, 이를 모든 후보 itemset들에 대해서 하나하나 검사하는 것은 굉장히 비효율적이다. 이러한 문제를 해결하기 위해 제시된 것이 FP-growth algorithm이다. + +reference) +https://process-mining.tistory.com/92 Installation ------------ @@ -84,4 +84,4 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. [me]: http://github.com/enaeseth/ -[pypi]: http://pypi.python.org/ \ No newline at end of file +[pypi]: http://pypi.python.org/ diff --git a/fp_growth.py b/fp_growth.py index 4ada47d..9c900b0 100644 --- a/fp_growth.py +++ b/fp_growth.py @@ -10,11 +10,13 @@ """ from collections import defaultdict, namedtuple -from itertools import imap -__author__ = 'Eric Naeseth ' -__copyright__ = 'Copyright © 2009 Eric Naeseth' -__license__ = 'MIT License' +# from itertools import imap + +__author__ = "Eric Naeseth " +__copyright__ = "Copyright © 2009 Eric Naeseth" +__license__ = "MIT License" + def find_frequent_itemsets(transactions, minimum_support, include_support=False): """ @@ -31,7 +33,7 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False) If `include_support` is true, yield (itemset, support) pairs instead of just the itemsets. """ - items = defaultdict(lambda: 0) # mapping from items to their supports + items = defaultdict(lambda: 0) # mapping from items to their supports # Load the passed-in transactions and count the support that individual # items have. @@ -40,19 +42,25 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False) items[item] += 1 # Remove infrequent items from the item support dictionary. - items = dict((item, support) for item, support in items.iteritems() - if support >= minimum_support) + items = dict( + (item, support) + # for item, support in items.iteritems() + for item, support in items.items() + if support >= minimum_support + ) # Build our FP-tree. Before any transactions can be added to the tree, they # must be stripped of infrequent items and their surviving items must be # sorted in decreasing order of frequency. def clean_transaction(transaction): transaction = filter(lambda v: v in items, transaction) - transaction.sort(key=lambda v: items[v], reverse=True) + # transaction.sort(key=lambda v: items[v], reverse=True) + transaction = sorted(transaction, key=lambda v: items[v], reverse=True) return transaction master = FPTree() - for transaction in imap(clean_transaction, transactions): + # for transaction in imap(clean_transaction, transactions): + for transaction in list(map(clean_transaction, transactions)): master.add(transaction) def find_with_suffix(tree, suffix): @@ -67,12 +75,13 @@ def find_with_suffix(tree, suffix): # itemsets within it. cond_tree = conditional_tree_from_paths(tree.prefix_paths(item)) for s in find_with_suffix(cond_tree, found_set): - yield s # pass along the good news to our caller + yield s # pass along the good news to our caller # Search for frequent itemsets, and yield the results we find. for itemset in find_with_suffix(master, []): yield itemset + class FPTree(object): """ An FP tree. @@ -81,7 +90,7 @@ class FPTree(object): (i.e., all items must be valid as dictionary keys or set members). """ - Route = namedtuple('Route', 'head tail') + Route = namedtuple("Route", "head tail") def __init__(self): # The root node of the tree. @@ -124,7 +133,7 @@ def _update_route(self, point): try: route = self._routes[point.item] - route[1].neighbor = point # route[1] is the tail + route[1].neighbor = point # route[1] is the tail self._routes[point.item] = self.Route(route[0], point) except KeyError: # First node for this item; start a new route. @@ -136,7 +145,8 @@ def items(self): element of the tuple is the item itself, and the second element is a generator that will yield the nodes in the tree that belong to the item. """ - for item in self._routes.iterkeys(): + # for item in self._routes.iterkeys(): + for item in self._routes: yield (item, self.nodes(item)) def nodes(self, item): @@ -167,15 +177,16 @@ def collect_path(node): return (collect_path(node) for node in self.nodes(item)) def inspect(self): - print 'Tree:' + print("Tree:") self.root.inspect(1) - print - print 'Routes:' + print() + print("Routes:") for item, nodes in self.items(): - print ' %r' % item + print(" %r" % item) for node in nodes: - print ' %r' % node + print(" %r" % node) + def conditional_tree_from_paths(paths): """Build a conditional FP-tree from the given prefix paths.""" @@ -212,6 +223,7 @@ def conditional_tree_from_paths(paths): return tree + class FPNode(object): """A node in an FP tree.""" @@ -312,7 +324,7 @@ def children(self): return tuple(self._children.itervalues()) def inspect(self, depth=0): - print (' ' * depth) + repr(self) + print((" " * depth) + repr(self)) for child in self.children: child.inspect(depth + 1) @@ -322,21 +334,31 @@ def __repr__(self): return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count) -if __name__ == '__main__': +if __name__ == "__main__": from optparse import OptionParser import csv - p = OptionParser(usage='%prog data_file') - p.add_option('-s', '--minimum-support', dest='minsup', type='int', - help='Minimum itemset support (default: 2)') - p.add_option('-n', '--numeric', dest='numeric', action='store_true', - help='Convert the values in datasets to numerals (default: false)') + p = OptionParser(usage="%prog data_file") + p.add_option( + "-s", + "--minimum-support", + dest="minsup", + type="int", + help="Minimum itemset support (default: 2)", + ) + p.add_option( + "-n", + "--numeric", + dest="numeric", + action="store_true", + help="Convert the values in datasets to numerals (default: false)", + ) p.set_defaults(minsup=2) p.set_defaults(numeric=False) options, args = p.parse_args() if len(args) < 1: - p.error('must provide the path to a CSV file to read') + p.error("must provide the path to a CSV file to read") transactions = [] with open(args[0]) as database: @@ -351,8 +373,8 @@ def __repr__(self): result = [] for itemset, support in find_frequent_itemsets(transactions, options.minsup, True): - result.append((itemset,support)) + result.append((itemset, support)) result = sorted(result, key=lambda i: i[0]) for itemset, support in result: - print str(itemset) + ' ' + str(support) + print(str(itemset) + " " + str(support))