Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Python FP-Growth
================

This module provides a pure Python implementation of the FP-growth algorithm for
finding frequent itemsets. FP-growth exploits an (often-valid) assumption that
many transactions will have items in common to build a prefix tree. If the
assumption holds true, this tree produces a compact representation of the actual
transactions and is used to generate itemsets much faster than *Apriori* can.
## concept
Association Rule (연관규칙)을 적용하기 위해서는 각 item들이 각itemset에서 어떤 빈도로 나타났고, 어떤 item과 함께 나왔는지 세는 것이 필수이다. 하지만 데이터셋이 큰 경우, 이를 모든 후보 itemset들에 대해서 하나하나 검사하는 것은 굉장히 비효율적이다. 이러한 문제를 해결하기 위해 제시된 것이 FP-growth algorithm이다.
Copy link

@daavve daavve Oct 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recommend we either retain old English explanation or add a section in English translated from the Korean above.


reference)
https://process-mining.tistory.com/92

Installation
------------
Expand Down Expand Up @@ -84,4 +84,4 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

[me]: http://github.com/enaeseth/
[pypi]: http://pypi.python.org/
[pypi]: http://pypi.python.org/
78 changes: 50 additions & 28 deletions fp_growth.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
"""

from collections import defaultdict, namedtuple
from itertools import imap

__author__ = 'Eric Naeseth <[email protected]>'
__copyright__ = 'Copyright © 2009 Eric Naeseth'
__license__ = 'MIT License'
# from itertools import imap

__author__ = "Eric Naeseth <[email protected]>"
__copyright__ = "Copyright © 2009 Eric Naeseth"
__license__ = "MIT License"


def find_frequent_itemsets(transactions, minimum_support, include_support=False):
"""
Expand All @@ -31,7 +33,7 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False)
If `include_support` is true, yield (itemset, support) pairs instead of
just the itemsets.
"""
items = defaultdict(lambda: 0) # mapping from items to their supports
items = defaultdict(lambda: 0) # mapping from items to their supports

# Load the passed-in transactions and count the support that individual
# items have.
Expand All @@ -40,19 +42,25 @@ def find_frequent_itemsets(transactions, minimum_support, include_support=False)
items[item] += 1

# Remove infrequent items from the item support dictionary.
items = dict((item, support) for item, support in items.iteritems()
if support >= minimum_support)
items = dict(
(item, support)
# for item, support in items.iteritems()
for item, support in items.items()
if support >= minimum_support
)

# Build our FP-tree. Before any transactions can be added to the tree, they
# must be stripped of infrequent items and their surviving items must be
# sorted in decreasing order of frequency.
def clean_transaction(transaction):
transaction = filter(lambda v: v in items, transaction)
transaction.sort(key=lambda v: items[v], reverse=True)
# transaction.sort(key=lambda v: items[v], reverse=True)
transaction = sorted(transaction, key=lambda v: items[v], reverse=True)
return transaction

master = FPTree()
for transaction in imap(clean_transaction, transactions):
# for transaction in imap(clean_transaction, transactions):
for transaction in list(map(clean_transaction, transactions)):
master.add(transaction)

def find_with_suffix(tree, suffix):
Expand All @@ -67,12 +75,13 @@ def find_with_suffix(tree, suffix):
# itemsets within it.
cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
for s in find_with_suffix(cond_tree, found_set):
yield s # pass along the good news to our caller
yield s # pass along the good news to our caller

# Search for frequent itemsets, and yield the results we find.
for itemset in find_with_suffix(master, []):
yield itemset


class FPTree(object):
"""
An FP tree.
Expand All @@ -81,7 +90,7 @@ class FPTree(object):
(i.e., all items must be valid as dictionary keys or set members).
"""

Route = namedtuple('Route', 'head tail')
Route = namedtuple("Route", "head tail")

def __init__(self):
# The root node of the tree.
Expand Down Expand Up @@ -124,7 +133,7 @@ def _update_route(self, point):

try:
route = self._routes[point.item]
route[1].neighbor = point # route[1] is the tail
route[1].neighbor = point # route[1] is the tail
self._routes[point.item] = self.Route(route[0], point)
except KeyError:
# First node for this item; start a new route.
Expand All @@ -136,7 +145,8 @@ def items(self):
element of the tuple is the item itself, and the second element is a
generator that will yield the nodes in the tree that belong to the item.
"""
for item in self._routes.iterkeys():
# for item in self._routes.iterkeys():
for item in self._routes:
yield (item, self.nodes(item))

def nodes(self, item):
Expand Down Expand Up @@ -167,15 +177,16 @@ def collect_path(node):
return (collect_path(node) for node in self.nodes(item))

def inspect(self):
print 'Tree:'
print("Tree:")
self.root.inspect(1)

print
print 'Routes:'
print()
print("Routes:")
for item, nodes in self.items():
print ' %r' % item
print(" %r" % item)
for node in nodes:
print ' %r' % node
print(" %r" % node)


def conditional_tree_from_paths(paths):
"""Build a conditional FP-tree from the given prefix paths."""
Expand Down Expand Up @@ -212,6 +223,7 @@ def conditional_tree_from_paths(paths):

return tree


class FPNode(object):
"""A node in an FP tree."""

Expand Down Expand Up @@ -312,7 +324,7 @@ def children(self):
return tuple(self._children.itervalues())

def inspect(self, depth=0):
print (' ' * depth) + repr(self)
print((" " * depth) + repr(self))
for child in self.children:
child.inspect(depth + 1)

Expand All @@ -322,21 +334,31 @@ def __repr__(self):
return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)


if __name__ == '__main__':
if __name__ == "__main__":
from optparse import OptionParser
import csv

p = OptionParser(usage='%prog data_file')
p.add_option('-s', '--minimum-support', dest='minsup', type='int',
help='Minimum itemset support (default: 2)')
p.add_option('-n', '--numeric', dest='numeric', action='store_true',
help='Convert the values in datasets to numerals (default: false)')
p = OptionParser(usage="%prog data_file")
p.add_option(
"-s",
"--minimum-support",
dest="minsup",
type="int",
help="Minimum itemset support (default: 2)",
)
p.add_option(
"-n",
"--numeric",
dest="numeric",
action="store_true",
help="Convert the values in datasets to numerals (default: false)",
)
p.set_defaults(minsup=2)
p.set_defaults(numeric=False)

options, args = p.parse_args()
if len(args) < 1:
p.error('must provide the path to a CSV file to read')
p.error("must provide the path to a CSV file to read")

transactions = []
with open(args[0]) as database:
Expand All @@ -351,8 +373,8 @@ def __repr__(self):

result = []
for itemset, support in find_frequent_itemsets(transactions, options.minsup, True):
result.append((itemset,support))
result.append((itemset, support))

result = sorted(result, key=lambda i: i[0])
for itemset, support in result:
print str(itemset) + ' ' + str(support)
print(str(itemset) + " " + str(support))