Updated 2015-03-25 14:52:42 by AMG

NEM 2015-03-24: I've been playing with Bloom Filters at work recently, and the idea is quite fascinating. They can represent vast numbers of elements in a very compact form (a few bits per element) at the cost of some probability of false positives when testing for membership. As an exercise, I thought I'd port the core of Guava's BloomFilter implementation into Tcl. The result is not particularly optimised - e.g., SHA-1 is probably not the most performant choice of hash, but it serves to illustrate the approach. It's had limited testing, but appears to basically work.

The actual bits of the bloom filter are encoded into a Tcl bignum, so should be fairly compact and efficient.
# bloomfilter.tcl --
#
#       Simple Bloom Filter implementation based on the Guava BloomFilter implementation:
#       https://github.com/google/guava/blob/master/guava/src/com/google/common/hash/BloomFilterStrategies.java
#       Copyright (C) 2011 The Guava Authors
#
# Copyright (c) 2015 Neil Madden.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# 

package require Tcl         8.5
package require sha1        2.0
package provide bloomfilter 0.1

namespace eval ::bloomfilter {
    namespace export create add mightcontain stats
    namespace ensemble create

    # Creates an empty bloom filter with the given capacity and false positive probability (fpp).
    #
    #   capacity    the expected number of elements to be inserted into the bloom filter
    #   fpp         the desired probability of false positives from the mightcontain method
    #
    proc create {capacity fpp} {
        set bitSize [optimalNumBits $capacity $fpp]
        set numHashes [optimalNumHashes $capacity $bitSize]
        
        return [dict create capacity $capacity fpp $fpp size $bitSize hashes $numHashes bits 0 ones 0]
    }

    # Adds an element to the bloom filter, returning a new bloom filter with the added element.
    #
    #   bf          the bloom filter to add an element to.
    #   element     the element to add.
    proc add {bf element} {
        # Hashing algorithm is based on Guava's MURMUR128_MITZ_64 strategy, which is itself 
        # based on the paper "Less Hashing, Same Performance: Building a Better Bloom Filter" by
        # Adam Kirsch and Michael Mitzenmacher. The basic idea is to use two hashes (or, in
        # this cases, one large hash split into two) and then just combine them to generate
        # additional hashes as required.
        set hash [expr 0x[sha1::sha1 -hex $element]]
        set hash1 [expr {$hash         & 0xFFFFFFFFFFFFFFFF}]
        set hash2 [expr {($hash >> 64) & 0xFFFFFFFFFFFFFFFF}]

        set numHashes [dict get $bf hashes]
        set size [dict get $bf size]
        set bits [dict get $bf bits]
        
        set newbits 0
        set combinedHash $hash1
        for {set i 0} {$i < $numHashes} {incr i} {
            set index [expr {$combinedHash % $size}]
            incr newbits [expr {$bits & (1<<$index) ? 0 : 1}]
            set bits [expr {$bits | (1 << $index)}]
            incr combinedHash $hash2
        }

        dict set bf bits $bits
        dict incr bf ones $newbits

        return $bf
    }

    # Determines whether this bloom filter *might* contain the given element or not. If the answer
    # is no then the bloom filter definitely does not contain the element, but if the answer is yes,
    # then there is a small probability (bounded by the false positive probability specified when
    # creating the bloom filter) that the element is not actually in the set. If more elements have
    # been inserted into the set than the initial capacity estimate, then the probability of false
    # positives will be significantly higher.
    #
    #   bf          the bloom filter to test this element for membership.
    #   element     the element to check for possible containment in the bloom filter.
    #
    proc mightcontain {bf element} {
        set hash [expr 0x[sha1::sha1 -hex $element]]
        set hash1 [expr {$hash         & 0xFFFFFFFFFFFFFFFF}]
        set hash2 [expr {($hash >> 64) & 0xFFFFFFFFFFFFFFFF}]
        
        set numHashes [dict get $bf hashes]
        set size [dict get $bf size]
        set bits [dict get $bf bits]

        set combinedHash $hash1
        for {set i 0} {$i < $numHashes} {incr i} {
            set index [expr {$combinedHash % $size}]

            if {($bits & (1 << $index)) == 0} {
                return no
            }

            incr combinedHash $hash2
        }

        return yes
    }

    # Returns some stats on the current state of the bloom filter as a dictionary with the following
    # keys:
    #   capacity        the configured capacity of the bloom filter
    #   fpp             a dictionary with two sub-keys:
    #       configured  the original configured false positive probability
    #       expected    the expected false positive probability given the current state
    #   remaining       an estimate of the number of additional elements that can be inserted before
    #                   the bloom filter exceeds the false positive probability
    proc stats bf {
        return [dict create \
            capacity        [dict get $bf capacity] \
            fpp             [dict create configured [dict get $bf fpp] \
                                         expected   [expectedFpp $bf]] \
            remaining       [remainingEstimate $bf]]
    }

    proc expectedFpp bf {
        expr {pow(double([dict get $bf ones]) / double([dict get $bf size]), [dict get $bf hashes])}
    }

    proc cardinality bf {
        # see http://en.wikipedia.org/wiki/Bloom_filter#Approximating_the_number_of_items_in_a_Bloom_filter
        expr {entier(-(([dict get $bf size] * 
            log(1 - [dict get $bf ones]/double([dict get $bf size]))) / [dict get $bf hashes]))}
    }

    proc remainingEstimate bf {
        expr {[dict get $bf capacity] - [cardinality $bf]}
    }


    # Calculates the optimal number of bits needed to store $capacity elements at $fpp probability of false positives
    proc optimalNumBits {capacity fpp} {
        expr {entier(-$capacity * log($fpp) / (log(2) * log(2)))}
    }

    # Calculates the optimal number of hashes for storing $capacity elements in a bitvector of $bitSize size
    proc optimalNumHashes {capacity bitSize} {
        expr {max(1, round(double($bitSize) / $capacity * log(2)))}
    }
}