Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions src/Interpreters/BloomFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <libdivide.h>


namespace DB
Expand Down Expand Up @@ -39,7 +40,8 @@ BloomFilter::BloomFilter(const BloomFilterParameters & params)
}

BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0)
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)),
modulus(8 * size_), divider(modulus), filter(words, 0)
{
chassert(size != 0);
chassert(hashes != 0);
Expand All @@ -49,6 +51,8 @@ void BloomFilter::resize(size_t size_)
{
size = size_;
words = ((size + sizeof(UnderType) - 1) / sizeof(UnderType));
modulus = 8 * size;
divider = libdivide::divider<size_t, libdivide::BRANCHFREE>(modulus);
filter.resize(words);
}

Expand All @@ -57,11 +61,15 @@ bool BloomFilter::find(const char * data, size_t len)
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);

size_t acc = hash1;
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
/// It accumulates in the loop as follows:
/// pos = (hash1 + hash2 * i + i * i) % (8 * size)
size_t pos = fastMod(acc + i * i);
if (!(filter[pos / word_bits] & (1ULL << (pos % word_bits))))
return false;
acc += hash2;
}
return true;
}
Expand All @@ -71,10 +79,14 @@ void BloomFilter::add(const char * data, size_t len)
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);

size_t acc = hash1;
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
/// It accumulates in the loop as follows:
/// pos = (hash1 + hash2 * i + i * i) % (8 * size)
size_t pos = fastMod(acc + i * i);
filter[pos / word_bits] |= (1ULL << (pos % word_bits));
acc += hash2;
}
}

Expand Down Expand Up @@ -111,14 +123,14 @@ bool operator== (const BloomFilter & a, const BloomFilter & b)

void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)));
filter[pos / word_bits] |= (1ULL << (pos % word_bits));
}

bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)));
return bool(filter[pos / word_bits] & (1ULL << (pos % word_bits)));
}

DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)
Expand Down
17 changes: 12 additions & 5 deletions src/Interpreters/BloomFilter.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#pragma once

#include <vector>
#include <base/types.h>
#include <Core/Field.h>
#include <Common/PODArray.h>
#include <Common/Allocator.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/IDataType.h>
#include <libdivide.h>

//#include <vector>
//#include <Common/PODArray.h>
//#include <Common/Allocator.h>
//#include <Columns/ColumnVector.h>


namespace DB
Expand Down Expand Up @@ -58,12 +59,18 @@ class BloomFilter
friend bool operator== (const BloomFilter & a, const BloomFilter & b);
private:

static constexpr size_t word_bits = 8 * sizeof(UnderType);

size_t size;
size_t hashes;
size_t seed;
size_t words;
size_t modulus; /// 8 * size, cached for fast modulo.
libdivide::divider<size_t, libdivide::BRANCHFREE> divider; /// Divider for fast modulo by modulus.
Container filter;

inline size_t fastMod(size_t value) const { return value - (value / divider) * modulus; }

public:
static ColumnPtr getPrimitiveColumn(const ColumnPtr & column);
static DataTypePtr getPrimitiveType(const DataTypePtr & data_type);
Expand Down
Loading