diff --git a/src/Interpreters/BloomFilter.cpp b/src/Interpreters/BloomFilter.cpp index 7bf50a0312be..4c99484edc94 100644 --- a/src/Interpreters/BloomFilter.cpp +++ b/src/Interpreters/BloomFilter.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -39,7 +40,8 @@ BloomFilter::BloomFilter(const BloomFilterParameters & params) } BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_) - : size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0) + : size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), + modulus(8 * size_), divider(modulus), filter(words, 0) { chassert(size != 0); chassert(hashes != 0); @@ -49,6 +51,8 @@ void BloomFilter::resize(size_t size_) { size = size_; words = ((size + sizeof(UnderType) - 1) / sizeof(UnderType)); + modulus = 8 * size; + divider = libdivide::divider(modulus); filter.resize(words); } @@ -57,11 +61,15 @@ bool BloomFilter::find(const char * data, size_t len) size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); + size_t acc = hash1; for (size_t i = 0; i < hashes; ++i) { - size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); - if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))))) + /// It accumulates in the loop as follows: + /// pos = (hash1 + hash2 * i + i * i) % (8 * size) + size_t pos = fastMod(acc + i * i); + if (!(filter[pos / word_bits] & (1ULL << (pos % word_bits)))) return false; + acc += hash2; } return true; } @@ -71,10 +79,14 @@ void BloomFilter::add(const char * data, size_t len) size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); + size_t acc = hash1; for (size_t i = 0; i < hashes; ++i) { - size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); - filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + /// It accumulates in the loop as follows: + /// pos = (hash1 + hash2 * i + i * i) % (8 * size) + size_t pos = fastMod(acc + i * i); + filter[pos / word_bits] |= (1ULL << (pos % word_bits)); + acc += hash2; } } @@ -111,14 +123,14 @@ bool operator== (const BloomFilter & a, const BloomFilter & b) void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) { - size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); - filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed))); + filter[pos / word_bits] |= (1ULL << (pos % word_bits)); } bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) { - size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); - return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))); + size_t pos = fastMod(CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed))); + return bool(filter[pos / word_bits] & (1ULL << (pos % word_bits))); } DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type) diff --git a/src/Interpreters/BloomFilter.h b/src/Interpreters/BloomFilter.h index 8ebdfd879e62..9c45c691de48 100644 --- a/src/Interpreters/BloomFilter.h +++ b/src/Interpreters/BloomFilter.h @@ -1,13 +1,14 @@ #pragma once -#include #include -#include -#include -#include #include -#include #include +#include + +//#include +//#include +//#include +//#include namespace DB @@ -58,12 +59,18 @@ class BloomFilter friend bool operator== (const BloomFilter & a, const BloomFilter & b); private: + static constexpr size_t word_bits = 8 * sizeof(UnderType); + size_t size; size_t hashes; size_t seed; size_t words; + size_t modulus; /// 8 * size, cached for fast modulo. + libdivide::divider divider; /// Divider for fast modulo by modulus. Container filter; + inline size_t fastMod(size_t value) const { return value - (value / divider) * modulus; } + public: static ColumnPtr getPrimitiveColumn(const ColumnPtr & column); static DataTypePtr getPrimitiveType(const DataTypePtr & data_type);