diff --git a/src/include/OpenImageIO/ustring.h b/src/include/OpenImageIO/ustring.h index fb45c075d5..82785cd999 100644 --- a/src/include/OpenImageIO/ustring.h +++ b/src/include/OpenImageIO/ustring.h @@ -32,6 +32,7 @@ OIIO_NAMESPACE_BEGIN #define OIIO_USTRING_HAS_CTR_FROM_USTRINGHASH 1 #define OIIO_USTRING_HAS_STDHASH 1 #define OIIO_HAS_USTRINGHASH_FORMATTER 1 +#define OIIO_USTRING_SAFE_HASH 1 class ustringhash; // forward declaration @@ -123,6 +124,16 @@ class ustringhash; // forward declaration /// - if you don't need to do a lot of string assignment or equality /// testing, but lots of more complex string manipulation. /// +/// The ustring implementation guarantees that no two ustrings return the same +/// value for hash() (despite the slim probability that two strings could +/// numerically hash to the same value). For the first ustring added with a +/// given hash, u.hash() will be the same value as ustring::strhash(chars), +/// and will deterministically be the same on every execution. In the very +/// improbable case of a hash collision, subsequent ustrings with the same +/// numeric hash will use an alternate hash based on the character address, +/// which is both not the same as ustring::strhash(chars), nor is it expected +/// to be the same constant on every program execution. + class OIIO_UTIL_API ustring { public: using rep_t = const char*; ///< The underlying representation type @@ -288,11 +299,7 @@ class OIIO_UTIL_API ustring { /// Return a C++ std::string representation of a ustring. const std::string& string() const noexcept { - if (m_chars) { - const TableRep* rep = (const TableRep*)m_chars - 1; - return rep->str; - } else - return empty_std_string; + return m_chars ? rep()->str : empty_std_string; } /// Reset to an empty string. @@ -303,17 +310,27 @@ class OIIO_UTIL_API ustring { { if (!m_chars) return 0; - const TableRep* rep = ((const TableRep*)m_chars) - 1; - return rep->length; + return rep()->length; } - /// Return a hashed version of the string + /// ustring::strhash() uses Strutil::strhash but clears the MSB. + static OIIO_HOSTDEVICE constexpr hash_t strhash(string_view str) + { + return Strutil::strhash(str) & hash_mask; + } + + /// Return a hashed version of the string. To guarantee unique hashes, + /// we check if the "duplicate bit" of the hash is set. If not, then + /// we just return the hash which we know is unique. But if that bit + /// is set, we utilize the unique character address. hash_t hash() const noexcept { if (!m_chars) return 0; - const TableRep* rep = ((const TableRep*)m_chars) - 1; - return rep->hashed; + hash_t h = rep()->hashed; + return OIIO_LIKELY((h & duplicate_bit) == 0) + ? h + : hash_t(m_chars) | duplicate_bit; } /// Return a hashed version of the string @@ -749,6 +766,8 @@ class OIIO_UTIL_API ustring { // if you know the rep, the chars are at (char *)(rep+1), and if you // know the chars, the rep is at ((TableRep *)chars - 1). struct TableRep { + // hashed has the MSB set if and only if this is the second or + // greater ustring to have the same hash. hash_t hashed; // precomputed Hash value std::string str; // String representation size_t length; // Length of the string; must be right before cap @@ -757,10 +776,29 @@ class OIIO_UTIL_API ustring { TableRep(string_view strref, hash_t hash); ~TableRep(); const char* c_str() const noexcept { return (const char*)(this + 1); } + constexpr bool unique_hash() const + { + return (hashed & duplicate_bit) == 0; + } }; + // duplicate_bit is a 1 in the MSB, which if set indicates a hash that + // is a duplicate. + static constexpr hash_t duplicate_bit = hash_t(1) << 63; + // hash_mask is what you `&` with hashed to get the real hash (clearing + // the duplicate bit). +#if 1 + static constexpr hash_t hash_mask = ~duplicate_bit; +#else + // Alternate to force lots of hash collisions for testing purposes: + static constexpr hash_t hash_mask = ~duplicate_bit & 0xffff; +#endif + bool has_unique_hash() const { return rep()->unique_hash(); } + private: static std::string empty_std_string; + + const TableRep* rep() const { return ((const TableRep*)m_chars) - 1; } }; @@ -811,7 +849,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(str)) + : m_hash(ustring::strhash(str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -823,7 +861,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str, size_t len) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(len, str)) + : m_hash(ustring::strhash(len, str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -837,7 +875,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(string_view str) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(str)) + : m_hash(ustring::strhash(str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -931,13 +969,13 @@ class OIIO_UTIL_API ustringhash { /// Test for equality with a char*. constexpr bool operator==(const char* str) const noexcept { - return m_hash == Strutil::strhash(str); + return m_hash == ustring::strhash(str); } /// Test for inequality with a char*. constexpr bool operator!=(const char* str) const noexcept { - return m_hash != Strutil::strhash(str); + return m_hash != ustring::strhash(str); } #ifndef __CUDA_ARCH__ diff --git a/src/libutil/ustring.cpp b/src/libutil/ustring.cpp index 211c944e67..6a0e1d48c6 100644 --- a/src/libutil/ustring.cpp +++ b/src/libutil/ustring.cpp @@ -22,12 +22,46 @@ typedef spin_rw_write_lock ustring_write_lock_t; #define PREVENT_HASH_COLLISIONS 1 +// Explanation of ustring hash non-collision guarantees: +// +// The gist is that the ustring::strhash(str) function is modified to +// strip out the MSB from Strutil::strhash. The rep entry is filed in +// the ustring table based on this hash. So effectively, the computed +// hash is 63 bits, not 64. +// +// But rep->hashed field consists of the lower 63 bits being the computed +// hash, and the MSB indicates whether this is the 2nd (or more) entry in +// the table that had the same 63 bit hash. +// +// ustring::hash() then is modified as follows: If the MSB is 0, the +// computed hash is the hash. If the MSB is 1, though, we DON'T use that +// hash, and instead we use the pointer to the unique characters, but +// with the MSB set (that's an invalid address by itself). Note that the +// computed hashes never have MSB set, and the char*+MSB always have MSB +// set, so therefore ustring::hash() will never have the same value for +// two different ustrings. +// +// But -- please note! -- that ustring::strhash(str) and +// ustring(str).hash() will only match (and also be the same value on +// every execution) if the ustring is the first to receive that hash, +// which should be approximately always. Probably always, in practice. +// +// But in the very improbable case of a hash collision, one of them (the +// second to be turned into a ustring) will be using the alternate hash +// based on the character address, which is both not the same as +// ustring::strhash(chars), nor is it expected to be the same constant on +// every program execution. + template struct identity { constexpr T operator()(T val) const noexcept { return val; } }; +namespace { +atomic_ll total_ustring_hash_collisions(0); +} + // #define USTRING_TRACK_NUM_LOOKUPS @@ -71,6 +105,10 @@ template struct TableRepMap { const char* lookup(string_view str, uint64_t hash) { + if (OIIO_UNLIKELY(hash & ustring::duplicate_bit)) { + // duplicate bit is set -- the hash is related to the chars! + return OIIO::bitcast(hash & ustring::hash_mask); + } ustring_read_lock_t lock(mutex); #ifdef USTRING_TRACK_NUM_LOOKUPS // NOTE: this simple increment adds a substantial amount of overhead @@ -81,13 +119,12 @@ template struct TableRepMap { #endif size_t pos = hash & mask, dist = 0; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) return 0; - if (entries[pos]->hashed == hash - && entries[pos]->length == str.length() - && strncmp(entries[pos]->c_str(), str.data(), str.length()) - == 0) - return entries[pos]->c_str(); + if (e->hashed == hash && e->length == str.length() + && !strncmp(e->c_str(), str.data(), str.length())) + return e->c_str(); ++dist; pos = (pos + dist) & mask; // quadratic probing } @@ -98,6 +135,10 @@ template struct TableRepMap { // the hash. const char* lookup(uint64_t hash) { + if (OIIO_UNLIKELY(hash & ustring::duplicate_bit)) { + // duplicate bit is set -- the hash is related to the chars! + return OIIO::bitcast(hash & ustring::hash_mask); + } ustring_read_lock_t lock(mutex); #ifdef USTRING_TRACK_NUM_LOOKUPS // NOTE: this simple increment adds a substantial amount of overhead @@ -108,10 +149,11 @@ template struct TableRepMap { #endif size_t pos = hash & mask, dist = 0; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) return 0; - if (entries[pos]->hashed == hash) - return entries[pos]->c_str(); + if (e->hashed == hash) + return e->c_str(); ++dist; pos = (pos + dist) & mask; // quadratic probing } @@ -119,27 +161,52 @@ template struct TableRepMap { const char* insert(string_view str, uint64_t hash) { + OIIO_ASSERT((hash & ustring::duplicate_bit) == 0); // can't happen? ustring_write_lock_t lock(mutex); size_t pos = hash & mask, dist = 0; + bool duplicate_hash = false; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) break; // found insert pos - if (entries[pos]->hashed == hash - && entries[pos]->length == str.length() - && !strncmp(entries[pos]->c_str(), str.data(), str.length())) { - // same string is already inserted, return the one that is - // already in the table - return entries[pos]->c_str(); + if (e->hashed == hash) { + duplicate_hash = true; + if (e->length == str.length() + && !strncmp(e->c_str(), str.data(), str.length())) { + // same string is already inserted, return the one that is + // already in the table + return e->c_str(); + } } ++dist; pos = (pos + dist) & mask; // quadratic probing } ustring::TableRep* rep = make_rep(str, hash); - entries[pos] = rep; + + // If we encountered another ustring with the same hash (if one + // exists, it would have hashed to the same address so we would have + // seen it), set the duplicate bit in the rep's hashed field. + if (duplicate_hash) { + ++total_ustring_hash_collisions; +#if PREVENT_HASH_COLLISIONS + rep->hashed |= ustring::duplicate_bit; +#endif +#if !defined(NDEBUG) + print("DUPLICATE ustring '{}' hash {:x} c_str {:p} strhash {:x}\n", + rep->c_str(), rep->hashed, rep->c_str(), + ustring::strhash(str)); +#endif + } + + entries[pos] = rep; ++num_entries; if (2 * num_entries > mask) - grow(); // maintain 0.5 load factor + grow(); // maintain 0.5 load factor + // ensure low bit clear + OIIO_DASSERT((size_t(rep->c_str()) & 1) == 0); + // ensure low bit clear + OIIO_DASSERT((size_t(rep->c_str()) & ustring::duplicate_bit) == 0); return rep->c_str(); // rep is now in the table } @@ -283,11 +350,6 @@ struct UstringTable { // This string is here so that we can return sensible values of str when the ustring's pointer is NULL std::string ustring::empty_std_string; -// The reverse map that lets you look up a string by its initial hash. -using ReverseMap - = unordered_map_concurrent, - std::equal_to, 256 /*bins*/>; - namespace { // anonymous @@ -298,19 +360,6 @@ ustring_table() return table; } - -static ReverseMap& -reverse_map() -{ - static OIIO_CACHE_ALIGN ReverseMap rm; - return rm; -} - - -// Keep track of any collisions -static std::vector> all_hash_collisions; -OIIO_CACHE_ALIGN static std::mutex collision_mutex; - } // end anonymous namespace @@ -384,6 +433,7 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) && defined(_GLIBCXX_USE_CXX11_ABI) && _GLIBCXX_USE_CXX11_ABI // NEW gcc ABI // FIXME -- do something smart with this. + str = strref; #elif defined(__GNUC__) && !defined(_LIBCPP_VERSION) // OLD gcc ABI @@ -402,7 +452,6 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) dummy_refcount = 1; // so it never frees *(const char**)&str = c_str(); OIIO_DASSERT(str.c_str() == c_str() && str.size() == length); - return; #elif defined(_LIBCPP_VERSION) && !defined(__aarch64__) // FIXME -- we seem to do the wrong thing with libcpp on Mac M1. Disable @@ -426,15 +475,17 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) ((libcpp_string__long*)&str)->__size_ = length; ((libcpp_string__long*)&str)->__data_ = (char*)c_str(); OIIO_DASSERT(str.c_str() == c_str() && str.size() == length); - return; + } else { + // Short string -- just assign it, since there is no extra allocation. + str = strref; } -#endif - +#else // Remaining cases - just assign the internal string. This may result // in double allocation for the chars. If you care about that, do // something special for your platform, much like we did for gcc and // libc++ above. (Windows users, I'm talking to you.) str = strref; +#endif } @@ -459,11 +510,8 @@ ustring::make_unique(string_view strref) if (!strref.data()) strref = string_view("", 0); - hash_t hash = Strutil::strhash64(strref); - // This line, if uncommented, lets you force lots of hash collisions: - // hash &= ~hash_t(0xffffff); + hash_t hash = ustring::strhash(strref); -#if !PREVENT_HASH_COLLISIONS // Check the ustring table to see if this string already exists. If so, // construct from its canonical representation. // NOTE: all locking is performed internally to the table implementation @@ -477,95 +525,13 @@ ustring::make_unique(string_view strref) // OIIO_ASSERT(strref.find('\0') == string_view::npos && // "ustring::make_unique() does not support embedded nulls"); strref = strref.substr(0, nul); - hash = Strutil::strhash64(strref); + hash = ustring::strhash(strref); result = table.lookup(strref, hash); if (result) return result; } // Strutil::print("ADDED ustring \"{}\" {:08x}\n", strref, hash); return table.insert(strref, hash); - -#else - // Check the ustring table to see if this string already exists with the - // default hash. If so, we're done. This is by far the common case -- - // most lookups already exist in the table, and hash collisions are - // extremely rare. - const char* result = table.lookup(strref, hash); - if (result) - return result; - - // ustring doesn't allow strings with embedded nul characters. Before we - // go any further, trim beyond any nul and rehash. - auto nul = strref.find('\0'); - if (nul != string_view::npos) { - // Strutil::print("ustring::make_unique: string contains nulls @{}/{}: \"{}\"\n", - // strref.find('\0'), strref.size(), strref); - // OIIO_ASSERT(strref.find('\0') == string_view::npos && - // "ustring::make_unique() does not support embedded nulls"); - strref = strref.substr(0, nul); - hash = Strutil::strhash64(strref); - result = table.lookup(strref, hash); - if (result) - return result; - } - - // We did not find it. There are two possibilities: (1) the string is in - // the table but has a different hash because it collided; or (2) the - // string is not yet in the table. - - // Thread safety by locking reverse_map's bin corresponding to our - // original hash. This will prevent any potentially colliding ustring - // from being added to either table. But ustrings whose hashes go to - // different bins of the reverse map (which by definition cannot clash) - // are allowed to be added concurrently. - auto& rm(reverse_map()); - size_t bin = rm.lock_bin(hash); - - hash_t orighash = hash; - size_t binmask = orighash & (~rm.nobin_mask()); - size_t num_rehashes = 0; - - while (1) { - auto rev = rm.find(hash, false); - // rev now either holds an iterator into the reverse map for a - // record that has this hash, or else it's end(). - if (rev == rm.end()) { - // That hash is unused, insert the string with that hash into - // the ustring table, and insert the hash with the unique char - // pointer into the reverse_map. - result = table.insert(strref, hash); - bool ok = rm.insert(hash, result, false); - // Strutil::print("ADDED \"{}\" {:08x}\n", strref, hash); - OIIO_ASSERT(ok && "thread safety failure"); - break; - } - // Something uses this hash. Is it our string? - if (!strncmp(rev->second, strref.data(), strref.size())) { - // It is our string, already in this hash slot! - result = rev->second; - break; - } - // Rehash, but keep the bin bits identical so we always rehash into - // the same (locked) bin. - hash = (hash & binmask) - | (farmhash::Fingerprint(hash) & rm.nobin_mask()); - ++num_rehashes; - // Strutil::print("COLLISION \"{}\" {:08x} vs \"{}\"\n", - // strref, orighash, rev->second); - { - std::lock_guard lock(collision_mutex); - all_hash_collisions.emplace_back(rev->second, rev->first); - } - } - rm.unlock_bin(bin); - - if (num_rehashes) { - std::lock_guard lock(collision_mutex); - all_hash_collisions.emplace_back(result, orighash); - } - - return result; -#endif } @@ -607,27 +573,20 @@ ustring::getstats(bool verbose) size_t n_e = total_ustrings(); size_t mem = memory(); if (verbose) { - out << "ustring statistics:\n"; + print(out, "ustring statistics:\n"); #ifdef USTRING_TRACK_NUM_LOOKUPS - out << " ustring requests: " << ustring_table().get_num_lookups() - << "\n"; -#endif - out << " unique strings: " << n_e << "\n"; - out << " ustring memory: " << Strutil::memformat(mem) << "\n"; -#ifndef NDEBUG - std::vector collisions; - hash_collisions(&collisions); - if (collisions.size()) { - out << " Hash collisions: " << collisions.size() << "\n"; - for (auto c : collisions) - out << Strutil::fmt::format(" {} \"{}\"\n", c.hash(), c); - } + print(out, " ustring requests: {}\n", + ustring_table().get_num_lookups()); #endif + print(out, " unique strings: {}\n", n_e); + print(out, " ustring memory: {}\n", Strutil::memformat(mem)); + print(out, " total ustring hash collisions: {}\n", + (int)total_ustring_hash_collisions); } else { #ifdef USTRING_TRACK_NUM_LOOKUPS - out << "requests: " << ustring_table().get_num_lookups() << ", "; + print(out, "requests: {}, ", ustring_table().get_num_lookups()); #endif - out << "unique " << n_e << ", " << Strutil::memformat(mem); + print(out, "unique {}, {}\n", n_e, Strutil::memformat(mem)); } return out.str(); } @@ -637,11 +596,12 @@ ustring::getstats(bool verbose) size_t ustring::hash_collisions(std::vector* collisions) { - std::lock_guard lock(collision_mutex); - if (collisions) - for (const auto& c : all_hash_collisions) - collisions->emplace_back(ustring::from_unique(c.first)); - return all_hash_collisions.size(); + if (collisions) { + // Disabled for now + // for (const auto& c : all_hash_collisions) + // collisions->emplace_back(ustring::from_unique(c.first)); + } + return size_t(total_ustring_hash_collisions); } diff --git a/src/libutil/ustring_test.cpp b/src/libutil/ustring_test.cpp index 6e0a883401..8c54676828 100644 --- a/src/libutil/ustring_test.cpp +++ b/src/libutil/ustring_test.cpp @@ -265,14 +265,14 @@ create_lotso_ustrings(int iterations) { OIIO_DASSERT(size_t(iterations) <= strings.size()); if (verbose) - Strutil::print("thread {}\n", std::this_thread::get_id()); + print("thread {}\n", std::this_thread::get_id()); size_t h = 0; for (int i = 0; i < iterations; ++i) { ustring s(strings[i].data()); h += s.hash(); } if (verbose) - Strutil::printf("checksum %08x\n", unsigned(h)); + print("checksum {:08x}\n", h); } @@ -310,10 +310,17 @@ verify_no_collisions() size_t ncollisions = ustring::hash_collisions(&collisions); OIIO_CHECK_ASSERT(ncollisions == 0); if (ncollisions) { - Strutil::print(" Hash collisions: {}\n", ncollisions); + print(" Hash collisions: {}\n", ncollisions); for (auto c : collisions) - Strutil::print(" \"{}\" (orig {:08x} rehashed {:08x})\n", c, - Strutil::strhash(c), c.hash()); + print(" \"{}\" (orig {:016x} rehashed {:016x})\n", c, + Strutil::strhash(c), c.hash()); + } + + for (int i = 0; i < 200; ++i) { + ustring u = ustring::fmtformat("{}", i); + print("{}: {:016x} uh {:016x} sh {:016x} (addr {:p}){}\n", u, u.hash(), + ustring::strhash(u.c_str()), Strutil::strhash(u.c_str()), + u.c_str(), u.has_unique_hash() ? "" : " DUP"); } } @@ -330,7 +337,7 @@ main(int argc, char* argv[]) benchmark_threaded_ustring_creation(); verify_no_collisions(); - std::cout << "\n" << ustring::getstats(true) << "\n"; + print("\n{}\n", ustring::getstats(true)); return unit_test_failures; }