From 9ea7fc451dcf44393b35c0fdd8cbd03a409ba742 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 10 Sep 2024 10:27:08 -0400 Subject: [PATCH 1/4] src: improve utf8 string generation performance --- src/util.cc | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index f9aad9ef5a6213..27b6b4ea06c2a3 100644 --- a/src/util.cc +++ b/src/util.cc @@ -48,6 +48,8 @@ #include #endif +#include + #include #include #include @@ -100,11 +102,26 @@ static void MakeUtf8String(Isolate* isolate, MaybeStackBuffer* target) { Local string; if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return; + String::ValueView value_view(isolate, string); + + if (value_view.is_one_byte()) { + target->AllocateSufficientStorage(value_view.length() + 1); + target->SetLengthAndZeroTerminate(value_view.length()); + memcpy(target->out(), + reinterpret_cast(value_view.data8()), + value_view.length()); + return; + } - size_t storage; - if (!StringBytes::StorageSize(isolate, string, UTF8).To(&storage)) return; - storage += 1; + // Add +1 for null termination. + auto storage = simdutf::utf8_length_from_utf16( + reinterpret_cast(value_view.data16()), + value_view.length()) + + 1; target->AllocateSufficientStorage(storage); + + // TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's + // implemented const int flags = String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8; const int length = From 435fa0444eeb3e769c83bfa1f4bc8eb53b3f5365 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 10 Sep 2024 12:10:43 -0400 Subject: [PATCH 2/4] Update util.cc Co-authored-by: Daniel Lemire --- src/string_bytes.cc | 40 ++++++++++++++++++++++------------------ src/util.cc | 24 +++++++++++++++--------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 3e2b29005a2012..e274bc2db79954 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -419,21 +419,21 @@ Maybe StringBytes::StorageSize(Isolate* isolate, Local val, enum encoding encoding) { HandleScope scope(isolate); - size_t data_size = 0; - bool is_buffer = Buffer::HasInstance(val); - if (is_buffer && (encoding == BUFFER || encoding == LATIN1)) { + if (Buffer::HasInstance(val) && (encoding == BUFFER || encoding == LATIN1)) { return Just(Buffer::Length(val)); } Local str; if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str)) return Nothing(); + String::ValueView view(isolate, str); + size_t data_size = 0; switch (encoding) { case ASCII: case LATIN1: - data_size = str->Length(); + data_size = view.length(); break; case BUFFER: @@ -441,25 +441,25 @@ Maybe StringBytes::StorageSize(Isolate* isolate, // A single UCS2 codepoint never takes up more than 3 utf8 bytes. // It is an exercise for the caller to decide when a string is // long enough to justify calling Size() instead of StorageSize() - data_size = 3 * str->Length(); + data_size = 3 * view.length(); break; case UCS2: - data_size = str->Length() * sizeof(uint16_t); + data_size = view.length() * sizeof(uint16_t); break; case BASE64URL: - data_size = simdutf::base64_length_from_binary(str->Length(), + data_size = simdutf::base64_length_from_binary(view.length(), simdutf::base64_url); break; case BASE64: - data_size = simdutf::base64_length_from_binary(str->Length()); + data_size = simdutf::base64_length_from_binary(view.length()); break; case HEX: - CHECK(str->Length() % 2 == 0 && "invalid hex string length"); - data_size = str->Length() / 2; + CHECK(view.length() % 2 == 0 && "invalid hex string length"); + data_size = view.length() / 2; break; default: @@ -480,32 +480,36 @@ Maybe StringBytes::Size(Isolate* isolate, Local str; if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str)) return Nothing(); + String::ValueView view(isolate, str); switch (encoding) { case ASCII: case LATIN1: - return Just(str->Length()); + return Just(view.length()); case BUFFER: case UTF8: - return Just(str->Utf8Length(isolate)); + if (view.is_one_byte()) { + return Just(simdutf::utf8_length_from_latin1( + reinterpret_cast(view.data8()), view.length())); + } + return Just(simdutf::utf8_length_from_utf16( + reinterpret_cast(view.data16()), view.length())); case UCS2: - return Just(str->Length() * sizeof(uint16_t)); + return Just(view.length() * sizeof(uint16_t)); case BASE64URL: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length(), + return Just(simdutf::base64_length_from_binary(view.length(), simdutf::base64_url)); } case BASE64: { - String::Value value(isolate, str); - return Just(simdutf::base64_length_from_binary(value.length())); + return Just(simdutf::base64_length_from_binary(view.length())); } case HEX: - return Just(str->Length() / 2); + return Just(view.length() / 2); } UNREACHABLE(); diff --git a/src/util.cc b/src/util.cc index 27b6b4ea06c2a3..51c3da79eb1ce9 100644 --- a/src/util.cc +++ b/src/util.cc @@ -104,20 +104,26 @@ static void MakeUtf8String(Isolate* isolate, if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return; String::ValueView value_view(isolate, string); + auto value_length = value_view.length(); + if (value_view.is_one_byte()) { - target->AllocateSufficientStorage(value_view.length() + 1); - target->SetLengthAndZeroTerminate(value_view.length()); - memcpy(target->out(), - reinterpret_cast(value_view.data8()), - value_view.length()); + auto const_char = reinterpret_cast(value_view.data8()); + auto expected_length = + target->capacity() > (static_cast(value_length) * 2 + 1) + ? simdutf::utf8_length_from_latin1(const_char, value_length) + : value_length * 2; + + // Add +1 for null termination. + target->AllocateSufficientStorage(expected_length + 1); + target->SetLengthAndZeroTerminate(expected_length); + auto actual_length = simdutf::convert_latin1_to_utf8( + const_char, value_length, target->out()); + target->SetLength(actual_length); return; } // Add +1 for null termination. - auto storage = simdutf::utf8_length_from_utf16( - reinterpret_cast(value_view.data16()), - value_view.length()) + - 1; + size_t storage = (3 * value_length) + 1; target->AllocateSufficientStorage(storage); // TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's From cb36b86d92eb5fde2f4a1c07d5db8d8957c46a4e Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Thu, 12 Sep 2024 11:54:21 -0400 Subject: [PATCH 3/4] Update src/util.cc Co-authored-by: Robert Nagy --- src/util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.cc b/src/util.cc index 51c3da79eb1ce9..4b595508f00258 100644 --- a/src/util.cc +++ b/src/util.cc @@ -109,7 +109,7 @@ static void MakeUtf8String(Isolate* isolate, if (value_view.is_one_byte()) { auto const_char = reinterpret_cast(value_view.data8()); auto expected_length = - target->capacity() > (static_cast(value_length) * 2 + 1) + target->capacity() < (static_cast(value_length) * 2 + 1) ? simdutf::utf8_length_from_latin1(const_char, value_length) : value_length * 2; From 4b28cf1a74209e01adc024264c9e937b5bf63062 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sat, 14 Sep 2024 19:40:52 -0400 Subject: [PATCH 4/4] Update src/util.cc Co-authored-by: Robert Nagy --- src/util.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index 4b595508f00258..173f115a162562 100644 --- a/src/util.cc +++ b/src/util.cc @@ -115,10 +115,9 @@ static void MakeUtf8String(Isolate* isolate, // Add +1 for null termination. target->AllocateSufficientStorage(expected_length + 1); - target->SetLengthAndZeroTerminate(expected_length); - auto actual_length = simdutf::convert_latin1_to_utf8( + const auto actual_length = simdutf::convert_latin1_to_utf8( const_char, value_length, target->out()); - target->SetLength(actual_length); + target->SetLengthAndZeroTerminate(actual_length); return; }