1- /* auto-generated on 2023-03-30 17:00:48 -0400. Do not edit! */
1+ /* auto-generated on 2023-04-17 12:20:41 -0400. Do not edit! */
22/* begin file src/ada.cpp */
33#include "ada.h"
44/* begin file src/checkers.cpp */
@@ -2753,7 +2753,7 @@ bool ascii_has_upper_case(char* input, size_t length) {
27532753 auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
27542754 uint64_t broadcast_80 = broadcast(0x80);
27552755 uint64_t broadcast_Ap = broadcast(128 - 'A');
2756- uint64_t broadcast_Zp = broadcast(128 - 'Z');
2756+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
27572757 size_t i = 0;
27582758
27592759 uint64_t runner{0};
@@ -2775,7 +2775,7 @@ void ascii_map(char* input, size_t length) {
27752775 auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
27762776 uint64_t broadcast_80 = broadcast(0x80);
27772777 uint64_t broadcast_Ap = broadcast(128 - 'A');
2778- uint64_t broadcast_Zp = broadcast(128 - 'Z');
2778+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
27792779 size_t i = 0;
27802780
27812781 for (; i + 7 < length; i += 8) {
@@ -9845,7 +9845,7 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
98459845 auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
98469846 uint64_t broadcast_80 = broadcast(0x80);
98479847 uint64_t broadcast_Ap = broadcast(128 - 'A');
9848- uint64_t broadcast_Zp = broadcast(128 - 'Z');
9848+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
98499849 uint64_t non_ascii = 0;
98509850 size_t i = 0;
98519851
@@ -9961,7 +9961,7 @@ ada_really_inline constexpr bool is_forbidden_domain_code_point(
99619961}
99629962
99639963ada_really_inline constexpr bool contains_forbidden_domain_code_point(
9964- char* input, size_t length) noexcept {
9964+ const char* input, size_t length) noexcept {
99659965 size_t i = 0;
99669966 uint8_t accumulator{};
99679967 for (; i + 4 <= length; i += 4) {
@@ -9976,6 +9976,44 @@ ada_really_inline constexpr bool contains_forbidden_domain_code_point(
99769976 return accumulator;
99779977}
99789978
9979+ constexpr static uint8_t is_forbidden_domain_code_point_table_or_upper[] = {
9980+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9981+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
9982+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
9983+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0,
9984+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9985+ 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9986+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9987+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9988+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9989+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9990+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
9991+
9992+ static_assert(sizeof(is_forbidden_domain_code_point_table_or_upper) == 256);
9993+ static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('A')] == 2);
9994+ static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('Z')] == 2);
9995+
9996+ ada_really_inline constexpr bool contains_forbidden_domain_code_point_or_upper(
9997+ const char* input, size_t length) noexcept {
9998+ size_t i = 0;
9999+ uint8_t accumulator{};
10000+ for (; i + 4 <= length; i += 4) {
10001+ accumulator |=
10002+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
10003+ accumulator |=
10004+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 1])];
10005+ accumulator |=
10006+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 2])];
10007+ accumulator |=
10008+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 3])];
10009+ }
10010+ for (; i < length; i++) {
10011+ accumulator |=
10012+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
10013+ }
10014+ return accumulator;
10015+ }
10016+
997910017static_assert(unicode::is_forbidden_domain_code_point('%'));
998010018static_assert(unicode::is_forbidden_domain_code_point('\x7f'));
998110019static_assert(unicode::is_forbidden_domain_code_point('\0'));
@@ -13473,23 +13511,50 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
1347313511 // to ASCII with domain and false. The most common case is an ASCII input, in
1347413512 // which case we do not need to call the expensive 'to_ascii' if a few
1347513513 // conditions are met: no '%' and no 'xn-' subsequence.
13476- std::string _buffer = std::string(input);
13477- // This next function checks that the result is ascii, but we are going to
13478- // to check anyhow with is_forbidden.
13479- // bool is_ascii =
13480- unicode::to_lower_ascii(_buffer.data(), _buffer.size());
13481- bool is_forbidden = unicode::contains_forbidden_domain_code_point(
13482- _buffer.data(), _buffer.size());
13483- if (is_forbidden == 0 && _buffer.find("xn-") == std::string_view::npos) {
13514+
13515+ // Often, the input does not contain any forbidden code points, and no upper
13516+ // case ASCII letter, then we can just copy it to the buffer. We want to
13517+ // optimize for such a common case.
13518+ uint8_t is_forbidden_or_upper =
13519+ unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
13520+ input.size());
13521+ // Minor optimization opportunity:
13522+ // contains_forbidden_domain_code_point_or_upper could be extend to check for
13523+ // the presence of characters that cannot appear in the ipv4 address and we
13524+ // could also check whether x and n and - are present, and so we could skip
13525+ // some of the checks below. However, the gains are likely to be small, and
13526+ // the code would be more complex.
13527+ if (is_forbidden_or_upper == 0 &&
13528+ input.find("xn-") == std::string_view::npos) {
1348413529 // fast path
13485- update_base_hostname(_buffer );
13530+ update_base_hostname(input );
1348613531 if (checkers::is_ipv4(get_hostname())) {
1348713532 ada_log("parse_host fast path ipv4");
1348813533 return parse_ipv4(get_hostname());
1348913534 }
1349013535 ada_log("parse_host fast path ", get_hostname());
1349113536 return true;
13537+ } else if (is_forbidden_or_upper == 2) {
13538+ // We have encountered at least one upper case ASCII letter, let us
13539+ // try to convert it to lower case. If there is no 'xn-' in the result,
13540+ // we can then use a secondary fast path.
13541+ std::string _buffer = std::string(input);
13542+ unicode::to_lower_ascii(_buffer.data(), _buffer.size());
13543+ if (input.find("xn-") == std::string_view::npos) {
13544+ // secondary fast path when input is not all lower case
13545+ update_base_hostname(input);
13546+ if (checkers::is_ipv4(get_hostname())) {
13547+ ada_log("parse_host fast path ipv4");
13548+ return parse_ipv4(get_hostname());
13549+ }
13550+ ada_log("parse_host fast path ", get_hostname());
13551+ return true;
13552+ }
1349213553 }
13554+ // We have encountered at least one forbidden code point or the input contains
13555+ // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
13556+ // conversion.
13557+
1349313558 ada_log("parse_host calling to_ascii");
1349413559 std::optional<std::string> host = std::string(get_hostname());
1349513560 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
0 commit comments