Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2bdfc08
Re-implement the grapheme segmenter from Intl.
matouskozak Dec 1, 2023
b4186f6
refactor grapheme segmentation
matouskozak Dec 1, 2023
cc22cf2
polish grapheme-segmenter.ts 2
matouskozak Dec 4, 2023
0b54cbc
fix equals
matouskozak Dec 8, 2023
0ea5118
cache GraphemeSegmenter
matouskozak Dec 8, 2023
95c7ddf
re-use isSurrogate from change-case.ts
matouskozak Dec 8, 2023
5a7335d
move isSurrogate to helpers.ts
matouskozak Dec 11, 2023
61e06b2
change GraphemeSegmenter functions to cammelCase
matouskozak Dec 11, 2023
2240a29
Change collation.ts functions to cammelCase
matouskozak Dec 12, 2023
aed0811
Merge branch 'main' into wasm-hg-indexOf
matouskozak Dec 12, 2023
44276a4
JsGlobalization.IndexOf
matouskozak Jan 4, 2024
2b1034b
load segmentation rules as static json
matouskozak Jan 5, 2024
7b1413b
Merge branch 'main' into wasm-hg-indexOf
matouskozak Jan 5, 2024
b9a36c1
re-formulate ShouldFilterCandidate condition
matouskozak Jan 8, 2024
2b62eb2
segmentation-rules.json as ICULibNativeFiles
matouskozak Jan 8, 2024
9ac537f
use segmentation-rules instead of static-json
matouskozak Jan 8, 2024
806c566
use full-path for segmentation-rules assset
matouskozak Jan 8, 2024
6ec53cd
simplify segmentation-rules include
matouskozak Jan 10, 2024
0199ab6
reverse filter-out condition
matouskozak Jan 11, 2024
093cf63
fix code style
matouskozak Jan 11, 2024
914986f
refactor resource target condition
matouskozak Jan 12, 2024
8743bf4
add segmentation-rules.json to expected ICU assets
matouskozak Jan 12, 2024
e740ff3
add segmentation-rules.json to expected assets fix
matouskozak Jan 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions THIRD-PARTY-NOTICES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -1331,3 +1331,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf

License for FormatJS Intl.Segmenter grapheme segmentation algorithm
--------------------------------------------------------------------------
Available at https://github.com/formatjs/formatjs/blob/58d6a7b398d776ca3d2726d72ae1573b65cc3bef/packages/intl-segmenter/LICENSE.md

MIT License

Copyright (c) 2022 FormatJS

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3 changes: 2 additions & 1 deletion eng/liveBuilds.targets
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@
$(LibrariesNativeArtifactsPath)package.json;
$(LibrariesNativeArtifactsPath)dotnet.native.wasm;
$(LibrariesNativeArtifactsPath)dotnet.native.js.symbols;
$(LibrariesNativeArtifactsPath)*.dat;"
$(LibrariesNativeArtifactsPath)*.dat;
$(LibrariesNativeArtifactsPath)segmentation-rules.json;"
IsNative="true" />
<!-- for threaded wasm -->
<LibrariesRuntimeFiles Condition="'$(TargetOS)' == 'browser' and Exists('$(LibrariesNativeArtifactsPath)dotnet.native.worker.js')"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
<PlatformManifestFileEntry Include="icudt_optimal.dat" IsNative="true" />
<PlatformManifestFileEntry Include="icudt_optimal_no_CJK.dat" IsNative="true" />
<PlatformManifestFileEntry Include="icudt_hybrid.dat" IsNative="true" />
<PlatformManifestFileEntry Include="segmentation-rules.json" IsNative="true" />
<PlatformManifestFileEntry Include="package.json" IsNative="true" />
<PlatformManifestFileEntry Include="dotnet.es6.pre.js" IsNative="true" />
<PlatformManifestFileEntry Include="dotnet.es6.lib.js" IsNative="true" />
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/browser.proj
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@
<ItemGroup>
<ICULibNativeFiles Include="$(ICULibDir)/libicuuc.a;
$(ICULibDir)/libicui18n.a;
$(ICULibDir)/libicudata.a" />
$(ICULibDir)/libicudata.a;
$(BrowserProjectRoot)runtime/hybrid-globalization/segmentation-rules.json" />
<ICULibFiles Include="$(ICULibDir)/*.dat" />
</ItemGroup>
<PropertyGroup>
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/build/BrowserWasmApp.targets
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@

<ItemGroup Condition="'$(InvariantGlobalization)' != 'true'">
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)segmentation-rules.json"/>
<_IcuAvailableDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_*" Exclude="@(_HybridGlobalizationDataFiles);$(_WasmIcuDataFileName)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="@(_HybridGlobalizationDataFiles)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt.dat"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' == ''" Include="@(_IcuAvailableDataFiles)"/>
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' != ''" Include="$(_WasmIcuDataFileName)"/>
Expand Down
12 changes: 12 additions & 0 deletions src/mono/browser/runtime/assets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { endMeasure, MeasuredBlock, startMeasure } from "./profiler";
import { AssetEntryInternal } from "./types/internal";
import { AssetEntry } from "./types";
import { VoidPtr } from "./types/emscripten";
import { setSegmentationRulesFromJson } from "./hybrid-globalization/grapheme-segmenter";

// this need to be run only after onRuntimeInitialized event, when the memory is ready
export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Array): void {
Expand All @@ -25,6 +26,7 @@ export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Ar
case "dotnetwasm":
case "js-module-threads":
case "symbols":
case "segmentation-rules":
// do nothing
break;
case "resource":
Expand Down Expand Up @@ -104,6 +106,16 @@ export async function instantiate_symbols_asset(pendingAsset: AssetEntryInternal
}
}

export async function instantiate_segmentation_rules_asset(pendingAsset: AssetEntryInternal): Promise<void> {
try {
const response = await pendingAsset.pendingDownloadInternal!.response;
const json = await response.json();
setSegmentationRulesFromJson(json);
} catch (error: any) {
mono_log_info(`Error loading static json asset ${pendingAsset.name}: ${JSON.stringify(error)}`);
}
}

export async function wait_for_all_assets() {
// wait for all assets in memory
await runtimeHelpers.allAssetsInMemory.promise;
Expand Down
3 changes: 2 additions & 1 deletion src/mono/browser/runtime/exports.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { mono_bind_static_method } from "./net6-legacy/method-calls";
import { export_binding_api, export_internal_api, export_mono_api } from "./net6-legacy/exports-legacy";
import { initializeLegacyExports } from "./net6-legacy/globals";
import { mono_log_warn, mono_wasm_stringify_as_error_with_stack } from "./logging";
import { instantiate_asset, instantiate_symbols_asset } from "./assets";
import { instantiate_asset, instantiate_symbols_asset, instantiate_segmentation_rules_asset } from "./assets";
import { jiterpreter_dump_stats } from "./jiterpreter";
import { forceDisposeProxies } from "./gc-handles";

Expand All @@ -46,6 +46,7 @@ function initializeExports(globalObjects: GlobalObjects): RuntimeAPI {
instantiate_asset,
jiterpreter_dump_stats,
forceDisposeProxies,
instantiate_segmentation_rules_asset,
});

const API = export_api();
Expand Down
15 changes: 1 addition & 14 deletions src/mono/browser/runtime/hybrid-globalization/change-case.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@ import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/i
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { localHeapViewU16, setU16_local } from "../memory";

const SURROGATE_HIGHER_START = "\uD800";
const SURROGATE_HIGHER_END = "\uDBFF";
const SURROGATE_LOWER_START = "\uDC00";
const SURROGATE_LOWER_END = "\uDFFF";
import { isSurrogate } from "./helpers";

export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
Expand Down Expand Up @@ -160,15 +156,6 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
}
}

function isSurrogate(str: string, startIdx: number) : boolean
{
return SURROGATE_HIGHER_START <= str[startIdx] &&
str[startIdx] <= SURROGATE_HIGHER_END &&
startIdx+1 < str.length &&
SURROGATE_LOWER_START <= str[startIdx+1] &&
str[startIdx+1] <= SURROGATE_LOWER_END;
}

function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
{
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));
Expand Down
97 changes: 44 additions & 53 deletions src/mono/browser/runtime/hybrid-globalization/collations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ import { monoStringToString, utf16ToString } from "../strings";
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { GraphemeSegmenter } from "./grapheme-segmenter";

const COMPARISON_ERROR = -2;
const INDEXING_ERROR = -1;
let graphemeSegmenterCached: GraphemeSegmenter | null;

export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, str1Length: number, str2: number, str2Length: number, options: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): number {
const cultureRoot = mono_wasm_new_external_root<MonoString>(culture),
Expand All @@ -20,7 +22,7 @@ export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, s
const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
wrap_no_error_root(is_exception, exceptionRoot);
return compare_strings(string1, string2, locale, casePicker);
return compareStrings(string1, string2, locale, casePicker);
}
catch (ex: any) {
wrap_error_root(is_exception, ex, exceptionRoot);
Expand All @@ -37,19 +39,19 @@ export function mono_wasm_starts_with(culture: MonoStringRef, str1: number, str1
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const cultureName = monoStringToString(cultureRoot);
const prefix = decode_to_clean_string(str2, str2Length);
const prefix = decodeToCleanString(str2, str2Length);
// no need to look for an empty string
if (prefix.length == 0)
return 1; // true

const source = decode_to_clean_string(str1, str1Length);
const source = decodeToCleanString(str1, str1Length);
if (source.length < prefix.length)
return 0; //false
const sourceOfPrefixLength = source.slice(0, prefix.length);

const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
const result = compare_strings(sourceOfPrefixLength, prefix, locale, casePicker);
const result = compareStrings(sourceOfPrefixLength, prefix, locale, casePicker);
wrap_no_error_root(is_exception, exceptionRoot);
return result === 0 ? 1 : 0; // equals ? true : false
}
Expand All @@ -68,19 +70,19 @@ export function mono_wasm_ends_with(culture: MonoStringRef, str1: number, str1Le
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const cultureName = monoStringToString(cultureRoot);
const suffix = decode_to_clean_string(str2, str2Length);
const suffix = decodeToCleanString(str2, str2Length);
if (suffix.length == 0)
return 1; // true

const source = decode_to_clean_string(str1, str1Length);
const source = decodeToCleanString(str1, str1Length);
const diff = source.length - suffix.length;
if (diff < 0)
return 0; //false
const sourceOfSuffixLength = source.slice(diff, source.length);

const casePicker = (options & 0x1f);
const locale = cultureName ? cultureName : undefined;
const result = compare_strings(sourceOfSuffixLength, suffix, locale, casePicker);
const result = compareStrings(sourceOfSuffixLength, suffix, locale, casePicker);
wrap_no_error_root(is_exception, exceptionRoot);
return result === 0 ? 1 : 0; // equals ? true : false
}
Expand All @@ -100,68 +102,57 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
try {
const needle = utf16ToString(<any>needlePtr, <any>(needlePtr + 2 * needleLength));
// no need to look for an empty string
if (clean_string(needle).length == 0) {
if (cleanString(needle).length == 0) {
wrap_no_error_root(is_exception, exceptionRoot);
return fromBeginning ? 0 : srcLength;
}

const source = utf16ToString(<any>srcPtr, <any>(srcPtr + 2 * srcLength));
// no need to look in an empty string
if (clean_string(source).length == 0) {
if (cleanString(source).length == 0) {
wrap_no_error_root(is_exception, exceptionRoot);
return fromBeginning ? 0 : srcLength;
}
const cultureName = monoStringToString(cultureRoot);
const locale = cultureName ? cultureName : undefined;
const casePicker = (options & 0x1f);

const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
const needleSegments = Array.from(segmenter.segment(needle)).map(s => s.segment);
let i = 0;
let stop = false;
let result = -1;
let segmentWidth = 0;
let index = 0;
let nextIndex = 0;
while (!stop) {
// we need to restart the iterator in this outer loop because we have shifted it in the inner loop
const iteratorSrc = segmenter.segment(source.slice(i, source.length))[Symbol.iterator]();
let srcNext = iteratorSrc.next();

if (srcNext.done)
break;
const graphemeSegmenter = graphemeSegmenterCached || (graphemeSegmenterCached = new GraphemeSegmenter());
const needleSegments = [];
let needleIdx = 0;

// Grapheme segmentation of needle string
while (needleIdx < needle.length) {
const needleGrapheme = graphemeSegmenter.nextGrapheme(needle, needleIdx);
needleSegments.push(needleGrapheme);
needleIdx += needleGrapheme.length;
}

let srcIdx = 0;
while (srcIdx < source.length) {
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcIdx);
srcIdx += srcGrapheme.length;

let matchFound = check_match_found(srcNext.value.segment, needleSegments[0], locale, casePicker);
index = nextIndex;
srcNext = iteratorSrc.next();
if (srcNext.done) {
result = matchFound ? index : result;
break;
if (!checkMatchFound(srcGrapheme, needleSegments[0], locale, casePicker)) {
continue;
}
segmentWidth = srcNext.value.index;
nextIndex = index + segmentWidth;
if (matchFound) {
for (let j = 1; j < needleSegments.length; j++) {
if (srcNext.done) {
stop = true;
break;
}
matchFound = check_match_found(srcNext.value.segment, needleSegments[j], locale, casePicker);
if (!matchFound)
break;

srcNext = iteratorSrc.next();
}
if (stop)
let j;
let srcNextIdx = srcIdx;
for (j = 1; j < needleSegments.length; j++) {
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcNextIdx);

if (!checkMatchFound(srcGrapheme, needleSegments[j], locale, casePicker)) {
break;
}
srcNextIdx += srcGrapheme.length;
}

if (matchFound) {
result = index;
if (j === needleSegments.length) {
result = srcIdx - srcGrapheme.length;
if (fromBeginning)
break;
}
i = nextIndex;
}
wrap_no_error_root(is_exception, exceptionRoot);
return result;
Expand All @@ -175,12 +166,12 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
exceptionRoot.release();
}

function check_match_found(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
return compare_strings(str1, str2, locale, casePicker) === 0;
function checkMatchFound(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
return compareStrings(str1, str2, locale, casePicker) === 0;
}
}

function compare_strings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
function compareStrings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
switch (casePicker) {
case 0:
// 0: None - default algorithm for the platform OR
Expand Down Expand Up @@ -272,12 +263,12 @@ function compare_strings(string1: string, string2: string, locale: string | unde
}
}

function decode_to_clean_string(strPtr: number, strLen: number) {
function decodeToCleanString(strPtr: number, strLen: number) {
const str = utf16ToString(<any>strPtr, <any>(strPtr + 2 * strLen));
return clean_string(str);
return cleanString(str);
}

function clean_string(str: string) {
function cleanString(str: string) {
const nStr = str.normalize();
return nStr.replace(/[\u200B-\u200D\uFEFF\0]/g, "");
}
Loading