Skip to content

Commit 8196809

Browse files
committed
Enable PCRE UTF-8 validity string checks
Strings are no guaranteed to contain valid UTF-8, and PCRE documentation says that the behavior is undefined in that case.
1 parent 6569fd0 commit 8196809

File tree

2 files changed

+16
-2
lines changed

2 files changed

+16
-2
lines changed

base/regex.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
include("pcre.jl")
66

7-
const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.NO_UTF_CHECK | PCRE.ALT_BSUX
8-
const DEFAULT_MATCH_OPTS = PCRE.NO_UTF_CHECK
7+
const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX
8+
const DEFAULT_MATCH_OPTS = zero(UInt32)
99

1010
mutable struct Regex
1111
pattern::String

test/regex.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,17 @@ end
4949

5050
# Proper unicode handling
5151
@test match(r"∀∀", "∀x∀∀∀").match == "∀∀"
52+
53+
@test_throws ErrorException match(r"a", "\xe2\x88") # 1 byte missing at end
54+
@test_throws ErrorException match(r"a", "\xe2\x08\x80") # byte 2 top bits not 0x80
55+
@test_throws ErrorException match(r"a", "\xf8\x89\x89\x80\x80") # 5-byte character is not allowed (RFC 3629)
56+
@test_throws ErrorException match(r"a", "\UDFFF") # code points 0xd800-0xdfff are not defined
57+
@test_throws ErrorException match(r"a", "\xc0\x80") # overlong 2-byte sequence
58+
@test_throws ErrorException match(r"a", "\xff") # illegal byte (0xfe or 0xff)
59+
60+
@test_throws ErrorException Regex("\xe2\x88") # 1 byte missing at end
61+
@test_throws ErrorException Regex("\xe2\x08\x80") # byte 2 top bits not 0x80
62+
@test_throws ErrorException Regex("\xf8\x89\x89\x80\x80") # 5-byte character is not allowed (RFC 3629)
63+
@test_throws ErrorException Regex("\UDFFF") # code points 0xd800-0xdfff are not defined
64+
@test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence
65+
@test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)

0 commit comments

Comments
 (0)