Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 80 additions & 16 deletions src/pcre2.ml
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,30 @@ let config_match_limit = pcre2_config_match_limit ()
let config_depth_limit = pcre2_config_depth_limit ()
let config_stackrecurse = pcre2_config_stackrecurse ()

type newline_type =
[
| `NEWLINE_CR
| `NEWLINE_LF
| `NEWLINE_CRLF
| `NEWLINE_ANY
| `NEWLINE_ANYCRLF
| `NEWLINE_NUL
]

let newline_type_of_int : int -> newline_type = function
| 1 -> `NEWLINE_CR
| 2 -> `NEWLINE_LF
| 3 -> `NEWLINE_CRLF
| 4 -> `NEWLINE_ANY
| 5 -> `NEWLINE_ANYCRLF
| 6 -> `NEWLINE_NUL
| _ -> failwith "Pcre2.newline_type_of_int: unknown newline type"

let config_newline_is_crlf =
match newline_type_of_int (Char.code config_newline) with
| `NEWLINE_CRLF | `NEWLINE_ANY | `NEWLINE_ANYCRLF -> true
| _ -> false


(* Information on patterns *)

Expand Down Expand Up @@ -532,24 +556,64 @@ let rec copy_lst ar n = function
let exec_all ?(iflags = 0L) ?flags ?(rex = def_rex) ?pat ?pos ?callout subj =
let rex = match pat with Some str -> regexp str | _ -> rex in
let iflags = match flags with Some flags -> rflags flags | _ -> iflags in
let (_, ovector as sstrs) = exec ~iflags ~rex ?pos ?callout subj in
let null_flags = Int64.logor iflags 0x00000004L in (* `NOTEMPTY *)
let subj_len = String.length subj in
let rec loop pos (subj, ovector as sstrs) n lst =
let maybe_ovector =
let (_, ovector) = exec ~iflags ~rex ?pos ?callout subj in
let null_flags =
List.fold_left Int64.logor Int64.zero
[iflags; int_of_rflag `NOTEMPTY_ATSTART; int_of_rflag `ANCHORED]
in
let rec loop pos ovector idx lst =
let match_start = Array.unsafe_get ovector 0 in
if match_start = pos then
if match_start = subj_len then
(* We have reached the end of the subject string, so there are no more
matches to be found. *)
copy_lst (Array.make (succ idx) (subj, ovector)) (pred idx) lst
else match_anchored pos ovector idx lst
else match_normal pos ovector idx lst
and match_normal pos ovector idx lst =
let next =
try Some (pcre2_match ~iflags ~rex ~pos ?callout subj)
with Not_found -> None
in
match next with
| Some ovector' ->
loop (Array.unsafe_get ovector' 1) ovector' (succ idx)
((subj, ovector') :: lst)
| None -> copy_lst (Array.make idx (subj, ovector)) (pred idx) lst
and match_anchored pos ovector idx lst =
let pos', ovector', idx', lst' =
try
let first = Array.unsafe_get ovector 0 in
if first = pos && Array.unsafe_get ovector 1 = pos then
if pos = subj_len then None
else Some (pcre2_match ~iflags:null_flags ~rex ~pos ?callout subj)
else Some (pcre2_match ~iflags ~rex ~pos ?callout subj)
with Not_found -> None in
match maybe_ovector with
| Some ovector ->
let new_pos = Array.unsafe_get ovector 1 in
loop new_pos (subj, ovector) (n + 1) (sstrs :: lst)
| None -> copy_lst (Array.make (n + 1) sstrs) (n - 1) lst in
loop (Array.unsafe_get ovector 1) sstrs 0 []
(* Try to perform an anchored non-empty match, as the previous match was
empty. If this succeeds, we can safely advance past the match. *)
let ovector' = pcre2_match ~iflags:null_flags ~rex ~pos ?callout subj in
( Array.unsafe_get ovector' 1, ovector', succ idx
, (subj, ovector') :: lst )
with Not_found ->
(* A Not_found exception here does not necessarily mean that there are
no more matches remaining in this string: advance to the beginning of
the next UTF-8 character or past the beginning of the newline we
currently reference, then continue matching. *)
let next_offset =
if config_newline_is_crlf then
if pos < pred subj_len
&& Char.equal subj.[pos] '\r'
&& Char.equal subj.[succ pos] '\n'
then 2
else 1
else if config_unicode then
let first_byte = Char.code subj.[pos] in
if first_byte land 0b11111000 = 0b11110000 then 4
else if first_byte land 0b11110000 = 0b11100000 then 3
else if first_byte land 0b11100000 = 0b11000000 then 2
else 1
else 1
in
(pos + next_offset, ovector, idx, lst)
in
loop pos' ovector' idx' lst'
in
loop (Array.unsafe_get ovector 1) ovector 1 [(subj, ovector)]

let extract ?iflags ?flags ?rex ?pat ?pos ?full_match ?callout subj =
get_substrings ?full_match (exec ?iflags ?flags ?rex ?pat ?pos ?callout subj)
Expand Down
49 changes: 46 additions & 3 deletions test/pcre2_tests.ml
Original file line number Diff line number Diff line change
@@ -1,16 +1,59 @@
open OUnit2
open Pcre2

let simple_test ctxt =
let test_full_split ctxt =
assert_equal 0 0
; assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"]
(full_split ~pat:"(x)|(u)" "abxcd")
; assert_equal [Text "ab"; Delim "x"; Group (1, "x"); NoGroup; Text "cd"; Delim "u";
NoGroup; Group (2, "u"); Text "ef"]
(full_split ~pat:"(x)|(u)" "abxcduef")

let suite = "Test pcre" >::: [
"simple_test" >:: simple_test
let test_exec_all ctxt =
let assert_matches ~pat subj expected =
let rec assert_match_equal idx expected actual =
match expected with
| (exp_subj, exp_start, exp_stop) :: exp_tl ->
assert_equal exp_subj (get_substring actual idx);
let act_start, act_stop = get_substring_ofs actual idx in
assert_equal exp_start act_start;
assert_equal exp_stop act_stop;
assert_match_equal (succ idx) exp_tl actual
| [] -> ()
and assert_substrings_equal expected actual =
match expected, actual with
| exp_hd :: exp_tl, act_hd :: act_tl ->
assert_equal (List.length exp_hd) (num_of_subs act_hd);
assert_match_equal 0 exp_hd act_hd;
assert_substrings_equal exp_tl act_tl
| [], [] -> ()
| _, _ -> failwith "Unequal substring list lengths"
in
let actual = exec_all ~pat subj in
assert_equal (List.length expected) (Array.length actual);
assert_substrings_equal expected @@ Array.to_list actual
in
(* A pattern with no matches should raise Not_found. *)
assert_raises Not_found (fun () -> exec_all ~pat:"empty" "");
assert_raises Not_found (fun () -> exec_all ~pat:"empty" "empt");
(* Single matches of non-zero-length patterns. *)
assert_matches ~pat:"p" "p" [[("p", 0, 1)]];
assert_matches ~pat:"pattern" "pattern" [[("pattern", 0, 7)]];
(* Multiple matches of non-zero-length patterns. *)
assert_matches ~pat:"a" "aaa" [[("a", 0, 1)]; [("a", 1, 2)]; [("a", 2, 3)]];
assert_matches ~pat:"hello|ocaml" "hello ocaml"
[[("hello", 0, 5)]; [("ocaml", 6, 11)]];
assert_matches ~pat:"(hello|ocaml)" "hello ocaml"
[[("hello", 0, 5); ("hello", 0, 5)]; [("ocaml", 6, 11); ("ocaml", 6, 11)]];
(* Matches of zero-length patterns. *)
assert_matches ~pat:"(?=(hello|ocaml))" "hello ocaml"
[[("", 0, 0); ("hello", 0, 5)]; [("", 6, 6); ("ocaml", 6, 11)]];
assert_matches ~pat:"(?=(hello|ocaml))" "hellocaml"
[[("", 0, 0); ("hello", 0, 5)]; [("", 4, 4); ("ocaml", 4, 9)]]

let suite = "Test pcre2" >::: [
"test_full_split" >:: test_full_split;
"test_exec_all" >:: test_exec_all;
]

let _ =
Expand Down