|
24 | 24 |
|
25 | 25 | module P = Ext_pp
|
26 | 26 |
|
| 27 | +open Ext_utf8 |
| 28 | + |
27 | 29 | (** Avoid to allocate single char string too many times*)
|
28 | 30 | let array_str1 = Array.init 256 (fun i -> String.make 1 (Char.chr i))
|
29 | 31 |
|
@@ -56,40 +58,86 @@ let ( +> ) = Ext_buffer.add_string
|
56 | 58 | let escape_to_buffer f (* ?(utf=false)*) s =
|
57 | 59 | let pp_raw_string f (* ?(utf=false)*) s =
|
58 | 60 | let l = String.length s in
|
59 |
| - for i = 0 to l - 1 do |
60 |
| - let c = String.unsafe_get s i in |
| 61 | + let i = ref 0 in |
| 62 | + while !i < l do |
| 63 | + let c = String.unsafe_get s !i in |
61 | 64 | match c with
|
62 |
| - | '\b' -> f +> "\\b" |
63 |
| - | '\012' -> f +> "\\f" |
64 |
| - | '\n' -> f +> "\\n" |
65 |
| - | '\r' -> f +> "\\r" |
66 |
| - | '\t' -> f +> "\\t" |
| 65 | + | '\b' -> |
| 66 | + f +> "\\b"; |
| 67 | + incr i |
| 68 | + | '\012' -> |
| 69 | + f +> "\\f"; |
| 70 | + incr i |
| 71 | + | '\n' -> |
| 72 | + f +> "\\n"; |
| 73 | + incr i |
| 74 | + | '\r' -> |
| 75 | + f +> "\\r"; |
| 76 | + incr i |
| 77 | + | '\t' -> |
| 78 | + f +> "\\t"; |
| 79 | + incr i |
67 | 80 | (* This escape sequence is not supported by IE < 9
|
68 | 81 | | '\011' -> "\\v"
|
69 |
| - IE < 9 treats '\v' as 'v' instead of a vertical tab ('\x0B'). |
70 |
| - If cross-browser compatibility is a concern, use \x0B instead of \v. |
| 82 | + IE < 9 treats '\v' as 'v' instead of a vertical tab ('\x0B'). |
| 83 | + If cross-browser compatibility is a concern, use \x0B instead of \v. |
71 | 84 |
|
72 |
| - Another thing to note is that the \v and \0 escapes are not allowed in JSON strings. |
73 |
| - *) |
| 85 | + Another thing to note is that the \v and \0 escapes are not allowed in JSON strings. |
| 86 | + *) |
74 | 87 | | '\000'
|
75 |
| - when i = l - 1 |
| 88 | + when !i = l - 1 |
76 | 89 | ||
|
77 |
| - let next = String.unsafe_get s (i + 1) in |
| 90 | + let next = String.unsafe_get s (!i + 1) in |
78 | 91 | next < '0' || next > '9' ->
|
79 |
| - f +> "\\0" |
80 |
| - | '\\' (* when not utf*) -> f +> "\\\\" |
| 92 | + f +> "\\0"; |
| 93 | + incr i |
| 94 | + | '\\' (* when not utf*) -> |
| 95 | + f +> "\\\\"; |
| 96 | + incr i |
81 | 97 | | '\000' .. '\031' | '\127' ->
|
82 | 98 | let c = Char.code c in
|
83 | 99 | f +> "\\x";
|
84 | 100 | f +> Array.unsafe_get array_conv (c lsr 4);
|
85 |
| - f +> Array.unsafe_get array_conv (c land 0xf) |
86 |
| - | '\128' .. '\255' (* when not utf*) -> |
87 |
| - let c = Char.code c in |
88 |
| - f +> "\\x"; |
89 |
| - f +> Array.unsafe_get array_conv (c lsr 4); |
90 |
| - f +> Array.unsafe_get array_conv (c land 0xf) |
91 |
| - | '\"' -> f +> "\\\"" (* quote*) |
92 |
| - | _ -> f +> Array.unsafe_get array_str1 (Char.code c) |
| 101 | + f +> Array.unsafe_get array_conv (c land 0xf); |
| 102 | + incr i |
| 103 | + | '\128' .. '\255' -> ( |
| 104 | + (* Check if this is part of a valid UTF-8 sequence *) |
| 105 | + let utf8_byte = classify c in |
| 106 | + match utf8_byte with |
| 107 | + | Single _ -> |
| 108 | + (* Single byte >= 128, escape it *) |
| 109 | + let c = Char.code c in |
| 110 | + f +> "\\x"; |
| 111 | + f +> Array.unsafe_get array_conv (c lsr 4); |
| 112 | + f +> Array.unsafe_get array_conv (c land 0xf); |
| 113 | + incr i |
| 114 | + | Leading (n, _) -> |
| 115 | + (* Start of UTF-8 sequence, output the whole sequence as-is *) |
| 116 | + let rec output_utf8_sequence pos remaining = |
| 117 | + if remaining > 0 && pos < l then ( |
| 118 | + let byte = String.unsafe_get s pos in |
| 119 | + f +> Array.unsafe_get array_str1 (Char.code byte); |
| 120 | + output_utf8_sequence (pos + 1) (remaining - 1)) |
| 121 | + in |
| 122 | + output_utf8_sequence !i (n + 1); |
| 123 | + (* Skip the continuation bytes *) |
| 124 | + i := !i + n + 1 |
| 125 | + | Cont _ -> |
| 126 | + (* Continuation byte, should be handled as part of Leading case *) |
| 127 | + incr i |
| 128 | + | Invalid -> |
| 129 | + (* Invalid UTF-8 byte, escape it *) |
| 130 | + let c = Char.code c in |
| 131 | + f +> "\\x"; |
| 132 | + f +> Array.unsafe_get array_conv (c lsr 4); |
| 133 | + f +> Array.unsafe_get array_conv (c land 0xf); |
| 134 | + incr i) |
| 135 | + | '\"' -> |
| 136 | + f +> "\\\""; |
| 137 | + incr i (* quote*) |
| 138 | + | _ -> |
| 139 | + f +> Array.unsafe_get array_str1 (Char.code c); |
| 140 | + incr i |
93 | 141 | done
|
94 | 142 | in
|
95 | 143 | f +> "\"";
|
|
0 commit comments