1717mod cursor;
1818pub mod unescape;
1919
20+ #[ cfg( test) ]
21+ mod tests;
22+
2023use self :: LiteralKind :: * ;
2124use self :: TokenKind :: * ;
2225use crate :: cursor:: { Cursor , EOF_CHAR } ;
26+ use std:: convert:: TryInto ;
2327
2428/// Parsed token.
2529/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,80 @@ pub enum LiteralKind {
132136 /// "b"abc"", "b"abc"
133137 ByteStr { terminated : bool } ,
134138 /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
135- RawStr { n_hashes : usize , started : bool , terminated : bool } ,
139+ RawStr ( UnvalidatedRawStr ) ,
136140 /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
137- RawByteStr { n_hashes : usize , started : bool , terminated : bool } ,
141+ RawByteStr ( UnvalidatedRawStr ) ,
142+ }
143+
144+ /// Represents something that looks like a raw string, but may have some
145+ /// problems. Use `.validate()` to convert it into something
146+ /// usable.
147+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
148+ pub struct UnvalidatedRawStr {
149+ /// The prefix (`r###"`) is valid
150+ valid_start : bool ,
151+ /// The number of leading `#`
152+ n_start_hashes : usize ,
153+ /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
154+ n_end_hashes : usize ,
155+ /// The offset starting at `r` or `br` where the user may have intended to end the string.
156+ /// Currently, it is the longest sequence of pattern `"#+"`.
157+ possible_terminator_offset : Option < usize > ,
158+ }
159+
160+ /// Error produced validating a raw string. Represents cases like:
161+ /// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
162+ /// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
163+ /// - Too many `#`s (>65536): `TooManyDelimiters`
164+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
165+ pub enum LexRawStrError {
166+ /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
167+ InvalidStarter ,
168+ /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
169+ /// may have intended to terminate it.
170+ NoTerminator { expected : usize , found : usize , possible_terminator_offset : Option < usize > } ,
171+ /// More than 65536 `#`s exist.
172+ TooManyDelimiters ,
173+ }
174+
175+ /// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
176+ /// there are a matching number of `#` characters in both. Note that this will
177+ /// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
178+ /// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
179+ #[ derive( Debug , Eq , PartialEq , Copy , Clone ) ]
180+ pub struct ValidatedRawStr {
181+ n_hashes : u16 ,
182+ }
183+
184+ impl ValidatedRawStr {
185+ pub fn num_hashes ( & self ) -> u16 {
186+ self . n_hashes
187+ }
188+ }
189+
190+ impl UnvalidatedRawStr {
191+ pub fn validate ( self ) -> Result < ValidatedRawStr , LexRawStrError > {
192+ if !self . valid_start {
193+ return Err ( LexRawStrError :: InvalidStarter ) ;
194+ }
195+
196+ // Only up to 65535 `#`s are allowed in raw strings
197+ let n_start_safe: u16 =
198+ self . n_start_hashes . try_into ( ) . map_err ( |_| LexRawStrError :: TooManyDelimiters ) ?;
199+
200+ if self . n_start_hashes > self . n_end_hashes {
201+ Err ( LexRawStrError :: NoTerminator {
202+ expected : self . n_start_hashes ,
203+ found : self . n_end_hashes ,
204+ possible_terminator_offset : self . possible_terminator_offset ,
205+ } )
206+ } else {
207+ // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
208+ // they must be equal.
209+ debug_assert_eq ! ( self . n_start_hashes, self . n_end_hashes) ;
210+ Ok ( ValidatedRawStr { n_hashes : n_start_safe } )
211+ }
212+ }
138213}
139214
140215/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +284,7 @@ pub fn is_whitespace(c: char) -> bool {
209284 // Dedicated whitespace characters from Unicode
210285 | '\u{2028}' // LINE SEPARATOR
211286 | '\u{2029}' // PARAGRAPH SEPARATOR
212- => true ,
287+ => true ,
213288 _ => false ,
214289 }
215290}
@@ -258,12 +333,12 @@ impl Cursor<'_> {
258333 'r' => match ( self . first ( ) , self . second ( ) ) {
259334 ( '#' , c1) if is_id_start ( c1) => self . raw_ident ( ) ,
260335 ( '#' , _) | ( '"' , _) => {
261- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
336+ let raw_str_i = self . raw_double_quoted_string ( 1 ) ;
262337 let suffix_start = self . len_consumed ( ) ;
263- if terminated {
338+ if raw_str_i . n_end_hashes == raw_str_i . n_start_hashes {
264339 self . eat_literal_suffix ( ) ;
265340 }
266- let kind = RawStr { n_hashes , started , terminated } ;
341+ let kind = RawStr ( raw_str_i ) ;
267342 Literal { kind, suffix_start }
268343 }
269344 _ => self . ident ( ) ,
@@ -293,12 +368,14 @@ impl Cursor<'_> {
293368 }
294369 ( 'r' , '"' ) | ( 'r' , '#' ) => {
295370 self . bump ( ) ;
296- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
371+ let raw_str_i = self . raw_double_quoted_string ( 2 ) ;
297372 let suffix_start = self . len_consumed ( ) ;
373+ let terminated = raw_str_i. n_start_hashes == raw_str_i. n_end_hashes ;
298374 if terminated {
299375 self . eat_literal_suffix ( ) ;
300376 }
301- let kind = RawByteStr { n_hashes, started, terminated } ;
377+
378+ let kind = RawByteStr ( raw_str_i) ;
302379 Literal { kind, suffix_start }
303380 }
304381 _ => self . ident ( ) ,
@@ -594,37 +671,49 @@ impl Cursor<'_> {
594671 false
595672 }
596673
597- /// Eats the double-quoted string and returns a tuple of
598- /// (amount of the '#' symbols, raw string started, raw string terminated)
599- fn raw_double_quoted_string ( & mut self ) -> ( usize , bool , bool ) {
674+ /// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
675+ fn raw_double_quoted_string ( & mut self , prefix_len : usize ) -> UnvalidatedRawStr {
600676 debug_assert ! ( self . prev( ) == 'r' ) ;
601- let mut started: bool = false ;
602- let mut finished: bool = false ;
677+ let mut valid_start: bool = false ;
678+ let start_pos = self . len_consumed ( ) ;
679+ let ( mut possible_terminator_offset, mut max_hashes) = ( None , 0 ) ;
603680
604681 // Count opening '#' symbols.
605- let n_hashes = self . eat_while ( |c| c == '#' ) ;
682+ let n_start_hashes = self . eat_while ( |c| c == '#' ) ;
606683
607684 // Check that string is started.
608685 match self . bump ( ) {
609- Some ( '"' ) => started = true ,
610- _ => return ( n_hashes, started, finished) ,
686+ Some ( '"' ) => valid_start = true ,
687+ _ => {
688+ return UnvalidatedRawStr {
689+ valid_start,
690+ n_start_hashes,
691+ n_end_hashes : 0 ,
692+ possible_terminator_offset,
693+ } ;
694+ }
611695 }
612696
613697 // Skip the string contents and on each '#' character met, check if this is
614698 // a raw string termination.
615- while !finished {
699+ loop {
616700 self . eat_while ( |c| c != '"' ) ;
617701
618702 if self . is_eof ( ) {
619- return ( n_hashes, started, finished) ;
703+ return UnvalidatedRawStr {
704+ valid_start,
705+ n_start_hashes,
706+ n_end_hashes : max_hashes,
707+ possible_terminator_offset,
708+ } ;
620709 }
621710
622711 // Eat closing double quote.
623712 self . bump ( ) ;
624713
625714 // Check that amount of closing '#' symbols
626715 // is equal to the amount of opening ones.
627- let mut hashes_left = n_hashes ;
716+ let mut hashes_left = n_start_hashes ;
628717 let is_closing_hash = |c| {
629718 if c == '#' && hashes_left != 0 {
630719 hashes_left -= 1 ;
@@ -633,10 +722,23 @@ impl Cursor<'_> {
633722 false
634723 }
635724 } ;
636- finished = self . eat_while ( is_closing_hash) == n_hashes;
725+ let n_end_hashes = self . eat_while ( is_closing_hash) ;
726+
727+ if n_end_hashes == n_start_hashes {
728+ return UnvalidatedRawStr {
729+ valid_start,
730+ n_start_hashes,
731+ n_end_hashes,
732+ possible_terminator_offset : None ,
733+ } ;
734+ } else if n_end_hashes > max_hashes {
735+ // Keep track of possible terminators to give a hint about where there might be
736+ // a missing terminator
737+ possible_terminator_offset =
738+ Some ( self . len_consumed ( ) - start_pos - n_end_hashes + prefix_len) ;
739+ max_hashes = n_end_hashes;
740+ }
637741 }
638-
639- ( n_hashes, started, finished)
640742 }
641743
642744 fn eat_decimal_digits ( & mut self ) -> bool {
0 commit comments