@@ -80,10 +80,6 @@ def initialize
8080    @binary_input    =  nil 
8181    @current_token   =  nil 
8282    @debug           =  false 
83-     @input           =  nil 
84-     @input_encoding  =  nil 
85-     @line            =  0 
86-     @line_pos        =  0 
8783    @s               =  nil 
8884    @tokens          =  [ ] 
8985  end 
@@ -319,13 +315,6 @@ def build_verbatim margin
319315    verbatim 
320316  end 
321317
322-   ## 
323-   # The character offset for the input string at the given +byte_offset+ 
324- 
325-   def  char_pos  byte_offset 
326-     @input . byteslice ( 0 ,  byte_offset ) . length 
327-   end 
328- 
329318  ## 
330319  # Pulls the next token from the stream. 
331320
@@ -424,15 +413,54 @@ def peek_token
424413    token 
425414  end 
426415
416+   ## 
417+   # A simple wrapper of StringScanner that is aware of the current column and lineno 
418+ 
419+   class  MyStringScanner 
420+     def  initialize ( input ) 
421+       @line  =  @column  =  0 
422+       @s  =  StringScanner . new  input 
423+     end 
424+ 
425+     def  scan ( re ) 
426+       prev_pos  =  @s . pos 
427+       ret  =  @s . scan ( re ) 
428+       @column  += ret . length  if  ret 
429+       ret 
430+     end 
431+ 
432+     def  unscan ( s ) 
433+       @s . pos  -= s . bytesize 
434+       @column  -= s . length 
435+     end 
436+ 
437+     def  pos 
438+       [ @column ,  @line ] 
439+     end 
440+ 
441+     def  newline! 
442+       @column  =  0 
443+       @line  += 1 
444+     end 
445+ 
446+     def  eos? 
447+       @s . eos? 
448+     end 
449+ 
450+     def  matched 
451+       @s . matched 
452+     end 
453+ 
454+     def  []( i ) 
455+       @s [ i ] 
456+     end 
457+   end 
458+ 
427459  ## 
428460  # Creates the StringScanner 
429461
430462  def  setup_scanner  input 
431-     @line      =  0 
432-     @line_pos  =  0 
433-     @input     =  input . dup 
434- 
435-     @s  =  StringScanner . new  input 
463+     @s  =  MyStringScanner . new  input 
436464  end 
437465
438466  ## 
@@ -467,31 +495,30 @@ def tokenize input
467495      @tokens  << case 
468496                 # [CR]LF => :NEWLINE 
469497                 when  @s . scan ( /\r ?\n / )  then 
470-                    token  =  [ :NEWLINE ,  @s . matched ,  *token_pos ( pos ) ] 
471-                    @line_pos  =  char_pos  @s . pos 
472-                    @line  += 1 
498+                    token  =  [ :NEWLINE ,  @s . matched ,  *pos ] 
499+                    @s . newline! 
473500                   token 
474501                 # === text => :HEADER then :TEXT 
475502                 when  @s . scan ( /(=+)(\s *)/ )  then 
476503                   level  =  @s [ 1 ] . length 
477-                    header  =  [ :HEADER ,  level ,  *token_pos ( pos ) ] 
504+                    header  =  [ :HEADER ,  level ,  *pos ] 
478505
479506                   if  @s [ 2 ]  =~ /^\r ?\n /  then 
480-                      @s . pos  -=  @s [ 2 ] . length 
507+                      @s . unscan ( @s [ 2 ] ) 
481508                     header 
482509                   else 
483510                     pos  =  @s . pos 
484511                     @s . scan ( /.*/ ) 
485512                     @tokens  << header 
486-                      [ :TEXT ,  @s . matched . sub ( /\r $/ ,  '' ) ,  *token_pos ( pos ) ] 
513+                      [ :TEXT ,  @s . matched . sub ( /\r $/ ,  '' ) ,  *pos ] 
487514                   end 
488515                 # --- (at least 3) and nothing else on the line => :RULE 
489516                 when  @s . scan ( /(-{3,}) *\r ?$/ )  then 
490-                    [ :RULE ,  @s [ 1 ] . length  - 2 ,  *token_pos ( pos ) ] 
517+                    [ :RULE ,  @s [ 1 ] . length  - 2 ,  *pos ] 
491518                 # * or - followed by white space and text => :BULLET 
492519                 when  @s . scan ( /([*-]) +(\S )/ )  then 
493-                    @s . pos  -=  @s [ 2 ] . bytesize   # unget \S 
494-                    [ :BULLET ,  @s [ 1 ] ,  *token_pos ( pos ) ] 
520+                    @s . unscan ( @s [ 2 ] ) 
521+                    [ :BULLET ,  @s [ 1 ] ,  *pos ] 
495522                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER 
496523                 when  @s . scan ( /([a-z]|\d +)\.  +(\S )/i )  then 
497524                   # FIXME if tab(s), the column will be wrong 
@@ -500,7 +527,7 @@ def tokenize input
500527                   # before (and provide a check for that at least in debug 
501528                   # mode) 
502529                   list_label  =  @s [ 1 ] 
503-                    @s . pos  -=  @s [ 2 ] . bytesize   # unget \S 
530+                    @s . unscan ( @s [ 2 ] ) 
504531                   list_type  = 
505532                     case  list_label 
506533                     when  /[a-z]/  then  :LALPHA 
@@ -509,24 +536,24 @@ def tokenize input
509536                     else 
510537                       raise  ParseError ,  "BUG token #{ list_label }  
511538                     end 
512-                    [ list_type ,  list_label ,  *token_pos ( pos ) ] 
539+                    [ list_type ,  list_label ,  *pos ] 
513540                 # [text] followed by spaces or end of line => :LABEL 
514541                 when  @s . scan ( /\[ (.*?)\] ( +|\r ?$)/ )  then 
515-                    [ :LABEL ,  @s [ 1 ] ,  *token_pos ( pos ) ] 
542+                    [ :LABEL ,  @s [ 1 ] ,  *pos ] 
516543                 # text:: followed by spaces or end of line => :NOTE 
517544                 when  @s . scan ( /(.*?)::( +|\r ?$)/ )  then 
518-                    [ :NOTE ,  @s [ 1 ] ,  *token_pos ( pos ) ] 
545+                    [ :NOTE ,  @s [ 1 ] ,  *pos ] 
519546                 # >>> followed by end of line => :BLOCKQUOTE 
520547                 when  @s . scan ( />>> *(\w +)?$/ )  then 
521-                    [ :BLOCKQUOTE ,  @s [ 1 ] ,  *token_pos ( pos ) ] 
548+                    [ :BLOCKQUOTE ,  @s [ 1 ] ,  *pos ] 
522549                 # anything else: :TEXT 
523550                 else 
524551                   @s . scan ( /(.*?)(  )?\r ?$/ ) 
525-                    token  =  [ :TEXT ,  @s [ 1 ] ,  *token_pos ( pos ) ] 
552+                    token  =  [ :TEXT ,  @s [ 1 ] ,  *pos ] 
526553
527554                   if  @s [ 2 ]  then 
528555                     @tokens  << token 
529-                      [ :BREAK ,  @s [ 2 ] ,  * token_pos ( pos  + @s [ 1 ] . length ) ] 
556+                      [ :BREAK ,  @s [ 2 ] ,  pos [ 0 ]  + @s [ 1 ] . length ,   pos [ 1 ] ] 
530557                   else 
531558                     token 
532559                   end 
@@ -536,16 +563,6 @@ def tokenize input
536563    self 
537564  end 
538565
539-   ## 
540-   # Calculates the column (by character) and line of the current token based 
541-   # on +byte_offset+. 
542- 
543-   def  token_pos  byte_offset 
544-     offset  =  char_pos  byte_offset 
545- 
546-     [ offset  - @line_pos ,  @line ] 
547-   end 
548- 
549566  ## 
550567  # Returns the current token to the token stream 
551568
0 commit comments