1+ /*
2+ * Licensed to Elasticsearch B.V. under one or more contributor
3+ * license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright
5+ * ownership. Elasticsearch B.V. licenses this file to you under
6+ * the Apache License, Version 2.0 (the "License"); you may
7+ * not use this file except in compliance with the License.
8+ * You may obtain a copy of the License at
9+ *
10+ * http://www.apache.org/licenses/LICENSE-2.0
11+ *
12+ * Unless required by applicable law or agreed to in writing,
13+ * software distributed under the License is distributed on an
14+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+ * KIND, either express or implied. See the License for the
16+ * specific language governing permissions and limitations
17+ * under the License.
18+ */
19+
20+ package org .logstash .common ;
21+
22+ import org .jruby .RubyArray ;
23+ import org .jruby .RubyEncoding ;
24+ import org .jruby .RubyString ;
25+ import org .jruby .runtime .ThreadContext ;
26+ import org .jruby .runtime .builtin .IRubyObject ;
27+ import org .junit .Before ;
28+ import org .junit .Test ;
29+ import org .logstash .RubyTestBase ;
30+ import org .logstash .RubyUtil ;
31+
32+ import java .util .List ;
33+
34+ import static org .junit .Assert .assertEquals ;
35+ import static org .junit .Assert .assertTrue ;
36+ import static org .logstash .RubyUtil .RUBY ;
37+
38+ @ SuppressWarnings ("unchecked" )
39+ public final class BufferedTokenizerExtTest extends RubyTestBase {
40+
41+ private BufferedTokenizerExt sut ;
42+ private ThreadContext context ;
43+
44+ @ Before
45+ public void setUp () {
46+ sut = new BufferedTokenizerExt (RubyUtil .RUBY , RubyUtil .BUFFERED_TOKENIZER );
47+ context = RUBY .getCurrentContext ();
48+ IRubyObject [] args = {};
49+ sut .init (context , args );
50+ }
51+
52+ @ Test
53+ public void shouldTokenizeASingleToken () {
54+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n " ));
55+
56+ assertEquals (List .of ("foo" ), tokens );
57+ }
58+
59+ @ Test
60+ public void shouldMergeMultipleToken () {
61+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo" ));
62+ assertTrue (tokens .isEmpty ());
63+
64+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("bar\n " ));
65+ assertEquals (List .of ("foobar" ), tokens );
66+ }
67+
68+ @ Test
69+ public void shouldTokenizeMultipleToken () {
70+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n bar\n " ));
71+
72+ assertEquals (List .of ("foo" , "bar" ), tokens );
73+ }
74+
75+ @ Test
76+ public void shouldIgnoreEmptyPayload () {
77+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("" ));
78+ assertTrue (tokens .isEmpty ());
79+
80+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("foo\n bar" ));
81+ assertEquals (List .of ("foo" ), tokens );
82+ }
83+
84+ @ Test
85+ public void shouldTokenizeEmptyPayloadWithNewline () {
86+ RubyArray <RubyString > tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("\n " ));
87+ assertEquals (List .of ("" ), tokens );
88+
89+ tokens = (RubyArray <RubyString >) sut .extract (context , RubyUtil .RUBY .newString ("\n \n \n " ));
90+ assertEquals (List .of ("" , "" , "" ), tokens );
91+ }
92+
93+ @ Test
94+ public void shouldNotChangeEncodingOfTokensAfterPartitioning () {
95+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x0A , 0x41 }); // £ character, newline, A
96+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
97+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , rubyInput );
98+
99+ // read the first token, the £ string
100+ IRubyObject firstToken = tokens .shift (context );
101+ assertEquals ("£" , firstToken .toString ());
102+
103+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
104+ RubyEncoding encoding = (RubyEncoding ) firstToken .callMethod (context , "encoding" );
105+ assertEquals ("ISO-8859-1" , encoding .toString ());
106+ }
107+
108+ @ Test
109+ public void shouldNotChangeEncodingOfTokensAfterPartitioningInCaseMultipleExtractionInInvoked () {
110+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 }); // £ character
111+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
112+ sut .extract (context , rubyInput );
113+ IRubyObject capitalAInLatin1 = RubyString .newString (RUBY , new byte []{(byte ) 0x41 })
114+ .force_encoding (context , RUBY .newString ("ISO8859-1" ));
115+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , capitalAInLatin1 );
116+ assertTrue (tokens .isEmpty ());
117+
118+ tokens = (RubyArray <RubyString >)sut .extract (context , RubyString .newString (RUBY , new byte []{(byte ) 0x0A }));
119+
120+ // read the first token, the £ string
121+ IRubyObject firstToken = tokens .shift (context );
122+ assertEquals ("£A" , firstToken .toString ());
123+
124+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
125+ RubyEncoding encoding = (RubyEncoding ) firstToken .callMethod (context , "encoding" );
126+ assertEquals ("ISO-8859-1" , encoding .toString ());
127+ }
128+
129+ @ Test
130+ public void shouldNotChangeEncodingOfTokensAfterPartitioningWhenRetrieveLastFlushedToken () {
131+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x0A , 0x41 }); // £ character, newline, A
132+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
133+ RubyArray <RubyString > tokens = (RubyArray <RubyString >)sut .extract (context , rubyInput );
134+
135+ // read the first token, the £ string
136+ IRubyObject firstToken = tokens .shift (context );
137+ assertEquals ("£" , firstToken .toString ());
138+
139+ // flush and check that the remaining A is still encoded in ISO8859-1
140+ IRubyObject lastToken = sut .flush (context );
141+ assertEquals ("A" , lastToken .toString ());
142+
143+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
144+ RubyEncoding encoding = (RubyEncoding ) lastToken .callMethod (context , "encoding" );
145+ assertEquals ("ISO-8859-1" , encoding .toString ());
146+ }
147+
148+ @ Test
149+ public void givenDirectFlushInvocationUTF8EncodingIsApplied () {
150+ RubyString rubyString = RubyString .newString (RUBY , new byte []{(byte ) 0xA3 , 0x41 }); // £ character, A
151+ IRubyObject rubyInput = rubyString .force_encoding (context , RUBY .newString ("ISO8859-1" ));
152+
153+ // flush and check that the remaining A is still encoded in ISO8859-1
154+ IRubyObject lastToken = sut .flush (context );
155+ assertEquals ("" , lastToken .toString ());
156+
157+ // verify encoding "ISO8859-1" is preserved in the Java to Ruby String conversion
158+ RubyEncoding encoding = (RubyEncoding ) lastToken .callMethod (context , "encoding" );
159+ assertEquals ("UTF-8" , encoding .toString ());
160+ }
161+ }
0 commit comments