@@ -19,10 +19,6 @@ package org.apache.spark.sql.types
1919
2020import java .util .Arrays
2121
22- import org .apache .spark .unsafe .PlatformDependent .BYTE_ARRAY_OFFSET
23- import org .apache .spark .unsafe .array .ByteArrayMethods
24- import org .apache .spark .unsafe .string .UTF8StringMethods
25-
2622/**
2723 * A UTF-8 String, as internal representation of StringType in SparkSQL
2824 *
@@ -40,7 +36,8 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
4036 * Update the UTF8String with String.
4137 */
4238 def set (str : String ): UTF8String = {
43- set(str.getBytes(" utf-8" ))
39+ bytes = str.getBytes(" utf-8" )
40+ this
4441 }
4542
4643 /**
@@ -51,13 +48,29 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
5148 this
5249 }
5350
51+ /**
52+ * Return the number of bytes for a code point with the first byte as `b`
53+ * @param b The first byte of a code point
54+ */
55+ @ inline
56+ private [this ] def numOfBytes (b : Byte ): Int = {
57+ val offset = (b & 0xFF ) - 192
58+ if (offset >= 0 ) UTF8String .bytesOfCodePointInUTF8(offset) else 1
59+ }
60+
5461 /**
5562 * Return the number of code points in it.
5663 *
5764 * This is only used by Substring() when `start` is negative.
5865 */
5966 def length (): Int = {
60- UTF8StringMethods .getLengthInCodePoints(bytes, BYTE_ARRAY_OFFSET , bytes.length)
67+ var len = 0
68+ var i : Int = 0
69+ while (i < bytes.length) {
70+ i += numOfBytes(bytes(i))
71+ len += 1
72+ }
73+ len
6174 }
6275
6376 def getBytes : Array [Byte ] = {
@@ -77,12 +90,12 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
7790 var c = 0
7891 var i : Int = 0
7992 while (c < start && i < bytes.length) {
80- i += UTF8StringMethods . numOfBytes(bytes(i))
93+ i += numOfBytes(bytes(i))
8194 c += 1
8295 }
8396 var j = i
8497 while (c < until && j < bytes.length) {
85- j += UTF8StringMethods . numOfBytes(bytes(j))
98+ j += numOfBytes(bytes(j))
8699 c += 1
87100 }
88101 UTF8String (Arrays .copyOfRange(bytes, i, j))
@@ -105,27 +118,19 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
105118 }
106119
107120 def startsWith (prefix : UTF8String ): Boolean = {
108- val prefixBytes = prefix.getBytes
109- UTF8StringMethods .startsWith(
110- bytes,
111- BYTE_ARRAY_OFFSET ,
112- bytes.length,
113- prefixBytes,
114- BYTE_ARRAY_OFFSET ,
115- prefixBytes.length
116- )
121+ val b = prefix.getBytes
122+ if (b.length > bytes.length) {
123+ return false
124+ }
125+ Arrays .equals(Arrays .copyOfRange(bytes, 0 , b.length), b)
117126 }
118127
119128 def endsWith (suffix : UTF8String ): Boolean = {
120- val suffixBytes = suffix.getBytes
121- UTF8StringMethods .endsWith(
122- bytes,
123- BYTE_ARRAY_OFFSET ,
124- bytes.length,
125- suffixBytes,
126- BYTE_ARRAY_OFFSET ,
127- suffixBytes.length
128- )
129+ val b = suffix.getBytes
130+ if (b.length > bytes.length) {
131+ return false
132+ }
133+ Arrays .equals(Arrays .copyOfRange(bytes, bytes.length - b.length, bytes.length), b)
129134 }
130135
131136 def toUpperCase (): UTF8String = {
@@ -145,15 +150,14 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
145150 override def clone (): UTF8String = new UTF8String ().set(this .bytes)
146151
147152 override def compare (other : UTF8String ): Int = {
148- val otherBytes = other.getBytes
149- UTF8StringMethods .compare(
150- bytes,
151- BYTE_ARRAY_OFFSET ,
152- bytes.length,
153- otherBytes,
154- BYTE_ARRAY_OFFSET ,
155- otherBytes.length
156- )
153+ var i : Int = 0
154+ val b = other.getBytes
155+ while (i < bytes.length && i < b.length) {
156+ val res = bytes(i).compareTo(b(i))
157+ if (res != 0 ) return res
158+ i += 1
159+ }
160+ bytes.length - b.length
157161 }
158162
159163 override def compareTo (other : UTF8String ): Int = {
@@ -162,14 +166,7 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
162166
163167 override def equals (other : Any ): Boolean = other match {
164168 case s : UTF8String =>
165- val otherBytes = s.getBytes
166- otherBytes.length == bytes.length && ByteArrayMethods .arrayEquals(
167- bytes,
168- BYTE_ARRAY_OFFSET ,
169- otherBytes,
170- BYTE_ARRAY_OFFSET ,
171- otherBytes.length
172- )
169+ Arrays .equals(bytes, s.getBytes)
173170 case s : String =>
174171 // This is only used for Catalyst unit tests
175172 // fail fast
@@ -184,6 +181,14 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
184181}
185182
186183object UTF8String {
184+ // number of tailing bytes in a UTF8 sequence for a code point
185+ // see http://en.wikipedia.org/wiki/UTF-8, 192-256 of Byte 1
186+ private [types] val bytesOfCodePointInUTF8 : Array [Int ] = Array (2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
187+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
188+ 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 ,
189+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
190+ 5 , 5 , 5 , 5 ,
191+ 6 , 6 , 6 , 6 )
187192
188193 /**
189194 * Create a UTF-8 String from String
0 commit comments