Skip to content

Commit a95291e

Browse files
committed
Cleanups to string handling code
Changes to UTF8String will be deferred for a followup, since we should benchmark them first.
1 parent afe8dca commit a95291e

File tree

5 files changed

+63
-326
lines changed

5 files changed

+63
-326
lines changed

sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,19 +311,26 @@ public double getDouble(int i) {
311311
}
312312

313313
public UTF8String getUTF8String(int i) {
314-
// TODO: this is inefficient; just doing this to make some tests pass for now; will fix later
315314
assertIndexIsValid(i);
316-
return UTF8String.apply(getString(i));
315+
final UTF8String str = new UTF8String();
316+
final long offsetToStringSize = getLong(i);
317+
final int stringSizeInBytes =
318+
(int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize);
319+
final byte[] strBytes = new byte[stringSizeInBytes];
320+
PlatformDependent.copyMemory(
321+
baseObject,
322+
baseOffset + offsetToStringSize + 8, // The +8 is to skip past the size to get the data,
323+
strBytes,
324+
PlatformDependent.BYTE_ARRAY_OFFSET,
325+
stringSizeInBytes
326+
);
327+
str.set(strBytes);
328+
return str;
317329
}
318330

319331
@Override
320332
public String getString(int i) {
321-
assertIndexIsValid(i);
322-
final long offsetToStringSize = getLong(i);
323-
final long stringSizeInBytes =
324-
PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize);
325-
// TODO: ugly cast; figure out whether we'll support mega long strings
326-
return UTF8StringMethods.toJavaString(baseObject, baseOffset + offsetToStringSize + 8, (int) stringSizeInBytes);
333+
return getUTF8String(i).toString();
327334
}
328335

329336
@Override

sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala

Lines changed: 48 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@ package org.apache.spark.sql.types
1919

2020
import java.util.Arrays
2121

22-
import org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET
23-
import org.apache.spark.unsafe.array.ByteArrayMethods
24-
import org.apache.spark.unsafe.string.UTF8StringMethods
25-
2622
/**
2723
* A UTF-8 String, as internal representation of StringType in SparkSQL
2824
*
@@ -40,7 +36,8 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
4036
* Update the UTF8String with String.
4137
*/
4238
def set(str: String): UTF8String = {
43-
set(str.getBytes("utf-8"))
39+
bytes = str.getBytes("utf-8")
40+
this
4441
}
4542

4643
/**
@@ -51,13 +48,29 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
5148
this
5249
}
5350

51+
/**
52+
* Return the number of bytes for a code point with the first byte as `b`
53+
* @param b The first byte of a code point
54+
*/
55+
@inline
56+
private[this] def numOfBytes(b: Byte): Int = {
57+
val offset = (b & 0xFF) - 192
58+
if (offset >= 0) UTF8String.bytesOfCodePointInUTF8(offset) else 1
59+
}
60+
5461
/**
5562
* Return the number of code points in it.
5663
*
5764
* This is only used by Substring() when `start` is negative.
5865
*/
5966
def length(): Int = {
60-
UTF8StringMethods.getLengthInCodePoints(bytes, BYTE_ARRAY_OFFSET, bytes.length)
67+
var len = 0
68+
var i: Int = 0
69+
while (i < bytes.length) {
70+
i += numOfBytes(bytes(i))
71+
len += 1
72+
}
73+
len
6174
}
6275

6376
def getBytes: Array[Byte] = {
@@ -77,12 +90,12 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
7790
var c = 0
7891
var i: Int = 0
7992
while (c < start && i < bytes.length) {
80-
i += UTF8StringMethods.numOfBytes(bytes(i))
93+
i += numOfBytes(bytes(i))
8194
c += 1
8295
}
8396
var j = i
8497
while (c < until && j < bytes.length) {
85-
j += UTF8StringMethods.numOfBytes(bytes(j))
98+
j += numOfBytes(bytes(j))
8699
c += 1
87100
}
88101
UTF8String(Arrays.copyOfRange(bytes, i, j))
@@ -105,27 +118,19 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
105118
}
106119

107120
def startsWith(prefix: UTF8String): Boolean = {
108-
val prefixBytes = prefix.getBytes
109-
UTF8StringMethods.startsWith(
110-
bytes,
111-
BYTE_ARRAY_OFFSET,
112-
bytes.length,
113-
prefixBytes,
114-
BYTE_ARRAY_OFFSET,
115-
prefixBytes.length
116-
)
121+
val b = prefix.getBytes
122+
if (b.length > bytes.length) {
123+
return false
124+
}
125+
Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b)
117126
}
118127

119128
def endsWith(suffix: UTF8String): Boolean = {
120-
val suffixBytes = suffix.getBytes
121-
UTF8StringMethods.endsWith(
122-
bytes,
123-
BYTE_ARRAY_OFFSET,
124-
bytes.length,
125-
suffixBytes,
126-
BYTE_ARRAY_OFFSET,
127-
suffixBytes.length
128-
)
129+
val b = suffix.getBytes
130+
if (b.length > bytes.length) {
131+
return false
132+
}
133+
Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b)
129134
}
130135

131136
def toUpperCase(): UTF8String = {
@@ -145,15 +150,14 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
145150
override def clone(): UTF8String = new UTF8String().set(this.bytes)
146151

147152
override def compare(other: UTF8String): Int = {
148-
val otherBytes = other.getBytes
149-
UTF8StringMethods.compare(
150-
bytes,
151-
BYTE_ARRAY_OFFSET,
152-
bytes.length,
153-
otherBytes,
154-
BYTE_ARRAY_OFFSET,
155-
otherBytes.length
156-
)
153+
var i: Int = 0
154+
val b = other.getBytes
155+
while (i < bytes.length && i < b.length) {
156+
val res = bytes(i).compareTo(b(i))
157+
if (res != 0) return res
158+
i += 1
159+
}
160+
bytes.length - b.length
157161
}
158162

159163
override def compareTo(other: UTF8String): Int = {
@@ -162,14 +166,7 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
162166

163167
override def equals(other: Any): Boolean = other match {
164168
case s: UTF8String =>
165-
val otherBytes = s.getBytes
166-
otherBytes.length == bytes.length && ByteArrayMethods.arrayEquals(
167-
bytes,
168-
BYTE_ARRAY_OFFSET,
169-
otherBytes,
170-
BYTE_ARRAY_OFFSET,
171-
otherBytes.length
172-
)
169+
Arrays.equals(bytes, s.getBytes)
173170
case s: String =>
174171
// This is only used for Catalyst unit tests
175172
// fail fast
@@ -184,6 +181,14 @@ final class UTF8String extends Ordered[UTF8String] with Serializable {
184181
}
185182

186183
object UTF8String {
184+
// number of tailing bytes in a UTF8 sequence for a code point
185+
// see http://en.wikipedia.org/wiki/UTF-8, 192-256 of Byte 1
186+
private[types] val bytesOfCodePointInUTF8: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
187+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
188+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
189+
4, 4, 4, 4, 4, 4, 4, 4,
190+
5, 5, 5, 5,
191+
6, 6, 6, 6)
187192

188193
/**
189194
* Create a UTF-8 String from String

unsafe/src/main/java/org/apache/spark/unsafe/string/UTF8StringMethods.java

Lines changed: 0 additions & 172 deletions
This file was deleted.

0 commit comments

Comments
 (0)