Skip to content

Commit 4c23e7a

Browse files
authored
Merge branch 'master' into fork-sparksession
2 parents 05abcf8 + e420fd4 commit 4c23e7a

File tree

117 files changed

+3394
-1053
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+3394
-1053
lines changed

R/pkg/R/DataFrame.R

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
26422642
#'
26432643
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
26442644
#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
2645+
#' Input SparkDataFrames can have different schemas (names and data types).
26452646
#'
26462647
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
26472648
#'
@@ -2685,7 +2686,8 @@ setMethod("unionAll",
26852686

26862687
#' Union two or more SparkDataFrames
26872688
#'
2688-
#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
2689+
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
2690+
#' requires that the input SparkDataFrames have the same column names.
26892691
#'
26902692
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
26912693
#'
@@ -2709,6 +2711,10 @@ setMethod("unionAll",
27092711
setMethod("rbind",
27102712
signature(... = "SparkDataFrame"),
27112713
function(x, ..., deparse.level = 1) {
2714+
nm <- lapply(list(x, ...), names)
2715+
if (length(unique(nm)) != 1) {
2716+
stop("Names of input data frames are different.")
2717+
}
27122718
if (nargs() == 3) {
27132719
union(x, ...)
27142720
} else {

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
18501850
expect_equal(count(unioned2), 12)
18511851
expect_equal(first(unioned2)$name, "Michael")
18521852

1853+
df3 <- df2
1854+
names(df3)[1] <- "newName"
1855+
expect_error(rbind(df, df3),
1856+
"Names of input data frames are different.")
1857+
expect_error(rbind(df, df2, df3),
1858+
"Names of input data frames are different.")
1859+
18531860
excepted <- arrange(except(df, df2), desc(df$age))
18541861
expect_is(unioned, "SparkDataFrame")
18551862
expect_equal(count(excepted), 2)
@@ -2585,8 +2592,8 @@ test_that("coalesce, repartition, numPartitions", {
25852592

25862593
df2 <- repartition(df1, 10)
25872594
expect_equal(getNumPartitions(df2), 10)
2588-
expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
2589-
expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
2595+
expect_equal(getNumPartitions(coalesce(df2, 13)), 10)
2596+
expect_equal(getNumPartitions(coalesce(df2, 7)), 7)
25902597
expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
25912598
})
25922599

common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 83 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -850,26 +850,42 @@ public UTF8String translate(Map<Character, Character> dict) {
850850
return fromString(sb.toString());
851851
}
852852

853-
private int getDigit(byte b) {
854-
if (b >= '0' && b <= '9') {
855-
return b - '0';
856-
}
857-
throw new NumberFormatException(toString());
853+
/**
854+
* Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
855+
* This is done solely for better performance and is not expected to be used by end users.
856+
*/
857+
public static class LongWrapper {
858+
public long value = 0;
859+
}
860+
861+
/**
862+
* Wrapper over `int` to allow result of parsing integer from string to be accessed via reference.
863+
* This is done solely for better performance and is not expected to be used by end users.
864+
*
865+
* {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
866+
* conversion from `long` -> `int`
867+
*/
868+
public static class IntWrapper {
869+
public int value = 0;
858870
}
859871

860872
/**
861873
* Parses this UTF8String to long.
862874
*
863875
* Note that, in this method we accumulate the result in negative format, and convert it to
864876
* positive format at the end, if this string is not started with '-'. This is because min value
865-
* is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
866-
* Integer.MIN_VALUE is '-2147483648'.
877+
* is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
878+
* Long.MIN_VALUE is '-9223372036854775808'.
867879
*
868880
* This code is mostly copied from LazyLong.parseLong in Hive.
881+
*
882+
* @param toLongResult If a valid `long` was parsed from this UTF8String, then its value would
883+
* be set in `toLongResult`
884+
* @return true if the parsing was successful else false
869885
*/
870-
public long toLong() {
886+
public boolean toLong(LongWrapper toLongResult) {
871887
if (numBytes == 0) {
872-
throw new NumberFormatException("Empty string");
888+
return false;
873889
}
874890

875891
byte b = getByte(0);
@@ -878,7 +894,7 @@ public long toLong() {
878894
if (negative || b == '+') {
879895
offset++;
880896
if (numBytes == 1) {
881-
throw new NumberFormatException(toString());
897+
return false;
882898
}
883899
}
884900

@@ -897,41 +913,48 @@ public long toLong() {
897913
break;
898914
}
899915

900-
int digit = getDigit(b);
916+
int digit;
917+
if (b >= '0' && b <= '9') {
918+
digit = b - '0';
919+
} else {
920+
return false;
921+
}
922+
901923
// We are going to process the new digit and accumulate the result. However, before doing
902924
// this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
903-
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
925+
// result * 10 will definitely be smaller than minValue, and we can stop.
904926
if (result < stopValue) {
905-
throw new NumberFormatException(toString());
927+
return false;
906928
}
907929

908930
result = result * radix - digit;
909931
// Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
910-
// can just use `result > 0` to check overflow. If result overflows, we should stop and throw
911-
// exception.
932+
// can just use `result > 0` to check overflow. If result overflows, we should stop.
912933
if (result > 0) {
913-
throw new NumberFormatException(toString());
934+
return false;
914935
}
915936
}
916937

917938
// This is the case when we've encountered a decimal separator. The fractional
918939
// part will not change the number, but we will verify that the fractional part
919940
// is well formed.
920941
while (offset < numBytes) {
921-
if (getDigit(getByte(offset)) == -1) {
922-
throw new NumberFormatException(toString());
942+
byte currentByte = getByte(offset);
943+
if (currentByte < '0' || currentByte > '9') {
944+
return false;
923945
}
924946
offset++;
925947
}
926948

927949
if (!negative) {
928950
result = -result;
929951
if (result < 0) {
930-
throw new NumberFormatException(toString());
952+
return false;
931953
}
932954
}
933955

934-
return result;
956+
toLongResult.value = result;
957+
return true;
935958
}
936959

937960
/**
@@ -946,10 +969,14 @@ public long toLong() {
946969
*
947970
* Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
948971
* reasons, like Hive does.
972+
*
973+
* @param intWrapper If a valid `int` was parsed from this UTF8String, then its value would
974+
* be set in `intWrapper`
975+
* @return true if the parsing was successful else false
949976
*/
950-
public int toInt() {
977+
public boolean toInt(IntWrapper intWrapper) {
951978
if (numBytes == 0) {
952-
throw new NumberFormatException("Empty string");
979+
return false;
953980
}
954981

955982
byte b = getByte(0);
@@ -958,7 +985,7 @@ public int toInt() {
958985
if (negative || b == '+') {
959986
offset++;
960987
if (numBytes == 1) {
961-
throw new NumberFormatException(toString());
988+
return false;
962989
}
963990
}
964991

@@ -977,61 +1004,69 @@ public int toInt() {
9771004
break;
9781005
}
9791006

980-
int digit = getDigit(b);
1007+
int digit;
1008+
if (b >= '0' && b <= '9') {
1009+
digit = b - '0';
1010+
} else {
1011+
return false;
1012+
}
1013+
9811014
// We are going to process the new digit and accumulate the result. However, before doing
9821015
// this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
983-
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
1016+
// result * 10 will definitely be smaller than minValue, and we can stop
9841017
if (result < stopValue) {
985-
throw new NumberFormatException(toString());
1018+
return false;
9861019
}
9871020

9881021
result = result * radix - digit;
9891022
// Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
990-
// we can just use `result > 0` to check overflow. If result overflows, we should stop and
991-
// throw exception.
1023+
// we can just use `result > 0` to check overflow. If result overflows, we should stop
9921024
if (result > 0) {
993-
throw new NumberFormatException(toString());
1025+
return false;
9941026
}
9951027
}
9961028

9971029
// This is the case when we've encountered a decimal separator. The fractional
9981030
// part will not change the number, but we will verify that the fractional part
9991031
// is well formed.
10001032
while (offset < numBytes) {
1001-
if (getDigit(getByte(offset)) == -1) {
1002-
throw new NumberFormatException(toString());
1033+
byte currentByte = getByte(offset);
1034+
if (currentByte < '0' || currentByte > '9') {
1035+
return false;
10031036
}
10041037
offset++;
10051038
}
10061039

10071040
if (!negative) {
10081041
result = -result;
10091042
if (result < 0) {
1010-
throw new NumberFormatException(toString());
1043+
return false;
10111044
}
10121045
}
1013-
1014-
return result;
1046+
intWrapper.value = result;
1047+
return true;
10151048
}
10161049

1017-
public short toShort() {
1018-
int intValue = toInt();
1019-
short result = (short) intValue;
1020-
if (result != intValue) {
1021-
throw new NumberFormatException(toString());
1050+
public boolean toShort(IntWrapper intWrapper) {
1051+
if (toInt(intWrapper)) {
1052+
int intValue = intWrapper.value;
1053+
short result = (short) intValue;
1054+
if (result == intValue) {
1055+
return true;
1056+
}
10221057
}
1023-
1024-
return result;
1058+
return false;
10251059
}
10261060

1027-
public byte toByte() {
1028-
int intValue = toInt();
1029-
byte result = (byte) intValue;
1030-
if (result != intValue) {
1031-
throw new NumberFormatException(toString());
1061+
public boolean toByte(IntWrapper intWrapper) {
1062+
if (toInt(intWrapper)) {
1063+
int intValue = intWrapper.value;
1064+
byte result = (byte) intValue;
1065+
if (result == intValue) {
1066+
return true;
1067+
}
10321068
}
1033-
1034-
return result;
1069+
return false;
10351070
}
10361071

10371072
@Override

0 commit comments

Comments
 (0)