Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
369a148
[SPARK-19595][SQL] Support json array in from_json
HyukjinKwon Mar 5, 2017
70f9d7f
[SPARK-19535][ML] RecommendForAllUsers RecommendForAllItems for ALS o…
sueann Mar 6, 2017
224e0e7
[SPARK-19701][SQL][PYTHON] Throws a correct exception for 'in' operat…
HyukjinKwon Mar 6, 2017
207067e
[SPARK-19822][TEST] CheckpointSuite.testCheckpointedOperation: should…
uncleGen Mar 6, 2017
2a0bc86
[SPARK-17495][SQL] Support Decimal type in Hive-hash
tejasapatil Mar 6, 2017
339b53a
[SPARK-19737][SQL] New analysis rule for reporting unregistered funct…
liancheng Mar 6, 2017
46a64d1
[SPARK-19304][STREAMING][KINESIS] fix kinesis slow checkpoint recovery
Gauravshah Mar 6, 2017
096df6d
[SPARK-19257][SQL] location for table/partition/database should be ja…
windpiger Mar 6, 2017
12bf832
[SPARK-19796][CORE] Fix serialization of long property values in Task…
squito Mar 6, 2017
9991c2d
[SPARK-19211][SQL] Explicitly prevent Insert into View or Create View…
jiangxb1987 Mar 6, 2017
9265436
[SPARK-19382][ML] Test sparse vectors in LinearSVCSuite
wangmiao1981 Mar 6, 2017
f6471dc
[SPARK-19709][SQL] Read empty file with CSV data source
wojtek-szymanski Mar 6, 2017
b0a5cd8
[SPARK-19719][SS] Kafka writer for both structured streaming and batc…
Mar 7, 2017
9909f6d
[SPARK-19350][SQL] Cardinality estimation of Limit and Sample
Mar 7, 2017
1f6c090
[SPARK-19818][SPARKR] rbind should check for name consistency of inpu…
actuaryzhang Mar 7, 2017
e52499e
[SPARK-19832][SQL] DynamicPartitionWriteTask get partitionPath should…
windpiger Mar 7, 2017
932196d
[SPARK-17075][SQL][FOLLOWUP] fix filter estimation issues
Mar 7, 2017
030acdd
[SPARK-19637][SQL] Add to_json in FunctionRegistry
maropu Mar 7, 2017
c05baab
[SPARK-19765][SPARK-18549][SQL] UNCACHE TABLE should un-cache all cac…
cloud-fan Mar 7, 2017
4a9034b
[SPARK-17498][ML] StringIndexer enhancement for handling unseen labels
Mar 7, 2017
d69aeea
[SPARK-19516][DOC] update public doc to use SparkSession instead of S…
cloud-fan Mar 7, 2017
49570ed
[SPARK-19803][TEST] flaky BlockManagerReplicationSuite test failure
uncleGen Mar 7, 2017
6f46846
[SPARK-19561] [PYTHON] cast TimestampType.toInternal output to long
JasonMWhite Mar 7, 2017
2e30c0b
[SPARK-19702][MESOS] Increase default refuse_seconds timeout in the M…
Mar 7, 2017
8e41c2e
[SPARK-19857][YARN] Correctly calculate next credential update time.
Mar 8, 2017
47b2f68
Revert "[SPARK-19561] [PYTHON] cast TimestampType.toInternal output t…
cloud-fan Mar 8, 2017
c96d14a
[SPARK-19843][SQL] UTF8String => (int / long) conversion expensive fo…
tejasapatil Mar 8, 2017
b9783a9
[SPARK-18389][SQL] Disallow cyclic view reference
jiangxb1987 Mar 8, 2017
ca849ac
[SPARK-19841][SS] watermarkPredicate should filter based on keys
zsxwing Mar 8, 2017
d8830c5
[SPARK-19859][SS] The new watermark should override the old one
zsxwing Mar 8, 2017
56e1bd3
[SPARK-17629][ML] methods to return synonyms directly
Mar 8, 2017
314e48a
[SPARK-18055][SQL] Use correct mirror in ExpresionEncoder
marmbrus Mar 8, 2017
1fa5886
[ML][MINOR] Separate estimator and model params for read/write test.
yanboliang Mar 8, 2017
81303f7
[SPARK-19806][ML][PYSPARK] PySpark GeneralizedLinearRegression suppor…
yanboliang Mar 8, 2017
3f9f918
[SPARK-19693][SQL] Make the SET mapreduce.job.reduces automatically c…
wangyum Mar 8, 2017
9ea201c
[SPARK-16440][MLLIB] Ensure broadcasted variables are destroyed even …
Mar 8, 2017
e442748
[SPARK-17080][SQL] join reorder
Mar 8, 2017
5f7d835
[SPARK-19865][SQL] remove the view identifier in SubqueryAlias
jiangxb1987 Mar 8, 2017
9a6ac72
[SPARK-19601][SQL] Fix CollapseRepartition rule to preserve shuffle-e…
gatorsmile Mar 8, 2017
e420fd4
[SPARK-19843][SQL][FOLLOWUP] Classdoc for `IntWrapper` and `LongWrapper`
tejasapatil Mar 8, 2017
f3387d9
[SPARK-19864][SQL][TEST] provide a makeQualifiedPath functions to opt…
windpiger Mar 8, 2017
e9e2c61
[SPARK-19727][SQL] Fix for round function that modifies original column
wojtek-szymanski Mar 8, 2017
1bf9012
[SPARK-19858][SS] Add output mode to flatMapGroupsWithState and disal…
zsxwing Mar 8, 2017
6570cfd
[SPARK-19540][SQL] Add ability to clone SparkSession wherein cloned s…
kunalkhamar Mar 8, 2017
4551290
[SPARK-15463][SQL] Add an API to load DataFrame from Dataset[String] …
HyukjinKwon Mar 8, 2017
a3648b5
[SPARK-19813] maxFilesPerTrigger combo latestFirst may miss old files…
brkyvz Mar 8, 2017
d809cee
[MINOR][SQL] The analyzer rules are fired twice for cases when Analys…
dilipbiswal Mar 9, 2017
09829be
[SPARK-19235][SQL][TESTS] Enable Test Cases in DDLSuite with Hive Met…
gatorsmile Mar 9, 2017
029e40b
[SPARK-19874][BUILD] Hide API docs for org.apache.spark.sql.internal
zsxwing Mar 9, 2017
eeb1d6d
[SPARK-19859][SS][FOLLOW-UP] The new watermark should override the ol…
uncleGen Mar 9, 2017
274973d
[SPARK-19763][SQL] qualified external datasource table location store…
windpiger Mar 9, 2017
206030b
[SPARK-19561][SQL] add int case handling for TimestampType
JasonMWhite Mar 9, 2017
b60b9fc
[SPARK-19757][CORE] DriverEndpoint#makeOffers race against CoarseGrai…
jxiang Mar 9, 2017
3232e54
[SPARK-19793] Use clock.getTimeMillis when mark task as finished in T…
Mar 9, 2017
40da4d1
[SPARK-19715][STRUCTURED STREAMING] Option to Strip Paths in FileSource
lw-lin Mar 9, 2017
30b18e6
[SPARK-19861][SS] watermark should not be a negative time.
uncleGen Mar 9, 2017
cabe1df
[SPARK-12334][SQL][PYSPARK] Support read from multiple input paths fo…
zjffdu Mar 9, 2017
f79371a
[SPARK-19611][SQL] Introduce configurable table schema inference
Mar 9, 2017
82138e0
[SPARK-19886] Fix reportDataLoss if statement in SS KafkaSource
brkyvz Mar 10, 2017
5949e6c
[SPARK-19008][SQL] Improve performance of Dataset.map by eliminating …
kiszk Mar 10, 2017
501b711
[SPARK-19891][SS] Await Batch Lock notified on stream execution exit
Mar 10, 2017
fcb68e0
[SPARK-19786][SQL] Facilitate loop optimizations in a JIT compiler re…
kiszk Mar 10, 2017
dd9049e
[SPARK-19620][SQL] Fix incorrect exchange coordinator id in the physi…
carsonwang Mar 10, 2017
8f0490e
[SPARK-17979][SPARK-14453] Remove deprecated SPARK_YARN_USER_ENV and …
yongtang Mar 10, 2017
bc30351
[SPARK-19611][SQL] Preserve metastore field order when merging inferr…
Mar 10, 2017
ffee4f1
[SPARK-19905][SQL] Bring back Dataset.inputFiles for Hive SerDe tables
liancheng Mar 10, 2017
fb9beda
[SPARK-19893][SQL] should not run DataFrame set oprations with map type
cloud-fan Mar 11, 2017
f6fdf92
[SPARK-19723][SQL] create datasource table with an non-existent locat…
windpiger Mar 11, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
#' Input SparkDataFrames can have different schemas (names and data types).
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#'
Expand Down Expand Up @@ -2685,7 +2686,8 @@ setMethod("unionAll",

#' Union two or more SparkDataFrames
#'
#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
#' requires that the input SparkDataFrames have the same column names.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#'
Expand All @@ -2709,6 +2711,10 @@ setMethod("unionAll",
setMethod("rbind",
signature(... = "SparkDataFrame"),
function(x, ..., deparse.level = 1) {
nm <- lapply(list(x, ...), names)
if (length(unique(nm)) != 1) {
stop("Names of input data frames are different.")
}
if (nargs() == 3) {
union(x, ...)
} else {
Expand Down
11 changes: 9 additions & 2 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
expect_equal(count(unioned2), 12)
expect_equal(first(unioned2)$name, "Michael")

df3 <- df2
names(df3)[1] <- "newName"
expect_error(rbind(df, df3),
"Names of input data frames are different.")
expect_error(rbind(df, df2, df3),
"Names of input data frames are different.")

excepted <- arrange(except(df, df2), desc(df$age))
expect_is(unioned, "SparkDataFrame")
expect_equal(count(excepted), 2)
Expand Down Expand Up @@ -2585,8 +2592,8 @@ test_that("coalesce, repartition, numPartitions", {

df2 <- repartition(df1, 10)
expect_equal(getNumPartitions(df2), 10)
expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
expect_equal(getNumPartitions(coalesce(df2, 13)), 10)
expect_equal(getNumPartitions(coalesce(df2, 7)), 7)
expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -850,26 +850,42 @@ public UTF8String translate(Map<Character, Character> dict) {
return fromString(sb.toString());
}

private int getDigit(byte b) {
if (b >= '0' && b <= '9') {
return b - '0';
}
throw new NumberFormatException(toString());
/**
* Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
* This is done solely for better performance and is not expected to be used by end users.
*/
public static class LongWrapper {
public long value = 0;
}

/**
* Wrapper over `int` to allow result of parsing integer from string to be accessed via reference.
* This is done solely for better performance and is not expected to be used by end users.
*
* {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
* conversion from `long` -> `int`
*/
public static class IntWrapper {
public int value = 0;
}

/**
* Parses this UTF8String to long.
*
* Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
* is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
* Integer.MIN_VALUE is '-2147483648'.
* is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
* Long.MIN_VALUE is '-9223372036854775808'.
*
* This code is mostly copied from LazyLong.parseLong in Hive.
*
* @param toLongResult If a valid `long` was parsed from this UTF8String, then its value would
* be set in `toLongResult`
* @return true if the parsing was successful else false
*/
public long toLong() {
public boolean toLong(LongWrapper toLongResult) {
if (numBytes == 0) {
throw new NumberFormatException("Empty string");
return false;
}

byte b = getByte(0);
Expand All @@ -878,7 +894,7 @@ public long toLong() {
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
throw new NumberFormatException(toString());
return false;
}
}

Expand All @@ -897,41 +913,48 @@ public long toLong() {
break;
}

int digit = getDigit(b);
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return false;
}

// We are going to process the new digit and accumulate the result. However, before doing
// this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
// result * 10 will definitely be smaller than minValue, and we can stop.
if (result < stopValue) {
throw new NumberFormatException(toString());
return false;
}

result = result * radix - digit;
// Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
// can just use `result > 0` to check overflow. If result overflows, we should stop and throw
// exception.
// can just use `result > 0` to check overflow. If result overflows, we should stop.
if (result > 0) {
throw new NumberFormatException(toString());
return false;
}
}

// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
if (getDigit(getByte(offset)) == -1) {
throw new NumberFormatException(toString());
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
}
offset++;
}

if (!negative) {
result = -result;
if (result < 0) {
throw new NumberFormatException(toString());
return false;
}
}

return result;
toLongResult.value = result;
return true;
}

/**
Expand All @@ -946,10 +969,14 @@ public long toLong() {
*
* Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
* reasons, like Hive does.
*
* @param intWrapper If a valid `int` was parsed from this UTF8String, then its value would
* be set in `intWrapper`
* @return true if the parsing was successful else false
*/
public int toInt() {
public boolean toInt(IntWrapper intWrapper) {
if (numBytes == 0) {
throw new NumberFormatException("Empty string");
return false;
}

byte b = getByte(0);
Expand All @@ -958,7 +985,7 @@ public int toInt() {
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
throw new NumberFormatException(toString());
return false;
}
}

Expand All @@ -977,61 +1004,69 @@ public int toInt() {
break;
}

int digit = getDigit(b);
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return false;
}

// We are going to process the new digit and accumulate the result. However, before doing
// this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
// result * 10 will definitely be smaller than minValue, and we can stop
if (result < stopValue) {
throw new NumberFormatException(toString());
return false;
}

result = result * radix - digit;
// Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
// we can just use `result > 0` to check overflow. If result overflows, we should stop and
// throw exception.
// we can just use `result > 0` to check overflow. If result overflows, we should stop
if (result > 0) {
throw new NumberFormatException(toString());
return false;
}
}

// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
if (getDigit(getByte(offset)) == -1) {
throw new NumberFormatException(toString());
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
}
offset++;
}

if (!negative) {
result = -result;
if (result < 0) {
throw new NumberFormatException(toString());
return false;
}
}

return result;
intWrapper.value = result;
return true;
}

public short toShort() {
int intValue = toInt();
short result = (short) intValue;
if (result != intValue) {
throw new NumberFormatException(toString());
public boolean toShort(IntWrapper intWrapper) {
if (toInt(intWrapper)) {
int intValue = intWrapper.value;
short result = (short) intValue;
if (result == intValue) {
return true;
}
}

return result;
return false;
}

public byte toByte() {
int intValue = toInt();
byte result = (byte) intValue;
if (result != intValue) {
throw new NumberFormatException(toString());
public boolean toByte(IntWrapper intWrapper) {
if (toInt(intWrapper)) {
int intValue = intWrapper.value;
byte result = (byte) intValue;
if (result == intValue) {
return true;
}
}

return result;
return false;
}

@Override
Expand Down
Loading