Skip to content

Commit a17d07c

Browse files
authored
Merge pull request apache-spark-on-k8s#282 from palantir/os/resync-apache
[NOSQUASH] Resync Apache
2 parents 02b2e34 + 4ba0c26 commit a17d07c

File tree

310 files changed

+8782
-3133
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

310 files changed

+8782
-3133
lines changed

LICENSE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
269269
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
270270
(BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE)
271271
(BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE)
272+
(BSD 2 Clause) Zstd-jni (https://github.com/luben/zstd-jni/blob/master/LICENSE)
273+
(BSD license) Zstd (https://github.com/facebook/zstd/blob/v1.3.1/LICENSE)
272274

273275
========================================================================
274276
MIT licenses

R/pkg/R/DataFrame.R

Lines changed: 56 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,23 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) {
5858
#' Set options/mode and then return the write object
5959
#' @noRd
6060
setWriteOptions <- function(write, path = NULL, mode = "error", ...) {
61-
options <- varargsToStrEnv(...)
62-
if (!is.null(path)) {
63-
options[["path"]] <- path
64-
}
65-
jmode <- convertToJSaveMode(mode)
66-
write <- callJMethod(write, "mode", jmode)
67-
write <- callJMethod(write, "options", options)
68-
write
61+
options <- varargsToStrEnv(...)
62+
if (!is.null(path)) {
63+
options[["path"]] <- path
64+
}
65+
write <- setWriteMode(write, mode)
66+
write <- callJMethod(write, "options", options)
67+
write
68+
}
69+
70+
#' Set mode and then return the write object
71+
#' @noRd
72+
setWriteMode <- function(write, mode) {
73+
if (!is.character(mode)) {
74+
stop("mode should be character or omitted. It is 'error' by default.")
75+
}
76+
write <- handledCallJMethod(write, "mode", mode)
77+
write
6978
}
7079

7180
#' @export
@@ -556,9 +565,8 @@ setMethod("registerTempTable",
556565
setMethod("insertInto",
557566
signature(x = "SparkDataFrame", tableName = "character"),
558567
function(x, tableName, overwrite = FALSE) {
559-
jmode <- convertToJSaveMode(ifelse(overwrite, "overwrite", "append"))
560568
write <- callJMethod(x@sdf, "write")
561-
write <- callJMethod(write, "mode", jmode)
569+
write <- setWriteMode(write, ifelse(overwrite, "overwrite", "append"))
562570
invisible(callJMethod(write, "insertInto", tableName))
563571
})
564572

@@ -810,7 +818,8 @@ setMethod("toJSON",
810818
#'
811819
#' @param x A SparkDataFrame
812820
#' @param path The directory where the file is saved
813-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
821+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
822+
#' save mode (it is 'error' by default)
814823
#' @param ... additional argument(s) passed to the method.
815824
#'
816825
#' @family SparkDataFrame functions
@@ -841,7 +850,8 @@ setMethod("write.json",
841850
#'
842851
#' @param x A SparkDataFrame
843852
#' @param path The directory where the file is saved
844-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
853+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
854+
#' save mode (it is 'error' by default)
845855
#' @param ... additional argument(s) passed to the method.
846856
#'
847857
#' @family SparkDataFrame functions
@@ -872,7 +882,8 @@ setMethod("write.orc",
872882
#'
873883
#' @param x A SparkDataFrame
874884
#' @param path The directory where the file is saved
875-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
885+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
886+
#' save mode (it is 'error' by default)
876887
#' @param ... additional argument(s) passed to the method.
877888
#'
878889
#' @family SparkDataFrame functions
@@ -917,7 +928,8 @@ setMethod("saveAsParquetFile",
917928
#'
918929
#' @param x A SparkDataFrame
919930
#' @param path The directory where the file is saved
920-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
931+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
932+
#' save mode (it is 'error' by default)
921933
#' @param ... additional argument(s) passed to the method.
922934
#'
923935
#' @family SparkDataFrame functions
@@ -1191,6 +1203,9 @@ setMethod("collect",
11911203
vec <- do.call(c, col)
11921204
stopifnot(class(vec) != "list")
11931205
class(vec) <- PRIMITIVE_TYPES[[colType]]
1206+
if (is.character(vec) && stringsAsFactors) {
1207+
vec <- as.factor(vec)
1208+
}
11941209
df[[colIndex]] <- vec
11951210
} else {
11961211
df[[colIndex]] <- col
@@ -2868,18 +2883,19 @@ setMethod("except",
28682883
#' Additionally, mode is used to specify the behavior of the save operation when data already
28692884
#' exists in the data source. There are four modes:
28702885
#' \itemize{
2871-
#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
2872-
#' \item overwrite: Existing data is expected to be overwritten by the contents of this
2886+
#' \item 'append': Contents of this SparkDataFrame are expected to be appended to existing data.
2887+
#' \item 'overwrite': Existing data is expected to be overwritten by the contents of this
28732888
#' SparkDataFrame.
2874-
#' \item error: An exception is expected to be thrown.
2875-
#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
2889+
#' \item 'error' or 'errorifexists': An exception is expected to be thrown.
2890+
#' \item 'ignore': The save operation is expected to not save the contents of the SparkDataFrame
28762891
#' and to not change the existing data.
28772892
#' }
28782893
#'
28792894
#' @param df a SparkDataFrame.
28802895
#' @param path a name for the table.
28812896
#' @param source a name for external data source.
2882-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
2897+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
2898+
#' save mode (it is 'error' by default)
28832899
#' @param ... additional argument(s) passed to the method.
28842900
#'
28852901
#' @family SparkDataFrame functions
@@ -2937,17 +2953,18 @@ setMethod("saveDF",
29372953
#'
29382954
#' Additionally, mode is used to specify the behavior of the save operation when
29392955
#' data already exists in the data source. There are four modes: \cr
2940-
#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr
2941-
#' overwrite: Existing data is expected to be overwritten by the contents of this
2956+
#' 'append': Contents of this SparkDataFrame are expected to be appended to existing data. \cr
2957+
#' 'overwrite': Existing data is expected to be overwritten by the contents of this
29422958
#' SparkDataFrame. \cr
2943-
#' error: An exception is expected to be thrown. \cr
2944-
#' ignore: The save operation is expected to not save the contents of the SparkDataFrame
2959+
#' 'error' or 'errorifexists': An exception is expected to be thrown. \cr
2960+
#' 'ignore': The save operation is expected to not save the contents of the SparkDataFrame
29452961
#' and to not change the existing data. \cr
29462962
#'
29472963
#' @param df a SparkDataFrame.
29482964
#' @param tableName a name for the table.
29492965
#' @param source a name for external data source.
2950-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default).
2966+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
2967+
#' save mode (it is 'error' by default)
29512968
#' @param ... additional option(s) passed to the method.
29522969
#'
29532970
#' @family SparkDataFrame functions
@@ -2969,12 +2986,11 @@ setMethod("saveAsTable",
29692986
if (is.null(source)) {
29702987
source <- getDefaultSqlSource()
29712988
}
2972-
jmode <- convertToJSaveMode(mode)
29732989
options <- varargsToStrEnv(...)
29742990

29752991
write <- callJMethod(df@sdf, "write")
29762992
write <- callJMethod(write, "format", source)
2977-
write <- callJMethod(write, "mode", jmode)
2993+
write <- setWriteMode(write, mode)
29782994
write <- callJMethod(write, "options", options)
29792995
invisible(callJMethod(write, "saveAsTable", tableName))
29802996
})
@@ -3233,7 +3249,7 @@ setMethod("as.data.frame",
32333249
#'
32343250
#' @family SparkDataFrame functions
32353251
#' @rdname attach
3236-
#' @aliases attach,SparkDataFrame-method
3252+
#' @aliases attach attach,SparkDataFrame-method
32373253
#' @param what (SparkDataFrame) The SparkDataFrame to attach
32383254
#' @param pos (integer) Specify position in search() where to attach.
32393255
#' @param name (character) Name to use for the attached SparkDataFrame. Names
@@ -3249,9 +3265,12 @@ setMethod("as.data.frame",
32493265
#' @note attach since 1.6.0
32503266
setMethod("attach",
32513267
signature(what = "SparkDataFrame"),
3252-
function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) {
3253-
newEnv <- assignNewEnv(what)
3254-
attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
3268+
function(what, pos = 2L, name = deparse(substitute(what), backtick = FALSE),
3269+
warn.conflicts = TRUE) {
3270+
args <- as.list(environment()) # capture all parameters - this must be the first line
3271+
newEnv <- assignNewEnv(args$what)
3272+
args$what <- newEnv
3273+
do.call(attach, args)
32553274
})
32563275

32573276
#' Evaluate a R expression in an environment constructed from a SparkDataFrame
@@ -3538,18 +3557,19 @@ setMethod("histogram",
35383557
#' Also, mode is used to specify the behavior of the save operation when
35393558
#' data already exists in the data source. There are four modes:
35403559
#' \itemize{
3541-
#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
3542-
#' \item overwrite: Existing data is expected to be overwritten by the contents of this
3560+
#' \item 'append': Contents of this SparkDataFrame are expected to be appended to existing data.
3561+
#' \item 'overwrite': Existing data is expected to be overwritten by the contents of this
35433562
#' SparkDataFrame.
3544-
#' \item error: An exception is expected to be thrown.
3545-
#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
3563+
#' \item 'error' or 'errorifexists': An exception is expected to be thrown.
3564+
#' \item 'ignore': The save operation is expected to not save the contents of the SparkDataFrame
35463565
#' and to not change the existing data.
35473566
#' }
35483567
#'
35493568
#' @param x a SparkDataFrame.
35503569
#' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
35513570
#' @param tableName yhe name of the table in the external database.
3552-
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default).
3571+
#' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
3572+
#' save mode (it is 'error' by default)
35533573
#' @param ... additional JDBC database connection properties.
35543574
#' @family SparkDataFrame functions
35553575
#' @rdname write.jdbc
@@ -3566,10 +3586,9 @@ setMethod("histogram",
35663586
setMethod("write.jdbc",
35673587
signature(x = "SparkDataFrame", url = "character", tableName = "character"),
35683588
function(x, url, tableName, mode = "error", ...) {
3569-
jmode <- convertToJSaveMode(mode)
35703589
jprops <- varargsToJProperties(...)
35713590
write <- callJMethod(x@sdf, "write")
3572-
write <- callJMethod(write, "mode", jmode)
3591+
write <- setWriteMode(write, mode)
35733592
invisible(handledCallJMethod(write, "jdbc", url, tableName, jprops))
35743593
})
35753594

R/pkg/R/generics.R

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,8 @@ setGeneric("as.data.frame",
409409
standardGeneric("as.data.frame")
410410
})
411411

412-
#' @rdname attach
412+
# Do not document the generic because of signature changes across R versions
413+
#' @noRd
413414
#' @export
414415
setGeneric("attach")
415416

@@ -1569,12 +1570,9 @@ setGeneric("year", function(x) { standardGeneric("year") })
15691570
#' @export
15701571
setGeneric("fitted")
15711572

1572-
#' @param x,y For \code{glm}: logical values indicating whether the response vector
1573-
#' and model matrix used in the fitting process should be returned as
1574-
#' components of the returned value.
1575-
#' @inheritParams stats::glm
1576-
#' @rdname glm
1573+
# Do not carry stats::glm usage and param here, and do not document the generic
15771574
#' @export
1575+
#' @noRd
15781576
setGeneric("glm")
15791577

15801578
#' @param object a fitted ML model object.

R/pkg/R/mllib_regression.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
210210
#' 1.0.
211211
#' @return \code{glm} returns a fitted generalized linear model.
212212
#' @rdname glm
213+
#' @aliases glm
213214
#' @export
214215
#' @examples
215216
#' \dontrun{

R/pkg/R/sparkR.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,18 @@ sparkR.session <- function(
420420
enableHiveSupport)
421421
assign(".sparkRsession", sparkSession, envir = .sparkREnv)
422422
}
423+
424+
# Check if version number of SparkSession matches version number of SparkR package
425+
jvmVersion <- callJMethod(sparkSession, "version")
426+
# Remove -SNAPSHOT from jvm versions
427+
jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion)
428+
rPackageVersion <- paste0(packageVersion("SparkR"))
429+
430+
if (jvmVersionStrip != rPackageVersion) {
431+
warning(paste("Version mismatch between Spark JVM and SparkR package. JVM version was",
432+
jvmVersion, ", while R package version was", rPackageVersion))
433+
}
434+
423435
sparkSession
424436
}
425437

R/pkg/R/utils.R

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -736,15 +736,6 @@ splitString <- function(input) {
736736
Filter(nzchar, unlist(strsplit(input, ",|\\s")))
737737
}
738738

739-
convertToJSaveMode <- function(mode) {
740-
allModes <- c("append", "overwrite", "error", "ignore")
741-
if (!(mode %in% allModes)) {
742-
stop('mode should be one of "append", "overwrite", "error", "ignore"') # nolint
743-
}
744-
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
745-
jmode
746-
}
747-
748739
varargsToJProperties <- function(...) {
749740
pairs <- list(...)
750741
props <- newJObject("java.util.Properties")

R/pkg/inst/tests/testthat/test_basic.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
context("basic tests for CRAN")
1919

2020
test_that("create DataFrame from list or data.frame", {
21-
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
21+
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
22+
sparkConfig = sparkRTestConfig)
2223

2324
i <- 4
2425
df <- createDataFrame(data.frame(dummy = 1:i))
@@ -49,7 +50,8 @@ test_that("create DataFrame from list or data.frame", {
4950
})
5051

5152
test_that("spark.glm and predict", {
52-
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
53+
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
54+
sparkConfig = sparkRTestConfig)
5355

5456
training <- suppressWarnings(createDataFrame(iris))
5557
# gaussian family

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,12 @@ test_that("create DataFrame with different data types", {
499499
expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
500500
})
501501

502+
test_that("SPARK-17902: collect() with stringsAsFactors enabled", {
503+
df <- suppressWarnings(collect(createDataFrame(iris), stringsAsFactors = TRUE))
504+
expect_equal(class(iris$Species), class(df$Species))
505+
expect_equal(iris$Species, df$Species)
506+
})
507+
502508
test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
503509
df <- data.frame(
504510
id = 1:2,
@@ -624,6 +630,10 @@ test_that("read/write json files", {
624630
jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json")
625631
write.df(df, jsonPath2, "json", mode = "overwrite")
626632

633+
# Test errorifexists
634+
expect_error(write.df(df, jsonPath2, "json", mode = "errorifexists"),
635+
"analysis error - path file:.*already exists")
636+
627637
# Test write.json
628638
jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json")
629639
write.json(df, jsonPath3)
@@ -1365,6 +1375,9 @@ test_that("test HiveContext", {
13651375
expect_equal(count(df5), 3)
13661376
unlink(parquetDataPath)
13671377

1378+
# Invalid mode
1379+
expect_error(saveAsTable(df, "parquetest", "parquet", mode = "abc", path = parquetDataPath),
1380+
"illegal argument - Unknown save mode: abc")
13681381
unsetHiveContext()
13691382
}
13701383
})
@@ -3297,6 +3310,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
32973310
"Error in orc : analysis error - path file:.*already exists")
32983311
expect_error(write.parquet(df, jsonPath),
32993312
"Error in parquet : analysis error - path file:.*already exists")
3313+
expect_error(write.parquet(df, jsonPath, mode = 123), "mode should be character or omitted.")
33003314

33013315
# Arguments checking in R side.
33023316
expect_error(write.df(df, "data.tmp", source = c(1, 2)),

R/pkg/tests/fulltests/test_utils.R

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,6 @@ test_that("varargsToJProperties", {
158158
expect_equal(callJMethod(jprops, "size"), 0L)
159159
})
160160

161-
test_that("convertToJSaveMode", {
162-
s <- convertToJSaveMode("error")
163-
expect_true(class(s) == "jobj")
164-
expect_match(capture.output(print.jobj(s)), "Java ref type org.apache.spark.sql.SaveMode id ")
165-
expect_error(convertToJSaveMode("foo"),
166-
'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
167-
})
168-
169161
test_that("captureJVMException", {
170162
method <- "createStructField"
171163
expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", method,

R/pkg/tests/run-all.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,17 @@ invisible(lapply(sparkRWhitelistSQLDirs,
3636
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
3737

3838
sparkRTestMaster <- "local[1]"
39+
sparkRTestConfig <- list()
3940
if (identical(Sys.getenv("NOT_CRAN"), "true")) {
4041
sparkRTestMaster <- ""
42+
} else {
43+
# Disable hsperfdata on CRAN
44+
old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
45+
Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt))
46+
tmpDir <- tempdir()
47+
tmpArg <- paste0("-Djava.io.tmpdir=", tmpDir)
48+
sparkRTestConfig <- list(spark.driver.extraJavaOptions = tmpArg,
49+
spark.executor.extraJavaOptions = tmpArg)
4150
}
4251

4352
test_package("SparkR")

0 commit comments

Comments
 (0)