Skip to content

Commit d0fc050

Browse files
author
Andrew Or
committed
Merge branch 'master' of github.com:apache/spark into fix-local-page-size
2 parents 0e140c2 + 555b208 commit d0fc050

File tree

446 files changed

+14406
-6090
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

446 files changed

+14406
-6090
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
265265
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
266266
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
267267
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
268-
(The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
268+
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
269269
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
270270
(BSD licence) sbt and sbt-launch-lib.bash
271271
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

R/pkg/NAMESPACE

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ export("setJobGroup",
2323
exportClasses("DataFrame")
2424

2525
exportMethods("arrange",
26+
"attach",
2627
"cache",
2728
"collect",
2829
"columns",
@@ -40,6 +41,7 @@ exportMethods("arrange",
4041
"fillna",
4142
"filter",
4243
"first",
44+
"freqItems",
4345
"group_by",
4446
"groupBy",
4547
"head",
@@ -63,6 +65,7 @@ exportMethods("arrange",
6365
"repartition",
6466
"sample",
6567
"sample_frac",
68+
"sampleBy",
6669
"saveAsParquetFile",
6770
"saveAsTable",
6871
"saveDF",
@@ -106,6 +109,7 @@ exportMethods("%in%",
106109
"cbrt",
107110
"ceil",
108111
"ceiling",
112+
"column",
109113
"concat",
110114
"concat_ws",
111115
"contains",
@@ -226,7 +230,8 @@ exportMethods("agg")
226230
export("sparkRSQL.init",
227231
"sparkRHive.init")
228232

229-
export("cacheTable",
233+
export("as.DataFrame",
234+
"cacheTable",
230235
"clearCache",
231236
"createDataFrame",
232237
"createExternalTable",
@@ -250,4 +255,4 @@ export("structField",
250255
"structType.structField",
251256
"print.structType")
252257

253-
export("as.data.frame")
258+
export("as.data.frame")

R/pkg/R/DataFrame.R

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,9 +1414,10 @@ setMethod("where",
14141414
#' @param x A Spark DataFrame
14151415
#' @param y A Spark DataFrame
14161416
#' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
1417-
#' Column expression. If joinExpr is omitted, join() wil perform a Cartesian join
1417+
#' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
14181418
#' @param joinType The type of join to perform. The following join types are available:
1419-
#' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner".
1419+
#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
1420+
#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
14201421
#' @return A DataFrame containing the result of the join operation.
14211422
#' @rdname join
14221423
#' @name join
@@ -1441,11 +1442,15 @@ setMethod("join",
14411442
if (is.null(joinType)) {
14421443
sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc)
14431444
} else {
1444-
if (joinType %in% c("inner", "outer", "left_outer", "right_outer", "semijoin")) {
1445+
if (joinType %in% c("inner", "outer", "full", "fullouter",
1446+
"leftouter", "left_outer", "left",
1447+
"rightouter", "right_outer", "right", "leftsemi")) {
1448+
joinType <- gsub("_", "", joinType)
14451449
sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
14461450
} else {
14471451
stop("joinType must be one of the following types: ",
1448-
"'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'")
1452+
"'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left',
1453+
'rightouter', 'right_outer', 'right', 'leftsemi'")
14491454
}
14501455
}
14511456
}
@@ -1826,17 +1831,15 @@ setMethod("fillna",
18261831
if (length(colNames) == 0 || !all(colNames != "")) {
18271832
stop("value should be an a named list with each name being a column name.")
18281833
}
1829-
1830-
# Convert to the named list to an environment to be passed to JVM
1831-
valueMap <- new.env()
1832-
for (col in colNames) {
1833-
# Check each item in the named list is of valid type
1834-
v <- value[[col]]
1834+
# Check each item in the named list is of valid type
1835+
lapply(value, function(v) {
18351836
if (!(class(v) %in% c("integer", "numeric", "character"))) {
18361837
stop("Each item in value should be an integer, numeric or charactor.")
18371838
}
1838-
valueMap[[col]] <- v
1839-
}
1839+
})
1840+
1841+
# Convert to the named list to an environment to be passed to JVM
1842+
valueMap <- convertNamedListToEnv(value)
18401843

18411844
# When value is a named list, caller is expected not to pass in cols
18421845
if (!is.null(cols)) {
@@ -1881,3 +1884,33 @@ setMethod("as.data.frame",
18811884
}
18821885
collect(x)
18831886
})
1887+
1888+
#' The specified DataFrame is attached to the R search path. This means that
1889+
#' the DataFrame is searched by R when evaluating a variable, so columns in
1890+
#' the DataFrame can be accessed by simply giving their names.
1891+
#'
1892+
#' @rdname attach
1893+
#' @title Attach DataFrame to R search path
1894+
#' @param what (DataFrame) The DataFrame to attach
1895+
#' @param pos (integer) Specify position in search() where to attach.
1896+
#' @param name (character) Name to use for the attached DataFrame. Names
1897+
#' starting with package: are reserved for library.
1898+
#' @param warn.conflicts (logical) If TRUE, warnings are printed about conflicts
1899+
#' from attaching the database, unless that DataFrame contains an object
1900+
#' @examples
1901+
#' \dontrun{
1902+
#' attach(irisDf)
1903+
#' summary(Sepal_Width)
1904+
#' }
1905+
#' @seealso \link{detach}
1906+
setMethod("attach",
1907+
signature(what = "DataFrame"),
1908+
function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) {
1909+
cols <- columns(what)
1910+
stopifnot(length(cols) > 0)
1911+
newEnv <- new.env()
1912+
for (i in 1:length(cols)) {
1913+
assign(x = cols[i], value = what[, cols[i]], envir = newEnv)
1914+
}
1915+
attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
1916+
})

R/pkg/R/SQLContext.R

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ infer_type <- function(x) {
3232
numeric = "double",
3333
raw = "binary",
3434
list = "array",
35+
struct = "struct",
3536
environment = "map",
3637
Date = "date",
3738
POSIXlt = "timestamp",
@@ -44,39 +45,42 @@ infer_type <- function(x) {
4445
paste0("map<string,", infer_type(get(key, x)), ">")
4546
} else if (type == "array") {
4647
stopifnot(length(x) > 0)
48+
49+
paste0("array<", infer_type(x[[1]]), ">")
50+
} else if (type == "struct") {
51+
stopifnot(length(x) > 0)
4752
names <- names(x)
48-
if (is.null(names)) {
49-
paste0("array<", infer_type(x[[1]]), ">")
50-
} else {
51-
# StructType
52-
types <- lapply(x, infer_type)
53-
fields <- lapply(1:length(x), function(i) {
54-
structField(names[[i]], types[[i]], TRUE)
55-
})
56-
do.call(structType, fields)
57-
}
53+
stopifnot(!is.null(names))
54+
55+
type <- lapply(seq_along(x), function(i) {
56+
paste0(names[[i]], ":", infer_type(x[[i]]), ",")
57+
})
58+
type <- Reduce(paste0, type)
59+
type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
5860
} else if (length(x) > 1) {
5961
paste0("array<", infer_type(x[[1]]), ">")
6062
} else {
6163
type
6264
}
6365
}
6466

65-
#' Create a DataFrame from an RDD
67+
#' Create a DataFrame
6668
#'
67-
#' Converts an RDD to a DataFrame by infer the types.
69+
#' Converts R data.frame or list into DataFrame.
6870
#'
6971
#' @param sqlContext A SQLContext
7072
#' @param data An RDD or list or data.frame
7173
#' @param schema a list of column names or named list (StructType), optional
7274
#' @return an DataFrame
75+
#' @rdname createDataFrame
7376
#' @export
7477
#' @examples
7578
#'\dontrun{
7679
#' sc <- sparkR.init()
7780
#' sqlContext <- sparkRSQL.init(sc)
78-
#' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
79-
#' df <- createDataFrame(sqlContext, rdd)
81+
#' df1 <- as.DataFrame(sqlContext, iris)
82+
#' df2 <- as.DataFrame(sqlContext, list(3,4,5,6))
83+
#' df3 <- createDataFrame(sqlContext, iris)
8084
#' }
8185

8286
# TODO(davies): support sampling and infer type from NA
@@ -149,6 +153,13 @@ createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0
149153
dataFrame(sdf)
150154
}
151155

156+
#' @rdname createDataFrame
157+
#' @aliases createDataFrame
158+
#' @export
159+
as.DataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
160+
createDataFrame(sqlContext, data, schema, samplingRatio)
161+
}
162+
152163
# toDF
153164
#
154165
# Converts an RDD to a DataFrame by infer the types.

R/pkg/R/column.R

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,11 @@ setMethod("initialize", "Column", function(.Object, jc) {
3636
.Object
3737
})
3838

39-
column <- function(jc) {
40-
new("Column", jc)
41-
}
42-
43-
col <- function(x) {
44-
column(callJStatic("org.apache.spark.sql.functions", "col", x))
45-
}
39+
setMethod("column",
40+
signature(x = "jobj"),
41+
function(x) {
42+
new("Column", x)
43+
})
4644

4745
#' @rdname show
4846
#' @name show

R/pkg/R/deserialize.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ readTypedObject <- function(con, type) {
5151
"a" = readArray(con),
5252
"l" = readList(con),
5353
"e" = readEnv(con),
54+
"s" = readStruct(con),
5455
"n" = NULL,
5556
"j" = getJobj(readString(con)),
5657
stop(paste("Unsupported type for deserialization", type)))
@@ -135,6 +136,15 @@ readEnv <- function(con) {
135136
env
136137
}
137138

139+
# Read a field of StructType from DataFrame
140+
# into a named list in R whose class is "struct"
141+
readStruct <- function(con) {
142+
names <- readObject(con)
143+
fields <- readObject(con)
144+
names(fields) <- names
145+
listToStruct(fields)
146+
}
147+
138148
readRaw <- function(con) {
139149
dataLen <- readInt(con)
140150
readBin(con, raw(), as.integer(dataLen), endian = "big")

R/pkg/R/functions.R

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,21 @@
1818
#' @include generics.R column.R
1919
NULL
2020

21-
#' Creates a \code{Column} of literal value.
21+
#' lit
2222
#'
23-
#' The passed in object is returned directly if it is already a \linkS4class{Column}.
24-
#' If the object is a Scala Symbol, it is converted into a \linkS4class{Column} also.
25-
#' Otherwise, a new \linkS4class{Column} is created to represent the literal value.
23+
#' A new \linkS4class{Column} is created to represent the literal value.
24+
#' If the parameter is a \linkS4class{Column}, it is returned unchanged.
2625
#'
2726
#' @family normal_funcs
2827
#' @rdname lit
2928
#' @name lit
3029
#' @export
30+
#' @examples
31+
#' \dontrun{
32+
#' lit(df$name)
33+
#' select(df, lit("x"))
34+
#' select(df, lit("2015-01-01"))
35+
#'}
3136
setMethod("lit", signature("ANY"),
3237
function(x) {
3338
jc <- callJStatic("org.apache.spark.sql.functions",
@@ -233,6 +238,28 @@ setMethod("ceil",
233238
column(jc)
234239
})
235240

241+
#' Though scala functions has "col" function, we don't expose it in SparkR
242+
#' because we don't want to conflict with the "col" function in the R base
243+
#' package and we also have "column" function exported which is an alias of "col".
244+
col <- function(x) {
245+
column(callJStatic("org.apache.spark.sql.functions", "col", x))
246+
}
247+
248+
#' column
249+
#'
250+
#' Returns a Column based on the given column name.
251+
#'
252+
#' @rdname col
253+
#' @name column
254+
#' @family normal_funcs
255+
#' @export
256+
#' @examples \dontrun{column(df)}
257+
setMethod("column",
258+
signature(x = "character"),
259+
function(x) {
260+
col(x)
261+
})
262+
236263
#' cos
237264
#'
238265
#' Computes the cosine of the given value.

R/pkg/R/generics.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
6363
# @export
6464
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
6565

66+
# @rdname statfunctions
67+
# @export
68+
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
69+
6670
# @rdname distinct
6771
# @export
6872
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
@@ -505,6 +509,10 @@ setGeneric("sample",
505509
setGeneric("sample_frac",
506510
function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
507511

512+
#' @rdname statfunctions
513+
#' @export
514+
setGeneric("sampleBy", function(x, col, fractions, seed) { standardGeneric("sampleBy") })
515+
508516
#' @rdname saveAsParquetFile
509517
#' @export
510518
setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
@@ -682,6 +690,10 @@ setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
682690
#' @export
683691
setGeneric("ceil", function(x) { standardGeneric("ceil") })
684692

693+
#' @rdname col
694+
#' @export
695+
setGeneric("column", function(x) { standardGeneric("column") })
696+
685697
#' @rdname concat
686698
#' @export
687699
setGeneric("concat", function(x, ...) { standardGeneric("concat") })
@@ -995,3 +1007,7 @@ setGeneric("rbind", signature = "...")
9951007
#' @rdname as.data.frame
9961008
#' @export
9971009
setGeneric("as.data.frame")
1010+
1011+
#' @rdname attach
1012+
#' @export
1013+
setGeneric("attach")

R/pkg/R/mllib.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,12 @@ setClass("PipelineModel", representation(model = "jobj"))
4545
#' summary(model)
4646
#'}
4747
setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
48-
function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {
48+
function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
49+
solver = "auto") {
4950
family <- match.arg(family)
5051
model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
5152
"fitRModelFormula", deparse(formula), data@sdf, family, lambda,
52-
alpha)
53+
alpha, solver)
5354
return(new("PipelineModel", model = model))
5455
})
5556

0 commit comments

Comments
 (0)