Skip to content

Commit 80e9cf1

Browse files
zero323Felix Cheung
authored andcommitted
[SPARK-20490][SPARKR] Add R wrappers for eqNullSafe and ! / not
## What changes were proposed in this pull request? - Add null-safe equality operator `%<=>%` (sames as `o.a.s.sql.Column.eqNullSafe`, `o.a.s.sql.Column.<=>`) - Add boolean negation operator `!` and function `not `. ## How was this patch tested? Existing unit tests, additional unit tests, `check-cran.sh`. Author: zero323 <[email protected]> Closes #17783 from zero323/SPARK-20490.
1 parent 6613046 commit 80e9cf1

File tree

6 files changed

+117
-5
lines changed

6 files changed

+117
-5
lines changed

R/pkg/NAMESPACE

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ exportMethods("arrange",
182182

183183
exportClasses("Column")
184184

185-
exportMethods("%in%",
185+
exportMethods("%<=>%",
186+
"%in%",
186187
"abs",
187188
"acos",
188189
"add_months",
@@ -291,6 +292,7 @@ exportMethods("%in%",
291292
"nanvl",
292293
"negate",
293294
"next_day",
295+
"not",
294296
"ntile",
295297
"otherwise",
296298
"over",

R/pkg/R/column.R

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,7 @@ operators <- list(
6767
"+" = "plus", "-" = "minus", "*" = "multiply", "/" = "divide", "%%" = "mod",
6868
"==" = "equalTo", ">" = "gt", "<" = "lt", "!=" = "notEqual", "<=" = "leq", ">=" = "geq",
6969
# we can not override `&&` and `||`, so use `&` and `|` instead
70-
"&" = "and", "|" = "or", #, "!" = "unary_$bang"
71-
"^" = "pow"
70+
"&" = "and", "|" = "or", "^" = "pow"
7271
)
7372
column_functions1 <- c("asc", "desc", "isNaN", "isNull", "isNotNull")
7473
column_functions2 <- c("like", "rlike", "getField", "getItem", "contains")
@@ -302,3 +301,55 @@ setMethod("otherwise",
302301
jc <- callJMethod(x@jc, "otherwise", value)
303302
column(jc)
304303
})
304+
305+
#' \%<=>\%
306+
#'
307+
#' Equality test that is safe for null values.
308+
#'
309+
#' Can be used, unlike standard equality operator, to perform null-safe joins.
310+
#' Equivalent to Scala \code{Column.<=>} and \code{Column.eqNullSafe}.
311+
#'
312+
#' @param x a Column
313+
#' @param value a value to compare
314+
#' @rdname eq_null_safe
315+
#' @name %<=>%
316+
#' @aliases %<=>%,Column-method
317+
#' @export
318+
#' @examples
319+
#' \dontrun{
320+
#' df1 <- createDataFrame(data.frame(
321+
#' x = c(1, NA, 3, NA), y = c(2, 6, 3, NA)
322+
#' ))
323+
#'
324+
#' head(select(df1, df1$x == df1$y, df1$x %<=>% df1$y))
325+
#'
326+
#' df2 <- createDataFrame(data.frame(y = c(3, NA)))
327+
#' count(join(df1, df2, df1$y == df2$y))
328+
#'
329+
#' count(join(df1, df2, df1$y %<=>% df2$y))
330+
#' }
331+
#' @note \%<=>\% since 2.3.0
332+
setMethod("%<=>%",
333+
signature(x = "Column", value = "ANY"),
334+
function(x, value) {
335+
value <- if (class(value) == "Column") { value@jc } else { value }
336+
jc <- callJMethod(x@jc, "eqNullSafe", value)
337+
column(jc)
338+
})
339+
340+
#' !
341+
#'
342+
#' Inversion of boolean expression.
343+
#'
344+
#' @rdname not
345+
#' @name not
346+
#' @aliases !,Column-method
347+
#' @export
348+
#' @examples
349+
#' \dontrun{
350+
#' df <- createDataFrame(data.frame(x = c(-1, 0, 1)))
351+
#'
352+
#' head(select(df, !column("x") > 0))
353+
#' }
354+
#' @note ! since 2.3.0
355+
setMethod("!", signature(x = "Column"), function(x) not(x))

R/pkg/R/functions.R

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3859,3 +3859,34 @@ setMethod("posexplode_outer",
38593859
jc <- callJStatic("org.apache.spark.sql.functions", "posexplode_outer", x@jc)
38603860
column(jc)
38613861
})
3862+
3863+
#' not
3864+
#'
3865+
#' Inversion of boolean expression.
3866+
#'
3867+
#' \code{not} and \code{!} cannot be applied directly to numerical column.
3868+
#' To achieve R-like truthiness column has to be casted to \code{BooleanType}.
3869+
#'
3870+
#' @param x Column to compute on
3871+
#' @rdname not
3872+
#' @name not
3873+
#' @aliases not,Column-method
3874+
#' @export
3875+
#' @examples \dontrun{
3876+
#' df <- createDataFrame(data.frame(
3877+
#' is_true = c(TRUE, FALSE, NA),
3878+
#' flag = c(1, 0, 1)
3879+
#' ))
3880+
#'
3881+
#' head(select(df, not(df$is_true)))
3882+
#'
3883+
#' # Explicit cast is required when working with numeric column
3884+
#' head(select(df, not(cast(df$flag, "boolean"))))
3885+
#' }
3886+
#' @note not since 2.3.0
3887+
setMethod("not",
3888+
signature(x = "Column"),
3889+
function(x) {
3890+
jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc)
3891+
column(jc)
3892+
})

R/pkg/R/generics.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,10 @@ setGeneric("otherwise", function(x, value) { standardGeneric("otherwise") })
856856
#' @export
857857
setGeneric("over", function(x, window) { standardGeneric("over") })
858858

859+
#' @rdname eq_null_safe
860+
#' @export
861+
setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })
862+
859863
###################### WindowSpec Methods ##########################
860864

861865
#' @rdname partitionBy
@@ -1154,6 +1158,10 @@ setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
11541158
#' @export
11551159
setGeneric("negate", function(x) { standardGeneric("negate") })
11561160

1161+
#' @rdname not
1162+
#' @export
1163+
setGeneric("not", function(x) { standardGeneric("not") })
1164+
11571165
#' @rdname next_day
11581166
#' @export
11591167
setGeneric("next_day", function(y, x) { standardGeneric("next_day") })

R/pkg/inst/tests/testthat/test_context.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ test_that("Check masked functions", {
2121
# Check that we are not masking any new function from base, stats, testthat unexpectedly
2222
# NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
2323
# hard for users to use base R functions. Please check when in doubt.
24-
namesOfMaskedCompletely <- c("cov", "filter", "sample")
24+
namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
2525
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
2626
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
27-
"summary", "transform", "drop", "window", "as.data.frame", "union")
27+
"summary", "transform", "drop", "window", "as.data.frame", "union", "not")
2828
if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
2929
namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
3030
}

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,8 @@ test_that("column operators", {
13231323
c3 <- (c + c2 - c2) * c2 %% c2
13241324
c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
13251325
c5 <- c2 ^ c3 ^ c4
1326+
c6 <- c2 %<=>% c3
1327+
c7 <- !c6
13261328
})
13271329

13281330
test_that("column functions", {
@@ -1348,6 +1350,7 @@ test_that("column functions", {
13481350
c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
13491351
c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
13501352
c21 <- posexplode_outer(c) + explode_outer(c)
1353+
c22 <- not(c)
13511354

13521355
# Test if base::is.nan() is exposed
13531356
expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
@@ -1488,6 +1491,13 @@ test_that("column functions", {
14881491
lapply(
14891492
list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)),
14901493
as.environment))
1494+
1495+
df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
1496+
expect_equal(
1497+
collect(select(df, alias(not(df$is_true), "is_false"))),
1498+
data.frame(is_false = c(FALSE, TRUE, NA))
1499+
)
1500+
14911501
})
14921502

14931503
test_that("column binary mathfunctions", {
@@ -1973,6 +1983,16 @@ test_that("filter() on a DataFrame", {
19731983
filtered6 <- where(df, df$age %in% c(19, 30))
19741984
expect_equal(count(filtered6), 2)
19751985

1986+
# test suites for %<=>%
1987+
dfNa <- read.json(jsonPathNa)
1988+
expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
1989+
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
1990+
expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
1991+
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
1992+
# match NA from two columns
1993+
expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
1994+
expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)
1995+
19761996
# Test stats::filter is working
19771997
#expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
19781998
})

0 commit comments

Comments
 (0)