-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-24537][R]Add array_remove / array_zip / map_from_arrays / array_distinct #21645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -194,10 +194,12 @@ NULL | |
| #' \itemize{ | ||
| #' \item \code{array_contains}: a value to be checked if contained in the column. | ||
| #' \item \code{array_position}: a value to locate in the given array. | ||
| #' \item \code{array_remove}: a value to remove in the given array. | ||
| #' } | ||
| #' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains | ||
| #' additional named properties to control how it is converted, accepts the same | ||
| #' options as the JSON data source. | ||
| #' options as the JSON data source. In \code{arrays_zip}, this contains additional | ||
| #' Columns of arrays to be merged. | ||
| #' @name column_collection_functions | ||
| #' @rdname column_collection_functions | ||
| #' @family collection functions | ||
|
|
@@ -207,9 +209,9 @@ NULL | |
| #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) | ||
| #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp)) | ||
| #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1))) | ||
| #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1))) | ||
| #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1))) | ||
| #' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1))) | ||
| #' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1))) | ||
| #' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1), array_remove(tmp$v1, 21))) | ||
| #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1)) | ||
| #' head(tmp2) | ||
| #' head(select(tmp, posexplode(tmp$v1))) | ||
|
|
@@ -221,6 +223,7 @@ NULL | |
| #' head(select(tmp3, element_at(tmp3$v3, "Valiant"))) | ||
| #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp)) | ||
| #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5))) | ||
| #' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5), map_from_arrays(tmp4$v4, tmp4$v5))) | ||
| #' head(select(tmp, concat(df$mpg, df$cyl, df$hp))) | ||
| #' tmp5 <- mutate(df, v6 = create_array(df$model, df$model)) | ||
| #' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))} | ||
|
|
@@ -1978,7 +1981,7 @@ setMethod("levenshtein", signature(y = "Column"), | |
| }) | ||
|
|
||
| #' @details | ||
| #' \code{months_between}: Returns number of months between dates \code{y} and \code{x}. | ||
| #' \code{months_between}: Returns number of months between dates \code{y} and \code{x}. | ||
| #' If \code{y} is later than \code{x}, then the result is positive. If \code{y} and \code{x} | ||
| #' are on the same day of month, or both are the last day of month, time of day will be ignored. | ||
| #' Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits. | ||
|
|
@@ -3008,6 +3011,19 @@ setMethod("array_contains", | |
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{array_distinct}: Removes duplicate values from the array. | ||
| #' | ||
| #' @rdname column_collection_functions | ||
| #' @aliases array_distinct array_distinct,Column-method | ||
| #' @note array_distinct since 2.4.0 | ||
| setMethod("array_distinct", | ||
| signature(x = "Column"), | ||
| function(x) { | ||
| jc <- callJStatic("org.apache.spark.sql.functions", "array_distinct", x@jc) | ||
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{array_join}: Concatenates the elements of column using the delimiter. | ||
| #' Null values are replaced with nullReplacement if set, otherwise they are ignored. | ||
|
|
@@ -3071,6 +3087,19 @@ setMethod("array_position", | |
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{array_remove}: Removes all elements that equal to element from the given array. | ||
| #' | ||
| #' @rdname column_collection_functions | ||
| #' @aliases array_remove array_remove,Column-method | ||
| #' @note array_remove since 2.4.0 | ||
| setMethod("array_remove", | ||
| signature(x = "Column", value = "ANY"), | ||
| function(x, value) { | ||
| jc <- callJStatic("org.apache.spark.sql.functions", "array_remove", x@jc, value) | ||
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{array_repeat}: Creates an array containing \code{x} repeated the number of times | ||
| #' given by \code{count}. | ||
|
|
@@ -3120,6 +3149,24 @@ setMethod("arrays_overlap", | |
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{arrays_zip}: Returns a merged array of structs in which the N-th struct contains all N-th | ||
| #' values of input arrays. | ||
| #' | ||
| #' @rdname column_collection_functions | ||
| #' @aliases arrays_zip arrays_zip,Column-method | ||
| #' @note arrays_zip since 2.4.0 | ||
| setMethod("arrays_zip", | ||
| signature(x = "Column"), | ||
| function(x, ...) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should add in doc that |
||
| jcols <- lapply(list(x, ...), function(arg) { | ||
| stopifnot(class(arg) == "Column") | ||
| arg@jc | ||
| }) | ||
| jc <- callJStatic("org.apache.spark.sql.functions", "arrays_zip", jcols) | ||
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{flatten}: Creates a single array from an array of arrays. | ||
| #' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed. | ||
|
|
@@ -3147,6 +3194,21 @@ setMethod("map_entries", | |
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{map_from_arrays}: Creates a new map column. The array in the first column is used for | ||
| #' keys. The array in the second column is used for values. All elements in the array for key | ||
| #' should not be null. | ||
| #' | ||
| #' @rdname column_collection_functions | ||
| #' @aliases map_from_arrays map_from_arrays,Column-method | ||
| #' @note map_from_arrays since 2.4.0 | ||
| setMethod("map_from_arrays", | ||
| signature(x = "Column", y = "Column"), | ||
| function(x, y) { | ||
| jc <- callJStatic("org.apache.spark.sql.functions", "map_from_arrays", x@jc, y@jc) | ||
| column(jc) | ||
| }) | ||
|
|
||
| #' @details | ||
| #' \code{map_keys}: Returns an unordered array containing the keys of the map. | ||
| #' | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should add in doc that for this function,
valueis ...?https://github.com/huaxingao/spark/blob/87100ec766acf647e91ab14450f1c33c0f929d52/R/pkg/R/functions.R#L193
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your review. I will add in doc.