-
Notifications
You must be signed in to change notification settings - Fork 2.1k
allow duplicate rows in x to be updated #5588
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8281e64
145be8e
3244a2d
b1e2192
7cf7299
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,8 +11,7 @@ | |
| #' | ||
| #' * `rows_insert()` adds new rows (like `INSERT`); key values in `y` must | ||
| #' not occur in `x`. | ||
| #' * `rows_update()` modifies existing rows (like `UPDATE`); key values in | ||
| #' `y` must occur in `x`. | ||
| #' * `rows_update()` modifies existing rows (like `UPDATE`) | ||
| #' * `rows_patch()` works like `rows_update()` but only overwrites `NA` values. | ||
| #' * `rows_upsert()` inserts or updates depending on whether or not the | ||
| #' key value in `y` already exists in `x`. | ||
|
|
@@ -82,15 +81,16 @@ rows_insert <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | |
| #' @export | ||
| rows_insert.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | ||
| key <- rows_check_key(by, x, y) | ||
| rows_check_subset(x, y) | ||
| y <- auto_copy(x, y, copy = copy) | ||
| rows_df_in_place(in_place) | ||
|
|
||
| rows_check_key_df(x, key, df_name = "x") | ||
| rows_check_key_df(y, key, df_name = "y") | ||
| rows_check_key_names_df(x, key, df_name = "x") | ||
| rows_check_key_names_df(y, key, df_name = "y") | ||
|
|
||
| idx <- vctrs::vec_match(y[key], x[key]) | ||
| bad <- which(!is.na(idx)) | ||
| if (has_length(bad)) { | ||
| rows_check_key_unique_df(y, key, df_name = "y") | ||
|
|
||
| if (any(vctrs::vec_in(y[key], x[key]))) { | ||
| abort("Attempting to insert duplicate rows.") | ||
| } | ||
|
|
||
|
|
@@ -109,19 +109,20 @@ rows_update <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | |
| #' @export | ||
| rows_update.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | ||
| key <- rows_check_key(by, x, y) | ||
| rows_check_subset(x, y) | ||
| y <- auto_copy(x, y, copy = copy) | ||
| rows_df_in_place(in_place) | ||
|
|
||
| rows_check_key_df(x, key, df_name = "x") | ||
| rows_check_key_df(y, key, df_name = "y") | ||
| idx <- vctrs::vec_match(y[key], x[key]) | ||
| rows_check_key_names_df(x, key, df_name = "x") | ||
| rows_check_key_names_df(y, key, df_name = "y") | ||
|
|
||
| bad <- which(is.na(idx)) | ||
| if (has_length(bad)) { | ||
| abort("Attempting to update missing rows.") | ||
| } | ||
| rows_check_key_unique_df(y, key, df_name = "y") | ||
|
|
||
| x[idx, names(y)] <- y | ||
| idx <- vctrs::vec_match(x[key], y[key]) | ||
| pos <- which(!is.na(idx)) | ||
| idx <- idx[pos] | ||
|
|
||
| x[pos, names(y)] <- y[idx, ] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like these might be useful tests to add x <- data.frame(a = c(1, 2), b = 1)
y <- data.frame(a = 3, b = 2)
# `y` key that isn't in `x` = no changes
expect_identical(rows_update(x, y, "a"), x)
x <- data.frame(a = c(1, 2, 1), b = 1)
y <- data.frame(a = 1, b = 2)
expect <- data.frame(a = c(1, 2, 1), b = c(2, 1, 2))
# can update duplicate keys in `x`
expect_identical(rows_update(x, y, "a"), expect) |
||
| x | ||
| } | ||
|
|
||
|
|
@@ -137,21 +138,20 @@ rows_patch <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | |
| #' @export | ||
| rows_patch.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | ||
| key <- rows_check_key(by, x, y) | ||
| rows_check_subset(x, y) | ||
| y <- auto_copy(x, y, copy = copy) | ||
| rows_df_in_place(in_place) | ||
|
|
||
| rows_check_key_df(x, key, df_name = "x") | ||
| rows_check_key_df(y, key, df_name = "y") | ||
| idx <- vctrs::vec_match(y[key], x[key]) | ||
| rows_check_key_names_df(x, key, df_name = "x") | ||
| rows_check_key_names_df(y, key, df_name = "y") | ||
|
|
||
| bad <- which(is.na(idx)) | ||
| if (has_length(bad)) { | ||
| abort("Attempting to patch missing rows.") | ||
| } | ||
| rows_check_key_unique_df(y, key, df_name = "y") | ||
|
|
||
| new_data <- map2(x[idx, names(y)], y, coalesce) | ||
| idx <- vctrs::vec_match(x[key], y[key]) | ||
| pos <- which(!is.na(idx)) | ||
| idx <- idx[pos] | ||
|
|
||
| x[idx, names(y)] <- new_data | ||
| x[pos, names(y)] <- map2(x[pos, names(y)], y[idx, ], coalesce) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I imagine this should only
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We would avoid extra work. |
||
| x | ||
| } | ||
|
|
||
|
|
@@ -167,18 +167,30 @@ rows_upsert <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | |
| #' @export | ||
| rows_upsert.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | ||
| key <- rows_check_key(by, x, y) | ||
| rows_check_subset(x, y) | ||
| y <- auto_copy(x, y, copy = copy) | ||
| rows_df_in_place(in_place) | ||
|
|
||
| rows_check_key_df(x, key, df_name = "x") | ||
| rows_check_key_df(y, key, df_name = "y") | ||
| idx <- vctrs::vec_match(y[key], x[key]) | ||
| new <- is.na(idx) | ||
| idx_existing <- idx[!new] | ||
| idx_new <- idx[new] | ||
| rows_check_key_names_df(x, key, df_name = "x") | ||
| rows_check_key_names_df(y, key, df_name = "y") | ||
|
|
||
| rows_check_key_unique_df(y, key, df_name = "y") | ||
|
|
||
| # update | ||
| idx <- vctrs::vec_match(x[key], y[key]) | ||
| pos <- which(!is.na(idx)) | ||
| if (length(pos)) { | ||
| idx <- idx[pos] | ||
| x[pos, names(y)] <- y[idx, ] | ||
| } | ||
|
|
||
| # and insert | ||
| pos_insert <- which(!vctrs::vec_in(y[key], x[key])) | ||
| if (length(pos_insert)) { | ||
| x <- rows_bind(x, vec_slice(y, pos_insert)) | ||
| } | ||
| x | ||
|
|
||
| x[idx_existing, names(y)] <- vec_slice(y, !new) | ||
| rows_bind(x, vec_slice(y, new)) | ||
| } | ||
|
|
||
| #' @rdname rows | ||
|
|
@@ -193,11 +205,16 @@ rows_delete <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | |
| #' @export | ||
| rows_delete.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) { | ||
| key <- rows_check_key(by, x, y) | ||
| rows_check_subset(x, y) | ||
|
|
||
| y <- auto_copy(x, y, copy = copy) | ||
| rows_df_in_place(in_place) | ||
|
|
||
| rows_check_key_df(x, key, df_name = "x") | ||
| rows_check_key_df(y, key, df_name = "y") | ||
| rows_check_key_names_df(x, key, df_name = "x") | ||
| rows_check_key_unique_df(x, key, df_name = "x") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems useful to be able to delete multiple rows in x <- data.frame(a = c(1, 1, 1), b = c(1, 1, 2))
y <- data.frame(a = 1, b = 1)
# rows_delete(x, y, by = c("a", "b"))
x[3,]
#> a b
#> 3 1 2That just continues the theme of this PR which allows |
||
|
|
||
| rows_check_key_names_df(y, key, df_name = "y") | ||
| rows_check_key_unique_df(y, key, df_name = "y") | ||
|
|
||
| extra_cols <- setdiff(names(y), key) | ||
| if (has_length(extra_cols)) { | ||
|
|
@@ -206,14 +223,7 @@ rows_delete.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place | |
| ) | ||
| } | ||
|
|
||
| idx <- vctrs::vec_match(y[key], x[key]) | ||
|
|
||
| bad <- which(is.na(idx)) | ||
| if (has_length(bad)) { | ||
| abort("Attempting to delete missing rows.") | ||
| } | ||
|
|
||
| dplyr_row_slice(x, -idx) | ||
| x[which(!vctrs::vec_in(x[key], y[key])), ] | ||
| } | ||
|
|
||
| # helpers ----------------------------------------------------------------- | ||
|
|
@@ -234,19 +244,24 @@ rows_check_key <- function(by, x, y) { | |
| abort("`by` must be unnamed.") | ||
| } | ||
|
|
||
| by | ||
| } | ||
|
|
||
| rows_check_subset <- function(x, y) { | ||
| bad <- setdiff(colnames(y), colnames(x)) | ||
| if (has_length(bad)) { | ||
| abort("All columns in `y` must exist in `x`.") | ||
| } | ||
|
|
||
| by | ||
| } | ||
|
|
||
| rows_check_key_df <- function(df, by, df_name) { | ||
| rows_check_key_names_df <- function(df, by, df_name) { | ||
| y_miss <- setdiff(by, colnames(df)) | ||
| if (length(y_miss) > 0) { | ||
| abort(glue("All `by` columns must exist in `{df_name}`.")) | ||
| } | ||
| } | ||
|
|
||
| rows_check_key_unique_df <- function(df, by, df_name) { | ||
| if (vctrs::vec_duplicate_any(df[by])) { | ||
| abort(glue("`{df_name}` key values are not unique.")) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think being able to update multiple rows in
xwith the same key implies that we are okay with duplicate keys.It might make sense to remove this check, allowing you to also insert a row with a duplicate key.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure. For insertion we might really care for uniqueness. On the other hand, should
rows_insert()or the underlying storage be responsible for identifying duplicate key violations?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was also wondering if a
strict = TRUE/FALSEwould make sense for the data frame method. Ifstrict = TRUE, then we can't add a duplicate key tox. This argument might make sense for otherrows_*()functions too.