Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 60 additions & 45 deletions R/rows.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
#'
#' * `rows_insert()` adds new rows (like `INSERT`); key values in `y` must
#' not occur in `x`.
#' * `rows_update()` modifies existing rows (like `UPDATE`); key values in
#' `y` must occur in `x`.
#' * `rows_update()` modifies existing rows (like `UPDATE`)
#' * `rows_patch()` works like `rows_update()` but only overwrites `NA` values.
#' * `rows_upsert()` inserts or updates depending on whether or not the
#' key value in `y` already exists in `x`.
Expand Down Expand Up @@ -82,15 +81,16 @@ rows_insert <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
#' @export
rows_insert.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
key <- rows_check_key(by, x, y)
rows_check_subset(x, y)
y <- auto_copy(x, y, copy = copy)
rows_df_in_place(in_place)

rows_check_key_df(x, key, df_name = "x")
rows_check_key_df(y, key, df_name = "y")
rows_check_key_names_df(x, key, df_name = "x")
rows_check_key_names_df(y, key, df_name = "y")

idx <- vctrs::vec_match(y[key], x[key])
bad <- which(!is.na(idx))
if (has_length(bad)) {
rows_check_key_unique_df(y, key, df_name = "y")

if (any(vctrs::vec_in(y[key], x[key]))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think being able to update multiple rows in x with the same key implies that we are okay with duplicate keys.

It might make sense to remove this check, allowing you to also insert a row with a duplicate key.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure. For insertion we might really care for uniqueness. On the other hand, should rows_insert() or the underlying storage be responsible for identifying duplicate key violations?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was also wondering if a strict = TRUE/FALSE would make sense for the data frame method. If strict = TRUE, then we can't add a duplicate key to x. This argument might make sense for other rows_*() functions too.

abort("Attempting to insert duplicate rows.")
}

Expand All @@ -109,19 +109,20 @@ rows_update <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
#' @export
rows_update.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
key <- rows_check_key(by, x, y)
rows_check_subset(x, y)
y <- auto_copy(x, y, copy = copy)
rows_df_in_place(in_place)

rows_check_key_df(x, key, df_name = "x")
rows_check_key_df(y, key, df_name = "y")
idx <- vctrs::vec_match(y[key], x[key])
rows_check_key_names_df(x, key, df_name = "x")
rows_check_key_names_df(y, key, df_name = "y")

bad <- which(is.na(idx))
if (has_length(bad)) {
abort("Attempting to update missing rows.")
}
rows_check_key_unique_df(y, key, df_name = "y")

x[idx, names(y)] <- y
idx <- vctrs::vec_match(x[key], y[key])
pos <- which(!is.na(idx))
idx <- idx[pos]

x[pos, names(y)] <- y[idx, ]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like these might be useful tests to add

x <- data.frame(a = c(1, 2), b = 1)
y <- data.frame(a = 3, b = 2)

# `y` key that isn't in `x` = no changes
expect_identical(rows_update(x, y, "a"), x)

x <- data.frame(a = c(1, 2, 1), b = 1)
y <- data.frame(a = 1, b = 2)
expect <- data.frame(a = c(1, 2, 1), b = c(2, 1, 2))

# can update duplicate keys in `x`
expect_identical(rows_update(x, y, "a"), expect)

x
}

Expand All @@ -137,21 +138,20 @@ rows_patch <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
#' @export
rows_patch.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
key <- rows_check_key(by, x, y)
rows_check_subset(x, y)
y <- auto_copy(x, y, copy = copy)
rows_df_in_place(in_place)

rows_check_key_df(x, key, df_name = "x")
rows_check_key_df(y, key, df_name = "y")
idx <- vctrs::vec_match(y[key], x[key])
rows_check_key_names_df(x, key, df_name = "x")
rows_check_key_names_df(y, key, df_name = "y")

bad <- which(is.na(idx))
if (has_length(bad)) {
abort("Attempting to patch missing rows.")
}
rows_check_key_unique_df(y, key, df_name = "y")

new_data <- map2(x[idx, names(y)], y, coalesce)
idx <- vctrs::vec_match(x[key], y[key])
pos <- which(!is.na(idx))
idx <- idx[pos]

x[idx, names(y)] <- new_data
x[pos, names(y)] <- map2(x[pos, names(y)], y[idx, ], coalesce)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imagine this should only coalesce over columns in x and y that are not key columns - but I am not sure this would have any practical difference

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would avoid extra work.

x
}

Expand All @@ -167,18 +167,30 @@ rows_upsert <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
#' @export
rows_upsert.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
key <- rows_check_key(by, x, y)
rows_check_subset(x, y)
y <- auto_copy(x, y, copy = copy)
rows_df_in_place(in_place)

rows_check_key_df(x, key, df_name = "x")
rows_check_key_df(y, key, df_name = "y")
idx <- vctrs::vec_match(y[key], x[key])
new <- is.na(idx)
idx_existing <- idx[!new]
idx_new <- idx[new]
rows_check_key_names_df(x, key, df_name = "x")
rows_check_key_names_df(y, key, df_name = "y")

rows_check_key_unique_df(y, key, df_name = "y")

# update
idx <- vctrs::vec_match(x[key], y[key])
pos <- which(!is.na(idx))
if (length(pos)) {
idx <- idx[pos]
x[pos, names(y)] <- y[idx, ]
}

# and insert
pos_insert <- which(!vctrs::vec_in(y[key], x[key]))
if (length(pos_insert)) {
x <- rows_bind(x, vec_slice(y, pos_insert))
}
x

x[idx_existing, names(y)] <- vec_slice(y, !new)
rows_bind(x, vec_slice(y, new))
}

#' @rdname rows
Expand All @@ -193,11 +205,16 @@ rows_delete <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
#' @export
rows_delete.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place = FALSE) {
key <- rows_check_key(by, x, y)
rows_check_subset(x, y)

y <- auto_copy(x, y, copy = copy)
rows_df_in_place(in_place)

rows_check_key_df(x, key, df_name = "x")
rows_check_key_df(y, key, df_name = "y")
rows_check_key_names_df(x, key, df_name = "x")
rows_check_key_unique_df(x, key, df_name = "x")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems useful to be able to delete multiple rows in x with the same key, i.e.

x <- data.frame(a = c(1, 1, 1), b = c(1, 1, 2))
y <- data.frame(a = 1, b = 1)

# rows_delete(x, y, by = c("a", "b"))
x[3,]
#>   a b
#> 3 1 2

That just continues the theme of this PR which allows x to have duplicate keys


rows_check_key_names_df(y, key, df_name = "y")
rows_check_key_unique_df(y, key, df_name = "y")

extra_cols <- setdiff(names(y), key)
if (has_length(extra_cols)) {
Expand All @@ -206,14 +223,7 @@ rows_delete.data.frame <- function(x, y, by = NULL, ..., copy = FALSE, in_place
)
}

idx <- vctrs::vec_match(y[key], x[key])

bad <- which(is.na(idx))
if (has_length(bad)) {
abort("Attempting to delete missing rows.")
}

dplyr_row_slice(x, -idx)
x[which(!vctrs::vec_in(x[key], y[key])), ]
}

# helpers -----------------------------------------------------------------
Expand All @@ -234,19 +244,24 @@ rows_check_key <- function(by, x, y) {
abort("`by` must be unnamed.")
}

by
}

rows_check_subset <- function(x, y) {
bad <- setdiff(colnames(y), colnames(x))
if (has_length(bad)) {
abort("All columns in `y` must exist in `x`.")
}

by
}

rows_check_key_df <- function(df, by, df_name) {
rows_check_key_names_df <- function(df, by, df_name) {
y_miss <- setdiff(by, colnames(df))
if (length(y_miss) > 0) {
abort(glue("All `by` columns must exist in `{df_name}`."))
}
}

rows_check_key_unique_df <- function(df, by, df_name) {
if (vctrs::vec_duplicate_any(df[by])) {
abort(glue("`{df_name}` key values are not unique."))
}
Expand Down
76 changes: 54 additions & 22 deletions tests/testthat/test-rows.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,29 @@ test_that("rows_update()", {
tibble(a = 1:3, b = c("a", "z", "z"), c = data$c)
)

expect_error(
rows_update(data, tibble(a = 2:3, b = "z"), by = c("a", "b")),
"update missing"
)

expect_silent(
expect_identical(
rows_update(data, tibble(b = "z", a = 2:3), by = "a"),
tibble(a = 1:3, b = c("a", "z", "z"), c = data$c)
)
)

expect_error(
rows_update(data, tibble(d = 1)),
"must exist"
)
expect_error(
rows_update(data, tibble(a = c(1, 1))),
"not unique"
)
expect_error(
rows_update(data, tibble(a = 1), by = "b"),
"must exist"
)
expect_error(
rows_update(data, tibble(a = 1), by = c(b = "a")),
"must be unnamed"
)
})

test_that("rows_patch()", {
Expand All @@ -41,17 +53,29 @@ test_that("rows_patch()", {
tibble(a = 1:3, b = c("a", "b", "z"), c = data$c)
)

expect_error(
rows_patch(data, tibble(a = 2:3, b = "z"), by = c("a", "b")),
"patch missing"
)

expect_silent(
expect_identical(
rows_patch(data, tibble(b = "z", a = 2:3), by = "a"),
tibble(a = 1:3, b = c("a", "b", "z"), c = data$c)
)
)

expect_error(
rows_patch(data, tibble(d = 1)),
"must exist"
)
expect_error(
rows_patch(data, tibble(a = c(1, 1))),
"not unique"
)
expect_error(
rows_patch(data, tibble(a = 1), by = "b"),
"must exist"
)
expect_error(
rows_patch(data, tibble(a = 1), by = c(b = "a")),
"must be unnamed"
)
})

test_that("rows_upsert()", {
Expand All @@ -61,6 +85,23 @@ test_that("rows_upsert()", {
rows_upsert(data, tibble(a = 2:4, b = "z")),
tibble(a = 1:4, b = c("a", "z", "z", "z"), c = c(data$c, NA))
)

expect_error(
rows_upsert(data, tibble(d = 1)),
"must exist"
)
expect_error(
rows_upsert(data, tibble(a = c(1, 1))),
"not unique"
)
expect_error(
rows_upsert(data, tibble(a = 1), by = "b"),
"must exist"
)
expect_error(
rows_upsert(data, tibble(a = 1), by = c(b = "a")),
"must be unnamed"
)
})

test_that("rows_delete()", {
Expand All @@ -71,19 +112,14 @@ test_that("rows_delete()", {
data[1, ]
)

expect_error(
rows_delete(data, tibble(a = 2:4)),
"delete missing"
)

expect_identical(
rows_delete(data, tibble(a = 2:3, b = "b")),
data[1, ]
)

expect_error(
rows_delete(data, tibble(a = 2:3, b = "b"), by = c("a", "b")),
"delete missing"
expect_message(
rows_delete(data, tibble(a = 2:3, b = "b")),
"Ignoring extra columns"
)
})

Expand All @@ -97,12 +133,10 @@ verify_output("test-rows.txt", {

"# Update"
rows_update(data, tibble(a = 2:3, b = "z"))
rows_update(data, tibble(a = 2:3, b = "z"), by = c("a", "b"))
rows_update(data, tibble(b = "z", a = 2:3), by = "a")

"# Variants: patch and upsert"
rows_patch(data, tibble(a = 2:3, b = "z"))
rows_patch(data, tibble(a = 2:3, b = "z"), by = c("a", "b"))
rows_upsert(data, tibble(a = 2:4, b = "z"))

"# Delete and truncate"
Expand All @@ -112,9 +146,7 @@ verify_output("test-rows.txt", {
rows_delete(data, tibble(a = 2:3, b = "b"), by = c("a", "b"))

"# Errors"
rows_insert(data[c(1, 1), ], tibble(a = 3))
rows_insert(data, tibble(a = c(4, 4)))

rows_insert(data, tibble(d = 4))
rows_insert(data, tibble(a = 4, b = "z"), by = "e")
})
22 changes: 9 additions & 13 deletions tests/testthat/test-rows.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ Message: Matching, by = "a"
2 2 z 1.5
3 3 z 2.5

> rows_update(data, tibble(a = 2:3, b = "z"), by = c("a", "b"))
Error: Attempting to update missing rows.

> rows_update(data, tibble(b = "z", a = 2:3), by = "a")
# A tibble: 3 x 3
a b c
Expand All @@ -66,9 +63,6 @@ Message: Matching, by = "a"
2 2 b 1.5
3 3 z 2.5

> rows_patch(data, tibble(a = 2:3, b = "z"), by = c("a", "b"))
Error: Attempting to patch missing rows.

> rows_upsert(data, tibble(a = 2:4, b = "z"))
Message: Matching, by = "a"

Expand All @@ -95,7 +89,10 @@ Message: Matching, by = "a"
> rows_delete(data, tibble(a = 2:4))
Message: Matching, by = "a"

Error: Attempting to delete missing rows.
# A tibble: 1 x 3
a b c
<int> <chr> <dbl>
1 1 a 0.5

> rows_delete(data, tibble(a = 2:3, b = "b"))
Message: Matching, by = "a"
Expand All @@ -108,17 +105,16 @@ Message: Ignoring extra columns: b
1 1 a 0.5

> rows_delete(data, tibble(a = 2:3, b = "b"), by = c("a", "b"))
Error: Attempting to delete missing rows.
# A tibble: 2 x 3
a b c
<int> <chr> <dbl>
1 1 a 0.5
2 3 <NA> 2.5


Errors
======

> rows_insert(data[c(1, 1), ], tibble(a = 3))
Message: Matching, by = "a"

Error: `x` key values are not unique.

> rows_insert(data, tibble(a = c(4, 4)))
Message: Matching, by = "a"

Expand Down