cmu-delphi
diff --git a/‎archive.qmd‎
Lines changed: 21 additions & 17 deletions b/‎archive.qmd‎
Lines changed: 21 additions & 17 deletions
diff --git a/‎epidf.qmd‎
Lines changed: 56 additions & 51 deletions b/‎epidf.qmd‎
Lines changed: 56 additions & 51 deletions
@@ -16,8 +16,8 @@ claims, available through the [COVIDcast
 API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This
 signal is subject to very heavy and regular revision; you can read more about it
 on its [API documentation
-page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). We'll use the offline version stored in `{epidatasets}`.
-
+page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html).
+We'll use the offline version stored in `{epidatasets}`.
 
 ```{r, include=FALSE}
 source("_common.R")
@@ -36,7 +36,7 @@ tibble, provided that it has (at least) the following columns:
   the data for January 14, 2022 that were available one day later.
 
 As we can see from the above, the data frame returned by
-`epidatr::covidcast()` has the columns required for the `epi_archive`
+`epidatr::pub_covidcast()` has the columns required for the `epi_archive`
 format, so we use
 `as_epi_archive()` to cast it into `epi_archive` format.[^1]
 
@@ -47,7 +47,7 @@ to the [compactify vignette](https://cmu-delphi.github.io/epiprocess/articles/co
 
 ```{r}
 x <- archive_cases_dv_subset_dt %>%
-  select(geo_value, time_value, version, percent_cli)  %>%
+  select(geo_value, time_value, version, percent_cli) %>%
   as_epi_archive(compactify = TRUE)
 
 class(x)
@@ -70,8 +70,8 @@ below). There can only be a single row per unique combination of key variables,
 and therefore the key variables are critical for figuring out how to generate a
 snapshot of data from the archive, as of a given version (also described below).
 
-```{r, error=TRUE}
-key(x$DT)
+```{r}
+data.table::key(x$DT)
 ```
 
 In general, the last version of each observation is carried forward (LOCF) to
@@ -100,7 +100,7 @@ the signal variables as of a given version. This can be accessed via
 `epix_as_of()`.
 
 ```{r}
-x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01"))
+x_snapshot <- epix_as_of(x, version = as.Date("2021-06-01"))
 class(x_snapshot)
 x_snapshot
 max(x_snapshot$time_value)
@@ -120,7 +120,7 @@ this case, since updates to the current version may still come in at a later
 point in time, due to various reasons, such as synchronization issues.
 
 ```{r}
-x_latest <- epix_as_of(x, max_version = max(x$DT$version))
+x_latest <- epix_as_of(x, version = max(x$DT$version))
 ```
 
 Below, we pull several snapshots from the archive, spaced one month apart. We
@@ -134,27 +134,32 @@ versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month")
 snapshots <- map(
   versions,
   function(v) {
-    epix_as_of(x, max_version = v) %>% mutate(version = v)
-  }) %>%
+    epix_as_of(x, version = v) %>% mutate(version = v)
+  }
+) %>%
   list_rbind() %>%
   bind_rows(x_latest %>% mutate(version = self_max)) %>%
   mutate(latest = version == self_max)
 ```
 
 ```{r, fig.height=7}
 #| code-fold: true
-ggplot(snapshots %>% filter(!latest),
-            aes(x = time_value, y = percent_cli)) +
+ggplot(
+  snapshots %>% filter(!latest),
+  aes(x = time_value, y = percent_cli)
+) +
   geom_line(aes(color = factor(version)), na.rm = TRUE) +
   geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +
-  facet_wrap(~ geo_value, scales = "free_y", ncol = 1) +
+  facet_wrap(~geo_value, scales = "free_y", ncol = 1) +
   scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
   scale_color_viridis_d(option = "A", end = .9) +
   labs(x = "Date", y = "% of doctor's visits with CLI") +
   theme(legend.position = "none") +
-  geom_line(data = snapshots %>% filter(latest),
-               aes(x = time_value, y = percent_cli),
-            inherit.aes = FALSE, color = "black", na.rm = TRUE)
+  geom_line(
+    data = snapshots %>% filter(latest),
+    aes(x = time_value, y = percent_cli),
+    inherit.aes = FALSE, color = "black", na.rm = TRUE
+  )
 ```
 
 We can see some interesting and highly nontrivial revision behavior: at some
@@ -164,7 +169,6 @@ they overestimate it (both states towards the beginning of 2021), though not
 quite as dramatically. Modeling the revision process, which is often called
 *backfill modeling*, is an important statistical problem in it of itself.
 
-
 ## Merging `epi_archive` objects
 
 Now we demonstrate how to merge two `epi_archive` objects together, e.g., so
 
@@ -5,7 +5,7 @@
 source("_common.R")
 ```
 
-We'll start by showing how to get data into 
+We'll start by showing how to get data into
 `epi_df`, which is just
 a tibble with a bit of special structure, and is the format assumed by all of
 the functions in the `epiprocess` package. An `epi_df` object has (at least) the
@@ -43,16 +43,13 @@ cases <- pub_covidcast(
 colnames(cases)
 ```
 
-As we can see, a data frame returned by `epidatr::covidcast()` has the
+As we can see, a data frame returned by `epidatr::pub_covidcast()` has the
 columns required for an `epi_df` object (along with many others). We can use
 `as_epi_df()`, with specification of some relevant metadata, to bring the data
 frame into `epi_df` format.
 
 ```{r, message = FALSE}
-x <- as_epi_df(cases, 
-               geo_type = "state",
-               time_type = "day",
-               as_of = max(cases$issue)) %>%
+x <- as_epi_df(cases, as_of = max(cases$issue)) %>%
   select(geo_value, time_value, total_cases = value)
 
 class(x)
@@ -64,7 +61,7 @@ attributes(x)$metadata
 ## Some details on metadata
 
 In general, an `epi_df` object has the following fields in its metadata:
- 
+
 * `geo_type`: the type for the geo values.
 * `time_type`: the type for the time values.
 * `as_of`: the time value at which the given data were available.
@@ -86,10 +83,10 @@ data set. See the [archive
 vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for
 more.
 
-If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a 
+If any of the `geo_type`, `time_type`, or `as_of` arguments are missing in a
 call to `as_epi_df()`, then this function will try to infer them from the passed
 object. Usually, `geo_type` and `time_type` can be inferred from the `geo_value`
-and `time_value` columns, respectively, but inferring the `as_of` field is not 
+and `time_value` columns, respectively, but inferring the `as_of` field is not
 as easy. See the documentation for `as_epi_df()` more details.
 
 ```{r}
@@ -109,25 +106,29 @@ In the following examples we will show how to create an `epi_df` with additional
 set.seed(12345)
 ex1 <- tibble(
   geo_value = rep(c("ca", "fl", "pa"), each = 3),
-  county_code = c("06059", "06061", "06067", "12111", "12113", "12117",
-                  "42101", "42103", "42105"),
+  county_code = c(
+    "06059", "06061", "06067", "12111", "12113", "12117",
+    "42101", "42103", "42105"
+  ),
   time_value = rep(
-    seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"), 
-    length.out = 9),
+    seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
+    length.out = 9
+  ),
   value = rpois(9, 5)
-) %>% 
+) %>%
   as_tsibble(index = time_value, key = c(geo_value, county_code))
 
-ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03")
+ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03")
 ```
 
 The metadata now includes `county_code` as an extra key.
+
 ```{r}
 attr(ex1, "metadata")
 ```
 
 
-### Dealing with misspecified column names 
+### Dealing with misspecified column names
 
 `epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error.
 
@@ -136,27 +137,27 @@ ex2 <- data.frame(
   state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
   pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
   reported_date = rep(
-    seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), 
-    length.out = 9), # misnamed
+    seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
+    length.out = 9
+  ), # misnamed
   value = rpois(9, 5)
-) 
-ex2 %>% as_epi_df() 
+)
+ex2 %>% as_epi_df()
 ```
 
-The columns should be renamed to match `epi_df` format. 
+The columns should be renamed to match `epi_df` format.
 
 ```{r}
-ex2 <- ex2 %>% 
+ex2 <- ex2 %>%
   rename(geo_value = state, time_value = reported_date) %>%
-  as_epi_df(geo_type = "state", 
-            as_of = "2020-06-03", 
-            additional_metadata = list(other_keys = "pol")
+  as_epi_df(
+    as_of = "2020-06-03",
+    other_keys = "pol"
   )
 
 attr(ex2, "metadata")
 ```
 
-
 ### Adding additional keys to an `epi_df` object
 
 In the above examples, all the keys are added to objects prior to conversion to
@@ -166,22 +167,23 @@ We'll look at an included dataset and filter to a single state for simplicity.
 ```{r}
 ex3 <- jhu_csse_county_level_subset %>%
   filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
-  slice_tail(n = 6) 
-  
+  slice_tail(n = 6)
+
 attr(ex3, "metadata") # geo_type is county currently
 ```
 
-Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that. 
+Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. The "state" `geo_type` anticipates lower-case abbreviations, so we'll match that.
 
 ```{r}
-ex3 <- ex3 %>% 
+ex3 <- ex3 %>%
   as_tibble() %>% # drop the `epi_df` class before adding additional metadata
   mutate(
     state = rep(tolower("MA"), 6),
-    pol = rep(c("blue", "swing", "swing"), each = 2)) %>%
-  as_epi_df(additional_metadata = list(other_keys = c("state", "pol")))
+    pol = rep(c("blue", "swing", "swing"), each = 2)
+  ) %>%
+  as_epi_df(other_keys = c("state", "pol"))
 
-attr(ex3,"metadata")
+attr(ex3, "metadata")
 ```
 
 Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` component of the `additional_metadata` list. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys.
@@ -199,38 +201,38 @@ package. Of course, we can also write custom code for other downstream uses,
 like plotting, which is pretty easy to do `ggplot2`.
 
 ```{r, message = FALSE, warning = FALSE}
-ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + 
+ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
   geom_line() +
   scale_color_brewer(palette = "Set1") +
   scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
   labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State")
 ```
 
-Finally, we'll examine some data from other packages just to show how 
-we might get them into `epi_df` format. 
-The first is data on daily new (not cumulative) SARS 
-cases in Canada in 2003, from the 
+Finally, we'll examine some data from other packages just to show how
+we might get them into `epi_df` format.
+The first is data on daily new (not cumulative) SARS
+cases in Canada in 2003, from the
 [outbreaks](https://github.com/reconverse/outbreaks) package. New cases are
 broken into a few categories by provenance.
 
 ```{r}
 x <- outbreaks::sars_canada_2003 %>%
   mutate(geo_value = "ca") %>%
   select(geo_value, time_value = date, starts_with("cases")) %>%
-  as_epi_df(geo_type = "nation")
+  as_epi_df()
 
 head(x)
 ```
 
 ```{r}
 #| code-fold: true
-x <- x %>% 
+x <- x %>%
   pivot_longer(starts_with("cases"), names_to = "type") %>%
   mutate(type = substring(type, 7))
 
 ggplot(x, aes(x = time_value, y = value)) +
   geom_col(aes(fill = type), just = 0.5) +
-  scale_y_continuous(breaks = 0:4*2, expand = expansion(c(0, 0.05))) +
+  scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +
   scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
   labs(x = "Date", y = "SARS cases in Canada", fill = "Type")
 ```
@@ -243,27 +245,30 @@ x <- outbreaks::ebola_sierraleone_2014 %>%
     cases = ifelse(status == "confirmed", 1, 0),
     province = case_when(
       district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern",
-      district %in% c("Bombali", "Kambia", "Koinadugu",
-                      "Port Loko", "Tonkolili") ~ "Northern",
+      district %in% c(
+        "Bombali", "Kambia", "Koinadugu",
+        "Port Loko", "Tonkolili"
+      ) ~ "Northern",
       district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern",
-      district %in% c("Western Rural", "Western Urban") ~ "Western")
-  ) %>% 
+      district %in% c("Western Rural", "Western Urban") ~ "Western"
+    )
+  ) %>%
   select(geo_value = province, time_value = date_of_onset, cases) %>%
   filter(cases == 1) %>%
-  group_by(geo_value, time_value) %>% 
+  group_by(geo_value, time_value) %>%
   summarise(cases = sum(cases)) %>%
-  as_epi_df(geo_type = "province")
+  as_epi_df()
 ```
 
 ```{r}
 #| code-fold: true
 #| fig-width: 8
 #| fig-height: 6
-ggplot(x, aes(x = time_value, y = cases)) + 
-  geom_col(aes(fill = geo_value), show.legend = FALSE) + 
-  facet_wrap(~ geo_value, scales = "free_y") +
+ggplot(x, aes(x = time_value, y = cases)) +
+  geom_col(aes(fill = geo_value), show.legend = FALSE) +
+  facet_wrap(~geo_value, scales = "free_y") +
   scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
-  labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") 
+  labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
 ```