55source("_common.R")
66```
77
8- We'll start by showing how to get data into
8+ We'll start by showing how to get data into
99` epi_df ` , which is just
1010a tibble with a bit of special structure, and is the format assumed by all of
1111the functions in the ` epiprocess ` package. An ` epi_df ` object has (at least) the
@@ -43,16 +43,13 @@ cases <- pub_covidcast(
4343colnames(cases)
4444```
4545
46- As we can see, a data frame returned by ` epidatr::covidcast () ` has the
46+ As we can see, a data frame returned by ` epidatr::pub_covidcast () ` has the
4747columns required for an ` epi_df ` object (along with many others). We can use
4848` as_epi_df() ` , with specification of some relevant metadata, to bring the data
4949frame into ` epi_df ` format.
5050
5151``` {r, message = FALSE}
52- x <- as_epi_df(cases,
53- geo_type = "state",
54- time_type = "day",
55- as_of = max(cases$issue)) %>%
52+ x <- as_epi_df(cases, as_of = max(cases$issue)) %>%
5653 select(geo_value, time_value, total_cases = value)
5754
5855class(x)
@@ -64,7 +61,7 @@ attributes(x)$metadata
6461## Some details on metadata
6562
6663In general, an ` epi_df ` object has the following fields in its metadata:
67-
64+
6865* ` geo_type ` : the type for the geo values.
6966* ` time_type ` : the type for the time values.
7067* ` as_of ` : the time value at which the given data were available.
@@ -86,10 +83,10 @@ data set. See the [archive
8683vignette] ( https://cmu-delphi.github.io/epiprocess/articles/archive.html ) for
8784more.
8885
89- If any of the ` geo_type ` , ` time_type ` , or ` as_of ` arguments are missing in a
86+ If any of the ` geo_type ` , ` time_type ` , or ` as_of ` arguments are missing in a
9087call to ` as_epi_df() ` , then this function will try to infer them from the passed
9188object. Usually, ` geo_type ` and ` time_type ` can be inferred from the ` geo_value `
92- and ` time_value ` columns, respectively, but inferring the ` as_of ` field is not
89+ and ` time_value ` columns, respectively, but inferring the ` as_of ` field is not
9390as easy. See the documentation for ` as_epi_df() ` more details.
9491
9592``` {r}
@@ -109,25 +106,29 @@ In the following examples we will show how to create an `epi_df` with additional
109106set.seed(12345)
110107ex1 <- tibble(
111108 geo_value = rep(c("ca", "fl", "pa"), each = 3),
112- county_code = c("06059", "06061", "06067", "12111", "12113", "12117",
113- "42101", "42103", "42105"),
109+ county_code = c(
110+ "06059", "06061", "06067", "12111", "12113", "12117",
111+ "42101", "42103", "42105"
112+ ),
114113 time_value = rep(
115- seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
116- length.out = 9),
114+ seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "1 day"),
115+ length.out = 9
116+ ),
117117 value = rpois(9, 5)
118- ) %>%
118+ ) %>%
119119 as_tsibble(index = time_value, key = c(geo_value, county_code))
120120
121- ex1 <- as_epi_df(x = ex1, geo_type = "state", time_type = "day", as_of = "2020-06-03")
121+ ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03")
122122```
123123
124124The metadata now includes ` county_code ` as an extra key.
125+
125126``` {r}
126127attr(ex1, "metadata")
127128```
128129
129130
130- ### Dealing with misspecified column names
131+ ### Dealing with misspecified column names
131132
132133` epi_df ` requires there to be columns ` geo_value ` and ` time_value ` , if they do not exist then ` as_epi_df() ` throws an error.
133134
@@ -136,27 +137,27 @@ ex2 <- data.frame(
136137 state = rep(c("ca", "fl", "pa"), each = 3), # misnamed
137138 pol = rep(c("blue", "swing", "swing"), each = 3), # extra key
138139 reported_date = rep(
139- seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
140- length.out = 9), # misnamed
140+ seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"),
141+ length.out = 9
142+ ), # misnamed
141143 value = rpois(9, 5)
142- )
143- ex2 %>% as_epi_df()
144+ )
145+ ex2 %>% as_epi_df()
144146```
145147
146- The columns should be renamed to match ` epi_df ` format.
148+ The columns should be renamed to match ` epi_df ` format.
147149
148150``` {r}
149- ex2 <- ex2 %>%
151+ ex2 <- ex2 %>%
150152 rename(geo_value = state, time_value = reported_date) %>%
151- as_epi_df(geo_type = "state",
152- as_of = "2020-06-03",
153- additional_metadata = list( other_keys = "pol")
153+ as_epi_df(
154+ as_of = "2020-06-03",
155+ other_keys = "pol"
154156 )
155157
156158attr(ex2, "metadata")
157159```
158160
159-
160161### Adding additional keys to an ` epi_df ` object
161162
162163In the above examples, all the keys are added to objects prior to conversion to
@@ -166,22 +167,23 @@ We'll look at an included dataset and filter to a single state for simplicity.
166167``` {r}
167168ex3 <- jhu_csse_county_level_subset %>%
168169 filter(time_value > "2021-12-01", state_name == "Massachusetts") %>%
169- slice_tail(n = 6)
170-
170+ slice_tail(n = 6)
171+
171172attr(ex3, "metadata") # geo_type is county currently
172173```
173174
174- Now we add ` state ` (MA) and ` pol ` as new columns to the data and as new keys to the metadata. The "state" ` geo_type ` anticipates lower-case abbreviations, so we'll match that.
175+ Now we add ` state ` (MA) and ` pol ` as new columns to the data and as new keys to the metadata. The "state" ` geo_type ` anticipates lower-case abbreviations, so we'll match that.
175176
176177``` {r}
177- ex3 <- ex3 %>%
178+ ex3 <- ex3 %>%
178179 as_tibble() %>% # drop the `epi_df` class before adding additional metadata
179180 mutate(
180181 state = rep(tolower("MA"), 6),
181- pol = rep(c("blue", "swing", "swing"), each = 2)) %>%
182- as_epi_df(additional_metadata = list(other_keys = c("state", "pol")))
182+ pol = rep(c("blue", "swing", "swing"), each = 2)
183+ ) %>%
184+ as_epi_df(other_keys = c("state", "pol"))
183185
184- attr(ex3,"metadata")
186+ attr(ex3, "metadata")
185187```
186188
187189Note that the two additional keys we added, ` state ` and ` pol ` , are specified as a character vector in the ` other_keys ` component of the ` additional_metadata ` list. They must be specified in this manner so that downstream actions on the ` epi_df ` , like model fitting and prediction, can recognize and use these keys.
@@ -199,38 +201,38 @@ package. Of course, we can also write custom code for other downstream uses,
199201like plotting, which is pretty easy to do ` ggplot2 ` .
200202
201203``` {r, message = FALSE, warning = FALSE}
202- ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
204+ ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) +
203205 geom_line() +
204206 scale_color_brewer(palette = "Set1") +
205207 scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
206208 labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State")
207209```
208210
209- Finally, we'll examine some data from other packages just to show how
210- we might get them into ` epi_df ` format.
211- The first is data on daily new (not cumulative) SARS
212- cases in Canada in 2003, from the
211+ Finally, we'll examine some data from other packages just to show how
212+ we might get them into ` epi_df ` format.
213+ The first is data on daily new (not cumulative) SARS
214+ cases in Canada in 2003, from the
213215[ outbreaks] ( https://github.com/reconverse/outbreaks ) package. New cases are
214216broken into a few categories by provenance.
215217
216218``` {r}
217219x <- outbreaks::sars_canada_2003 %>%
218220 mutate(geo_value = "ca") %>%
219221 select(geo_value, time_value = date, starts_with("cases")) %>%
220- as_epi_df(geo_type = "nation" )
222+ as_epi_df()
221223
222224head(x)
223225```
224226
225227``` {r}
226228#| code-fold: true
227- x <- x %>%
229+ x <- x %>%
228230 pivot_longer(starts_with("cases"), names_to = "type") %>%
229231 mutate(type = substring(type, 7))
230232
231233ggplot(x, aes(x = time_value, y = value)) +
232234 geom_col(aes(fill = type), just = 0.5) +
233- scale_y_continuous(breaks = 0:4* 2, expand = expansion(c(0, 0.05))) +
235+ scale_y_continuous(breaks = 0:4 * 2, expand = expansion(c(0, 0.05))) +
234236 scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
235237 labs(x = "Date", y = "SARS cases in Canada", fill = "Type")
236238```
@@ -243,27 +245,30 @@ x <- outbreaks::ebola_sierraleone_2014 %>%
243245 cases = ifelse(status == "confirmed", 1, 0),
244246 province = case_when(
245247 district %in% c("Kailahun", "Kenema", "Kono") ~ "Eastern",
246- district %in% c("Bombali", "Kambia", "Koinadugu",
247- "Port Loko", "Tonkolili") ~ "Northern",
248+ district %in% c(
249+ "Bombali", "Kambia", "Koinadugu",
250+ "Port Loko", "Tonkolili"
251+ ) ~ "Northern",
248252 district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ "Sourthern",
249- district %in% c("Western Rural", "Western Urban") ~ "Western")
250- ) %>%
253+ district %in% c("Western Rural", "Western Urban") ~ "Western"
254+ )
255+ ) %>%
251256 select(geo_value = province, time_value = date_of_onset, cases) %>%
252257 filter(cases == 1) %>%
253- group_by(geo_value, time_value) %>%
258+ group_by(geo_value, time_value) %>%
254259 summarise(cases = sum(cases)) %>%
255- as_epi_df(geo_type = "province" )
260+ as_epi_df()
256261```
257262
258263``` {r}
259264#| code-fold: true
260265#| fig-width: 8
261266#| fig-height: 6
262- ggplot(x, aes(x = time_value, y = cases)) +
263- geom_col(aes(fill = geo_value), show.legend = FALSE) +
264- facet_wrap(~ geo_value, scales = "free_y") +
267+ ggplot(x, aes(x = time_value, y = cases)) +
268+ geom_col(aes(fill = geo_value), show.legend = FALSE) +
269+ facet_wrap(~geo_value, scales = "free_y") +
265270 scale_x_date(minor_breaks = "month", date_labels = "%b %Y") +
266- labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
271+ labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone")
267272```
268273
269274
0 commit comments