diff --git a/.quarto/listing/listing-cache.json b/.quarto/listing/listing-cache.json index f26dc3d..7e7e30f 100644 --- a/.quarto/listing/listing-cache.json +++ b/.quarto/listing/listing-cache.json @@ -6,6 +6,9 @@ "posts/2020-09-21-data-visualization/index.qmd": [ "B3049D40" ], + "posts/2022-03-22-bayes-for-the-busy-ecologist/index.qmd": [ + "B3049D40" + ], "posts/2021-11-02-introduction-to-gams/index.qmd": [ "B3049D40" ], diff --git a/.quarto/xref/1893bfdc b/.quarto/xref/1893bfdc index 8cb276d..6abc991 100644 --- a/.quarto/xref/1893bfdc +++ b/.quarto/xref/1893bfdc @@ -1 +1 @@ -{"headings":["instructor","outline","get-course-materials","install-required-software","get-the-notes","useful-resources","references","license"],"entries":[]} \ No newline at end of file +{"entries":[],"headings":["instructor","outline","get-course-materials","install-required-software","get-the-notes","useful-resources","references","license"]} \ No newline at end of file diff --git a/.quarto/xref/3766ef87 b/.quarto/xref/3766ef87 index 4415144..3c35f2b 100644 --- a/.quarto/xref/3766ef87 +++ b/.quarto/xref/3766ef87 @@ -1 +1 @@ -{"headings":["introduction-to-edi-concepts-in-a-scientific-context","introduction-aux-concepts-edi-en-contexte-scientifique"],"entries":[]} \ No newline at end of file +{"entries":[],"headings":["introduction-to-edi-concepts-in-a-scientific-context","introduction-aux-concepts-edi-en-contexte-scientifique"]} \ No newline at end of file diff --git a/.quarto/xref/442bc2ed b/.quarto/xref/442bc2ed index c4b6de6..fafdf78 100644 --- a/.quarto/xref/442bc2ed +++ b/.quarto/xref/442bc2ed @@ -1 +1 @@ -{"entries":[],"headings":["spatial-statistics-in-ecology","course-outline","introduction","types-of-spatial-analyses","stationarity-and-isotropy","georeferenced-data","point-pattern","point-pattern-and-point-process","complete-spatial-randomness","exploratory-or-inferential-analysis-for-a-point-pattern","ripleys-k-function","edge-effects","example","exercise-1","effect-of-heterogeneity","exercise-2","relationship-between-two-point-patterns","questions","marked-point-patterns","references","solutions","exercise-1-1","exercise-2-1","spatial-correlation","intrinsic-or-induced-dependence","different-ways-to-model-spatial-effects","geostat-models","variogram","theoretical-models-for-the-variogram","empirical-variogram","regression-model-with-spatial-correlation","geostatistical-models-in-r","regression-with-spatial-correlation","exercise","kriging","solutions-1","areal-data","moran-i","spatial-autoreg","conditional-autoregressive-car-model","simultaneous-autoregressive-sar-model","analysis-areal","definition-of-the-neighbourhood-network","spatial-autoregression-models","exercise-3","reference","glmm-spatial-gaussian","data","non-spatial-glmm","spatial-glmm-with-spamm","gaussian-process-models-vs.-smoothing-splines","bayesian-methods-for-glmms-with-gaussian-processes","glmm-spatial-autoreg","reference-1","statistiques-spatiales-en-écologie","plan-du-cours","introduction-fr","types-danalyses-spatiales","stationnarité-et-isotropie","données-géoréférencées","point-pattern-fr","patron-de-points-et-processus-ponctuel","structure-spatiale-totalement-aléatoire","analyse-exploratoire-ou-inférentielle-pour-un-patron-de-points","indice-k-de-ripley","effets-de-bordure","exemple","exercice-1","effet-de-lhétérogénéité","exercice-2","relation-entre-deux-patrons-de-points","questions-1","patrons-de-points-marqués","références","solutions-2","exercice-1-1","exercice-2-1","spatial-correlation-fr","dépendance-intrinsèque-ou-induite","différentes-façons-de-modéliser-les-effets-spatiaux","geostat-models-fr","variogramme","modèles-théoriques-du-variogramme","variogramme-empirique","modèle-de-régression-avec-corrélation-spatiale","modèles-géostatistiques-dans-r","régression-avec-corrélation-spatiale","exercice","krigeage","solutions-3","areal-data-fr","moran-i-fr","spatial-autoreg-fr","autorégression-conditionnelle-car","autorégression-simultanée-sar","analysis-areal-fr","définition-du-réseau-de-voisinage","modèles-dautorégression-spatiale","exercice-3","référence","glmm-spatial-gaussian-fr","données","glmm-non-spatial","glmm-spatial-avec-spamm","processus-gaussiens-vs.-splines-de-lissage","méthodes-bayésiennes-pour-les-glmm-avec-processus-gaussiens","glmm-spatial-autoreg-fr","référence-1"]} \ No newline at end of file +{"headings":["spatial-statistics-in-ecology","course-outline","introduction","types-of-spatial-analyses","stationarity-and-isotropy","georeferenced-data","point-pattern","point-pattern-and-point-process","complete-spatial-randomness","exploratory-or-inferential-analysis-for-a-point-pattern","ripleys-k-function","edge-effects","example","exercise-1","effect-of-heterogeneity","exercise-2","relationship-between-two-point-patterns","questions","marked-point-patterns","references","solutions","exercise-1-1","exercise-2-1","spatial-correlation","intrinsic-or-induced-dependence","different-ways-to-model-spatial-effects","geostat-models","variogram","theoretical-models-for-the-variogram","empirical-variogram","regression-model-with-spatial-correlation","geostatistical-models-in-r","regression-with-spatial-correlation","exercise","kriging","solutions-1","areal-data","moran-i","spatial-autoreg","conditional-autoregressive-car-model","simultaneous-autoregressive-sar-model","analysis-areal","definition-of-the-neighbourhood-network","spatial-autoregression-models","exercise-3","reference","glmm-spatial-gaussian","data","non-spatial-glmm","spatial-glmm-with-spamm","gaussian-process-models-vs.-smoothing-splines","bayesian-methods-for-glmms-with-gaussian-processes","glmm-spatial-autoreg","reference-1","statistiques-spatiales-en-écologie","plan-du-cours","introduction-fr","types-danalyses-spatiales","stationnarité-et-isotropie","données-géoréférencées","point-pattern-fr","patron-de-points-et-processus-ponctuel","structure-spatiale-totalement-aléatoire","analyse-exploratoire-ou-inférentielle-pour-un-patron-de-points","indice-k-de-ripley","effets-de-bordure","exemple","exercice-1","effet-de-lhétérogénéité","exercice-2","relation-entre-deux-patrons-de-points","questions-1","patrons-de-points-marqués","références","solutions-2","exercice-1-1","exercice-2-1","spatial-correlation-fr","dépendance-intrinsèque-ou-induite","différentes-façons-de-modéliser-les-effets-spatiaux","geostat-models-fr","variogramme","modèles-théoriques-du-variogramme","variogramme-empirique","modèle-de-régression-avec-corrélation-spatiale","modèles-géostatistiques-dans-r","régression-avec-corrélation-spatiale","exercice","krigeage","solutions-3","areal-data-fr","moran-i-fr","spatial-autoreg-fr","autorégression-conditionnelle-car","autorégression-simultanée-sar","analysis-areal-fr","définition-du-réseau-de-voisinage","modèles-dautorégression-spatiale","exercice-3","référence","glmm-spatial-gaussian-fr","données","glmm-non-spatial","glmm-spatial-avec-spamm","processus-gaussiens-vs.-splines-de-lissage","méthodes-bayésiennes-pour-les-glmm-avec-processus-gaussiens","glmm-spatial-autoreg-fr","référence-1"],"entries":[]} \ No newline at end of file diff --git a/.quarto/xref/INDEX b/.quarto/xref/INDEX index 508eef2..1caf742 100644 --- a/.quarto/xref/INDEX +++ b/.quarto/xref/INDEX @@ -70,5 +70,8 @@ }, "summer-schools/BiodiversityModelling2022.qmd": { "BiodiversityModelling2022.html": "7416f5a6" + }, + "posts/2022-03-22-bayes-for-the-busy-ecologist/index.qmd": { + "index.html": "c178496b" } } \ No newline at end of file diff --git a/.quarto/xref/c738fe30 b/.quarto/xref/c738fe30 index 7df77d5..208374b 100644 --- a/.quarto/xref/c738fe30 +++ b/.quarto/xref/c738fe30 @@ -1 +1 @@ -{"headings":[],"entries":[]} \ No newline at end of file +{"entries":[],"headings":[]} \ No newline at end of file diff --git a/.quarto/xref/d5122045 b/.quarto/xref/d5122045 index 208374b..7df77d5 100644 --- a/.quarto/xref/d5122045 +++ b/.quarto/xref/d5122045 @@ -1 +1 @@ -{"entries":[],"headings":[]} \ No newline at end of file +{"headings":[],"entries":[]} \ No newline at end of file diff --git a/.quarto/xref/e80605ec b/.quarto/xref/e80605ec index 4a71287..6dfb720 100644 --- a/.quarto/xref/e80605ec +++ b/.quarto/xref/e80605ec @@ -1 +1 @@ -{"headings":["training-material","interactive-examples","streamgraph","interactive-plot","example-figures","annotated-resource-library","books-articles","design-principles","choosing-a-visualization","colour","tools","r","base-r","ggplot2","python","julia","customization","inspiration-pretty-things"],"entries":[]} \ No newline at end of file +{"entries":[],"headings":["training-material","interactive-examples","streamgraph","interactive-plot","example-figures","annotated-resource-library","books-articles","design-principles","choosing-a-visualization","colour","tools","r","base-r","ggplot2","python","julia","customization","inspiration-pretty-things"]} \ No newline at end of file diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/execute-results/html.json b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/execute-results/html.json new file mode 100644 index 0000000..8e9cda1 --- /dev/null +++ b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/execute-results/html.json @@ -0,0 +1,16 @@ +{ + "hash": "f07663379a35eeae870d2aa0e2428ede", + "result": { + "markdown": "---\ntitle: \"Bayes for the Busy Ecologist\"\ndescription: \"This workshop presents one idea of a complete workflow for applied Bayesian statistics with real-world models that are actually used by biodiversity scientists.\"\nauthor:\n - name: \"Andrew MacDonald\"\n affiliation: \"BIOS² and Université de Sherbrooke\"\ndate: \"22-03-2022\"\nimage: image.jpg\ncategories: [Technical, EN]\ntoc: true\nnumber-sections: true\nnumber-depth: 1\n---\n\n\n\n\n# Applied Bayesian models for working ecologists {-}\n\nThis training covers how to write down a model, how to translate that into computer code, how to fit it to data and finally, how to work with the resulting posterior distribution. We’ll use Stan, which is a language for writing Bayesian models, and practice with some of the tools that help us work with Stan: rstanarm, brms, tidybayes, shinystan.\n\nThis training is intended for experienced users of Bayesian tools and for complete beginners who want to use such tools.\n\nThis 6 to 8h online workshop was conducted in 4 sessions: March 22, 24, 29, 31, 2022, from 11AM – 1 PM Pacific Time / 12 – 2 PM Mountain Time / 2-4 PM Eastern Time. The training was built and presented by Dr. Andrew MacDonald in English with bilingual support throughout.\n\nAndrew MacDonald is the Training Coordinator of the BIOS² program. He is a quantitative ecologist who works mostly in R and a well-experienced trainer in teaching quantitative and computational methods. He is currently a research professional at Université de Sherbrooke.\n\n# Day 1\n\n# Day 2: Hierarchical and nonlinear models\n\nIn which we discuss many groups and curving lines.\n\n## Outline\n\n* Return to previous model: Poisson regression \n* Panel regression version of this model\n* Bayesian workflow\n* Brief foray into moment matching\n* Nonlinear model\n* Nonlinear model with random effects\n\n## Quick review\n\n### Bird masses\n\nThis example is based on work by Marie-Eve at UdeS! \n\nWe imagine a model like the following: \n\n$$\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(\\lambda_i) \\\\\n\\text{log}(\\lambda_i) &= \\beta_0 + \\beta_1 \\times \\text{Mass}_i \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n$$\n\n\n$i$ keeps track of which bird we are talking about. You can think of it as \"bird number i\"\n\nNote: We could also write the model like this:\n\n$$\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(e^{\\beta_0} \\times e^{\\beta_1 \\times \\text{Mass}_i}) \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n$$\n\n### Centering variables\n\nCentering variables is one of the most important things we can do to help our models be more interpretable. This also helps us to set good priors. \n\nCentering a variable means to subtract the mean from the variable:\n\n$$\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(\\lambda_i) \\\\\n\\text{log}(\\lambda_i) &= \\beta_0 + \\beta_1 \\times (\\text{Mass}_i - \\overline{\\text{Mass}}) \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n$$\n\n*Question* How does this change the meaning of $\\beta_0$ and/or $\\beta_1$, if at all? (Hint: what will be the equation for a bird who has exactly average mass?) \n\n\n::: {.cell}\n\n```{.r .cell-code}\nset.seed(1234)\n\nn_birds <- 15\navg_nestlings_at_avg_mass <- log(4.2)\neffect_of_one_gram <- .2\n\nmother_masses_g <- rnorm(n_birds, mean = 15, sd = 3)\navg_mother_mass <- mean(mother_masses_g)\n\nlog_average_nestlings <- avg_nestlings_at_avg_mass + \n effect_of_one_gram * (mother_masses_g - avg_mother_mass)\n\nnestlings <- rpois(n = n_birds, lambda = exp(log_average_nestlings))\n```\n:::\n\n\nPlot these to get an idea of it:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsuppressPackageStartupMessages(library(tidyverse))\nimaginary_birds <- tibble(mother_masses_g, nestlings)\n\nggplot(imaginary_birds, aes(x = mother_masses_g, y = nestlings)) + \n geom_point()\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-1-1.png){width=672}\n:::\n:::\n\n\n*NOTE* We can also fit this very same model by frequentist statistics, using `lm`\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncoef(glm(nestlings ~ 1 + I(mother_masses_g - mean(mother_masses_g)), family = \"poisson\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n (Intercept) \n 1.4138103 \nI(mother_masses_g - mean(mother_masses_g)) \n 0.1727791 \n```\n:::\n\n```{.r .cell-code}\n# compare to known values\navg_nestlings_at_avg_mass\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1.435085\n```\n:::\n\n```{.r .cell-code}\neffect_of_one_gram\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 0.2\n```\n:::\n:::\n\n\n### Bayesian workflow: define a model and priors\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(brms)\n\nimaginary_birds_centered <- imaginary_birds |> \n mutate(mother_mass_g_cen = mother_masses_g - mean(mother_masses_g))\n\nbird_form <- bf(nestlings ~ 1 + mother_mass_g_cen, family = poisson(link = \"log\"))\n\nget_prior(bird_form, data = imaginary_birds_centered)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n prior class coef group resp dpar nlpar lb ub\n (flat) b \n (flat) b mother_mass_g_cen \n student_t(3, 1.4, 2.5) Intercept \n source\n default\n (vectorized)\n default\n```\n:::\n:::\n\n\nWe set a prior for each parameter. \n\n\n::: {.cell}\n\n```{.r .cell-code}\nbird_priors <- c(\n prior(normal(1, .5), class = \"Intercept\"),\n prior(normal(.1, .1), class = \"b\", coef = \"mother_mass_g_cen\")\n)\n```\n:::\n\n\n#### Prior predictive checks\n\n\n::: {.cell}\n\n```{.r .cell-code}\nprior_predictions <- brm(bird_form,\n data = imaginary_birds_centered,\n prior = bird_priors, \n sample_prior = \"only\", \n file = \"bird_model\",\n file_refit = \"on_change\",\n refresh = FALSE)\n```\n:::\n\n\nPlot a few of these:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidybayes)\nimaginary_birds_centered |> \n add_predicted_draws(prior_predictions, ndraws = 6, seed = 4321) |> \n ggplot(aes(x = mother_masses_g, y = .prediction)) + geom_point() + facet_wrap(~.draw)\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-6-1.png){width=672}\n:::\n:::\n\n\n*Question* are we satisfied with these priors?\n\n#### Fit to the data\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbird_posterior <- update(prior_predictions, sample_prior = \"yes\", \n file = \"bird_posterior\", \n file_refit = \"on_change\", refresh = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nThe desired updates require recompiling the model\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(bird_posterior)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n Family: poisson \n Links: mu = log \nFormula: nestlings ~ 1 + mother_mass_g_cen \n Data: imaginary_birds_centered (Number of observations: 15) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 1.39 0.13 1.14 1.64 1.00 1979 2070\nmother_mass_g_cen 0.16 0.04 0.07 0.25 1.00 2376 2463\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n```\n:::\n\n```{.r .cell-code}\nknitr::kable(head(tidybayes::tidy_draws(bird_posterior)))\n```\n\n::: {.cell-output-display}\n| .chain| .iteration| .draw| b_Intercept| b_mother_mass_g_cen| prior_Intercept| prior_b_mother_mass_g_cen| lprior| lp__| accept_stat__| stepsize__| treedepth__| n_leapfrog__| divergent__| energy__|\n|------:|----------:|-----:|-----------:|-------------------:|---------------:|-------------------------:|---------:|---------:|-------------:|----------:|-----------:|------------:|-----------:|--------:|\n| 1| 1| 1| 1.450523| 0.1357033| 0.7548442| 0.2027851| 0.6881775| -29.18266| 0.9686242| 0.8998242| 2| 7| 0| 29.84270|\n| 1| 2| 2| 1.415657| 0.1138594| 0.7622771| 0.0676872| 0.8027096| -29.58861| 0.9127170| 0.8998242| 2| 3| 0| 29.93907|\n| 1| 3| 3| 1.456546| 0.1777444| 0.7965764| 0.1959839| 0.4387761| -29.24064| 0.9937692| 0.8998242| 2| 3| 0| 29.87044|\n| 1| 4| 4| 1.440214| 0.1352149| 0.7316238| 0.1870379| 0.7082742| -29.17389| 0.9942078| 0.8998242| 2| 7| 0| 29.34977|\n| 1| 5| 5| 1.436886| 0.1975367| 1.2892340| 0.0588215| 0.3004459| -29.50809| 0.9286261| 0.8998242| 2| 7| 0| 29.73433|\n| 1| 6| 6| 1.450670| 0.0961187| 0.4208806| 0.1258187| 0.7508956| -30.07357| 0.9696699| 0.8998242| 2| 3| 0| 30.30790|\n:::\n:::\n\n\nHow do our priors and posteriors compare?\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggridges)\ntidybayes::tidy_draws(bird_posterior) |> \n select(.draw, b_Intercept:prior_b_mother_mass_g_cen) |> \n pivot_longer(-.draw) |> \n ggplot(aes(x = value, y = name)) + geom_density_ridges()\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-9-1.png){width=672}\n:::\n:::\n\n\nCan we draw the regression line? \n\n\n::: {.cell}\n\n```{.r .cell-code}\naverage_mom <- mean(mother_masses_g)\n\nrange(imaginary_birds_centered$mother_mass_g_cen)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] -6.025202 4.265214\n```\n:::\n\n```{.r .cell-code}\ntibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, \n n = 10)) |> \n tidybayes::add_epred_draws(bird_posterior) |> \n ungroup() |> \n ggplot(aes(x = average_mom + mother_mass_g_cen, y = .epred)) + \n stat_lineribbon() + \n scale_fill_brewer(palette = \"Greens\", direction = -1) + \n geom_point(aes(x = mother_masses_g, y = nestlings),\n data = imaginary_birds_centered, pch = 21,\n fill = \"orange\", size = 3)\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-10-1.png){width=672}\n:::\n:::\n\n\nLet's also try drawing the prediction intervals.\n\n\n::: {.cell}\n\n```{.r .cell-code}\naverage_mom <- mean(mother_masses_g)\n\nrange(imaginary_birds_centered$mother_mass_g_cen)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] -6.025202 4.265214\n```\n:::\n\n```{.r .cell-code}\ntibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, \n n = 10)) |> \n tidybayes::add_predicted_draws(bird_posterior) |> \n ungroup() |> \n ggplot(aes(x = average_mom + mother_mass_g_cen, y = .prediction)) + \n stat_lineribbon() + \n scale_fill_brewer(palette = \"Greens\", direction = -1) + \n geom_point(aes(x = mother_masses_g, y = nestlings),\n data = imaginary_birds_centered, pch = 21,\n fill = \"orange\", size = 3)\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-11-1.png){width=672}\n:::\n:::\n\n\nOther checks we can do:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbird_posterior_onlyparam <- update(prior_predictions, sample_prior = \"no\", \n file = \"bird_posterior\", \n file_refit = \"on_change\", refresh = FALSE)\n\nshinystan::launch_shinystan(bird_posterior_onlyparam)\n```\n:::\n\n\n\n## Multilevel models\n\nBased on the awesome vignette for vignette for `tidybayes`\n\nWe begin by sampling some data from five different \"conditions\":\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(modelr)\nset.seed(5)\nn <- 10\nn_condition <- 5\nABC <-\n data_frame(\n condition = rep(c(\"A\", \"B\", \"C\", \"D\", \"E\"), n),\n response = rnorm(n * 5, c(0, 1, 2, 1, -1), 0.5)\n )\n\nABC %>%\n ggplot(aes(y = condition, x = response)) +\n geom_point(pch = 21, size = 4, stroke = 1.4, fill = \"#41b6c4\")\n```\n\n::: {.cell-output-display}\n![](index_files/figure-html/setup-vig-1.png){width=672}\n:::\n:::\n\n\nAnd by fitting a model to these data, with varying intercepts for each group:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nm <- brm(\n response ~ (1 | condition), data = ABC, \n control = list(adapt_delta = .99),\n prior = c(\n prior(normal(0, 1), class = Intercept),\n prior(student_t(3, 0, 1), class = sd),\n prior(student_t(3, 0, 1), class = sigma)\n )\n)\n```\n:::\n\n\nAn easy way to visualize these results is with a _ridgeline plot_ as above\n\n\n::: {.cell}\n\n```{.r .cell-code}\nABC %>%\n modelr::data_grid(condition) %>%\n tidybayes::add_predicted_draws(m) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.102\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/m_plot-1.png){width=672}\n:::\n:::\n\n\nAlright. This used the simple vanilla option, `add_predicted_samples(m)`. This uses the default options for making predictions, which recall is \"NULL (default), include all group-level effects\". If you set `add_predicted_samples(m, re_formula = NULL)`, you'll get exactly the same figure. \n\nSo we can see that to \"include\" an effect is to take the actual estimated intercepts for each _specific group we studied_ and use them to make new predictions for the same groups. This is **Case 1** from McElreath's list (though in this case, because we only have groups and nothing else, Case 1 and 2 are the same). \n\nWe can also say the **exact same thing** using a formula: \n\n\n::: {.cell}\n\n```{.r .cell-code}\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = ~(1|condition)) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.1\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/m_re_condition-1.png){width=672}\n:::\n:::\n\n\nThat's right, there are three ways to say the exact same thing: say nothing, say `NULL`, or say the original \"random effects\" formula^[this impulse in R to \"help your users\" by making it possible to say a great deal by saying almost nothing is... actually pretty counterproductive, I'd argue? But that's a different post]. You go with what you feel in your heart is right, but I prefer the formula. \n\nIn all three cases, we are using the model to predict the means for the groups in our varying-intercepts model. This is what the documentation means by \"including\" these varying intercepts.\n\n### Squishing those random effects\n\nOK, so that was three separate ways to make predictions for the _same_ groups. What else can we do? Let's try that thing with the `NA` argument, which means \"include no group-level effects\":\n\n\n::: {.cell}\n\n```{.r .cell-code}\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = NA,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.142\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/m_plot_NA_TRUE-and_zero-1.png){width=672}\n:::\n:::\n\n\nAh, so if you do this, all the groups come out the same! But if they're all the same, what do they represent? It seems reasonable that they represent the model's intercept, as if the varying intercepts were all 0. Let's calculate predicitons that ignore the varying effects -- that is, using only the model intercept and the standard deviation of the response -- using a bit of [handy `purrr` magic]^[no magic required! `rnorm` is already vectorized]:\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nm %>% \n spread_draws(b_Intercept, sigma) %>% \n mutate(prediction = rnorm(length(b_Intercept), b_Intercept, sigma),\n #map2_dbl(b_Intercept, sigma, ~ rnorm(1, mean = .x, sd = .y)),\n Prediction = \"prediction\") %>% #glimpse %>% \n ggplot(aes(x = prediction, y = Prediction)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.119\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-13-1.png){width=672}\n:::\n:::\n\n\nAs you can see, this distribution has exactly the same shape as the five in the previous figure! It is as if we calculated the predictions for a group which was exactly at the average (in other words, it had a varying intercept of 0.) In the Rethinking book, readers are taught to do this in a much more explicit way: you actually generate all the 0 intercepts yourself, and give that to the model in place of the estimated intercepts! A very manual and concrete way to \"set something to 0\". \n\n`brms` does this too. As the documentation says\n>NA values within factors in newdata, are interpreted as if all dummy variables of this factor are zero.\n\nThe `brms` phrasing certainly takes less space, though it also requires you to remember that this is what NA gets you!\n\nWe can also remove random effects from our predictions by excluding them from the `re_formula`. In our model, we have only one varying effect -- yet an even simpler formula is possible, a model with no intercept at all:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = ~ 0,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal() \n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.14\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-14-1.png){width=672}\n:::\n:::\n\n\nOnce again, the same distribution appears: it is as if all group effects had been set to zero. If we had two random effects and omitted one, this is what we would get for the omitted effect -- the expected value if all its effects were 0.\n\n### New levels\n\nI'm going to show how to create predictions for new levels, but first I'm going to show two mistakes that I made frequently while learning:\n\nFirst, asking for new levels without specifying `allow_new_levels = TRUE`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# this does not work at all!!\ndata_frame(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = ~(1|condition),\n n = 2000)\n```\n\n::: {.cell-output .cell-output-error}\n```\nError: Levels 'bugaboo' of grouping factor 'condition' cannot be found in the fitted model. Consider setting argument 'allow_new_levels' to TRUE.\n```\n:::\n:::\n\n\nThat fails because I tried to pass in a level of my grouping variable that _wasn't_ in the original model! \n\nSecond, passing in new levels -- but telling the function to just ignore them:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndata_frame(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = NA,#~(1|condition),\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.142\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/m_data_NA-1.png){width=672}\n:::\n:::\n\n\nHere, I'm still passing in the unknown level -- but the function doesn't complain, because I'm not including random effects at all! This is the same result from above, when we used `NA` or `~0` to remove varying effects altogether. This is definitely something to watch for if you are passing in new data (I made this mistake, and it cost me an afternoon!)\n\nIf we avoid both of these errors, we get what we expect: our means for our original groups, and a new predicted mean for `\"bugaboo\"`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nABC %>%\n data_grid(condition) %>% \n add_row(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = ~(1|condition),\n allow_new_levels = TRUE,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nPicking joint bandwidth of 0.131\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/new_level-1.png){width=672}\n:::\n:::\n\n\nHere you can see that the new level is much flatter than the other original five. It comes from the same population as the others, which is rather variable (the group means are sort of different to each other). As a result, this new distribution is quite wide, including all that uncertainty. \n\nAn ecologist might do something like this if we were had data on _some_ species in a community, but wanted to make predictions for new, as yet unobserved, species we might find next year.\n\n# Day 3: Offsets\n\nIn which we cover how to do a count analysis for different efforts or exposure.\n\n## Outline\n\n- Poisson count model\n- Bayesian Poisson count model\n\n## Today's data: Dandelions\n\nLet's imagine that we have counted dandelions. \n\nDandelions occur on average 6 per square meter\n\nHowever we have five kinds of quadrat: 1, 4, 9 and 25 square meters\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidyverse)\n\nimaginary_dandelions <- tibble(quadrat_size = rep(c(1,4, 9, 25), each = 15),\n n_per_m2 = purrr::map(quadrat_size, rpois, lambda = 6),\n obs_dandelions = map_dbl(n_per_m2, sum))\n\nggplot(imaginary_dandelions, aes(x = obs_dandelions)) + geom_histogram() + \n facet_wrap(~quadrat_size)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-15-1.png){width=672}\n:::\n:::\n\n\nHow can we get the correct number of dandelions? \n\n## Poisson count model\n\n$$\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) &= \\beta\n\\end{align}\n$$\n$\\lambda$ is the average response. If we want to measure the average _per unit effort_, we can do that too:\n\n$$\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda/Q) &= \\beta\n\\end{align}\n$$\n\n\n\n\n$$\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) - \\text{log}(Q) &= \\beta\n\\end{align}\n$$\n\n\n$$\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) &= \\beta + \\text{log}(Q)\n\\end{align}\n$$\n\nIn other words, we need a way to add a log coefficient to a model and give it a slope of _exactly one_. Fortunately the function `offset()` is here to do exactly this:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndandelion_model <- glm(obs_dandelions ~ 1, family = poisson(link = \"log\"), data = imaginary_dandelions)\nsummary(dandelion_model) \n```\n\n::: {.cell-output .cell-output-stdout}\n```\n\nCall:\nglm(formula = obs_dandelions ~ 1, family = poisson(link = \"log\"), \n data = imaginary_dandelions)\n\nDeviance Residuals: \n Min 1Q Median 3Q Max \n-9.802 -6.227 -2.658 2.539 11.652 \n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 4.03836 0.01714 235.6 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for poisson family taken to be 1)\n\n Null deviance: 2909.7 on 59 degrees of freedom\nResidual deviance: 2909.7 on 59 degrees of freedom\nAIC: 3230.4\n\nNumber of Fisher Scoring iterations: 5\n```\n:::\n:::\n\n\nThis gives the wrong answer! \n \n\n::: {.cell}\n\n```{.r .cell-code}\ndandelion_model <- glm(obs_dandelions ~ 1 + offset(log(quadrat_size)),\n family = poisson(link = \"log\"),\n data = imaginary_dandelions)\nsummary(dandelion_model) \n```\n\n::: {.cell-output .cell-output-stdout}\n```\n\nCall:\nglm(formula = obs_dandelions ~ 1 + offset(log(quadrat_size)), \n family = poisson(link = \"log\"), data = imaginary_dandelions)\n\nDeviance Residuals: \n Min 1Q Median 3Q Max \n-1.83462 -0.45999 0.07473 0.46032 2.07858 \n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 1.76109 0.01714 102.7 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for poisson family taken to be 1)\n\n Null deviance: 43.464 on 59 degrees of freedom\nResidual deviance: 43.464 on 59 degrees of freedom\nAIC: 364.13\n\nNumber of Fisher Scoring iterations: 4\n```\n:::\n:::\n\n\nThe coefficient should be close to 6, after we reverse the link function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(coef(dandelion_model)[[1]])\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 5.818803\n```\n:::\n:::\n\n\n## Do it the Bayes Way\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(brms)\n\ndandelion_bf <- bf(obs_dandelions ~ 1 + offset(log(quadrat_size)), \n family = poisson(link = \"log\"))\n\nget_prior(dandelion_bf, data = imaginary_dandelions)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nIntercept ~ student_t(3, 1.89940130916892, 2.5)\n```\n:::\n\n```{.r .cell-code}\ndandelion_prior <- prior(normal(2, 1), class = \"Intercept\")\n\ndandelion_model <- brm(formula = dandelion_bf,\n data = imaginary_dandelions, \n prior = dandelion_prior)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nCompiling Stan program...\n```\n:::\n\n::: {.cell-output .cell-output-stderr}\n```\nStart sampling\n```\n:::\n\n::: {.cell-output .cell-output-stdout}\n```\n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 1).\nChain 1: \nChain 1: Gradient evaluation took 2.3e-05 seconds\nChain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.23 seconds.\nChain 1: Adjust your expectations accordingly!\nChain 1: \nChain 1: \nChain 1: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 1: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 1: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 1: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 1: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 1: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 1: \nChain 1: Elapsed Time: 0.028059 seconds (Warm-up)\nChain 1: 0.026169 seconds (Sampling)\nChain 1: 0.054228 seconds (Total)\nChain 1: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 2).\nChain 2: \nChain 2: Gradient evaluation took 9e-06 seconds\nChain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.\nChain 2: Adjust your expectations accordingly!\nChain 2: \nChain 2: \nChain 2: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 2: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 2: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 2: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 2: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 2: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 2: \nChain 2: Elapsed Time: 0.02398 seconds (Warm-up)\nChain 2: 0.024673 seconds (Sampling)\nChain 2: 0.048653 seconds (Total)\nChain 2: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 3).\nChain 3: \nChain 3: Gradient evaluation took 1.2e-05 seconds\nChain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.\nChain 3: Adjust your expectations accordingly!\nChain 3: \nChain 3: \nChain 3: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 3: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 3: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 3: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 3: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 3: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 3: \nChain 3: Elapsed Time: 0.029274 seconds (Warm-up)\nChain 3: 0.02456 seconds (Sampling)\nChain 3: 0.053834 seconds (Total)\nChain 3: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 4).\nChain 4: \nChain 4: Gradient evaluation took 9e-06 seconds\nChain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.\nChain 4: Adjust your expectations accordingly!\nChain 4: \nChain 4: \nChain 4: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 4: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 4: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 4: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 4: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 4: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 4: \nChain 4: Elapsed Time: 0.023883 seconds (Warm-up)\nChain 4: 0.021166 seconds (Sampling)\nChain 4: 0.045049 seconds (Total)\nChain 4: \n```\n:::\n:::\n\n\nLook at the Stan code:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstancode(dandelion_model)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n// generated with brms 2.18.0\nfunctions {\n}\ndata {\n int N; // total number of observations\n int Y[N]; // response variable\n vector[N] offsets;\n int prior_only; // should the likelihood be ignored?\n}\ntransformed data {\n}\nparameters {\n real Intercept; // temporary intercept for centered predictors\n}\ntransformed parameters {\n real lprior = 0; // prior contributions to the log posterior\n lprior += normal_lpdf(Intercept | 2, 1);\n}\nmodel {\n // likelihood including constants\n if (!prior_only) {\n // initialize linear predictor term\n vector[N] mu = rep_vector(0.0, N);\n mu += Intercept + offsets;\n target += poisson_log_lpmf(Y | mu);\n }\n // priors including constants\n target += lprior;\n}\ngenerated quantities {\n // actual population-level intercept\n real b_Intercept = Intercept;\n}\n```\n:::\n:::\n\n\nLook at posterior distribution of parameter:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# as.matrix(dandelion_model) |> head()\n\nlibrary(tidybayes)\n\ntidy_draws(dandelion_model) |> \n ggplot(aes(x = exp(b_Intercept))) + \n geom_histogram() + \n geom_vline(xintercept = 6, col = \"red\", lwd = 3)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-21-1.png){width=672}\n:::\n:::\n", + "supporting": [ + "index_files" + ], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_data_NA-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_data_NA-1.png new file mode 100644 index 0000000..eee6cc9 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_data_NA-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot-1.png new file mode 100644 index 0000000..fb1d03b Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot_NA_TRUE-and_zero-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot_NA_TRUE-and_zero-1.png new file mode 100644 index 0000000..6a87b26 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_plot_NA_TRUE-and_zero-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_re_condition-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_re_condition-1.png new file mode 100644 index 0000000..aa501d7 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/m_re_condition-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/new_level-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/new_level-1.png new file mode 100644 index 0000000..26f89b6 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/new_level-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/setup-vig-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/setup-vig-1.png new file mode 100644 index 0000000..9ffba3e Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/setup-vig-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-1-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-1-1.png new file mode 100644 index 0000000..3cf80bb Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-1-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-10-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000..faa2e4e Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-10-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-11-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-11-1.png new file mode 100644 index 0000000..12451ec Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-11-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-13-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-13-1.png new file mode 100644 index 0000000..cfbc2a8 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-13-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-14-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-14-1.png new file mode 100644 index 0000000..3f0b227 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-14-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-15-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-15-1.png new file mode 100644 index 0000000..3a92180 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-15-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-21-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-21-1.png new file mode 100644 index 0000000..0b24b51 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-21-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-6-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000..afd9ad8 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-6-1.png differ diff --git a/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-9-1.png b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000..7d60b93 Binary files /dev/null and b/_freeze/posts/2022-03-22-bayes-for-the-busy-ecologist/index/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/about.html b/docs/about.html index 3284658..f89d8d0 100644 --- a/docs/about.html +++ b/docs/about.html @@ -191,7 +191,7 @@

About

-
+
-
+
+
+

+
+ + +
+
@@ -241,7 +268,7 @@

-
+
@@ -268,7 +295,7 @@

-
+
@@ -295,7 +322,7 @@

-
+
@@ -322,7 +349,7 @@

-
+
@@ -349,7 +376,7 @@

-
+
@@ -376,7 +403,7 @@

-
+
@@ -403,7 +430,7 @@

-
+
@@ -430,7 +457,7 @@

-
+
@@ -457,7 +484,7 @@

-
+
@@ -484,7 +511,7 @@

-
+
@@ -511,7 +538,7 @@

-
+
@@ -538,7 +565,7 @@

-
+
@@ -565,7 +592,7 @@

-
+
diff --git a/docs/index.html b/docs/index.html index 9f331dd..ce308ca 100644 --- a/docs/index.html +++ b/docs/index.html @@ -159,7 +159,7 @@ +
Categories
All (14)
Career (1)
EN (13)
FR (3)
Fellow contributed (3)
Technical (11)
Transversal competencies (3)
@@ -204,7 +204,45 @@

BIOS² Education resources

-
+
+
+

+
+ + +
+
@@ -242,7 +280,7 @@

-
+
@@ -280,7 +318,7 @@

-
+
@@ -321,7 +359,7 @@

-
+
@@ -359,7 +397,7 @@

-
+
@@ -397,7 +435,7 @@

-
+
@@ -435,7 +473,7 @@

-
+
@@ -476,7 +514,7 @@

-
+
@@ -517,7 +555,7 @@

-
+
@@ -558,7 +596,7 @@

-
+
@@ -599,7 +637,7 @@

-
+
@@ -640,7 +678,7 @@

-
+
@@ -678,7 +716,7 @@

-
+
diff --git a/docs/listings.json b/docs/listings.json index 1f7935f..91dc22e 100644 --- a/docs/listings.json +++ b/docs/listings.json @@ -7,6 +7,10 @@ "listing": "/posts/2020-09-21-data-visualization/index.html", "items": [] }, + { + "listing": "/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html", + "items": [] + }, { "listing": "/posts/2021-11-02-introduction-to-gams/index.html", "items": [] @@ -58,6 +62,7 @@ { "listing": "/index.html", "items": [ + "/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html", "/posts/2022-05-19-introduction-to-microbiome-analysis/index.html", "/posts/2021-11-02-introduction-to-gams/index.html", "/posts/2021-06-22-introduction-to-shiny-apps/index.html", @@ -77,6 +82,7 @@ "listing": "/about.html", "items": [ "/index.html", + "/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html", "/summer-schools/BiodiversityModelling2022.html", "/posts/2021-05-04-building-r-packages/index.html", "/posts/2020-09-21-data-visualization/index.html", diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html new file mode 100644 index 0000000..f180ef5 --- /dev/null +++ b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html @@ -0,0 +1,1361 @@ + + + + + + + + + + + + +BIOS² Education resources - Bayes for the Busy Ecologist + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+
+
+

Bayes for the Busy Ecologist

+
+
+ This workshop presents one idea of a complete workflow for applied Bayesian statistics with real-world models that are actually used by biodiversity scientists. +
+
+
+
Technical
+
EN
+
+
+
+ +
+
Author
+
+ +
+ Andrew MacDonald +
+
+

+ BIOS² and Université de Sherbrooke +

+
+
+ +
+ + +
+
Published
+
+

October 3, 2023

+
+
+ + +
+ + +
+ + + + +
+ + + + +
+

Applied Bayesian models for working ecologists

+

This training covers how to write down a model, how to translate that into computer code, how to fit it to data and finally, how to work with the resulting posterior distribution. We’ll use Stan, which is a language for writing Bayesian models, and practice with some of the tools that help us work with Stan: rstanarm, brms, tidybayes, shinystan.

+

This training is intended for experienced users of Bayesian tools and for complete beginners who want to use such tools.

+

This 6 to 8h online workshop was conducted in 4 sessions: March 22, 24, 29, 31, 2022, from 11AM – 1 PM Pacific Time / 12 – 2 PM Mountain Time / 2-4 PM Eastern Time. The training was built and presented by Dr. Andrew MacDonald in English with bilingual support throughout.

+

Andrew MacDonald is the Training Coordinator of the BIOS² program. He is a quantitative ecologist who works mostly in R and a well-experienced trainer in teaching quantitative and computational methods. He is currently a research professional at Université de Sherbrooke.

+
+
+

1 Day 1

+
+
+

2 Day 2: Hierarchical and nonlinear models

+

In which we discuss many groups and curving lines.

+
+

Outline

+
    +
  • Return to previous model: Poisson regression
  • +
  • Panel regression version of this model
  • +
  • Bayesian workflow
  • +
  • Brief foray into moment matching
  • +
  • Nonlinear model
  • +
  • Nonlinear model with random effects
  • +
+
+
+

Quick review

+
+

Bird masses

+

This example is based on work by Marie-Eve at UdeS!

+

We imagine a model like the following:

+

\[ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(\lambda_i) \\ +\text{log}(\lambda_i) &= \beta_0 + \beta_1 \times \text{Mass}_i \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +\]

+

\(i\) keeps track of which bird we are talking about. You can think of it as “bird number i”

+

Note: We could also write the model like this:

+

\[ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(e^{\beta_0} \times e^{\beta_1 \times \text{Mass}_i}) \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +\]

+
+
+

Centering variables

+

Centering variables is one of the most important things we can do to help our models be more interpretable. This also helps us to set good priors.

+

Centering a variable means to subtract the mean from the variable:

+

\[ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(\lambda_i) \\ +\text{log}(\lambda_i) &= \beta_0 + \beta_1 \times (\text{Mass}_i - \overline{\text{Mass}}) \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +\]

+

Question How does this change the meaning of \(\beta_0\) and/or \(\beta_1\), if at all? (Hint: what will be the equation for a bird who has exactly average mass?)

+
+
set.seed(1234)
+
+n_birds <- 15
+avg_nestlings_at_avg_mass <- log(4.2)
+effect_of_one_gram <- .2
+
+mother_masses_g <- rnorm(n_birds, mean = 15, sd = 3)
+avg_mother_mass <- mean(mother_masses_g)
+
+log_average_nestlings <- avg_nestlings_at_avg_mass + 
+  effect_of_one_gram * (mother_masses_g - avg_mother_mass)
+
+nestlings <- rpois(n = n_birds, lambda = exp(log_average_nestlings))
+
+

Plot these to get an idea of it:

+
+
suppressPackageStartupMessages(library(tidyverse))
+imaginary_birds <- tibble(mother_masses_g, nestlings)
+
+ggplot(imaginary_birds, aes(x = mother_masses_g, y = nestlings)) + 
+  geom_point()
+
+

+
+
+

NOTE We can also fit this very same model by frequentist statistics, using lm

+
+
coef(glm(nestlings ~ 1 + I(mother_masses_g - mean(mother_masses_g)), family = "poisson"))
+
+
                               (Intercept) 
+                                 1.4138103 
+I(mother_masses_g - mean(mother_masses_g)) 
+                                 0.1727791 
+
+
# compare to known values
+avg_nestlings_at_avg_mass
+
+
[1] 1.435085
+
+
effect_of_one_gram
+
+
[1] 0.2
+
+
+
+
+

Bayesian workflow: define a model and priors

+
+
library(brms)
+
+imaginary_birds_centered <- imaginary_birds |> 
+  mutate(mother_mass_g_cen = mother_masses_g - mean(mother_masses_g))
+
+bird_form <- bf(nestlings ~ 1 + mother_mass_g_cen, family = poisson(link = "log"))
+
+get_prior(bird_form, data = imaginary_birds_centered)
+
+
                  prior     class              coef group resp dpar nlpar lb ub
+                 (flat)         b                                              
+                 (flat)         b mother_mass_g_cen                            
+ student_t(3, 1.4, 2.5) Intercept                                              
+       source
+      default
+ (vectorized)
+      default
+
+
+

We set a prior for each parameter.

+
+
bird_priors <- c(
+  prior(normal(1, .5), class = "Intercept"),
+  prior(normal(.1, .1), class = "b", coef = "mother_mass_g_cen")
+)
+
+
+

Prior predictive checks

+
+
prior_predictions <- brm(bird_form,
+                         data = imaginary_birds_centered,
+                         prior = bird_priors, 
+                         sample_prior = "only", 
+                         file = "bird_model",
+                         file_refit = "on_change",
+                         refresh = FALSE)
+
+

Plot a few of these:

+
+
library(tidybayes)
+imaginary_birds_centered |> 
+  add_predicted_draws(prior_predictions, ndraws = 6, seed = 4321) |> 
+  ggplot(aes(x = mother_masses_g, y = .prediction)) + geom_point() + facet_wrap(~.draw)
+
+

+
+
+

Question are we satisfied with these priors?

+
+
+

Fit to the data

+
+
bird_posterior <- update(prior_predictions, sample_prior = "yes", 
+                         file = "bird_posterior", 
+                         file_refit = "on_change", refresh = FALSE)
+
+
The desired updates require recompiling the model
+
+
+
+
summary(bird_posterior)
+
+
 Family: poisson 
+  Links: mu = log 
+Formula: nestlings ~ 1 + mother_mass_g_cen 
+   Data: imaginary_birds_centered (Number of observations: 15) 
+  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
+         total post-warmup draws = 4000
+
+Population-Level Effects: 
+                  Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
+Intercept             1.39      0.13     1.14     1.64 1.00     1979     2070
+mother_mass_g_cen     0.16      0.04     0.07     0.25 1.00     2376     2463
+
+Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
+and Tail_ESS are effective sample size measures, and Rhat is the potential
+scale reduction factor on split chains (at convergence, Rhat = 1).
+
+
knitr::kable(head(tidybayes::tidy_draws(bird_posterior)))
+
+ +++++++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
.chain.iteration.drawb_Interceptb_mother_mass_g_cenprior_Interceptprior_b_mother_mass_g_cenlpriorlp__accept_stat__stepsize__treedepth__n_leapfrog__divergent__energy__
1111.4505230.13570330.75484420.20278510.6881775-29.182660.96862420.899824227029.84270
1221.4156570.11385940.76227710.06768720.8027096-29.588610.91271700.899824223029.93907
1331.4565460.17774440.79657640.19598390.4387761-29.240640.99376920.899824223029.87044
1441.4402140.13521490.73162380.18703790.7082742-29.173890.99420780.899824227029.34977
1551.4368860.19753671.28923400.05882150.3004459-29.508090.92862610.899824227029.73433
1661.4506700.09611870.42088060.12581870.7508956-30.073570.96966990.899824223030.30790
+
+
+

How do our priors and posteriors compare?

+
+
library(ggridges)
+tidybayes::tidy_draws(bird_posterior) |> 
+  select(.draw, b_Intercept:prior_b_mother_mass_g_cen) |> 
+  pivot_longer(-.draw) |> 
+  ggplot(aes(x = value, y = name)) + geom_density_ridges()
+
+

+
+
+

Can we draw the regression line?

+
+
average_mom <- mean(mother_masses_g)
+
+range(imaginary_birds_centered$mother_mass_g_cen)
+
+
[1] -6.025202  4.265214
+
+
tibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, 
+                                             n = 10)) |> 
+  tidybayes::add_epred_draws(bird_posterior) |> 
+  ungroup() |> 
+  ggplot(aes(x = average_mom + mother_mass_g_cen, y = .epred)) + 
+  stat_lineribbon() + 
+  scale_fill_brewer(palette = "Greens", direction = -1) + 
+  geom_point(aes(x = mother_masses_g, y = nestlings),
+             data = imaginary_birds_centered, pch = 21,
+             fill = "orange", size = 3)
+
+

+
+
+

Let’s also try drawing the prediction intervals.

+
+
average_mom <- mean(mother_masses_g)
+
+range(imaginary_birds_centered$mother_mass_g_cen)
+
+
[1] -6.025202  4.265214
+
+
tibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, 
+                                             n = 10)) |> 
+  tidybayes::add_predicted_draws(bird_posterior) |> 
+  ungroup() |> 
+  ggplot(aes(x = average_mom + mother_mass_g_cen, y = .prediction)) + 
+  stat_lineribbon() + 
+  scale_fill_brewer(palette = "Greens", direction = -1) + 
+  geom_point(aes(x = mother_masses_g, y = nestlings),
+             data = imaginary_birds_centered, pch = 21,
+             fill = "orange", size = 3)
+
+

+
+
+

Other checks we can do:

+
+
bird_posterior_onlyparam <- update(prior_predictions, sample_prior = "no", 
+                         file = "bird_posterior", 
+                         file_refit = "on_change", refresh = FALSE)
+
+shinystan::launch_shinystan(bird_posterior_onlyparam)
+
+
+
+
+
+

Multilevel models

+

Based on the awesome vignette for vignette for tidybayes

+

We begin by sampling some data from five different “conditions”:

+
+
library(modelr)
+set.seed(5)
+n <- 10
+n_condition <- 5
+ABC <-
+  data_frame(
+    condition = rep(c("A", "B", "C", "D", "E"), n),
+    response = rnorm(n * 5, c(0, 1, 2, 1, -1), 0.5)
+  )
+
+ABC %>%
+  ggplot(aes(y = condition, x = response)) +
+  geom_point(pch = 21, size = 4, stroke = 1.4, fill = "#41b6c4")
+
+

+
+
+

And by fitting a model to these data, with varying intercepts for each group:

+
+
m <- brm(
+  response ~ (1 | condition), data = ABC, 
+  control = list(adapt_delta = .99),
+  prior = c(
+    prior(normal(0, 1), class = Intercept),
+    prior(student_t(3, 0, 1), class = sd),
+    prior(student_t(3, 0, 1), class = sigma)
+  )
+)
+
+

An easy way to visualize these results is with a ridgeline plot as above

+
+
ABC %>%
+  modelr::data_grid(condition) %>%
+  tidybayes::add_predicted_draws(m) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") + 
+  theme_minimal()
+
+
Picking joint bandwidth of 0.102
+
+
+

+
+
+

Alright. This used the simple vanilla option, add_predicted_samples(m). This uses the default options for making predictions, which recall is “NULL (default), include all group-level effects”. If you set add_predicted_samples(m, re_formula = NULL), you’ll get exactly the same figure.

+

So we can see that to “include” an effect is to take the actual estimated intercepts for each specific group we studied and use them to make new predictions for the same groups. This is Case 1 from McElreath’s list (though in this case, because we only have groups and nothing else, Case 1 and 2 are the same).

+

We can also say the exact same thing using a formula:

+
+
ABC %>%
+  data_grid(condition) %>%
+  add_predicted_draws(m, re_formula = ~(1|condition)) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") +  
+  theme_minimal()
+
+
Picking joint bandwidth of 0.1
+
+
+

+
+
+

That’s right, there are three ways to say the exact same thing: say nothing, say NULL, or say the original “random effects” formula1. You go with what you feel in your heart is right, but I prefer the formula.

+

In all three cases, we are using the model to predict the means for the groups in our varying-intercepts model. This is what the documentation means by “including” these varying intercepts.

+
+

Squishing those random effects

+

OK, so that was three separate ways to make predictions for the same groups. What else can we do? Let’s try that thing with the NA argument, which means “include no group-level effects”:

+
+
ABC %>%
+  data_grid(condition) %>%
+  add_predicted_draws(m, re_formula = NA,
+                        n = 2000) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") +    theme_minimal()
+
+
Picking joint bandwidth of 0.142
+
+
+

+
+
+

Ah, so if you do this, all the groups come out the same! But if they’re all the same, what do they represent? It seems reasonable that they represent the model’s intercept, as if the varying intercepts were all 0. Let’s calculate predicitons that ignore the varying effects – that is, using only the model intercept and the standard deviation of the response – using a bit of [handy purrr magic]2:

+
+
m %>% 
+  spread_draws(b_Intercept, sigma) %>% 
+  mutate(prediction = rnorm(length(b_Intercept), b_Intercept, sigma),
+         #map2_dbl(b_Intercept, sigma, ~ rnorm(1, mean = .x, sd = .y)),
+         Prediction = "prediction") %>% #glimpse %>% 
+  ggplot(aes(x = prediction, y = Prediction)) +
+  geom_density_ridges(fill = "#41b6c4") +    
+  theme_minimal()
+
+
Picking joint bandwidth of 0.119
+
+
+

+
+
+

As you can see, this distribution has exactly the same shape as the five in the previous figure! It is as if we calculated the predictions for a group which was exactly at the average (in other words, it had a varying intercept of 0.) In the Rethinking book, readers are taught to do this in a much more explicit way: you actually generate all the 0 intercepts yourself, and give that to the model in place of the estimated intercepts! A very manual and concrete way to “set something to 0”.

+

brms does this too. As the documentation says >NA values within factors in newdata, are interpreted as if all dummy variables of this factor are zero.

+

The brms phrasing certainly takes less space, though it also requires you to remember that this is what NA gets you!

+

We can also remove random effects from our predictions by excluding them from the re_formula. In our model, we have only one varying effect – yet an even simpler formula is possible, a model with no intercept at all:

+
+
ABC %>%
+  data_grid(condition) %>%
+  add_predicted_draws(m, re_formula = ~ 0,
+                        n = 2000) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") + theme_minimal() 
+
+
Picking joint bandwidth of 0.14
+
+
+

+
+
+

Once again, the same distribution appears: it is as if all group effects had been set to zero. If we had two random effects and omitted one, this is what we would get for the omitted effect – the expected value if all its effects were 0.

+
+
+

New levels

+

I’m going to show how to create predictions for new levels, but first I’m going to show two mistakes that I made frequently while learning:

+

First, asking for new levels without specifying allow_new_levels = TRUE:

+
+
# this does not work at all!!
+data_frame(condition = "bugaboo") %>%
+  add_predicted_draws(m, re_formula = ~(1|condition),
+                        n = 2000)
+
+
Error: Levels 'bugaboo' of grouping factor 'condition' cannot be found in the fitted model. Consider setting argument 'allow_new_levels' to TRUE.
+
+
+

That fails because I tried to pass in a level of my grouping variable that wasn’t in the original model!

+

Second, passing in new levels – but telling the function to just ignore them:

+
+
data_frame(condition = "bugaboo") %>%
+  add_predicted_draws(m, re_formula = NA,#~(1|condition),
+                        n = 2000) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") + 
+  theme_minimal()
+
+
Picking joint bandwidth of 0.142
+
+
+

+
+
+

Here, I’m still passing in the unknown level – but the function doesn’t complain, because I’m not including random effects at all! This is the same result from above, when we used NA or ~0 to remove varying effects altogether. This is definitely something to watch for if you are passing in new data (I made this mistake, and it cost me an afternoon!)

+

If we avoid both of these errors, we get what we expect: our means for our original groups, and a new predicted mean for "bugaboo":

+
+
ABC %>%
+  data_grid(condition) %>% 
+  add_row(condition = "bugaboo") %>%
+  add_predicted_draws(m, re_formula = ~(1|condition),
+                        allow_new_levels = TRUE,
+                        n = 2000) %>%
+  ggplot(aes(x = .prediction, y = condition)) +
+  geom_density_ridges(fill = "#41b6c4") +    theme_minimal()
+
+
Picking joint bandwidth of 0.131
+
+
+

+
+
+

Here you can see that the new level is much flatter than the other original five. It comes from the same population as the others, which is rather variable (the group means are sort of different to each other). As a result, this new distribution is quite wide, including all that uncertainty.

+

An ecologist might do something like this if we were had data on some species in a community, but wanted to make predictions for new, as yet unobserved, species we might find next year.

+
+
+
+
+

3 Day 3: Offsets

+

In which we cover how to do a count analysis for different efforts or exposure.

+
+

Outline

+
    +
  • Poisson count model
  • +
  • Bayesian Poisson count model
  • +
+
+
+

Today’s data: Dandelions

+

Let’s imagine that we have counted dandelions.

+

Dandelions occur on average 6 per square meter

+

However we have five kinds of quadrat: 1, 4, 9 and 25 square meters

+
+
library(tidyverse)
+
+imaginary_dandelions <- tibble(quadrat_size = rep(c(1,4, 9, 25), each = 15),
+       n_per_m2 = purrr::map(quadrat_size, rpois, lambda = 6),
+       obs_dandelions = map_dbl(n_per_m2, sum))
+
+ggplot(imaginary_dandelions, aes(x = obs_dandelions)) + geom_histogram() + 
+  facet_wrap(~quadrat_size)
+
+
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+
+
+

+
+
+

How can we get the correct number of dandelions?

+
+
+

Poisson count model

+

\[ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) &= \beta +\end{align} +\] \(\lambda\) is the average response. If we want to measure the average per unit effort, we can do that too:

+

\[ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda/Q) &= \beta +\end{align} +\]

+

\[ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) - \text{log}(Q) &= \beta +\end{align} +\]

+

\[ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) &= \beta + \text{log}(Q) +\end{align} +\]

+

In other words, we need a way to add a log coefficient to a model and give it a slope of exactly one. Fortunately the function offset() is here to do exactly this:

+
+
dandelion_model <- glm(obs_dandelions ~ 1, family = poisson(link = "log"), data = imaginary_dandelions)
+summary(dandelion_model) 
+
+

+Call:
+glm(formula = obs_dandelions ~ 1, family = poisson(link = "log"), 
+    data = imaginary_dandelions)
+
+Deviance Residuals: 
+   Min      1Q  Median      3Q     Max  
+-9.802  -6.227  -2.658   2.539  11.652  
+
+Coefficients:
+            Estimate Std. Error z value Pr(>|z|)    
+(Intercept)  4.03836    0.01714   235.6   <2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+(Dispersion parameter for poisson family taken to be 1)
+
+    Null deviance: 2909.7  on 59  degrees of freedom
+Residual deviance: 2909.7  on 59  degrees of freedom
+AIC: 3230.4
+
+Number of Fisher Scoring iterations: 5
+
+
+

This gives the wrong answer!

+
+
dandelion_model <- glm(obs_dandelions ~ 1 + offset(log(quadrat_size)),
+                       family = poisson(link = "log"),
+                       data = imaginary_dandelions)
+summary(dandelion_model) 
+
+

+Call:
+glm(formula = obs_dandelions ~ 1 + offset(log(quadrat_size)), 
+    family = poisson(link = "log"), data = imaginary_dandelions)
+
+Deviance Residuals: 
+     Min        1Q    Median        3Q       Max  
+-1.83462  -0.45999   0.07473   0.46032   2.07858  
+
+Coefficients:
+            Estimate Std. Error z value Pr(>|z|)    
+(Intercept)  1.76109    0.01714   102.7   <2e-16 ***
+---
+Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+(Dispersion parameter for poisson family taken to be 1)
+
+    Null deviance: 43.464  on 59  degrees of freedom
+Residual deviance: 43.464  on 59  degrees of freedom
+AIC: 364.13
+
+Number of Fisher Scoring iterations: 4
+
+
+

The coefficient should be close to 6, after we reverse the link function:

+
+
exp(coef(dandelion_model)[[1]])
+
+
[1] 5.818803
+
+
+
+
+

Do it the Bayes Way

+
+
library(brms)
+
+dandelion_bf <- bf(obs_dandelions ~ 1 + offset(log(quadrat_size)), 
+                   family = poisson(link = "log"))
+
+get_prior(dandelion_bf, data = imaginary_dandelions)
+
+
Intercept ~ student_t(3, 1.89940130916892, 2.5)
+
+
dandelion_prior <- prior(normal(2, 1), class = "Intercept")
+
+dandelion_model <- brm(formula = dandelion_bf,
+                       data = imaginary_dandelions, 
+                       prior = dandelion_prior)
+
+
Compiling Stan program...
+
+
+
Start sampling
+
+
+

+SAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 1).
+Chain 1: 
+Chain 1: Gradient evaluation took 2.3e-05 seconds
+Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.23 seconds.
+Chain 1: Adjust your expectations accordingly!
+Chain 1: 
+Chain 1: 
+Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
+Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
+Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
+Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
+Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
+Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
+Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
+Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
+Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
+Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
+Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
+Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
+Chain 1: 
+Chain 1:  Elapsed Time: 0.028059 seconds (Warm-up)
+Chain 1:                0.026169 seconds (Sampling)
+Chain 1:                0.054228 seconds (Total)
+Chain 1: 
+
+SAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 2).
+Chain 2: 
+Chain 2: Gradient evaluation took 9e-06 seconds
+Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
+Chain 2: Adjust your expectations accordingly!
+Chain 2: 
+Chain 2: 
+Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
+Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
+Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
+Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
+Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
+Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
+Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
+Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
+Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
+Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
+Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
+Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
+Chain 2: 
+Chain 2:  Elapsed Time: 0.02398 seconds (Warm-up)
+Chain 2:                0.024673 seconds (Sampling)
+Chain 2:                0.048653 seconds (Total)
+Chain 2: 
+
+SAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 3).
+Chain 3: 
+Chain 3: Gradient evaluation took 1.2e-05 seconds
+Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
+Chain 3: Adjust your expectations accordingly!
+Chain 3: 
+Chain 3: 
+Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
+Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
+Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
+Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
+Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
+Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
+Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
+Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
+Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
+Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
+Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
+Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
+Chain 3: 
+Chain 3:  Elapsed Time: 0.029274 seconds (Warm-up)
+Chain 3:                0.02456 seconds (Sampling)
+Chain 3:                0.053834 seconds (Total)
+Chain 3: 
+
+SAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 4).
+Chain 4: 
+Chain 4: Gradient evaluation took 9e-06 seconds
+Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
+Chain 4: Adjust your expectations accordingly!
+Chain 4: 
+Chain 4: 
+Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
+Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
+Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
+Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
+Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
+Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
+Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
+Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
+Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
+Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
+Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
+Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
+Chain 4: 
+Chain 4:  Elapsed Time: 0.023883 seconds (Warm-up)
+Chain 4:                0.021166 seconds (Sampling)
+Chain 4:                0.045049 seconds (Total)
+Chain 4: 
+
+
+

Look at the Stan code:

+
+
stancode(dandelion_model)
+
+
// generated with brms 2.18.0
+functions {
+}
+data {
+  int<lower=1> N;  // total number of observations
+  int Y[N];  // response variable
+  vector[N] offsets;
+  int prior_only;  // should the likelihood be ignored?
+}
+transformed data {
+}
+parameters {
+  real Intercept;  // temporary intercept for centered predictors
+}
+transformed parameters {
+  real lprior = 0;  // prior contributions to the log posterior
+  lprior += normal_lpdf(Intercept | 2, 1);
+}
+model {
+  // likelihood including constants
+  if (!prior_only) {
+    // initialize linear predictor term
+    vector[N] mu = rep_vector(0.0, N);
+    mu += Intercept + offsets;
+    target += poisson_log_lpmf(Y | mu);
+  }
+  // priors including constants
+  target += lprior;
+}
+generated quantities {
+  // actual population-level intercept
+  real b_Intercept = Intercept;
+}
+
+
+

Look at posterior distribution of parameter:

+
+
# as.matrix(dandelion_model) |> head()
+
+library(tidybayes)
+
+tidy_draws(dandelion_model) |> 
+  ggplot(aes(x = exp(b_Intercept))) + 
+  geom_histogram() + 
+  geom_vline(xintercept = 6, col = "red", lwd = 3)
+
+
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+
+
+

+
+
+ + + +
+
+ + +
+
+
+
+ +
+
+No matching items +
+

Footnotes

+ +
    +
  1. this impulse in R to “help your users” by making it possible to say a great deal by saying almost nothing is… actually pretty counterproductive, I’d argue? But that’s a different post↩︎

  2. +
  3. no magic required! rnorm is already vectorized↩︎

  4. +
+

Citation

BibTeX citation:
@online{macdonald2023,
+  author = {Andrew MacDonald},
+  title = {Bayes for the {Busy} {Ecologist}},
+  date = {2023-10-03},
+  url = {https://bios2.github.io/posts/2022-03-22-bayes-for-the-busy-ecologist},
+  langid = {en}
+}
+
For attribution, please cite this work as:
+Andrew MacDonald. 2023. “Bayes for the Busy Ecologist.” +BIOS2 Education Resources. October 3, 2023. https://bios2.github.io/posts/2022-03-22-bayes-for-the-busy-ecologist. +
+ +
+ + + + \ No newline at end of file diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_data_NA-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_data_NA-1.png new file mode 100644 index 0000000..eee6cc9 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_data_NA-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot-1.png new file mode 100644 index 0000000..fb1d03b Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot_NA_TRUE-and_zero-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot_NA_TRUE-and_zero-1.png new file mode 100644 index 0000000..6a87b26 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_plot_NA_TRUE-and_zero-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_re_condition-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_re_condition-1.png new file mode 100644 index 0000000..aa501d7 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/m_re_condition-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/new_level-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/new_level-1.png new file mode 100644 index 0000000..26f89b6 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/new_level-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/setup-vig-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/setup-vig-1.png new file mode 100644 index 0000000..9ffba3e Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/setup-vig-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-1-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-1-1.png new file mode 100644 index 0000000..3cf80bb Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-1-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-10-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000..faa2e4e Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-11-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-11-1.png new file mode 100644 index 0000000..12451ec Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-11-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-13-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-13-1.png new file mode 100644 index 0000000..cfbc2a8 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-14-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-14-1.png new file mode 100644 index 0000000..3f0b227 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-14-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-15-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-15-1.png new file mode 100644 index 0000000..3a92180 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-15-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-21-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-21-1.png new file mode 100644 index 0000000..0b24b51 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-21-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-6-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000..afd9ad8 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-9-1.png b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000..7d60b93 Binary files /dev/null and b/docs/posts/2022-03-22-bayes-for-the-busy-ecologist/index_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/search.json b/docs/search.json index ab6f194..90b8adc 100644 --- a/docs/search.json +++ b/docs/search.json @@ -41,6 +41,62 @@ "section": "Training material", "text": "Training material\nClick on “Show code” to learn how to do each plot!\n\nInteractive examples\n\n\n\n\nStreamgraph\n\n\nShow code\n# Script to make a streamgraph of the top 10 most popular dog breeds in \n# New York City from 1999 to 2015\n\n# load libraries\nlibrary(lubridate) # dealing with dates\nlibrary(dplyr) # data manipulation\nlibrary(streamgraph) #devtools::install_github(\"hrbrmstr/streamgraph\")\nlibrary(htmlwidgets) # to save the widget!\n\n# load the dataset\n# more information about this dataset can be found here:\n# https://www.kaggle.com/smithaachar/nyc-dog-licensing-clean\nnyc_dogs <- read.csv(\"data/nyc_dogs.csv\")\n\n# convert birth year to date format (and keep only the year)\nnyc_dogs$AnimalBirthYear <- mdy_hms(nyc_dogs$AnimalBirthMonth) %>% year()\n\n# identify 10 most common dogs\ntopdogs <- nyc_dogs %>% count(BreedName) \ntopdogs <- topdogs[order(topdogs$n, decreasing = TRUE),]\n# keep 10 most common breeds (and remove last year of data which is incomplete)\ndf <- filter(nyc_dogs, BreedName %in% topdogs$BreedName[2:11] & AnimalBirthYear < 2016) %>% \n group_by(AnimalBirthYear) %>% \n count(BreedName) %>% ungroup()\n\n# get some nice colours from viridis (magma)\ncols <- viridis::viridis_pal(option = \"magma\")(length(unique(df$BreedName)))\n\n# make streamgraph!\npp <- streamgraph(df, \n key = BreedName, value = n, date = AnimalBirthYear, \n height=\"600px\", width=\"1000px\") %>%\n sg_legend(show=TRUE, label=\"names: \") %>%\n sg_fill_manual(values = cols) \n# saveWidget(pp, file=paste0(getwd(), \"/figures/dogs_streamgraph.html\"))\n\n# plot\npp\n\n\n\n\n\n\n\n\n\n\nInteractive plot\n\n\nShow code\n# Script to generate plots to demonstrate how combinations of information dimensions\n# can become overwhelming and difficult to interpret.\n\n# set-up & data manipulation ---------------------------------------------------\n\n# load packages\nlibrary(ggplot2) # for plots, built layer by layer\nlibrary(dplyr) # for data manipulation\nlibrary(magrittr) # for piping\nlibrary(plotly) # interactive plots\n\n# set ggplot theme\ntheme_set(theme_classic() +\n theme(axis.title = element_text(size = 11, face = \"bold\"),\n axis.text = element_text(size = 11),\n plot.title = element_text(size = 13, face = \"bold\"),\n legend.title = element_text(size = 11, face = \"bold\"),\n legend.text = element_text(size = 10)))\n\n# import data\n# more info on this dataset: https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-07-28/readme.md\npenguins <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-28/penguins.csv') \n\n# get some nice colours from viridis (magma)\nsp_cols <- viridis::viridis_pal(option = \"magma\")(5)[2:4]\n\n\n#### Day 1 ####\n\n# 1. Similarity\n\nggplot(penguins) +\n geom_point(aes(y = bill_length_mm, x = bill_depth_mm, col = species), size = 2.5) +\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", col = \"Species\") + # labels\n scale_color_manual(values = sp_cols) # sets the colour scale we created above \n\n\n\n\n\nShow code\nggsave(\"figures/penguins_similarity.png\", width = 6, height = 3, units = \"in\")\n\n# 2. Proximity\n\ndf <- penguins %>% group_by(sex, species) %>% \n summarise(mean_mass = mean(body_mass_g, na.rm = TRUE)) %>% na.omit() \nggplot(df) +\n geom_bar(aes(y = mean_mass, x = species, fill = sex), \n position = \"dodge\", stat = \"identity\") +\n labs(x = \"Species\", y = \"Mean body mass (g)\", col = \"Sex\") + # labels\n scale_fill_manual(values = sp_cols) # sets the colour scale we created above\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_proximity.png\", width = 6, height = 3, units = \"in\")\n\n# 3. Enclosure (Ellipses over a fake PCA)\nggplot(data = penguins, \n aes(y = bill_length_mm, x = bill_depth_mm)) +\n geom_point(size = 2.1, col = \"grey30\") +\n stat_ellipse(aes(col = species), lwd = .7) +\n labs(x = \"PCA1\", y = \"PCA2\", col = \"Species\") + # labels\n scale_color_manual(values = sp_cols) + # sets the colour scale we created above\n theme(axis.text = element_blank(), axis.ticks = element_blank())\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_enclosure.png\", width = 6, height = 3, units = \"in\")\n\n# 4. Mismatched combination of principles\ntemp_palette <- rev(c(sp_cols, \"#1f78b4\", \"#33a02c\"))\nggplot(data = penguins, \n aes(y = bill_length_mm, x = bill_depth_mm)) +\n geom_point(aes(col = sex), size = 2.1) +\n stat_ellipse(aes(col = species), lwd = .7) +\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", col = \"?\") + # labels\n scale_color_manual(values = temp_palette) # sets the colour scale we created above\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_mismatchedgestalt.png\", width = 6, height = 3, units = \"in\")\n\n\n\n#### Day 2 ####\n\n# 1. Ineffective combinations: Luminance & shading -----------------------------\n\n# create the plot\nggplot(penguins) +\n geom_point(aes(y = bill_length_mm, x = bill_depth_mm, \n col = species, # hue\n alpha = log(body_mass_g)), # luminance\n size = 2.5) +\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", \n col = \"Species\", alpha = \"Body mass (g)\") +\n scale_color_manual(values = sp_cols)\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_incompatible1.png\", width = 6, height = 3, units = \"in\")\n\n# 2. Ineffective combinations: Sizes and shapes --------------------------------\n\nggplot(penguins) +\n geom_point(aes(y = bill_length_mm, x = bill_depth_mm, \n shape = species, # shape\n size = log(body_mass_g)), alpha = .7) + # size\n scale_size(range = c(.1, 5)) + # make sure the sizes are scaled by area and not by radius\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", \n shape = \"Species\", size = \"Body mass (g)\") \n\n\n\n\n\nShow code\nggsave(\"figures/penguins_incompatible2.png\", width = 6, height = 3, units = \"in\")\n\n# 3. Cognitive overload --------------------------------------------------------\n\n# get some nice colours from viridis (magma)\nsex_cols <- viridis::viridis_pal(option = \"magma\")(8)[c(3,6)]\n\nggplot(na.omit(penguins)) +\n geom_point(aes(y = bill_length_mm, # dimension 1: position along y scale\n x = bill_depth_mm, # dimension 2: position along x scale\n shape = species, # dimension 3: shape\n size = log(body_mass_g), # dimension 4: size\n col = sex), # dimension 5: hue\n alpha = .7) + # size\n scale_size(range = c(.1, 5)) + # make sure the sizes are scaled by area and not by radius\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", \n shape = \"Species\", size = \"Body mass (g)\", col = \"Sex\") +\n scale_color_manual(values = sex_cols)\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_5dimensions.png\", width = 7, height = 4, units = \"in\")\n\n\n# 4. Panels -------------------------------------------------------------------\n\nggplot(na.omit(penguins)) +\n geom_point(aes(y = bill_length_mm, # dimension 1: position along y scale\n x = bill_depth_mm, # dimension 2: position along x scale\n col = log(body_mass_g)), # dimension 3: hue\n alpha = .7, size = 2) + \n facet_wrap(~ species) + # dimension 4: species!\n # this will create a separate panel for each species\n # note: this also automatically uses the same axes for all panels! If you want \n # axes to vary between panels, use the argument scales = \"free\"\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", col = \"Body mass (g)\") +\n scale_color_viridis_c(option = \"magma\", end = .9, direction = -1) +\n theme_linedraw() + theme(panel.grid = element_blank()) # making the panels prettier\n\n\n\n\n\nShow code\nggsave(\"figures/penguins_dimensions_facets.png\", width = 7, height = 4, units = \"in\")\n\n\n# 5. Interactive ---------------------------------------------------------------\n\np <- na.omit(penguins) %>%\n ggplot(aes(y = bill_length_mm, \n x = bill_depth_mm, \n col = log(body_mass_g))) +\n geom_point(size = 2, alpha = .7) + \n facet_wrap(~ species) +\n labs(x = \"Bill depth (mm)\", y = \"Bill length (mm)\", col = \"Body mass (g)\") +\n scale_color_viridis_c(option = \"magma\", end = .9, direction = -1) +\n theme_linedraw() + theme(panel.grid = element_blank()) # making the panels prettier\np <- ggplotly(p)\n#setwd(\"figures\")\nhtmlwidgets::saveWidget(as_widget(p), \"figures/penguins_interactive.html\")\np\n\n\n\n\n\n\n\n\n\nExample figures\n\n\nShow code\n# Script to make animated plot of volcano eruptions over time\n\n# Load libraries:\nlibrary(dplyr) # data manipulation\nlibrary(ggplot2) # plotting\nlibrary(gganimate) # animation\nlibrary(gifski) # creating gifs\n\n# set ggplot theme\ntheme_set(theme_classic() +\n theme(axis.title = element_text(size = 11, face = \"bold\"),\n axis.text = element_text(size = 11),\n plot.title = element_text(size = 13, face = \"bold\"),\n legend.title = element_text(size = 11, face = \"bold\"),\n legend.text = element_text(size = 10)))\n\n# function to floor a year to the decade\nfloor_decade = function(value){return(value - value %% 10)}\n\n# read data \neruptions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-12/eruptions.csv')\n\n# select top 5 most frequently exploding volcanoes\ntemp <- group_by(eruptions, volcano_name) %>% tally() \ntemp <- temp[order(temp$n, decreasing = TRUE),]\n\n# make a time series dataset (number of eruptions per year)\neruptions$start_decade = floor_decade(eruptions$start_year)\n\n# filter dataset to subset we want to visualize\ndf <- eruptions %>% \n filter(between(start_decade, 1900, 2019)) %>%\n filter(volcano_name %in% temp$volcano_name[1:5]) %>%\n group_by(start_decade) %>%\n count(volcano_name) %>% ungroup()\n\n# plot!\np <- ggplot(df, aes(x = start_decade, y = n, fill = volcano_name)) +\n geom_area() +\n geom_vline(aes(xintercept = start_decade)) + # line that follows the current decade\n scale_fill_viridis_d(option = \"magma\", end = .8) +\n labs(x = \"\", y = \"Number of eruptions\", fill = \"Volcano\",\n title = 'Eruptions of the top 5 most frequently erupting volcanos worldwide') +\n # gganimate part: reveals each decade\n transition_reveal(start_decade) \nanimate(p, duration = 5, fps = 20, width = 800, height = 300, renderer = gifski_renderer())\n\n\n\n\n\nShow code\n#anim_save(\"figures/volcano_eruptions.gif\")\n\n\n\n\nShow code\n# Script to generate plots with various ways of representing uncertainty, based \n# Coffee & Code dataset from https://www.kaggle.com/devready/coffee-and-code/data\n\n# set-up & data manipulation ---------------------------------------------------\n\n# load packages\nlibrary(ggplot2) # for plots, built layer by layer\nlibrary(dplyr) # for data manipulation\nlibrary(magrittr) # for piping\nlibrary(ggridges) # for density ridge plots\nlibrary(patchwork) # great package for \"patching\" plots together!\n\n# set ggplot theme\ntheme_set(theme_classic() +\n theme(axis.title = element_text(size = 11, face = \"bold\"),\n axis.text = element_text(size = 11),\n plot.title = element_text(size = 13, face = \"bold\"),\n legend.title = element_text(size = 11, face = \"bold\"),\n legend.text = element_text(size = 10)))\n\n# import data\ndf <- read.csv(\"data/coffee_code.csv\")\n\n# set labels to be used in all plots\ncoffee_labels <- labs(title = \"Does coffee help programmers code?\",\n x = \"Coffee while coding\", \n y = \"Time spent coding \\n(hours/day)\") \n\n# the variable CodingWithoutCoffee is negative, which is harder to understand\n# (i.e. \"No\" means they drink coffee...). So, let's transform it into a more \n# intuitive variable!\ndf$CodingWithCoffee <- gsub(\"No\", \"Usually\", df$CodingWithoutCoffee)\ndf$CodingWithCoffee <- gsub(\"Yes\", \"Rarely\\n or never\", df$CodingWithCoffee)\n# convert to factor and set levels so they show up in a logical order\ndf$CodingWithCoffee <- factor(df$CodingWithCoffee,\n levels = c(\"Rarely\\n or never\", \n \"Sometimes\", \n \"Usually\"))\n\n# calculate summary statistics for the variable of choice\ndf_summary <- group_by(df, CodingWithCoffee) %>%\n summarise(\n # mean\n mean_codinghours = mean(CodingHours), \n # standard deviation\n sd_codinghours = sd(CodingHours), \n # standard error\n se_codinghours = sd(CodingHours)/sqrt(length(CodingHours)))\n\n\n# 1. Error bars (standard error) -----------------------------------------------\n\nggplot(df_summary) +\n geom_errorbar(aes(x = CodingWithCoffee, \n ymin = mean_codinghours - se_codinghours,\n ymax = mean_codinghours + se_codinghours), \n width = .2) +\n geom_point(aes(x = CodingWithCoffee, y = mean_codinghours), \n size = 3) +\n coffee_labels + ylim(0,10)\n\n\n\n\n\nShow code\nggsave(\"figures/coffee_errorbars.png\", width = 5, height = 3, units = \"in\")\n\n# 2. Boxplot -------------------------------------------------------------------\n\nggplot(df) +\n geom_boxplot(aes(x = CodingWithCoffee, y = CodingHours)) +\n coffee_labels\n\n\n\n\n\nShow code\nggsave(\"figures/coffee_boxplot.png\", width = 5, height = 3, units = \"in\")\n\n\n# 3. Error bar demonstration ---------------------------------------------------\n\n# get some nice colours from viridis (magma)\nerror_cols <- viridis::viridis_pal(option = \"magma\")(5)[2:4]\n# set labels to be used in the palette\nerror_labels = c(\"standard deviation\",\"95% confidence interval\",\"standard error\")\n\nggplot(df_summary) +\n # show the raw data\n geom_jitter(data = df, aes(x = CodingWithCoffee, \n y = CodingHours),\n alpha = .5, width = .05, col = \"grey\") +\n # standard deviation\n geom_errorbar(aes(x = CodingWithCoffee, \n ymin = mean_codinghours - sd_codinghours,\n ymax = mean_codinghours + sd_codinghours,\n col = \"SD\"), width = .2, lwd = 1) +\n # 95% confidence interval\n geom_errorbar(aes(x = CodingWithCoffee, \n ymin = mean_codinghours - 1.96*se_codinghours,\n ymax = mean_codinghours + 1.96*se_codinghours, \n col = \"CI\"), width = .2, lwd = 1) +\n # standard error\n geom_errorbar(aes(x = CodingWithCoffee, \n ymin = mean_codinghours - se_codinghours,\n ymax = mean_codinghours + se_codinghours, \n col = \"SE\"), width = .2, lwd = 1) +\n geom_point(aes(x = CodingWithCoffee, y = mean_codinghours), \n size = 3) +\n coffee_labels + ylim(c(0,11)) +\n # manual palette/legend set-up!\n scale_colour_manual(name = \"Uncertainty metric\", \n values = c(SD = error_cols[1], \n CI = error_cols[2], \n SE = error_cols[3]),\n labels = error_labels) +\n theme(legend.position = \"top\")\n\n\n\n\n\nShow code\nggsave(\"figures/coffee_bars_demo.png\", width = 7, height = 5, units = \"in\")\n\n\n# 4. Jitter plot with violin ---------------------------------------------------\n\nggplot() +\n geom_jitter(data = df, aes(x = CodingWithCoffee, \n y = CodingHours),\n alpha = .5, width = .05, col = \"grey\") +\n geom_violin(data = df, aes(x = CodingWithCoffee, \n y = CodingHours), alpha = 0) +\n geom_linerange(data = df_summary,\n aes(x = CodingWithCoffee, \n ymin = mean_codinghours - se_codinghours,\n ymax = mean_codinghours + se_codinghours)) +\n geom_point(data = df_summary, \n aes(x = CodingWithCoffee, \n y = mean_codinghours), size = 3) +\n coffee_labels\n\n\n\n\n\nShow code\nggsave(\"figures/coffee_violin_jitter.png\", width = 5, height = 3, units = \"in\")\n\n\n# 5. Density ridge plot --------------------------------------------------------\n\nggplot(df) + \n aes(y = CodingWithCoffee, x = CodingHours, fill = stat(x)) +\n geom_density_ridges_gradient(scale = 1.9, size = .2, rel_min_height = 0.005) +\n # colour palette (gradient according to CodingHours)\n scale_fill_viridis_c(option = \"magma\", direction = -1) +\n # remove legend - it's not necessary here!\n theme(legend.position = \"none\") +\n labs(title = coffee_labels$title, \n x = coffee_labels$y, \n y = \"Coffee \\nwhile coding\") + \n theme(axis.title.y = element_text(angle=0, hjust = 1, vjust = .9, \n margin = margin(t = 0, r = -50, b = 0, l = 0)))\n\n\n\n\n\nShow code\nggsave(\"figures/coffee_density_ridges.png\", width = 5, height = 3, units = \"in\")\n\n# 6. Jitter vs. Rug plot ------------------------------------------------------------------\n\njitterplot <- ggplot(df, aes(x = CoffeeCupsPerDay, y = CodingHours)) +\n geom_jitter(alpha = .2) +\n geom_smooth(fill = error_cols[1], col = \"black\", method = lm, lwd = .7) +\n coffee_labels + ylim(c(0,11)) + labs(x = \"Cups of coffee (per day)\")\n\nrugplot <- ggplot(df, aes(x = CoffeeCupsPerDay, y = CodingHours)) +\n geom_smooth(fill = error_cols[1], col = \"black\", method = lm, lwd = .7) +\n geom_rug(position=\"jitter\", alpha = .7) + ylim(c(0,11)) +\n coffee_labels + labs(x = \"Cups of coffee (per day)\")\n\n# patch the two plots together\njitterplot + rugplot\n\n\n\n\n\nShow code\n#ggsave(\"figures/coffee_jitter_vs_rugplot.png\", width = 10, height = 4, units = \"in\")\n\n\n\n\nShow code\n# Script to generate 95% confidence intervals of a generated random normal distribution\n# as an example in Day 2: Visualizing uncertainty.\n\n# load library\nlibrary(ggplot2)\nlibrary(magrittr)\nlibrary(dplyr)\n\n# set ggplot theme\ntheme_set(theme_classic() +\n theme(axis.title = element_text(size = 11, face = \"bold\"),\n axis.text = element_text(size = 11),\n plot.title = element_text(size = 13, face = \"bold\"),\n legend.title = element_text(size = 11, face = \"bold\"),\n legend.text = element_text(size = 10)))\n\n# set random seed\nset.seed(22)\n\n# generate population (random normal distribution)\ndf <- data.frame(\"value\" = rnorm(50, mean = 0, sd = 1))\n\n# descriptive stats for each distribution\ndesc_stats = df %>% \n summarise(mean_val = mean(value, na.rm = TRUE),\n se_val = sqrt(var(value)/length(value)))\n\n# build density plot!\np <- ggplot(data = df, aes(x = value, y = ..count..)) +\n geom_density(alpha = .2, lwd = .3) +\n xlim(c(min(df$value-1), max(df$value+1))) \n# extract plotted values\nbase_p <- ggplot_build(p)$data[[1]]\n# shade the 95% confidence interval\np + \n geom_area(data = subset(base_p, \n between(x, \n left = (desc_stats$mean_val - 1.96*desc_stats$se_val),\n right = (desc_stats$mean_val + 1.96*desc_stats$se_val))),\n aes(x = x, y = y), fill = \"cadetblue3\", alpha = .6) +\n # add vertical line to show population mean\n geom_vline(aes(xintercept = 0), lty = 2) +\n annotate(\"text\", x = 0.9, y = 19, label = \"True mean\", fontface = \"italic\") +\n # label axis!\n labs(x = \"Variable of interest\", y = \"\") \n\n\n\n\n\nShow code\n#ggsave(\"figures/confidenceinterval_example.png\", width = 5, height = 3.5, units = \"in\")\n\n\n\n\nAnnotated resource library\nThis is an annotated library of data visualization resources we used to build the BIOS² Data Visualization Training, as well as some bonus resources we didn’t have the time to include. Feel free to save this page as a reference for your data visualization adventures!\n\n\nBooks & articles\nFundamentals of Data Visualization A primer on making informative and compelling figures. This is the website for the book “Fundamentals of Data Visualization” by Claus O. Wilke, published by O’Reilly Media, Inc.\nData Visualization: A practical introduction An accessible primer on how to create effective graphics from data using R (mainly ggplot). This book provides a hands-on introduction to the principles and practice of data visualization, explaining what makes some graphs succeed while others fail, how to make high-quality figures from data using powerful and reproducible methods, and how to think about data visualization in an honest and effective way.\nData Science Design (Chapter 6: Visualizing Data) Covers the principles that make standard plot designs work, show how they can be misleading if not properly used, and develop a sense of when graphs might be lying, and how to construct better ones.\nGraphical Perception: Theory, Experimentation, and Application to the Development of Graphical Methods Cleveland, William S., and Robert McGill. “Graphical Perception: Theory, Experimentation, and Application to the Development of Graphical Methods.” Journal of the American Statistical Association, vol. 79, no. 387, 1984, pp. 531–554. JSTOR, www.jstor.org/stable/2288400. Accessed 9 Oct. 2020.\nGraphical Perception and Graphical Methods for Analyzing Scientific Data Cleveland, William S., and Robert McGill. “Graphical perception and graphical methods for analyzing scientific data.” Science 229.4716 (1985): 828-833.\nFrom Static to Interactive: Transforming Data Visualization to Improve Transparency Weissgerber TL, Garovic VD, Savic M, Winham SJ, Milic NM (2016) designed an interactive line graph that demonstrates how dynamic alternatives to static graphics for small sample size studies allow for additional exploration of empirical datasets. This simple, free, web-based tool demonstrates the overall concept and may promote widespread use of interactive graphics.\nData visualization: ambiguity as a fellow traveler Research that is being done about how to visualize uncertainty in data visualizations. Marx, V. Nat Methods 10, 613–615 (2013). https://doi.org/10.1038/nmeth.2530\nData visualization standards Collection of guidance and resources to help create better data visualizations with less effort.\n\n\n\nDesign principles\nGestalt Principles for Data Visualization: Similarity, Proximity & Enclosure Short visual guide to the Gestalt Principles.\nWhy scientists need to be better at data visualization A great overview of principles that could improve how we visualize scientific data and results.\nA collection of graphic pitfalls A collection of short articles about common issues with data visualizations that can mislead or obscure your message.\n\n\n\nChoosing a visualization\nData Viz Project This is a great place to get inspiration and guidance about how to choose an appropriate visualization. There are many visualizations we are not used to seeing in ecology!\nFrom data to Viz | Find the graphic you need Interactive tool to choose an appropriate visualization type for your data.\n\n\n\nColour\nWhat to consider when choosing colors for data visualization A short, visual guide on things to keep in mind when using colour, such as when and how to use colour gradients, the colour grey, etc.\nColorBrewer: Color Advice for Maps Tool to generate colour palettes for visualizations with colorblind-friendly options. You can also use these palettes in R using the RColorBrewer package, and the scale_*_brewer() (for discrete palettes) or scale_*_distiller() (for continuous palettes) functions in ggplot2.\nColor.review Tool to pick or verify colour palettes with high relative contrast between colours, to ensure your information is readable for everyone.\nCoblis — Color Blindness Simulator Tool to upload an image and view it as they would appear to a colorblind person, with the option to simulate several color-vision deficiencies.\n500+ Named Colours with rgb and hex values List of named colours along with their hex values.\nCartoDB/CartoColor CARTOColors are a set of custom color palettes built on top of well-known standards for color use on maps, with next generation enhancements for the web and CARTO basemaps. Choose from a selection of sequential, diverging, or qualitative schemes for your next CARTO powered visualization using their online module.\n\n\n\nTools\n\nR\nThe R Graph Gallery A collection of charts made with the R programming language. Hundreds of charts are displayed in several sections, always with their reproducible code available. The gallery makes a focus on the tidyverse and ggplot2.\n\nBase R\nCheatsheet: Margins in base R Edit your margins in base R to accommodate axis titles, legends, captions, etc.!\nCustomizing tick marks in base R Seems like a simple thing, but it can be so frustrating! This is a great post about customizing tick marks with base plot in R.\nAnimations in R (for time series) If you want to use animations but don’t want to use ggplot2, this demo might help you!\n\n\nggplot2\nCheatsheet: ggplot2 Cheatsheet for ggplot2 in R - anything you want to do is probably covered here!\nCoding Club tutorial: Data Viz Part 1 - Beautiful and informative data visualization Great tutorial demonstrating how to customize titles, subtitles, captions, labels, colour palettes, and themes in ggplot2.\nCoding Club tutorial: Data Viz Part 2 - Customizing your figures Great tutorial demonstrating how to customize titles, subtitles, captions, labels, colour palettes, and themes in ggplot2.\nggplot flipbook A flipbook-style demonstration that builds and customizes plots line by line using ggplot in R.\ngganimate: A Grammar of Animated Graphics Package to create animated graphics in R (with ggplot2).\n\n\n\nPython\nThe Python Graph Gallery This website displays hundreds of charts, always providing the reproducible python code.\nPython Tutorial: Intro to Matplotlib Introduction to basic functionalities of the Python’s library Matplotlib covering basic plots, plot attributes, subplots and plotting the iris dataset.\nThe Art of Effective Visualization of Multi-dimensional Data Covers both univariate (one-dimension) and multivariate (multi-dimensional) data visualization strategies using the Python machine learning ecosystem.\n\n\nJulia\nJulia Plots Gallery Display of various plots with reproducible code in Julia.\nPlots in Julia Documentation for the Plots package in Julia, including demonstrations for animated plots, and links to tutorials.\nAnimations in Julia How to start making animated plots in Julia.\n\n\n\n\nCustomization\nChart Studio Web editor to create interactive plots with plotly. You can download the image as .html, or static images, without coding the figure yourself.\nPhyloPic Vector images of living organisms. This is great for ecologists who want to add silhouettes of their organisms onto their plots - search anything, and you will likely find it!\nAdd icons on your R plot Add special icons to your plot as a great way to customize it, and save space with labels!\n\n\n\nInspiration (pretty things!)\nInformation is Beautiful Collection of beautiful original visualizations about a variety of topics!\nTidyTuesday A weekly data project aimed at the R ecosystem, where people wrangle and visualize data in loads of creative ways. Browse what people have created (#TidyTuesday on Twitter is great too!), and the visualizations that have inspired each week’s theme.\nWind currents on Earth Dynamic and interactive map of wind currents on Earth.\nA Day in the Life of Americans Dynamic visualisation of how Americans spend their time in an average day.\n2019: The Year in Visual Stories and Graphics Collection of the most popular visualizations by the New York Times in 2019." }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html", + "title": "Bayes for the Busy Ecologist", + "section": "", + "text": "This training covers how to write down a model, how to translate that into computer code, how to fit it to data and finally, how to work with the resulting posterior distribution. We’ll use Stan, which is a language for writing Bayesian models, and practice with some of the tools that help us work with Stan: rstanarm, brms, tidybayes, shinystan.\nThis training is intended for experienced users of Bayesian tools and for complete beginners who want to use such tools.\nThis 6 to 8h online workshop was conducted in 4 sessions: March 22, 24, 29, 31, 2022, from 11AM – 1 PM Pacific Time / 12 – 2 PM Mountain Time / 2-4 PM Eastern Time. The training was built and presented by Dr. Andrew MacDonald in English with bilingual support throughout.\nAndrew MacDonald is the Training Coordinator of the BIOS² program. He is a quantitative ecologist who works mostly in R and a well-experienced trainer in teaching quantitative and computational methods. He is currently a research professional at Université de Sherbrooke." + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#outline", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#outline", + "title": "Bayes for the Busy Ecologist", + "section": "Outline", + "text": "Outline\n\nReturn to previous model: Poisson regression\nPanel regression version of this model\nBayesian workflow\nBrief foray into moment matching\nNonlinear model\nNonlinear model with random effects" + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#quick-review", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#quick-review", + "title": "Bayes for the Busy Ecologist", + "section": "Quick review", + "text": "Quick review\n\nBird masses\nThis example is based on work by Marie-Eve at UdeS!\nWe imagine a model like the following:\n\\[\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(\\lambda_i) \\\\\n\\text{log}(\\lambda_i) &= \\beta_0 + \\beta_1 \\times \\text{Mass}_i \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n\\]\n\\(i\\) keeps track of which bird we are talking about. You can think of it as “bird number i”\nNote: We could also write the model like this:\n\\[\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(e^{\\beta_0} \\times e^{\\beta_1 \\times \\text{Mass}_i}) \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n\\]\n\n\nCentering variables\nCentering variables is one of the most important things we can do to help our models be more interpretable. This also helps us to set good priors.\nCentering a variable means to subtract the mean from the variable:\n\\[\n\\begin{align}\n\\text{Nestlings}_i & \\sim \\text{Poisson}(\\lambda_i) \\\\\n\\text{log}(\\lambda_i) &= \\beta_0 + \\beta_1 \\times (\\text{Mass}_i - \\overline{\\text{Mass}}) \\\\\n\\beta_0 & \\sim \\text{Normal}(??, ??) \\\\\n\\beta_1 & \\sim \\text{Normal}(??, ??)\n\\end{align}\n\\]\nQuestion How does this change the meaning of \\(\\beta_0\\) and/or \\(\\beta_1\\), if at all? (Hint: what will be the equation for a bird who has exactly average mass?)\n\nset.seed(1234)\n\nn_birds <- 15\navg_nestlings_at_avg_mass <- log(4.2)\neffect_of_one_gram <- .2\n\nmother_masses_g <- rnorm(n_birds, mean = 15, sd = 3)\navg_mother_mass <- mean(mother_masses_g)\n\nlog_average_nestlings <- avg_nestlings_at_avg_mass + \n effect_of_one_gram * (mother_masses_g - avg_mother_mass)\n\nnestlings <- rpois(n = n_birds, lambda = exp(log_average_nestlings))\n\nPlot these to get an idea of it:\n\nsuppressPackageStartupMessages(library(tidyverse))\nimaginary_birds <- tibble(mother_masses_g, nestlings)\n\nggplot(imaginary_birds, aes(x = mother_masses_g, y = nestlings)) + \n geom_point()\n\n\n\n\nNOTE We can also fit this very same model by frequentist statistics, using lm\n\ncoef(glm(nestlings ~ 1 + I(mother_masses_g - mean(mother_masses_g)), family = \"poisson\"))\n\n (Intercept) \n 1.4138103 \nI(mother_masses_g - mean(mother_masses_g)) \n 0.1727791 \n\n# compare to known values\navg_nestlings_at_avg_mass\n\n[1] 1.435085\n\neffect_of_one_gram\n\n[1] 0.2\n\n\n\n\nBayesian workflow: define a model and priors\n\nlibrary(brms)\n\nimaginary_birds_centered <- imaginary_birds |> \n mutate(mother_mass_g_cen = mother_masses_g - mean(mother_masses_g))\n\nbird_form <- bf(nestlings ~ 1 + mother_mass_g_cen, family = poisson(link = \"log\"))\n\nget_prior(bird_form, data = imaginary_birds_centered)\n\n prior class coef group resp dpar nlpar lb ub\n (flat) b \n (flat) b mother_mass_g_cen \n student_t(3, 1.4, 2.5) Intercept \n source\n default\n (vectorized)\n default\n\n\nWe set a prior for each parameter.\n\nbird_priors <- c(\n prior(normal(1, .5), class = \"Intercept\"),\n prior(normal(.1, .1), class = \"b\", coef = \"mother_mass_g_cen\")\n)\n\n\nPrior predictive checks\n\nprior_predictions <- brm(bird_form,\n data = imaginary_birds_centered,\n prior = bird_priors, \n sample_prior = \"only\", \n file = \"bird_model\",\n file_refit = \"on_change\",\n refresh = FALSE)\n\nPlot a few of these:\n\nlibrary(tidybayes)\nimaginary_birds_centered |> \n add_predicted_draws(prior_predictions, ndraws = 6, seed = 4321) |> \n ggplot(aes(x = mother_masses_g, y = .prediction)) + geom_point() + facet_wrap(~.draw)\n\n\n\n\nQuestion are we satisfied with these priors?\n\n\nFit to the data\n\nbird_posterior <- update(prior_predictions, sample_prior = \"yes\", \n file = \"bird_posterior\", \n file_refit = \"on_change\", refresh = FALSE)\n\nThe desired updates require recompiling the model\n\n\n\nsummary(bird_posterior)\n\n Family: poisson \n Links: mu = log \nFormula: nestlings ~ 1 + mother_mass_g_cen \n Data: imaginary_birds_centered (Number of observations: 15) \n Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;\n total post-warmup draws = 4000\n\nPopulation-Level Effects: \n Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS\nIntercept 1.39 0.13 1.14 1.64 1.00 1979 2070\nmother_mass_g_cen 0.16 0.04 0.07 0.25 1.00 2376 2463\n\nDraws were sampled using sampling(NUTS). For each parameter, Bulk_ESS\nand Tail_ESS are effective sample size measures, and Rhat is the potential\nscale reduction factor on split chains (at convergence, Rhat = 1).\n\nknitr::kable(head(tidybayes::tidy_draws(bird_posterior)))\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n.chain\n.iteration\n.draw\nb_Intercept\nb_mother_mass_g_cen\nprior_Intercept\nprior_b_mother_mass_g_cen\nlprior\nlp__\naccept_stat__\nstepsize__\ntreedepth__\nn_leapfrog__\ndivergent__\nenergy__\n\n\n\n\n1\n1\n1\n1.450523\n0.1357033\n0.7548442\n0.2027851\n0.6881775\n-29.18266\n0.9686242\n0.8998242\n2\n7\n0\n29.84270\n\n\n1\n2\n2\n1.415657\n0.1138594\n0.7622771\n0.0676872\n0.8027096\n-29.58861\n0.9127170\n0.8998242\n2\n3\n0\n29.93907\n\n\n1\n3\n3\n1.456546\n0.1777444\n0.7965764\n0.1959839\n0.4387761\n-29.24064\n0.9937692\n0.8998242\n2\n3\n0\n29.87044\n\n\n1\n4\n4\n1.440214\n0.1352149\n0.7316238\n0.1870379\n0.7082742\n-29.17389\n0.9942078\n0.8998242\n2\n7\n0\n29.34977\n\n\n1\n5\n5\n1.436886\n0.1975367\n1.2892340\n0.0588215\n0.3004459\n-29.50809\n0.9286261\n0.8998242\n2\n7\n0\n29.73433\n\n\n1\n6\n6\n1.450670\n0.0961187\n0.4208806\n0.1258187\n0.7508956\n-30.07357\n0.9696699\n0.8998242\n2\n3\n0\n30.30790\n\n\n\n\n\nHow do our priors and posteriors compare?\n\nlibrary(ggridges)\ntidybayes::tidy_draws(bird_posterior) |> \n select(.draw, b_Intercept:prior_b_mother_mass_g_cen) |> \n pivot_longer(-.draw) |> \n ggplot(aes(x = value, y = name)) + geom_density_ridges()\n\n\n\n\nCan we draw the regression line?\n\naverage_mom <- mean(mother_masses_g)\n\nrange(imaginary_birds_centered$mother_mass_g_cen)\n\n[1] -6.025202 4.265214\n\ntibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, \n n = 10)) |> \n tidybayes::add_epred_draws(bird_posterior) |> \n ungroup() |> \n ggplot(aes(x = average_mom + mother_mass_g_cen, y = .epred)) + \n stat_lineribbon() + \n scale_fill_brewer(palette = \"Greens\", direction = -1) + \n geom_point(aes(x = mother_masses_g, y = nestlings),\n data = imaginary_birds_centered, pch = 21,\n fill = \"orange\", size = 3)\n\n\n\n\nLet’s also try drawing the prediction intervals.\n\naverage_mom <- mean(mother_masses_g)\n\nrange(imaginary_birds_centered$mother_mass_g_cen)\n\n[1] -6.025202 4.265214\n\ntibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, \n n = 10)) |> \n tidybayes::add_predicted_draws(bird_posterior) |> \n ungroup() |> \n ggplot(aes(x = average_mom + mother_mass_g_cen, y = .prediction)) + \n stat_lineribbon() + \n scale_fill_brewer(palette = \"Greens\", direction = -1) + \n geom_point(aes(x = mother_masses_g, y = nestlings),\n data = imaginary_birds_centered, pch = 21,\n fill = \"orange\", size = 3)\n\n\n\n\nOther checks we can do:\n\nbird_posterior_onlyparam <- update(prior_predictions, sample_prior = \"no\", \n file = \"bird_posterior\", \n file_refit = \"on_change\", refresh = FALSE)\n\nshinystan::launch_shinystan(bird_posterior_onlyparam)" + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#multilevel-models", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#multilevel-models", + "title": "Bayes for the Busy Ecologist", + "section": "Multilevel models", + "text": "Multilevel models\nBased on the awesome vignette for vignette for tidybayes\nWe begin by sampling some data from five different “conditions”:\n\nlibrary(modelr)\nset.seed(5)\nn <- 10\nn_condition <- 5\nABC <-\n data_frame(\n condition = rep(c(\"A\", \"B\", \"C\", \"D\", \"E\"), n),\n response = rnorm(n * 5, c(0, 1, 2, 1, -1), 0.5)\n )\n\nABC %>%\n ggplot(aes(y = condition, x = response)) +\n geom_point(pch = 21, size = 4, stroke = 1.4, fill = \"#41b6c4\")\n\n\n\n\nAnd by fitting a model to these data, with varying intercepts for each group:\n\nm <- brm(\n response ~ (1 | condition), data = ABC, \n control = list(adapt_delta = .99),\n prior = c(\n prior(normal(0, 1), class = Intercept),\n prior(student_t(3, 0, 1), class = sd),\n prior(student_t(3, 0, 1), class = sigma)\n )\n)\n\nAn easy way to visualize these results is with a ridgeline plot as above\n\nABC %>%\n modelr::data_grid(condition) %>%\n tidybayes::add_predicted_draws(m) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n\nPicking joint bandwidth of 0.102\n\n\n\n\n\nAlright. This used the simple vanilla option, add_predicted_samples(m). This uses the default options for making predictions, which recall is “NULL (default), include all group-level effects”. If you set add_predicted_samples(m, re_formula = NULL), you’ll get exactly the same figure.\nSo we can see that to “include” an effect is to take the actual estimated intercepts for each specific group we studied and use them to make new predictions for the same groups. This is Case 1 from McElreath’s list (though in this case, because we only have groups and nothing else, Case 1 and 2 are the same).\nWe can also say the exact same thing using a formula:\n\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = ~(1|condition)) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n\nPicking joint bandwidth of 0.1\n\n\n\n\n\nThat’s right, there are three ways to say the exact same thing: say nothing, say NULL, or say the original “random effects” formula1. You go with what you feel in your heart is right, but I prefer the formula.\nIn all three cases, we are using the model to predict the means for the groups in our varying-intercepts model. This is what the documentation means by “including” these varying intercepts.\n\nSquishing those random effects\nOK, so that was three separate ways to make predictions for the same groups. What else can we do? Let’s try that thing with the NA argument, which means “include no group-level effects”:\n\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = NA,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal()\n\nPicking joint bandwidth of 0.142\n\n\n\n\n\nAh, so if you do this, all the groups come out the same! But if they’re all the same, what do they represent? It seems reasonable that they represent the model’s intercept, as if the varying intercepts were all 0. Let’s calculate predicitons that ignore the varying effects – that is, using only the model intercept and the standard deviation of the response – using a bit of [handy purrr magic]2:\n\nm %>% \n spread_draws(b_Intercept, sigma) %>% \n mutate(prediction = rnorm(length(b_Intercept), b_Intercept, sigma),\n #map2_dbl(b_Intercept, sigma, ~ rnorm(1, mean = .x, sd = .y)),\n Prediction = \"prediction\") %>% #glimpse %>% \n ggplot(aes(x = prediction, y = Prediction)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n\nPicking joint bandwidth of 0.119\n\n\n\n\n\nAs you can see, this distribution has exactly the same shape as the five in the previous figure! It is as if we calculated the predictions for a group which was exactly at the average (in other words, it had a varying intercept of 0.) In the Rethinking book, readers are taught to do this in a much more explicit way: you actually generate all the 0 intercepts yourself, and give that to the model in place of the estimated intercepts! A very manual and concrete way to “set something to 0”.\nbrms does this too. As the documentation says >NA values within factors in newdata, are interpreted as if all dummy variables of this factor are zero.\nThe brms phrasing certainly takes less space, though it also requires you to remember that this is what NA gets you!\nWe can also remove random effects from our predictions by excluding them from the re_formula. In our model, we have only one varying effect – yet an even simpler formula is possible, a model with no intercept at all:\n\nABC %>%\n data_grid(condition) %>%\n add_predicted_draws(m, re_formula = ~ 0,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal() \n\nPicking joint bandwidth of 0.14\n\n\n\n\n\nOnce again, the same distribution appears: it is as if all group effects had been set to zero. If we had two random effects and omitted one, this is what we would get for the omitted effect – the expected value if all its effects were 0.\n\n\nNew levels\nI’m going to show how to create predictions for new levels, but first I’m going to show two mistakes that I made frequently while learning:\nFirst, asking for new levels without specifying allow_new_levels = TRUE:\n\n# this does not work at all!!\ndata_frame(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = ~(1|condition),\n n = 2000)\n\nError: Levels 'bugaboo' of grouping factor 'condition' cannot be found in the fitted model. Consider setting argument 'allow_new_levels' to TRUE.\n\n\nThat fails because I tried to pass in a level of my grouping variable that wasn’t in the original model!\nSecond, passing in new levels – but telling the function to just ignore them:\n\ndata_frame(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = NA,#~(1|condition),\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + \n theme_minimal()\n\nPicking joint bandwidth of 0.142\n\n\n\n\n\nHere, I’m still passing in the unknown level – but the function doesn’t complain, because I’m not including random effects at all! This is the same result from above, when we used NA or ~0 to remove varying effects altogether. This is definitely something to watch for if you are passing in new data (I made this mistake, and it cost me an afternoon!)\nIf we avoid both of these errors, we get what we expect: our means for our original groups, and a new predicted mean for \"bugaboo\":\n\nABC %>%\n data_grid(condition) %>% \n add_row(condition = \"bugaboo\") %>%\n add_predicted_draws(m, re_formula = ~(1|condition),\n allow_new_levels = TRUE,\n n = 2000) %>%\n ggplot(aes(x = .prediction, y = condition)) +\n geom_density_ridges(fill = \"#41b6c4\") + theme_minimal()\n\nPicking joint bandwidth of 0.131\n\n\n\n\n\nHere you can see that the new level is much flatter than the other original five. It comes from the same population as the others, which is rather variable (the group means are sort of different to each other). As a result, this new distribution is quite wide, including all that uncertainty.\nAn ecologist might do something like this if we were had data on some species in a community, but wanted to make predictions for new, as yet unobserved, species we might find next year." + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#outline-1", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#outline-1", + "title": "Bayes for the Busy Ecologist", + "section": "Outline", + "text": "Outline\n\nPoisson count model\nBayesian Poisson count model" + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#todays-data-dandelions", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#todays-data-dandelions", + "title": "Bayes for the Busy Ecologist", + "section": "Today’s data: Dandelions", + "text": "Today’s data: Dandelions\nLet’s imagine that we have counted dandelions.\nDandelions occur on average 6 per square meter\nHowever we have five kinds of quadrat: 1, 4, 9 and 25 square meters\n\nlibrary(tidyverse)\n\nimaginary_dandelions <- tibble(quadrat_size = rep(c(1,4, 9, 25), each = 15),\n n_per_m2 = purrr::map(quadrat_size, rpois, lambda = 6),\n obs_dandelions = map_dbl(n_per_m2, sum))\n\nggplot(imaginary_dandelions, aes(x = obs_dandelions)) + geom_histogram() + \n facet_wrap(~quadrat_size)\n\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n\n\n\n\n\nHow can we get the correct number of dandelions?" + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#poisson-count-model", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#poisson-count-model", + "title": "Bayes for the Busy Ecologist", + "section": "Poisson count model", + "text": "Poisson count model\n\\[\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) &= \\beta\n\\end{align}\n\\] \\(\\lambda\\) is the average response. If we want to measure the average per unit effort, we can do that too:\n\\[\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda/Q) &= \\beta\n\\end{align}\n\\]\n\\[\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) - \\text{log}(Q) &= \\beta\n\\end{align}\n\\]\n\\[\n\\begin{align}\ny &\\sim \\text{Poisson}(\\lambda) \\\\\n\\text{log}(\\lambda) &= \\beta + \\text{log}(Q)\n\\end{align}\n\\]\nIn other words, we need a way to add a log coefficient to a model and give it a slope of exactly one. Fortunately the function offset() is here to do exactly this:\n\ndandelion_model <- glm(obs_dandelions ~ 1, family = poisson(link = \"log\"), data = imaginary_dandelions)\nsummary(dandelion_model) \n\n\nCall:\nglm(formula = obs_dandelions ~ 1, family = poisson(link = \"log\"), \n data = imaginary_dandelions)\n\nDeviance Residuals: \n Min 1Q Median 3Q Max \n-9.802 -6.227 -2.658 2.539 11.652 \n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 4.03836 0.01714 235.6 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for poisson family taken to be 1)\n\n Null deviance: 2909.7 on 59 degrees of freedom\nResidual deviance: 2909.7 on 59 degrees of freedom\nAIC: 3230.4\n\nNumber of Fisher Scoring iterations: 5\n\n\nThis gives the wrong answer!\n\ndandelion_model <- glm(obs_dandelions ~ 1 + offset(log(quadrat_size)),\n family = poisson(link = \"log\"),\n data = imaginary_dandelions)\nsummary(dandelion_model) \n\n\nCall:\nglm(formula = obs_dandelions ~ 1 + offset(log(quadrat_size)), \n family = poisson(link = \"log\"), data = imaginary_dandelions)\n\nDeviance Residuals: \n Min 1Q Median 3Q Max \n-1.83462 -0.45999 0.07473 0.46032 2.07858 \n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) 1.76109 0.01714 102.7 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for poisson family taken to be 1)\n\n Null deviance: 43.464 on 59 degrees of freedom\nResidual deviance: 43.464 on 59 degrees of freedom\nAIC: 364.13\n\nNumber of Fisher Scoring iterations: 4\n\n\nThe coefficient should be close to 6, after we reverse the link function:\n\nexp(coef(dandelion_model)[[1]])\n\n[1] 5.818803" + }, + { + "objectID": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#do-it-the-bayes-way", + "href": "posts/2022-03-22-bayes-for-the-busy-ecologist/index.html#do-it-the-bayes-way", + "title": "Bayes for the Busy Ecologist", + "section": "Do it the Bayes Way", + "text": "Do it the Bayes Way\n\nlibrary(brms)\n\ndandelion_bf <- bf(obs_dandelions ~ 1 + offset(log(quadrat_size)), \n family = poisson(link = \"log\"))\n\nget_prior(dandelion_bf, data = imaginary_dandelions)\n\nIntercept ~ student_t(3, 1.89940130916892, 2.5)\n\ndandelion_prior <- prior(normal(2, 1), class = \"Intercept\")\n\ndandelion_model <- brm(formula = dandelion_bf,\n data = imaginary_dandelions, \n prior = dandelion_prior)\n\nCompiling Stan program...\n\n\nStart sampling\n\n\n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 1).\nChain 1: \nChain 1: Gradient evaluation took 2.3e-05 seconds\nChain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.23 seconds.\nChain 1: Adjust your expectations accordingly!\nChain 1: \nChain 1: \nChain 1: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 1: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 1: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 1: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 1: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 1: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 1: \nChain 1: Elapsed Time: 0.028059 seconds (Warm-up)\nChain 1: 0.026169 seconds (Sampling)\nChain 1: 0.054228 seconds (Total)\nChain 1: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 2).\nChain 2: \nChain 2: Gradient evaluation took 9e-06 seconds\nChain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.\nChain 2: Adjust your expectations accordingly!\nChain 2: \nChain 2: \nChain 2: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 2: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 2: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 2: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 2: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 2: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 2: \nChain 2: Elapsed Time: 0.02398 seconds (Warm-up)\nChain 2: 0.024673 seconds (Sampling)\nChain 2: 0.048653 seconds (Total)\nChain 2: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 3).\nChain 3: \nChain 3: Gradient evaluation took 1.2e-05 seconds\nChain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.\nChain 3: Adjust your expectations accordingly!\nChain 3: \nChain 3: \nChain 3: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 3: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 3: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 3: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 3: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 3: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 3: \nChain 3: Elapsed Time: 0.029274 seconds (Warm-up)\nChain 3: 0.02456 seconds (Sampling)\nChain 3: 0.053834 seconds (Total)\nChain 3: \n\nSAMPLING FOR MODEL '712e52da308896603a6860da7d30e1d5' NOW (CHAIN 4).\nChain 4: \nChain 4: Gradient evaluation took 9e-06 seconds\nChain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.\nChain 4: Adjust your expectations accordingly!\nChain 4: \nChain 4: \nChain 4: Iteration: 1 / 2000 [ 0%] (Warmup)\nChain 4: Iteration: 200 / 2000 [ 10%] (Warmup)\nChain 4: Iteration: 400 / 2000 [ 20%] (Warmup)\nChain 4: Iteration: 600 / 2000 [ 30%] (Warmup)\nChain 4: Iteration: 800 / 2000 [ 40%] (Warmup)\nChain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)\nChain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)\nChain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)\nChain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)\nChain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)\nChain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)\nChain 4: Iteration: 2000 / 2000 [100%] (Sampling)\nChain 4: \nChain 4: Elapsed Time: 0.023883 seconds (Warm-up)\nChain 4: 0.021166 seconds (Sampling)\nChain 4: 0.045049 seconds (Total)\nChain 4: \n\n\nLook at the Stan code:\n\nstancode(dandelion_model)\n\n// generated with brms 2.18.0\nfunctions {\n}\ndata {\n int N; // total number of observations\n int Y[N]; // response variable\n vector[N] offsets;\n int prior_only; // should the likelihood be ignored?\n}\ntransformed data {\n}\nparameters {\n real Intercept; // temporary intercept for centered predictors\n}\ntransformed parameters {\n real lprior = 0; // prior contributions to the log posterior\n lprior += normal_lpdf(Intercept | 2, 1);\n}\nmodel {\n // likelihood including constants\n if (!prior_only) {\n // initialize linear predictor term\n vector[N] mu = rep_vector(0.0, N);\n mu += Intercept + offsets;\n target += poisson_log_lpmf(Y | mu);\n }\n // priors including constants\n target += lprior;\n}\ngenerated quantities {\n // actual population-level intercept\n real b_Intercept = Intercept;\n}\n\n\nLook at posterior distribution of parameter:\n\n# as.matrix(dandelion_model) |> head()\n\nlibrary(tidybayes)\n\ntidy_draws(dandelion_model) |> \n ggplot(aes(x = exp(b_Intercept))) + \n geom_histogram() + \n geom_vline(xintercept = 6, col = \"red\", lwd = 3)\n\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`." + }, { "objectID": "posts/2021-11-02-introduction-to-gams/index.html", "href": "posts/2021-11-02-introduction-to-gams/index.html", @@ -886,13 +942,13 @@ "href": "index.html", "title": "BIOS² Education resources", "section": "", - "text": "Order By\n Default\n \n Date - Oldest\n \n \n Date - Newest\n \n \n Author\n \n \n \n\n\n\n\n \n\n\n\n\nIntroduction to Microbiome Analysis\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis workshop will give an overview of the theory and practice of using metabarcoding approaches to study the diversity of microbial communities. The workshop will give participants an understanding of 1) the current methods for microbiome diversity quantification using metabarcoding/amplicon sequencing approaches and 2) the normalization and diversity analysis approaches that can be used to quantify the diversity of microbial communities.\n\n\n\n\n\n\nMay 19, 2022\n\n\nSteven Kembel, Zihui Wang, Salix Dubois\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Generalized Additive Models (GAMs)\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nTo address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the R mgcv package, can be used to fit Generalized Additive Models (GAMs) to data from a wide range of sources.\n\n\n\n\n\n\nNov 2, 2021\n\n\nEric Pedersen\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Shiny Apps\n\n\n\n\n\n\n\nTechnical\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nIntroduction to interactive app development with R Shiny.\n\n\n\n\n\n\nJun 22, 2021\n\n\nKatherine Hébert, Andrew MacDonald, Jake Lawlor, Vincent Bellavance\n\n\n\n\n\n\n \n\n\n\n\nGeneralized Linear Models for Community Ecology\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nIn this workshop we will explore, discuss, and apply generalized linear models to combine information on species distributions, traits, phylogenies, environmental and landscape variation. We will also discuss inference under spatial and phylogenetic autocorrelation under fixed and random effects implementations. We will discuss technical elements and cover implementations using R.\n\n\n\n\n\n\nMay 17, 2021\n\n\nPedro Peres-Neto\n\n\n\n\n\n\n \n\n\n\n\nBuilding R packages\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis practical training will cover the basics of modern package development in R with a focus on the following three aspects: (1) how to turn your code into functions, (2) how to write tests and documentation, and (3) how to share your R package on GitHub..\n\n\n\n\n\n\nMay 4, 2021\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nPoint-count Data Analysis\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nAnalysis of point-count data in the presence of variable survey methodologies and detection error offered by Péter Sólymos to BIOS2 Fellows in March 2021.\n\n\n\n\n\n\nMar 25, 2021\n\n\nPéter Sólymos\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to EDI Concepts in a Scientific Context\n\n\n\n\n\n\n\nTransversal competencies\n\n\nFR\n\n\nEN\n\n\n\n\nA short introduction to EDI concepts in a scientific context.\n\n\n\n\n\n\nJan 22, 2021\n\n\nAgathe Riallan, Marie-José Naud\n\n\n\n\n\n\n \n\n\n\n\nSpatial Statistics in Ecology\n\n\n\n\n\n\n\nFR\n\n\nEN\n\n\nTechnical\n\n\n\n\nTraining session about statistical analysis of spatial data in ecology, hosted by Philippe Marchand (UQAT). | Session de formation sur l’analyse statistique des données spatiales en écologie, animée par Pr. Philippe Marchand (UQAT).\n\n\n\n\n\n\nJan 12, 2021\n\n\nPhilippe Marchand\n\n\n\n\n\n\n \n\n\n\n\nMaking Websites with HUGO\n\n\n\n\n\n\n\nTechnical\n\n\nTransversal competencies\n\n\nEN\n\n\n\n\nThis workshop provides a general introduction to HUGO, a popular open source framework for building websites without requiring a knowledge of HTML/CSS or web programming.\n\n\n\n\n\n\nDec 7, 2020\n\n\nDominique Gravel, Guillaume Larocque\n\n\n\n\n\n\n \n\n\n\n\nData Visualization\n\n\n\n\n\n\n\nTechnical\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nGeneral principles of visualization and graphic design, and techniques of tailored visualization. This training was developed and delivered by Alex Arkilanian and Katherine Hébert on September 21st and 22nd, 2020.\n\n\n\n\n\n\nSep 21, 2020\n\n\nAlex Arkilanian, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nScience Communication\n\n\n\n\n\n\n\nCareer\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nRecordings, content and handouts from a 6-hour Science Communication workshop held over two days on 15 and 16 June 2020.\n\n\n\n\n\n\nJun 15, 2020\n\n\nGracielle Higino, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nSensibilisation aux réalités autochtones et recherche collaborative\n\n\n\n\n\n\n\nTransversal competencies\n\n\nFR\n\n\n\n\nSérie de deux webinaires sur la sensibilisation aux réalités autochtones et la recherche en collaboration avec les Autochtones, offert du 28 au 30 avril 2020 par Catherine-Alexandra Gagnon, PhD.\n\n\n\n\n\n\nApr 28, 2020\n\n\nCatherine-Alexandra Gagnon\n\n\n\n\n\n\n \n\n\n\n\nMathematical Modeling in Ecology and Evolution\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis workshop will introduce participants to the logic behind modeling in biology, focusing on developing equations, finding equilibria, analyzing stability, and running simulations.Techniques will be illustrated with the software tools, Mathematica and Maxima. This workshop was held in two parts: January 14 and January 16, 2020.\n\n\n\n\n\n\nJan 14, 2020\n\n\nSarah P. Otto\n\n\n\n\n\n\nNo matching items" + "text": "Order By\n Default\n \n Date - Oldest\n \n \n Date - Newest\n \n \n Author\n \n \n \n\n\n\n\n \n\n\n\n\nBayes for the Busy Ecologist\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis workshop presents one idea of a complete workflow for applied Bayesian statistics with real-world models that are actually used by biodiversity scientists.\n\n\n\n\n\n\nOct 3, 2023\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Microbiome Analysis\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis workshop will give an overview of the theory and practice of using metabarcoding approaches to study the diversity of microbial communities. The workshop will give participants an understanding of 1) the current methods for microbiome diversity quantification using metabarcoding/amplicon sequencing approaches and 2) the normalization and diversity analysis approaches that can be used to quantify the diversity of microbial communities.\n\n\n\n\n\n\nMay 19, 2022\n\n\nSteven Kembel, Zihui Wang, Salix Dubois\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Generalized Additive Models (GAMs)\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nTo address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the R mgcv package, can be used to fit Generalized Additive Models (GAMs) to data from a wide range of sources.\n\n\n\n\n\n\nNov 2, 2021\n\n\nEric Pedersen\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Shiny Apps\n\n\n\n\n\n\n\nTechnical\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nIntroduction to interactive app development with R Shiny.\n\n\n\n\n\n\nJun 22, 2021\n\n\nKatherine Hébert, Andrew MacDonald, Jake Lawlor, Vincent Bellavance\n\n\n\n\n\n\n \n\n\n\n\nGeneralized Linear Models for Community Ecology\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nIn this workshop we will explore, discuss, and apply generalized linear models to combine information on species distributions, traits, phylogenies, environmental and landscape variation. We will also discuss inference under spatial and phylogenetic autocorrelation under fixed and random effects implementations. We will discuss technical elements and cover implementations using R.\n\n\n\n\n\n\nMay 17, 2021\n\n\nPedro Peres-Neto\n\n\n\n\n\n\n \n\n\n\n\nBuilding R packages\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis practical training will cover the basics of modern package development in R with a focus on the following three aspects: (1) how to turn your code into functions, (2) how to write tests and documentation, and (3) how to share your R package on GitHub..\n\n\n\n\n\n\nMay 4, 2021\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nPoint-count Data Analysis\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nAnalysis of point-count data in the presence of variable survey methodologies and detection error offered by Péter Sólymos to BIOS2 Fellows in March 2021.\n\n\n\n\n\n\nMar 25, 2021\n\n\nPéter Sólymos\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to EDI Concepts in a Scientific Context\n\n\n\n\n\n\n\nTransversal competencies\n\n\nFR\n\n\nEN\n\n\n\n\nA short introduction to EDI concepts in a scientific context.\n\n\n\n\n\n\nJan 22, 2021\n\n\nAgathe Riallan, Marie-José Naud\n\n\n\n\n\n\n \n\n\n\n\nSpatial Statistics in Ecology\n\n\n\n\n\n\n\nFR\n\n\nEN\n\n\nTechnical\n\n\n\n\nTraining session about statistical analysis of spatial data in ecology, hosted by Philippe Marchand (UQAT). | Session de formation sur l’analyse statistique des données spatiales en écologie, animée par Pr. Philippe Marchand (UQAT).\n\n\n\n\n\n\nJan 12, 2021\n\n\nPhilippe Marchand\n\n\n\n\n\n\n \n\n\n\n\nMaking Websites with HUGO\n\n\n\n\n\n\n\nTechnical\n\n\nTransversal competencies\n\n\nEN\n\n\n\n\nThis workshop provides a general introduction to HUGO, a popular open source framework for building websites without requiring a knowledge of HTML/CSS or web programming.\n\n\n\n\n\n\nDec 7, 2020\n\n\nDominique Gravel, Guillaume Larocque\n\n\n\n\n\n\n \n\n\n\n\nData Visualization\n\n\n\n\n\n\n\nTechnical\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nGeneral principles of visualization and graphic design, and techniques of tailored visualization. This training was developed and delivered by Alex Arkilanian and Katherine Hébert on September 21st and 22nd, 2020.\n\n\n\n\n\n\nSep 21, 2020\n\n\nAlex Arkilanian, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nScience Communication\n\n\n\n\n\n\n\nCareer\n\n\nFellow contributed\n\n\nEN\n\n\n\n\nRecordings, content and handouts from a 6-hour Science Communication workshop held over two days on 15 and 16 June 2020.\n\n\n\n\n\n\nJun 15, 2020\n\n\nGracielle Higino, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nSensibilisation aux réalités autochtones et recherche collaborative\n\n\n\n\n\n\n\nTransversal competencies\n\n\nFR\n\n\n\n\nSérie de deux webinaires sur la sensibilisation aux réalités autochtones et la recherche en collaboration avec les Autochtones, offert du 28 au 30 avril 2020 par Catherine-Alexandra Gagnon, PhD.\n\n\n\n\n\n\nApr 28, 2020\n\n\nCatherine-Alexandra Gagnon\n\n\n\n\n\n\n \n\n\n\n\nMathematical Modeling in Ecology and Evolution\n\n\n\n\n\n\n\nTechnical\n\n\nEN\n\n\n\n\nThis workshop will introduce participants to the logic behind modeling in biology, focusing on developing equations, finding equilibria, analyzing stability, and running simulations.Techniques will be illustrated with the software tools, Mathematica and Maxima. This workshop was held in two parts: January 14 and January 16, 2020.\n\n\n\n\n\n\nJan 14, 2020\n\n\nSarah P. Otto\n\n\n\n\n\n\nNo matching items" }, { "objectID": "about.html", "href": "about.html", "title": "About", "section": "", - "text": "About this blog\n\n\n\n\n \n \n \n Order By\n Default\n \n Date - Oldest\n \n \n Date - Newest\n \n \n Author\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\nBIOS² Education resources\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n \n\n\n\n\nBiodiversity Modelling 2022\n\n\n\n\n The 2022 edition of the Biodiversity Modelling Summer School was on the theme: Biodiversity changes and data visualization. The course took the form of a workshop during which the students, in collaboration with local organizations involved in biodiversity monitoring, developed a web platform for visualizing biodiversity changes.\n\n\n\n\n\n\nAug 22, 2022\n\n\nDominique Gravel, Vincent Beauregard\n\n\n\n\n\n\n \n\n\n\n\nBuilding R packages\n\n\n\n\nThis practical training will cover the basics of modern package development in R with a focus on the following three aspects: (1) how to turn your code into functions, (2) how to write tests and documentation, and (3) how to share your R package on GitHub..\n\n\n\n\n\n\nMay 4, 2021\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nData Visualization\n\n\n\n\nGeneral principles of visualization and graphic design, and techniques of tailored visualization. This training was developed and delivered by Alex Arkilanian and Katherine Hébert on September 21st and 22nd, 2020.\n\n\n\n\n\n\nSep 21, 2020\n\n\nAlex Arkilanian, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nGeneralized Linear Models for Community Ecology\n\n\n\n\nIn this workshop we will explore, discuss, and apply generalized linear models to combine information on species distributions, traits, phylogenies, environmental and landscape variation. We will also discuss inference under spatial and phylogenetic autocorrelation under fixed and random effects implementations. We will discuss technical elements and cover implementations using R.\n\n\n\n\n\n\nMay 17, 2021\n\n\nPedro Peres-Neto\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to EDI Concepts in a Scientific Context\n\n\n\n\nA short introduction to EDI concepts in a scientific context.\n\n\n\n\n\n\nJan 22, 2021\n\n\nAgathe Riallan, Marie-José Naud\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Generalized Additive Models (GAMs)\n\n\n\n\nTo address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the R mgcv package, can be used to fit Generalized Additive Models (GAMs) to data from a wide range of sources.\n\n\n\n\n\n\nNov 2, 2021\n\n\nEric Pedersen\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Microbiome Analysis\n\n\n\n\nThis workshop will give an overview of the theory and practice of using metabarcoding approaches to study the diversity of microbial communities. The workshop will give participants an understanding of 1) the current methods for microbiome diversity quantification using metabarcoding/amplicon sequencing approaches and 2) the normalization and diversity analysis approaches that can be used to quantify the diversity of microbial communities.\n\n\n\n\n\n\nMay 19, 2022\n\n\nSteven Kembel, Zihui Wang, Salix Dubois\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Shiny Apps\n\n\n\n\nIntroduction to interactive app development with R Shiny.\n\n\n\n\n\n\nJun 22, 2021\n\n\nKatherine Hébert, Andrew MacDonald, Jake Lawlor, Vincent Bellavance\n\n\n\n\n\n\n \n\n\n\n\nMaking Websites with HUGO\n\n\n\n\nThis workshop provides a general introduction to HUGO, a popular open source framework for building websites without requiring a knowledge of HTML/CSS or web programming.\n\n\n\n\n\n\nDec 7, 2020\n\n\nDominique Gravel, Guillaume Larocque\n\n\n\n\n\n\n \n\n\n\n\nMathematical Modeling in Ecology and Evolution\n\n\n\n\nThis workshop will introduce participants to the logic behind modeling in biology, focusing on developing equations, finding equilibria, analyzing stability, and running simulations.Techniques will be illustrated with the software tools, Mathematica and Maxima. This workshop was held in two parts: January 14 and January 16, 2020.\n\n\n\n\n\n\nJan 14, 2020\n\n\nSarah P. Otto\n\n\n\n\n\n\n \n\n\n\n\nPoint-count Data Analysis\n\n\n\n\nAnalysis of point-count data in the presence of variable survey methodologies and detection error offered by Péter Sólymos to BIOS2 Fellows in March 2021.\n\n\n\n\n\n\nMar 25, 2021\n\n\nPéter Sólymos\n\n\n\n\n\n\n \n\n\n\n\nScience Communication\n\n\n\n\nRecordings, content and handouts from a 6-hour Science Communication workshop held over two days on 15 and 16 June 2020.\n\n\n\n\n\n\nJun 15, 2020\n\n\nGracielle Higino, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nSensibilisation aux réalités autochtones et recherche collaborative\n\n\n\n\nSérie de deux webinaires sur la sensibilisation aux réalités autochtones et la recherche en collaboration avec les Autochtones, offert du 28 au 30 avril 2020 par Catherine-Alexandra Gagnon, PhD.\n\n\n\n\n\n\nApr 28, 2020\n\n\nCatherine-Alexandra Gagnon\n\n\n\n\n\n\n \n\n\n\n\nSpatial Statistics in Ecology\n\n\n\n\nTraining session about statistical analysis of spatial data in ecology, hosted by Philippe Marchand (UQAT). | Session de formation sur l’analyse statistique des données spatiales en écologie, animée par Pr. Philippe Marchand (UQAT).\n\n\n\n\n\n\nJan 12, 2021\n\n\nPhilippe Marchand\n\n\n\n\n\n\nNo matching items" + "text": "About this blog\n\n\n\n\n \n \n \n Order By\n Default\n \n Date - Oldest\n \n \n Date - Newest\n \n \n Author\n \n \n \n\n\n\n\n\n\n\n\n\n\n\n\nBIOS² Education resources\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n \n\n\n\n\nBayes for the Busy Ecologist\n\n\n\n\nThis workshop presents one idea of a complete workflow for applied Bayesian statistics with real-world models that are actually used by biodiversity scientists.\n\n\n\n\n\n\nOct 3, 2023\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nBiodiversity Modelling 2022\n\n\n\n\n The 2022 edition of the Biodiversity Modelling Summer School was on the theme: Biodiversity changes and data visualization. The course took the form of a workshop during which the students, in collaboration with local organizations involved in biodiversity monitoring, developed a web platform for visualizing biodiversity changes.\n\n\n\n\n\n\nAug 22, 2022\n\n\nDominique Gravel, Vincent Beauregard\n\n\n\n\n\n\n \n\n\n\n\nBuilding R packages\n\n\n\n\nThis practical training will cover the basics of modern package development in R with a focus on the following three aspects: (1) how to turn your code into functions, (2) how to write tests and documentation, and (3) how to share your R package on GitHub..\n\n\n\n\n\n\nMay 4, 2021\n\n\nAndrew MacDonald\n\n\n\n\n\n\n \n\n\n\n\nData Visualization\n\n\n\n\nGeneral principles of visualization and graphic design, and techniques of tailored visualization. This training was developed and delivered by Alex Arkilanian and Katherine Hébert on September 21st and 22nd, 2020.\n\n\n\n\n\n\nSep 21, 2020\n\n\nAlex Arkilanian, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nGeneralized Linear Models for Community Ecology\n\n\n\n\nIn this workshop we will explore, discuss, and apply generalized linear models to combine information on species distributions, traits, phylogenies, environmental and landscape variation. We will also discuss inference under spatial and phylogenetic autocorrelation under fixed and random effects implementations. We will discuss technical elements and cover implementations using R.\n\n\n\n\n\n\nMay 17, 2021\n\n\nPedro Peres-Neto\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to EDI Concepts in a Scientific Context\n\n\n\n\nA short introduction to EDI concepts in a scientific context.\n\n\n\n\n\n\nJan 22, 2021\n\n\nAgathe Riallan, Marie-José Naud\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Generalized Additive Models (GAMs)\n\n\n\n\nTo address the increase in both quantity and complexity of available data, ecologists require flexible, robust statistical models, as well as software to perform such analyses. This workshop will focus on how a single tool, the R mgcv package, can be used to fit Generalized Additive Models (GAMs) to data from a wide range of sources.\n\n\n\n\n\n\nNov 2, 2021\n\n\nEric Pedersen\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Microbiome Analysis\n\n\n\n\nThis workshop will give an overview of the theory and practice of using metabarcoding approaches to study the diversity of microbial communities. The workshop will give participants an understanding of 1) the current methods for microbiome diversity quantification using metabarcoding/amplicon sequencing approaches and 2) the normalization and diversity analysis approaches that can be used to quantify the diversity of microbial communities.\n\n\n\n\n\n\nMay 19, 2022\n\n\nSteven Kembel, Zihui Wang, Salix Dubois\n\n\n\n\n\n\n \n\n\n\n\nIntroduction to Shiny Apps\n\n\n\n\nIntroduction to interactive app development with R Shiny.\n\n\n\n\n\n\nJun 22, 2021\n\n\nKatherine Hébert, Andrew MacDonald, Jake Lawlor, Vincent Bellavance\n\n\n\n\n\n\n \n\n\n\n\nMaking Websites with HUGO\n\n\n\n\nThis workshop provides a general introduction to HUGO, a popular open source framework for building websites without requiring a knowledge of HTML/CSS or web programming.\n\n\n\n\n\n\nDec 7, 2020\n\n\nDominique Gravel, Guillaume Larocque\n\n\n\n\n\n\n \n\n\n\n\nMathematical Modeling in Ecology and Evolution\n\n\n\n\nThis workshop will introduce participants to the logic behind modeling in biology, focusing on developing equations, finding equilibria, analyzing stability, and running simulations.Techniques will be illustrated with the software tools, Mathematica and Maxima. This workshop was held in two parts: January 14 and January 16, 2020.\n\n\n\n\n\n\nJan 14, 2020\n\n\nSarah P. Otto\n\n\n\n\n\n\n \n\n\n\n\nPoint-count Data Analysis\n\n\n\n\nAnalysis of point-count data in the presence of variable survey methodologies and detection error offered by Péter Sólymos to BIOS2 Fellows in March 2021.\n\n\n\n\n\n\nMar 25, 2021\n\n\nPéter Sólymos\n\n\n\n\n\n\n \n\n\n\n\nScience Communication\n\n\n\n\nRecordings, content and handouts from a 6-hour Science Communication workshop held over two days on 15 and 16 June 2020.\n\n\n\n\n\n\nJun 15, 2020\n\n\nGracielle Higino, Katherine Hébert\n\n\n\n\n\n\n \n\n\n\n\nSensibilisation aux réalités autochtones et recherche collaborative\n\n\n\n\nSérie de deux webinaires sur la sensibilisation aux réalités autochtones et la recherche en collaboration avec les Autochtones, offert du 28 au 30 avril 2020 par Catherine-Alexandra Gagnon, PhD.\n\n\n\n\n\n\nApr 28, 2020\n\n\nCatherine-Alexandra Gagnon\n\n\n\n\n\n\n \n\n\n\n\nSpatial Statistics in Ecology\n\n\n\n\nTraining session about statistical analysis of spatial data in ecology, hosted by Philippe Marchand (UQAT). | Session de formation sur l’analyse statistique des données spatiales en écologie, animée par Pr. Philippe Marchand (UQAT).\n\n\n\n\n\n\nJan 12, 2021\n\n\nPhilippe Marchand\n\n\n\n\n\n\nNo matching items" } ] \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index c3eb2e4..66b19dc 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -2,66 +2,70 @@ https://bios2.github.io/posts/2020-04-28-sensibilisation-aux-ralits-autochtones-et-recherche-collaborative/index.html - 2022-12-16T19:04:23.792Z + 2022-12-16T19:45:26.012Z https://bios2.github.io/posts/2020-09-21-data-visualization/index.html - 2022-12-16T19:04:25.126Z + 2022-12-16T19:45:27.768Z + + + https://bios2.github.io/posts/2022-03-22-bayes-for-the-busy-ecologist/index.html + 2022-12-16T19:45:29.374Z https://bios2.github.io/posts/2021-11-02-introduction-to-gams/index.html - 2022-12-16T19:04:25.993Z + 2022-12-16T19:45:30.415Z https://bios2.github.io/posts/2020-01-14-mathematical-modeling-in-ecology-and-evolution/index.html - 2022-12-16T19:04:26.595Z + 2022-12-16T19:45:31.017Z https://bios2.github.io/posts/2020-06-15-science-communication/index.html - 2022-12-16T19:04:27.129Z + 2022-12-16T19:45:31.578Z https://bios2.github.io/posts/2021-06-22-introduction-to-shiny-apps/index.html - 2022-12-16T19:04:28.237Z + 2022-12-16T19:45:32.992Z https://bios2.github.io/posts/2021-05-04-building-r-packages/index.html - 2022-12-16T19:04:29.050Z + 2022-12-16T19:45:33.957Z https://bios2.github.io/posts/2021-01-12-4-day-training-in-spatial-statistics-with-philippe-marchand/index.html - 2022-12-16T19:04:34.884Z + 2022-12-16T19:45:40.269Z https://bios2.github.io/posts/2022-05-19-introduction-to-microbiome-analysis/index.html - 2022-12-16T19:05:22.138Z + 2022-12-16T19:45:42.725Z https://bios2.github.io/posts/2020-12-07-making-websites-with-hugo/index.html - 2022-12-16T19:04:37.979Z + 2022-12-16T19:45:43.683Z https://bios2.github.io/posts/2021-01-22-introduction-aux-concepts-edi-en-contexte-scientifique/index.html - 2022-12-16T19:04:38.457Z + 2022-12-16T19:45:44.255Z https://bios2.github.io/posts/2021-03-25-point-count-data-analysis/index.html - 2022-12-16T19:04:39.244Z + 2022-12-16T19:45:45.429Z https://bios2.github.io/posts/2021-07-19-glm-community-ecology/index.html - 2022-12-16T19:04:42.308Z + 2022-12-16T19:45:49.277Z https://bios2.github.io/summer-schools/BiodiversityModelling2022.html - 2022-12-16T19:04:42.735Z + 2022-12-16T19:45:49.706Z https://bios2.github.io/index.html - 2022-12-16T19:05:07.213Z + 2022-12-16T19:45:50.506Z https://bios2.github.io/about.html - 2022-12-16T19:04:43.939Z + 2022-12-16T19:45:51.398Z diff --git a/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_model.rds b/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_model.rds new file mode 100644 index 0000000..133a21d Binary files /dev/null and b/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_model.rds differ diff --git a/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_posterior.rds b/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_posterior.rds new file mode 100644 index 0000000..5f12ad2 Binary files /dev/null and b/posts/2022-03-22-bayes-for-the-busy-ecologist/bird_posterior.rds differ diff --git a/posts/2022-03-22-bayes-for-the-busy-ecologist/index.qmd b/posts/2022-03-22-bayes-for-the-busy-ecologist/index.qmd new file mode 100644 index 0000000..f4e6b16 --- /dev/null +++ b/posts/2022-03-22-bayes-for-the-busy-ecologist/index.qmd @@ -0,0 +1,828 @@ +--- +title: "Bayes for the Busy Ecologist" +description: "This workshop presents one idea of a complete workflow for applied Bayesian statistics with real-world models that are actually used by biodiversity scientists." +author: + - name: "Andrew MacDonald" + affiliation: "BIOS² and Université de Sherbrooke" +date: "22-03-2022" +image: image.jpg +categories: [Technical, EN] +toc: true +number-sections: true +number-depth: 1 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, warning = FALSE) +# set a common theme for all ggplots +ggplot2::theme_set(ggplot2::theme_minimal()) +``` + +# Applied Bayesian models for working ecologists {-} + +This training covers how to write down a model, how to translate that into computer code, how to fit it to data and finally, how to work with the resulting posterior distribution. We’ll use Stan, which is a language for writing Bayesian models, and practice with some of the tools that help us work with Stan: rstanarm, brms, tidybayes, shinystan. + +This training is intended for experienced users of Bayesian tools and for complete beginners who want to use such tools. + +This 6 to 8h online workshop was conducted in 4 sessions: March 22, 24, 29, 31, 2022, from 11AM – 1 PM Pacific Time / 12 – 2 PM Mountain Time / 2-4 PM Eastern Time. The training was built and presented by Dr. Andrew MacDonald in English with bilingual support throughout. + +Andrew MacDonald is the Training Coordinator of the BIOS² program. He is a quantitative ecologist who works mostly in R and a well-experienced trainer in teaching quantitative and computational methods. He is currently a research professional at Université de Sherbrooke. + +# Day 1 + +# Day 2: Hierarchical and nonlinear models + +In which we discuss many groups and curving lines. + +## Outline + +* Return to previous model: Poisson regression +* Panel regression version of this model +* Bayesian workflow +* Brief foray into moment matching +* Nonlinear model +* Nonlinear model with random effects + +## Quick review + +### Bird masses + +This example is based on work by Marie-Eve at UdeS! + +We imagine a model like the following: + +$$ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(\lambda_i) \\ +\text{log}(\lambda_i) &= \beta_0 + \beta_1 \times \text{Mass}_i \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +$$ + + +$i$ keeps track of which bird we are talking about. You can think of it as "bird number i" + +Note: We could also write the model like this: + +$$ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(e^{\beta_0} \times e^{\beta_1 \times \text{Mass}_i}) \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +$$ + +### Centering variables + +Centering variables is one of the most important things we can do to help our models be more interpretable. This also helps us to set good priors. + +Centering a variable means to subtract the mean from the variable: + +$$ +\begin{align} +\text{Nestlings}_i & \sim \text{Poisson}(\lambda_i) \\ +\text{log}(\lambda_i) &= \beta_0 + \beta_1 \times (\text{Mass}_i - \overline{\text{Mass}}) \\ +\beta_0 & \sim \text{Normal}(??, ??) \\ +\beta_1 & \sim \text{Normal}(??, ??) +\end{align} +$$ + +*Question* How does this change the meaning of $\beta_0$ and/or $\beta_1$, if at all? (Hint: what will be the equation for a bird who has exactly average mass?) + +```{r bird-simulation} +set.seed(1234) + +n_birds <- 15 +avg_nestlings_at_avg_mass <- log(4.2) +effect_of_one_gram <- .2 + +mother_masses_g <- rnorm(n_birds, mean = 15, sd = 3) +avg_mother_mass <- mean(mother_masses_g) + +log_average_nestlings <- avg_nestlings_at_avg_mass + + effect_of_one_gram * (mother_masses_g - avg_mother_mass) + +nestlings <- rpois(n = n_birds, lambda = exp(log_average_nestlings)) +``` + +Plot these to get an idea of it: + +```{r, message=FALSE} +suppressPackageStartupMessages(library(tidyverse)) +imaginary_birds <- tibble(mother_masses_g, nestlings) + +ggplot(imaginary_birds, aes(x = mother_masses_g, y = nestlings)) + + geom_point() +``` + +*NOTE* We can also fit this very same model by frequentist statistics, using `lm` + +```{r} +coef(glm(nestlings ~ 1 + I(mother_masses_g - mean(mother_masses_g)), family = "poisson")) +# compare to known values +avg_nestlings_at_avg_mass +effect_of_one_gram +``` + +### Bayesian workflow: define a model and priors + +```{r, message=FALSE} +library(brms) + +imaginary_birds_centered <- imaginary_birds |> + mutate(mother_mass_g_cen = mother_masses_g - mean(mother_masses_g)) + +bird_form <- bf(nestlings ~ 1 + mother_mass_g_cen, family = poisson(link = "log")) + +get_prior(bird_form, data = imaginary_birds_centered) +``` + +We set a prior for each parameter. + +```{r} +bird_priors <- c( + prior(normal(1, .5), class = "Intercept"), + prior(normal(.1, .1), class = "b", coef = "mother_mass_g_cen") +) +``` + +#### Prior predictive checks + +```{r} +prior_predictions <- brm(bird_form, + data = imaginary_birds_centered, + prior = bird_priors, + sample_prior = "only", + file = "bird_model", + file_refit = "on_change", + refresh = FALSE) +``` + +Plot a few of these: + +```{r, message=FALSE} +library(tidybayes) +imaginary_birds_centered |> + add_predicted_draws(prior_predictions, ndraws = 6, seed = 4321) |> + ggplot(aes(x = mother_masses_g, y = .prediction)) + geom_point() + facet_wrap(~.draw) +``` + +*Question* are we satisfied with these priors? + +#### Fit to the data + +```{r} +bird_posterior <- update(prior_predictions, sample_prior = "yes", + file = "bird_posterior", + file_refit = "on_change", refresh = FALSE) +``` + +```{r} +summary(bird_posterior) + +knitr::kable(head(tidybayes::tidy_draws(bird_posterior))) +``` + +How do our priors and posteriors compare? + +```{r, message=FALSE} +library(ggridges) +tidybayes::tidy_draws(bird_posterior) |> + select(.draw, b_Intercept:prior_b_mother_mass_g_cen) |> + pivot_longer(-.draw) |> + ggplot(aes(x = value, y = name)) + geom_density_ridges() +``` + +Can we draw the regression line? + +```{r, warning = FALSE} +average_mom <- mean(mother_masses_g) + +range(imaginary_birds_centered$mother_mass_g_cen) + +tibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, + n = 10)) |> + tidybayes::add_epred_draws(bird_posterior) |> + ungroup() |> + ggplot(aes(x = average_mom + mother_mass_g_cen, y = .epred)) + + stat_lineribbon() + + scale_fill_brewer(palette = "Greens", direction = -1) + + geom_point(aes(x = mother_masses_g, y = nestlings), + data = imaginary_birds_centered, pch = 21, + fill = "orange", size = 3) + + +``` + +Let's also try drawing the prediction intervals. + +```{r} +average_mom <- mean(mother_masses_g) + +range(imaginary_birds_centered$mother_mass_g_cen) + +tibble(mother_mass_g_cen = modelr::seq_range(imaginary_birds_centered$mother_mass_g_cen, + n = 10)) |> + tidybayes::add_predicted_draws(bird_posterior) |> + ungroup() |> + ggplot(aes(x = average_mom + mother_mass_g_cen, y = .prediction)) + + stat_lineribbon() + + scale_fill_brewer(palette = "Greens", direction = -1) + + geom_point(aes(x = mother_masses_g, y = nestlings), + data = imaginary_birds_centered, pch = 21, + fill = "orange", size = 3) + + +``` + +Other checks we can do: + +```{r eval=FALSE} +bird_posterior_onlyparam <- update(prior_predictions, sample_prior = "no", + file = "bird_posterior", + file_refit = "on_change", refresh = FALSE) + +shinystan::launch_shinystan(bird_posterior_onlyparam) +``` + + +## Multilevel models + +Based on the awesome vignette for vignette for `tidybayes` + +We begin by sampling some data from five different "conditions": + +```{r setup-vig, message=FALSE} +library(modelr) +set.seed(5) +n <- 10 +n_condition <- 5 +ABC <- + data_frame( + condition = rep(c("A", "B", "C", "D", "E"), n), + response = rnorm(n * 5, c(0, 1, 2, 1, -1), 0.5) + ) + +ABC %>% + ggplot(aes(y = condition, x = response)) + + geom_point(pch = 21, size = 4, stroke = 1.4, fill = "#41b6c4") +``` + +And by fitting a model to these data, with varying intercepts for each group: + +```{r MODEL_m, message=FALSE, warning=FALSE, results='hide'} +m <- brm( + response ~ (1 | condition), data = ABC, + control = list(adapt_delta = .99), + prior = c( + prior(normal(0, 1), class = Intercept), + prior(student_t(3, 0, 1), class = sd), + prior(student_t(3, 0, 1), class = sigma) + ) +) +``` + +An easy way to visualize these results is with a _ridgeline plot_ as above + +```{r m_plot} +ABC %>% + modelr::data_grid(condition) %>% + tidybayes::add_predicted_draws(m) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + + theme_minimal() +``` + +Alright. This used the simple vanilla option, `add_predicted_samples(m)`. This uses the default options for making predictions, which recall is "NULL (default), include all group-level effects". If you set `add_predicted_samples(m, re_formula = NULL)`, you'll get exactly the same figure. + +So we can see that to "include" an effect is to take the actual estimated intercepts for each _specific group we studied_ and use them to make new predictions for the same groups. This is **Case 1** from McElreath's list (though in this case, because we only have groups and nothing else, Case 1 and 2 are the same). + +We can also say the **exact same thing** using a formula: + +```{r m_re_condition} +ABC %>% + data_grid(condition) %>% + add_predicted_draws(m, re_formula = ~(1|condition)) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + + theme_minimal() +``` + +That's right, there are three ways to say the exact same thing: say nothing, say `NULL`, or say the original "random effects" formula^[this impulse in R to "help your users" by making it possible to say a great deal by saying almost nothing is... actually pretty counterproductive, I'd argue? But that's a different post]. You go with what you feel in your heart is right, but I prefer the formula. + +In all three cases, we are using the model to predict the means for the groups in our varying-intercepts model. This is what the documentation means by "including" these varying intercepts. + +### Squishing those random effects + +OK, so that was three separate ways to make predictions for the _same_ groups. What else can we do? Let's try that thing with the `NA` argument, which means "include no group-level effects": + +```{r m_plot_NA_TRUE-and_zero} +ABC %>% + data_grid(condition) %>% + add_predicted_draws(m, re_formula = NA, + n = 2000) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + theme_minimal() +``` + +Ah, so if you do this, all the groups come out the same! But if they're all the same, what do they represent? It seems reasonable that they represent the model's intercept, as if the varying intercepts were all 0. Let's calculate predicitons that ignore the varying effects -- that is, using only the model intercept and the standard deviation of the response -- using a bit of [handy `purrr` magic]^[no magic required! `rnorm` is already vectorized]: + + +```{r} +m %>% + spread_draws(b_Intercept, sigma) %>% + mutate(prediction = rnorm(length(b_Intercept), b_Intercept, sigma), + #map2_dbl(b_Intercept, sigma, ~ rnorm(1, mean = .x, sd = .y)), + Prediction = "prediction") %>% #glimpse %>% + ggplot(aes(x = prediction, y = Prediction)) + + geom_density_ridges(fill = "#41b6c4") + + theme_minimal() +``` + +As you can see, this distribution has exactly the same shape as the five in the previous figure! It is as if we calculated the predictions for a group which was exactly at the average (in other words, it had a varying intercept of 0.) In the Rethinking book, readers are taught to do this in a much more explicit way: you actually generate all the 0 intercepts yourself, and give that to the model in place of the estimated intercepts! A very manual and concrete way to "set something to 0". + +`brms` does this too. As the documentation says +>NA values within factors in newdata, are interpreted as if all dummy variables of this factor are zero. + +The `brms` phrasing certainly takes less space, though it also requires you to remember that this is what NA gets you! + +We can also remove random effects from our predictions by excluding them from the `re_formula`. In our model, we have only one varying effect -- yet an even simpler formula is possible, a model with no intercept at all: + +```{r} +ABC %>% + data_grid(condition) %>% + add_predicted_draws(m, re_formula = ~ 0, + n = 2000) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + theme_minimal() +``` + +Once again, the same distribution appears: it is as if all group effects had been set to zero. If we had two random effects and omitted one, this is what we would get for the omitted effect -- the expected value if all its effects were 0. + +### New levels + +I'm going to show how to create predictions for new levels, but first I'm going to show two mistakes that I made frequently while learning: + +First, asking for new levels without specifying `allow_new_levels = TRUE`: + +```{r m_failur_no_newlevel, error=TRUE} +# this does not work at all!! +data_frame(condition = "bugaboo") %>% + add_predicted_draws(m, re_formula = ~(1|condition), + n = 2000) +``` + +That fails because I tried to pass in a level of my grouping variable that _wasn't_ in the original model! + +Second, passing in new levels -- but telling the function to just ignore them: + +```{r m_data_NA} +data_frame(condition = "bugaboo") %>% + add_predicted_draws(m, re_formula = NA,#~(1|condition), + n = 2000) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + + theme_minimal() +``` + +Here, I'm still passing in the unknown level -- but the function doesn't complain, because I'm not including random effects at all! This is the same result from above, when we used `NA` or `~0` to remove varying effects altogether. This is definitely something to watch for if you are passing in new data (I made this mistake, and it cost me an afternoon!) + +If we avoid both of these errors, we get what we expect: our means for our original groups, and a new predicted mean for `"bugaboo"`: + +```{r, new_level} +ABC %>% + data_grid(condition) %>% + add_row(condition = "bugaboo") %>% + add_predicted_draws(m, re_formula = ~(1|condition), + allow_new_levels = TRUE, + n = 2000) %>% + ggplot(aes(x = .prediction, y = condition)) + + geom_density_ridges(fill = "#41b6c4") + theme_minimal() +``` + +Here you can see that the new level is much flatter than the other original five. It comes from the same population as the others, which is rather variable (the group means are sort of different to each other). As a result, this new distribution is quite wide, including all that uncertainty. + +An ecologist might do something like this if we were had data on _some_ species in a community, but wanted to make predictions for new, as yet unobserved, species we might find next year. + +# Day 3: Offsets + +In which we cover how to do a count analysis for different efforts or exposure. + +## Outline + +- Poisson count model +- Bayesian Poisson count model + +## Data: Dandelion counts + +Let's imagine that we have counted dandelions. + +Dandelions occur on average 6 per square meter. + +However, we have five kinds of quadrat: 1, 4, 9 and 25 square meters. + +```{r} +library(tidyverse) + +imaginary_dandelions <- tibble(quadrat_size = rep(c(1,4, 9, 25), each = 15), + n_per_m2 = purrr::map(quadrat_size, rpois, lambda = 6), + obs_dandelions = map_dbl(n_per_m2, sum)) + +ggplot(imaginary_dandelions, aes(x = obs_dandelions)) + geom_histogram() + + facet_wrap(~quadrat_size) +``` + +How can we get the correct number of dandelions? + +## Poisson count model + +$$ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) &= \beta +\end{align} +$$ +$\lambda$ is the average response. If we want to measure the average _per unit effort_, we can do that too: + +$$ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda/Q) &= \beta +\end{align} +$$ + + + + +$$ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) - \text{log}(Q) &= \beta +\end{align} +$$ + + +$$ +\begin{align} +y &\sim \text{Poisson}(\lambda) \\ +\text{log}(\lambda) &= \beta + \text{log}(Q) +\end{align} +$$ + +In other words, we need a way to add a log coefficient to a model and give it a slope of _exactly one_. Fortunately the function `offset()` is here to do exactly this: + +```{r} +dandelion_model <- glm(obs_dandelions ~ 1, family = poisson(link = "log"), data = imaginary_dandelions) +summary(dandelion_model) + +``` + +This gives the wrong answer! + +```{r} +dandelion_model <- glm(obs_dandelions ~ 1 + offset(log(quadrat_size)), + family = poisson(link = "log"), + data = imaginary_dandelions) +summary(dandelion_model) +``` + +The coefficient should be close to 6, after we reverse the link function: + +```{r} +exp(coef(dandelion_model)[[1]]) +``` + +## Do it the Bayes Way + +```{r} +library(brms) + +dandelion_bf <- bf(obs_dandelions ~ 1 + offset(log(quadrat_size)), + family = poisson(link = "log")) + +get_prior(dandelion_bf, data = imaginary_dandelions) + +dandelion_prior <- prior(normal(2, 1), class = "Intercept") + +dandelion_model <- brm(formula = dandelion_bf, + data = imaginary_dandelions, + prior = dandelion_prior) + +``` + +Look at the Stan code: + +```{r} +stancode(dandelion_model) +``` + +Look at posterior distribution of parameter: + +```{r} +# as.matrix(dandelion_model) |> head() + +library(tidybayes) + +tidy_draws(dandelion_model) |> + ggplot(aes(x = exp(b_Intercept))) + + geom_histogram() + + geom_vline(xintercept = 6, col = "red", lwd = 3) +``` + +# Day 3: Nonlinear models + +In which we consider that nature has no straight lines. + +Fitting nonlinear models to ecological data is interesting and powerful. + +This is possible in base R using the function `nls()`. +In a Bayesian approach we can do the same thing, but we don't need to learn any new tools. + +## Data: Hemlock growth + +We will work with a dataset about hemlock growth. + +```{r} +library(tidyverse) +library(tidybayes) +library(brms) + +hemlock <- readr::read_delim( + "https://raw.githubusercontent.com/bios2/biodiversity_modelling_2021/master/data/hemlock.txt", + delim = " ", + col_names = c("x","light", "growth"), skip = 1) + +knitr::kable(head(hemlock, n = 3)) + +ggplot(hemlock, aes(x = light, y = growth)) + + geom_point() +``` + +## Define a model + +We need a function for the mean growth rate per species. +A very popular choice in ecology is the famous **Type 2 functional response**: + +$$ +y = \frac{a x}{b + x} +$$ +* $a$ is the asymptote -- the max value of $y$ when $x$ is large +* $b$ is the value of $x$ where $y = a/2$ + +We experiment using `curve` to understand how this works + +```{r} +a <- 195 +b <- 30 +curve(a * x / (b + x), xlim = c(0, 100)) +``` + + +## Define a distribution for observations around this average + +We can use the gamma distribution. The gamma distribution looks like this: + +```{r} +curve(dgamma(x, 3,5), xlim = c(0, 3)) +``` + +The gamma distribution has two parameters: + +$$ +\text{Gamma}(a, b) +$$ + +The mean and variance are both functions of both of these parameters: + +$$ +\mu = \frac{a}{b} +$$ +$$ +\sigma^2 = \frac{a}{b^2} +$$ + +We can demonstrate this easily in R: + +```{r} +xx <- rgamma(5000, 3, 5) +mean(xx) #about 3/5 = 0.6 +var(xx) #about 3/(5^2) = 0.12 +``` + +We can reverse this: write the parameters $a$ and $b$ in terms of the desired mean and standard deviation: + +$$ +\begin{align} +a &= \frac{\mu^2}{\sigma^2} \\ +b &= \frac{\mu}{\sigma^2} \\ +\end{align} +$$ + +_optional_ prove that to yourself with algebra! + +*Exercise*: Simulate 3000 from a Gamma distribution with a mean of 42 and a standard deviation of 10. + +### Simulating observations: + +We exploit this technique to make up some fake data: + +```{r} +a <- 195 +b <- 20 +x_values <- runif(n = 70, min = 0, max = 100) +average_response <- a * x_values / (b + x_values) +plot(x_values, average_response) + +``` + +```{r} +sigma <- 31 + +observed_response <- rgamma(n = 70, shape = average_response^2/sigma^2, rate = average_response/ sigma^2) + +plot(x_values, observed_response) +``` + +## Defining a Bayesian model + +To fully build our Bayesian model we put all the above together: + +* our function that describes how light causes the _average_ growth +* a distribution for observations around that average +* priors for the three unobserved quantities: $a$, $b$ and $\sigma$ + +$$ +\begin{align} +\text{growth} &\sim \text{Gamma}(\mu^2/\sigma^2, \mu/\sigma^2) \\ +\mu &= \frac{aL}{b + L} \\ +\sigma & \sim \text{Exponential}(4)\\ +a & \sim \text{Normal}(200, 15)\\ +b & \sim \text{Normal}(25, 5)\\ +\end{align} +$$ + +## Prior predictive simulations + +```{r} +hemlock$x <- NULL + +light_curve_bf <- bf(growth ~ exp(loga) * light / (exp(logb) + light), + family = Gamma(link = "identity"), + loga ~ 1, + logb ~ 1, + nl = TRUE) + +get_prior(light_curve_bf, data = hemlock) + +light_curve_prior <- c( + # prior(exponential(.1), class = "shape"), + prior(gamma(6.25, .25), class = "shape"), + prior(gamma(250^2/70^2, 250/70^2), class = "b", nlpar = "a", lb = 0), + prior(normal(30, 20), class = "b", nlpar = "b") +) + +light_curve_model_prior <- brm(light_curve_bf, + prior = light_curve_prior, + data = hemlock, + refresh = FALSE, + sample_prior = "only", + file = here::here("light_curve_prior"), + file_refit = "on_change") +``` + + +```{r} +hemlock |> + add_predicted_draws(light_curve_model_prior, ndraws = 6) |> + ggplot(aes(x = light, y = .prediction)) + + geom_point() + + facet_wrap(~.draw) + + coord_cartesian(ylim = c(0, 300)) +``` + +Fit to real data: + +```{r} +light_curve_model_posterior <- brm(light_curve_bf, + prior = light_curve_prior, + data = hemlock, + sample_prior = "yes", + file = here::here("light_curve_posterior"), + file_refit = "on_change") +``` + +Predictions with the original data: + +```{r} + +hemlock_post <- hemlock |> + add_predicted_draws(light_curve_model_posterior) + + +hemlock_post |> + ggplot(aes(x = light, y = .prediction)) + + stat_lineribbon()+ + coord_cartesian(ylim = c(0, 300)) + + scale_fill_brewer(palette = "Oranges") + + geom_point(aes(x = light, y = growth), size = 3, pch = 21, + fill = "lightblue", data = hemlock) +``` + +### Multilevel nonlinear model + +```{r} +light_curve_bf <- bf(growth ~ exp(loga) * light / (exp(logb) + light), + family = Gamma(link = "identity"), + loga ~ 1 + (1|ben|spp), + logb ~ 1 + (1|ben|spp), + nl = TRUE) + +get_prior(light_curve_bf, data = hemlock |> mutate(spp = "a")) + +light_curve_prior <- c( + # prior(exponential(.1), class = "shape"), + prior(gamma(6.25, .25), class = "shape"), + prior(normal(5.5, 3), class = "b", nlpar = "loga"), + prior(normal(3.4, 2), class = "b", nlpar = "logb"), + prior(exponential(8), class = "sd", group = "spp", nlpar = "loga"), + prior(exponential(8), class = "sd", group = "spp", nlpar = "logb"), + prior(lkj(2), class = "cor") +) + +fake_hemlock <- expand.grid(spp = letters[1:3], + light = runif(120, min = 0, max = 100), + growth = 1) + +light_curve_model_prior <- brm(light_curve_bf, + prior = light_curve_prior, + data = fake_hemlock, + refresh = FALSE, + sample_prior = "only", + file = here::here("light_curve_prior"), + file_refit = "on_change") +``` + +```{r} +fake_spp_prior <- fake_hemlock |> + add_predicted_draws(light_curve_model_prior, ndraws = "4") + + +fake_spp_prior |> + ggplot(aes(x = light , y = .prediction)) + geom_point() + + facet_grid(spp~.draw) +``` + +```{r} +# mean of 5, sd of 2 +hist(rnorm(400, 5, 2)) + +hist(rnorm(400, 0, 1)) +hist(rnorm(400, 0, 1)*2 + 5) +``` + +# Day 3: Random effects (continued) + +```{r} +library(tidyverse) + +n <- 300 + +sigma_disper <- .7 +average_nestling <- 5 +nestling_shrub <- .03 + +fake_nestlings <- tibble( + obs_label = 1:n, + shrub_cover = runif(n, min = 3, max = 70), + shrub_cover_c = shrub_cover - mean(shrub_cover), + log_avg_nestlings = log(average_nestling) + nestling_shrub*shrub_cover_c, + log_avg_nestlings_overdisp = log_avg_nestlings + rnorm(n, mean = 0, sd = sigma_disper), + nestlings_pois = rpois(n, exp(log_avg_nestlings)), + nestlings_overdisp = rpois(n, exp(log_avg_nestlings_overdisp))) +``` + +```{r} +fake_nestlings |> + ggplot(aes(x = shrub_cover, y = nestlings_pois)) + geom_point() +``` + +```{r} +fake_nestlings |> + ggplot(aes(x = shrub_cover, y = nestlings_overdisp)) + geom_point() + +``` + + +```{r} +library(brms) + +nest_poisson <- bf(nestlings_overdisp ~ shrub_cover_c, family = poisson()) +nest_intercept <- bf(nestlings_overdisp ~ shrub_cover_c + (1|obs_label), family = poisson()) +nest_negbin <- bf(nestlings_overdisp ~ shrub_cover_c, family = negbinomial()) +``` + +```{r} +brm(nest) +``` + diff --git a/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_posterior.rds b/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_posterior.rds new file mode 100644 index 0000000..116bf47 Binary files /dev/null and b/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_posterior.rds differ diff --git a/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_prior.rds b/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_prior.rds new file mode 100644 index 0000000..8855f6c Binary files /dev/null and b/posts/2022-03-22-bayes-for-the-busy-ecologist/light_curve_prior.rds differ