Confidence Intervals via Bootstrap

Calculates confidence intervals (CI) using bootstrap methods. This enhanced version of DescTools::BootCI() returns a data frame.

ci_boot(.data, x, y = NULL, conf.level = 0.95, ...)

Arguments

.data

Data frame.

x, y

Column names (unquoted).

conf.level

Confidence level. Default: 0.95.

...

Additional parameters for DescTools::BootCI(), including:

FUN – function for which CI is calculated;
bci.method – interval method:
- "perc" – percentile method,
- "bca" – bias-corrected and accelerated (BCa) method (see note below),
- others;
R – number of replications, typically 1,000 to 10,000.

Value

A data frame with confidence intervals. Columns depend on arguments and grouping:

(if grouped) grouping variable names;
Column matching the statistic name (from FUN) containing the estimate;
lwr.ci, upr.ci – lower and upper CI bounds.

Details

Similar to DescTools::BootCI(), but:

First argument is a data frame;
Arguments x and y are unquoted column names;
Responds to dplyr::group_by() for subgroup calculations;
Returns a data frame for convenient plotting with ggplot2.

Note

Notes:

Each group should have at least 20 observations for bootstrap methods.
Use set.seed() for reproducible results.
If using bci.method = "bca" produces the warning "extreme order statistics used as endpoints", the BCa method is unsuitable; use "perc" instead (https://rcompanion.org/handbook/E_04.html).

Examples

# Bootstrap is useful when:
# - Data is skewed (not normal)
# - You want CI for statistics other than the mean (e.g., median, SD)
# - You don't want to assume a specific distribution

data(iris, package = "datasets")
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa

set.seed(123) # For reproducible results

# Example 1: CI for the median (resistant to outliers)
iris |>
  ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "perc")
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4   4.55
# Compare to mean CI - median is often more robust

# Example 2: CI for the median by group
iris |>
  dplyr::group_by(Species) |>
  ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "perc")
#> # A tibble: 3 × 4
#>   Species    median lwr.ci upr.ci
#>   <fct>       <dbl>  <dbl>  <dbl>
#> 1 setosa       1.5    1.4     1.5
#> 2 versicolor   4.35   4.1     4.5
#> 3 virginica    5.55   5.25    5.7
# Useful when groups have different distributions

# Example 3: CI for standard deviation
# How variable is petal length?
set.seed(456)
iris |>
  ci_boot(Petal.Length, FUN = sd, R = 1000, bci.method = "perc")
#> # A tibble: 1 × 3
#>      sd lwr.ci upr.ci
#>   <dbl>  <dbl>  <dbl>
#> 1  1.77   1.64   1.86

# Example 4: CI for interquartile range (IQR)
# IQR = 75th percentile - 25th percentile
set.seed(789)
iris |>
  ci_boot(Petal.Length, FUN = IQR, R = 1000, bci.method = "perc")
#> # A tibble: 1 × 3
#>     IQR lwr.ci upr.ci
#>   <dbl>  <dbl>  <dbl>
#> 1   3.5   3.03   3.87

# Example 5: CI for correlation coefficient (Pearson's r)
# How related are petal length and width?
set.seed(101)
iris |>
  dplyr::group_by(Species) |>
  ci_boot(
    Petal.Length, Petal.Width,
    FUN = cor, method = "pearson",
    R = 1000, bci.method = "perc"
  )
#> # A tibble: 3 × 4
#>   Species      cor lwr.ci upr.ci
#>   <fct>      <dbl>  <dbl>  <dbl>
#> 1 setosa     0.332 0.0743  0.522
#> 2 versicolor 0.787 0.670   0.874
#> 3 virginica  0.322 0.113   0.517
# Look for CIs that don't include 0 (suggests real correlation)

# Example 6: Comparing BCa and percentile methods
set.seed(111)
# BCa method (often more accurate but requires more assumptions)
iris |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "bca")
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4    4.5

# Percentile method (simpler, more robust)
iris |> ci_boot(Petal.Length, FUN = median, R = 1000, bci.method = "perc")
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4    4.6

# Example 7: Effect of number of bootstrap replications
set.seed(222)
# Fewer replications (faster but less stable)
iris |> ci_boot(Petal.Length, FUN = median, R = 500, bci.method = "perc")
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4   4.57

# More replications (slower but more stable)
iris |> ci_boot(Petal.Length, FUN = median, R = 5000, bci.method = "perc")
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4   4.55
# For teaching: 1000 is usually enough; for research: 5000-10000

# Example 8: Handling missing values
set.seed(333)
iris |>
  ci_boot(
    Petal.Length,
    FUN = median, na.rm = TRUE,
    R = 1000, bci.method = "bca"
  )
#> # A tibble: 1 × 3
#>   median lwr.ci upr.ci
#>    <dbl>  <dbl>  <dbl>
#> 1   4.35      4   4.55

# Example 9: With mtcars dataset
set.seed(444)
data(mtcars, package = "datasets")
mtcars |>
  dplyr::group_by(cyl) |>
  ci_boot(mpg, FUN = median, R = 1000, bci.method = "perc")
#> # A tibble: 3 × 4
#>     cyl median lwr.ci upr.ci
#>   <dbl>  <dbl>  <dbl>  <dbl>
#> 1     6   19.7   18.1   21  
#> 2     4   26     22.8   30.4
#> 3     8   15.2   14.5   16.4
# Compare median MPG for different cylinder counts

# Example 10: Spearman correlation (rank-based, robust to outliers)
set.seed(555)
iris |>
  dplyr::group_by(Species) |>
  ci_boot(
    Petal.Length, Petal.Width,
    FUN = cor, method = "spearman",
    R = 1000, bci.method = "perc"
  )
#> # A tibble: 3 × 4
#>   Species      cor  lwr.ci upr.ci
#>   <fct>      <dbl>   <dbl>  <dbl>
#> 1 setosa     0.271 0.00686  0.501
#> 2 versicolor 0.787 0.638    0.885
#> 3 virginica  0.363 0.119    0.579