Using dplyr window functions to calculate percentiles
RDplyrTidyrR Problem Overview
I have a working solution but am looking for a cleaner, more readable solution that perhaps takes advantage of some of the newer dplyr window functions.
Using the mtcars dataset, if I want to look at the 25th, 50th, 75th percentiles and the mean and count of miles per gallon ("mpg") by the number of cylinders ("cyl"), I use the following code:
library(dplyr)
library(tidyr)
# load data
data("mtcars")
# Percentiles used in calculation
p <- c(.25,.5,.75)
# old dplyr solution
mtcars %>% group_by(cyl) %>%
do(data.frame(p=p, stats=quantile(.$mpg, probs=p),
n = length(.$mpg), avg = mean(.$mpg))) %>%
spread(p, stats) %>%
select(1, 4:6, 3, 2)
# note: the select and spread statements are just to get the data into
# the format in which I'd like to see it, but are not critical
Is there a way I can do this more cleanly with dplyr using some of the summary functions (n_tiles, percent_rank, etc.)? By cleanly, I mean without the "do" statement.
Thank you
R Solutions
Solution 1 - R
In dplyr 1.0
, summarise
can return multiple values, allowing the following:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
summarise(quantile = scales::percent(c(0.25, 0.5, 0.75)),
mpg = quantile(mpg, c(0.25, 0.5, 0.75)))
Or, you can avoid a separate line to name the quantiles by going with enframe
:
mtcars %>%
group_by(cyl) %>%
summarise(enframe(quantile(mpg, c(0.25, 0.5, 0.75)), "quantile", "mpg"))
> cyl quantile mpg
>
> 3 4 75% 30.4
> 4 6 25% 18.6
> 5 6 50% 19.7
> 6 6 75% 21
> 7 8 25% 14.4
> 8 8 50% 15.2
> 9 8 75% 16.2
Answer for previous versions of dplyr
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
summarise(x=list(enframe(quantile(mpg, probs=c(0.25,0.5,0.75)), "quantiles", "mpg"))) %>%
unnest(x)
> cyl quantiles mpg > 1 4 25% 22.80 > 2 4 50% 26.00 > 3 4 75% 30.40 > 4 6 25% 18.65 > 5 6 50% 19.70 > 6 6 75% 21.00 > 7 8 25% 14.40 > 8 8 50% 15.20 > 9 8 75% 16.25
This can be turned into a more general function using tidyeval:
q_by_group = function(data, value.col, ..., probs=seq(0,1,0.25)) {
groups=enquos(...)
data %>%
group_by(!!!groups) %>%
summarise(x = list(enframe(quantile({{value.col}}, probs=probs), "quantiles", "mpg"))) %>%
unnest(x)
}
q_by_group(mtcars, mpg)
q_by_group(mtcars, mpg, cyl)
q_by_group(mtcars, mpg, cyl, vs, probs=c(0.5,0.75))
q_by_group(iris, Petal.Width, Species)
Solution 2 - R
If you're up for using purrr::map
, you can do it like this!
library(tidyverse)
mtcars %>%
tbl_df() %>%
nest(-cyl) %>%
mutate(Quantiles = map(data, ~ quantile(.$mpg)),
Quantiles = map(Quantiles, ~ bind_rows(.) %>% gather())) %>%
unnest(Quantiles)
#> # A tibble: 15 x 3
#> cyl key value
#> <dbl> <chr> <dbl>
#> 1 6 0% 17.8
#> 2 6 25% 18.6
#> 3 6 50% 19.7
#> 4 6 75% 21
#> 5 6 100% 21.4
#> 6 4 0% 21.4
#> 7 4 25% 22.8
#> 8 4 50% 26
#> 9 4 75% 30.4
#> 10 4 100% 33.9
#> 11 8 0% 10.4
#> 12 8 25% 14.4
#> 13 8 50% 15.2
#> 14 8 75% 16.2
#> 15 8 100% 19.2
Created on 2018-11-10 by the reprex package (v0.2.1)
One nice thing about this approach is the output is tidy, one observation per row.
Solution 3 - R
This is a dplyr
approach that uses the tidy()
function of the broom
package, unfortunately it still requires do()
, but it is a lot simpler.
library(dplyr)
library(broom)
mtcars %>%
group_by(cyl) %>%
do( tidy(t(quantile(.$mpg))) )
which gives:
cyl X0. X25. X50. X75. X100.
(dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 4 21.4 22.80 26.0 30.40 33.9
2 6 17.8 18.65 19.7 21.00 21.4
3 8 10.4 14.40 15.2 16.25 19.2
Note the use of t()
since the broom
package does not have a method for named numerics.
This is based on my earlier answer for summary() here.
Solution 4 - R
Not sure how to avoid do()
in dplyr
, but you can do this with c()
and as.list()
with data.table
in a pretty straightforward manner:
require(data.table)
as.data.table(mtcars)[, c(as.list(quantile(mpg, probs=p)),
avg=mean(mpg), n=.N), by=cyl]
# cyl 25% 50% 75% avg n
# 1: 6 18.65 19.7 21.00 19.74286 7
# 2: 4 22.80 26.0 30.40 26.66364 11
# 3: 8 14.40 15.2 16.25 15.10000 14
Replace by
with keyby
if you want them ordered by cyl
column.
Solution 5 - R
This solution uses dplyr
and tidyr
only, lets you specify your quantiles in the dplyr
chain, and takes advantage of tidyr::crossing()
to "stack" multiple copies of the dataset prior to grouping and summarising.
diamonds %>% # Initial data
tidyr::crossing(pctile = 0:4/4) %>% # Specify quantiles; crossing() is like expand.grid()
dplyr::group_by(cut, pctile) %>% # Indicate your grouping var, plus your quantile var
dplyr::summarise(quantile_value = quantile(price, unique(pctile))) %>% # unique() is needed
dplyr::mutate(pctile = sprintf("%1.0f%%", pctile*100)) # Optional prettification
Result:
# A tibble: 25 x 3
# Groups: cut [5]
cut pctile quantile_value
<ord> <chr> <dbl>
1 Fair 0% 337.00
2 Fair 25% 2050.25
3 Fair 50% 3282.00
4 Fair 75% 5205.50
5 Fair 100% 18574.00
6 Good 0% 327.00
7 Good 25% 1145.00
8 Good 50% 3050.50
9 Good 75% 5028.00
10 Good 100% 18788.00
11 Very Good 0% 336.00
12 Very Good 25% 912.00
13 Very Good 50% 2648.00
14 Very Good 75% 5372.75
15 Very Good 100% 18818.00
16 Premium 0% 326.00
17 Premium 25% 1046.00
18 Premium 50% 3185.00
19 Premium 75% 6296.00
20 Premium 100% 18823.00
21 Ideal 0% 326.00
22 Ideal 25% 878.00
23 Ideal 50% 1810.00
24 Ideal 75% 4678.50
25 Ideal 100% 18806.00
The unique()
is necessary to let dplyr::summarise()
know that you only want one value per group.
Solution 6 - R
Answered many diffrent ways. dplyr distinct made the difference for what I wanted to do..
mtcars %>%
select(cyl, mpg) %>%
group_by(cyl) %>%
mutate( qnt_0 = quantile(mpg, probs= 0),
qnt_25 = quantile(mpg, probs= 0.25),
qnt_50 = quantile(mpg, probs= 0.5),
qnt_75 = quantile(mpg, probs= 0.75),
qnt_100 = quantile(mpg, probs= 1),
mean = mean(mpg),
sd = sd(mpg)
) %>%
distinct(qnt_0 ,qnt_25 ,qnt_50 ,qnt_75 ,qnt_100 ,mean ,sd)
renders
# A tibble: 3 x 8
# Groups: cyl [3]
qnt_0 qnt_25 qnt_50 qnt_75 qnt_100 mean sd cyl
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 17.8 18.6 19.7 21 21.4 19.7 1.45 6
2 21.4 22.8 26 30.4 33.9 26.7 4.51 4
3 10.4 14.4 15.2 16.2 19.2 15.1 2.56 8
Solution 7 - R
Here is a solution using a combination of dplyr
, purrr
, and rlang
:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
# load data
data("mtcars")
# Percentiles used in calculation
p <- c(.25,.5,.75)
p_names <- paste0(p*100, "%")
p_funs <- map(p, ~partial(quantile, probs = .x, na.rm = TRUE)) %>%
set_names(nm = p_names)
# dplyr/purrr/rlang solution
mtcars %>%
group_by(cyl) %>%
summarize_at(vars(mpg), funs(!!!p_funs))
#> # A tibble: 3 x 4
#> cyl `25%` `50%` `75%`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 22.8 26 30.4
#> 2 6 18.6 19.7 21
#> 3 8 14.4 15.2 16.2
#Especially useful if you want to summarize more variables
mtcars %>%
group_by(cyl) %>%
summarize_at(vars(mpg, drat), funs(!!!p_funs))
#> # A tibble: 3 x 7
#> cyl `mpg_25%` `drat_25%` `mpg_50%` `drat_50%` `mpg_75%` `drat_75%`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 22.8 3.81 26 4.08 30.4 4.16
#> 2 6 18.6 3.35 19.7 3.9 21 3.91
#> 3 8 14.4 3.07 15.2 3.12 16.2 3.22
Created on 2018-10-01 by the reprex package (v0.2.0).
Edit (2019-04-17):
As of dplyr 0.8.0
, the funs
function has been deprecated in favor of using list
to pass the desired functions into scoped dplyr
functions. As a result of this, the implementation above gets slightly more straightfoward. We no longer need to worry about unquoting the functions with the !!!
. Please see the below reprex
:
library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.2
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
# load data
data("mtcars")
# Percentiles used in calculation
p <- c(.25,.5,.75)
p_names <- paste0(p*100, "%")
p_funs <- map(p, ~partial(quantile, probs = .x, na.rm = TRUE)) %>%
set_names(nm = p_names)
# dplyr/purrr/rlang solution
mtcars %>%
group_by(cyl) %>%
summarize_at(vars(mpg), p_funs)
#> # A tibble: 3 x 4
#> cyl `25%` `50%` `75%`
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 22.8 26 30.4
#> 2 6 18.6 19.7 21
#> 3 8 14.4 15.2 16.2
#Especially useful if you want to summarize more variables
mtcars %>%
group_by(cyl) %>%
summarize_at(vars(mpg, drat), p_funs)
#> # A tibble: 3 x 7
#> cyl `mpg_25%` `drat_25%` `mpg_50%` `drat_50%` `mpg_75%` `drat_75%`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 22.8 3.81 26 4.08 30.4 4.16
#> 2 6 18.6 3.35 19.7 3.9 21 3.91
#> 3 8 14.4 3.07 15.2 3.12 16.2 3.22
Created on 2019-04-17 by the reprex package (v0.2.0).
Solution 8 - R
Here's a fairly readable solution that uses dplyr
and purrr
to return quantiles in a tidy format:
Code
library(dplyr)
library(purrr)
mtcars %>%
group_by(cyl) %>%
do({x <- .$mpg
map_dfr(.x = c(.25, .5, .75),
.f = ~ data_frame(Quantile = .x,
Value = quantile(x, probs = .x)))
})
Result
# A tibble: 9 x 3
# Groups: cyl [3]
cyl Quantile Value
<dbl> <dbl> <dbl>
1 4 0.25 22.80
2 4 0.50 26.00
3 4 0.75 30.40
4 6 0.25 18.65
5 6 0.50 19.70
6 6 0.75 21.00
7 8 0.25 14.40
8 8 0.50 15.20
9 8 0.75 16.25
Solution 9 - R
Yet another way to accomplish this, with unnest_wider/longer
mtcars %>%
group_by(cyl) %>%
summarise(quants = list(quantile(mpg, probs = c(.01, .1, .25, .5, .75, .90,.99)))) %>%
unnest_wider(quants)
And if you wanted to do it for multiple variables, you could gather before the grouping:
mtcars %>%
gather(key = 'metric', value = 'value', -cyl) %>%
group_by(cyl, metric) %>%
summarise(quants = list(quantile(value, probs = c(.01, .1, .25, .5, .75, .90,.99)))) %>%
unnest_wider(quants)
Solution 10 - R
do()
is in fact the correct idiom, since it’s designed for group-wise transformations. Think of it as an lapply()
that maps over groups of a data frame. (For such a specialized function, a generic name like “do” is not ideal. But it’s probably too late to change it.)
Morally, within each cyl
group, you want to apply quantile()
to the mpg
column:
library(dplyr)
p <- c(.2, .5, .75)
mtcars %>%
group_by(cyl) %>%
do(quantile(.$mpg, p))
#> Error: Results 1, 2, 3 must be data frames, not numeric
Except that doesn’t work because quantile()
doesn’t return a data frame; you must convert its output, explicitly. Since this alteration amounts to wrapping quantile()
with a data frame, you can use the gestalt function composition operator %>>>%
:
library(gestalt)
library(tibble)
quantile_tbl <- quantile %>>>% enframe("quantile")
mtcars %>%
group_by(cyl) %>%
do(quantile_tbl(.$mpg, p))
#> # A tibble: 9 x 3
#> # Groups: cyl [3]
#> cyl quantile value
#> <dbl> <chr> <dbl>
#> 1 4 20% 22.8
#> 2 4 50% 26
#> 3 4 75% 30.4
#> 4 6 20% 18.3
#> 5 6 50% 19.7
#> 6 6 75% 21
#> 7 8 20% 13.9
#> 8 8 50% 15.2
#> 9 8 75% 16.2