class: center, middle, title-slide .title[ # Visualizing distributions ] .author[ ### Claus O. Wilke, remixed by Joseph Elsherbini ] .date[ ### 2023-01-10 ] --- ## Histograms and density plots --- ## Passengers on the Titanic .center.small-font[ <table> <thead> <tr> <th style="text-align:right;"> age </th> <th style="text-align:left;"> sex </th> <th style="text-align:left;"> class </th> <th style="text-align:left;"> survived </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0.17 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 0.33 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 0.80 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 0.83 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 0.83 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 0.92 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 1st </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 1.00 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 1.00 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 1.00 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 1.00 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> </tbody> </table> <table> <thead> <tr> <th style="text-align:right;"> age </th> <th style="text-align:left;"> sex </th> <th style="text-align:left;"> class </th> <th style="text-align:left;"> survived </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1.0 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 1.5 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 1.5 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 1st </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> died </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 2.0 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> </tbody> </table> <table> <thead> <tr> <th style="text-align:right;"> age </th> <th style="text-align:left;"> sex </th> <th style="text-align:left;"> class </th> <th style="text-align:left;"> survived </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> male </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 2nd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> female </td> <td style="text-align:left;"> 3rd </td> <td style="text-align:left;"> survived </td> </tr> </tbody> </table> ] --- ## Histogram: Define bins and count cases .pull-left.small-font[ <table> <thead> <tr> <th style="text-align:left;"> age range </th> <th style="text-align:right;"> count </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> 0–5 </td> <td style="text-align:right;"> 36 </td> </tr> <tr> <td style="text-align:left;"> 6–10 </td> <td style="text-align:right;"> 19 </td> </tr> <tr> <td style="text-align:left;"> 11–15 </td> <td style="text-align:right;"> 18 </td> </tr> <tr> <td style="text-align:left;"> 16–20 </td> <td style="text-align:right;"> 99 </td> </tr> <tr> <td style="text-align:left;"> 21–25 </td> <td style="text-align:right;"> 139 </td> </tr> <tr> <td style="text-align:left;"> 26–30 </td> <td style="text-align:right;"> 121 </td> </tr> <tr> <td style="text-align:left;"> 31–35 </td> <td style="text-align:right;"> 76 </td> </tr> <tr> <td style="text-align:left;"> 36–40 </td> <td style="text-align:right;"> 74 </td> </tr> </tbody> </table> <table> <thead> <tr> <th style="text-align:left;"> age range </th> <th style="text-align:right;"> count </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> 41–45 </td> <td style="text-align:right;"> 54 </td> </tr> <tr> <td style="text-align:left;"> 46–50 </td> <td style="text-align:right;"> 50 </td> </tr> <tr> <td style="text-align:left;"> 51–55 </td> <td style="text-align:right;"> 26 </td> </tr> <tr> <td style="text-align:left;"> 56–60 </td> <td style="text-align:right;"> 22 </td> </tr> <tr> <td style="text-align:left;"> 61–65 </td> <td style="text-align:right;"> 16 </td> </tr> <tr> <td style="text-align:left;"> 66–70 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:left;"> 71–75 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:left;"> 76–80 </td> <td style="text-align:right;"> 0 </td> </tr> </tbody> </table> ] -- .pull-right[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- # Histograms depend on the chosen bin width .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## Alternative to histogram: Kernel density estimate (KDE) .pull-left[ <!-- --> ] -- .pull-right[ <!-- --> ] -- Histograms show raw counts, KDEs show proportions. (Total area = 1) ??? Figures redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## KDEs also depend on parameter settings .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## Careful: KDEs can show non-sensical data .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## Careful: Are bars stacked or overlapping? .pull-left[ <!-- --> ] -- .pull-right[ <!-- --> ] -- Stacked or overlapping histograms are rarely a good choice. ??? Figures redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## Alternatively: Age pyramid .center[ <!-- --> ] ??? Figures redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## Alternatively: KDEs showing proportions of total .center[ <!-- --> ] ??? Figures redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) [//]: # "segment ends here" --- class: center middle ## Histograms and density plots in **ggplot2** --- ## Making histograms with ggplot: `geom_histogram()` .small-font[ ```r ggplot(titanic, aes(age)) + geom_histogram() ``` ] -- .center.small-font[ ``` `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. ``` <!-- --> ] --- ## Setting the bin width .small-font[ ```r ggplot(titanic, aes(age)) + geom_histogram(binwidth = 5) ``` ] .center[ <!-- --> ] -- Do you like the bin placement? --- ## Always set the center as well .small-font[ ```r ggplot(titanic, aes(age)) + geom_histogram( binwidth = 5, # width of the bins center = 2.5 # center of the bin containing that value ) ``` ] .center[ <!-- --> ] -- Setting center 2.5 makes the bars start 0-5, 5-10, etc. instead of 2.5-7.5, etc. You could instead use the argument `boundary=5` to accomplish the same behavior. --- ## Making density plots with ggplot: `geom_density()` .small-font[ ```r ggplot(titanic, aes(age)) + geom_density(fill = "skyblue") ``` ] -- .center[ <!-- --> ] --- ## Making density plots with ggplot: `geom_density()` .small-font[ ```r ggplot(titanic, aes(age)) + geom_density() # without fill ``` ] .center[ <!-- --> ] --- ## Modifying bandwidth (`bw`) and kernel parameters .tiny-font[ ```r ggplot(titanic, aes(age)) + geom_density( fill = "skyblue", bw = 0.5, # a small bandwidth kernel = "gaussian" # Gaussian kernel (the default) ) ``` ] .center[ <!-- --> ] --- ## Modifying bandwidth (`bw`) and kernel parameters .tiny-font[ ```r ggplot(titanic, aes(age)) + geom_density( fill = "skyblue", bw = 2, # a moderate bandwidth kernel = "rectangular" # rectangular kernel ) ``` ] .center[ <!-- --> ] [//]: # "segment ends here" --- class: center middle ## Setting stats explicitly in **ggplot2** --- ## Statistical transformations (stats) can be set explicitly .tiny-font[ ```r ggplot(titanic, aes(age)) + geom_density( stat = "density", # the default for geom_density() fill = "skyblue" ) ``` ] .center[ <!-- --> ] --- ## Statistical transformations (stats) can be set explicitly .tiny-font[ ```r ggplot(titanic, aes(age)) + geom_area( # geom_area() does not normally use stat = "density" stat = "density", fill = "skyblue" ) ``` ] .center[ <!-- --> ] --- ## Statistical transformations (stats) can be set explicitly .tiny-font[ ```r ggplot(titanic, aes(age)) + geom_line( # neither does geom_line() stat = "density" ) ``` ] .center[ <!-- --> ] --- ## Statistical transformations (stats) can be set explicitly .tiny-font[ ```r ggplot(titanic, aes(age)) + # we can use multiple geoms on top of each other geom_area(stat = "density", fill = "skyblue") + geom_line(stat = "density") ``` ] .center[ <!-- --> ] --- ## Parameters are handed through to the stat .pull-left.tiny-font[ ```r ggplot(titanic, aes(age)) + geom_line(stat = "density", bw = 3) ``` .center[ <!-- --> ]] .pull-right.tiny-font[ ```r ggplot(titanic, aes(age)) + geom_line(stat = "density", bw = 0.3) ``` .center[ <!-- --> ]] -- Here, `bw` is a parameter of `stat_density()`, not of `geom_line()`. --- ## Density estimates visualize distributions .pull-left.small-font[ Mean temperatures in Lincoln, NE, in January 2016: .center[ |date | mean temp| |:----------|---------:| |2016-01-01 | 24| |2016-01-02 | 23| |2016-01-03 | 23| |2016-01-04 | 17| |2016-01-05 | 29| |2016-01-06 | 33| |2016-01-07 | 30| |2016-01-08 | 25| |2016-01-09 | 9| |2016-01-10 | 11| |2016-01-11 | 28| |2016-01-12 | 24| |2016-01-13 | 33| |2016-01-14 | 40| |2016-01-15 | 29| |2016-01-16 | 19| |2016-01-17 | 5| |2016-01-18 | 11| |2016-01-19 | 22| |2016-01-20 | 28| |2016-01-21 | 25| |2016-01-22 | 22| |2016-01-23 | 28| |2016-01-24 | 30| |2016-01-25 | 26| |2016-01-26 | 29| |2016-01-27 | 33| |2016-01-28 | 41| |2016-01-29 | 41| |2016-01-30 | 39| |2016-01-31 | 35| ]] -- .pull-right[ <!-- --> ] --- ## Density estimates visualize distributions .pull-left.small-font[ Mean temperatures in Lincoln, NE, in January 2016: .center[ |date | mean temp| |:----------|---------:| |2016-01-01 | 24| |2016-01-02 | 23| |2016-01-03 | 23| |2016-01-04 | 17| |2016-01-05 | 29| |2016-01-06 | 33| |2016-01-07 | 30| |2016-01-08 | 25| |2016-01-09 | 9| |2016-01-10 | 11| |2016-01-11 | 28| |2016-01-12 | 24| |2016-01-13 | 33| |2016-01-14 | 40| |2016-01-15 | 29| |2016-01-16 | 19| |2016-01-17 | 5| |2016-01-18 | 11| |2016-01-19 | 22| |2016-01-20 | 28| |2016-01-21 | 25| |2016-01-22 | 22| |2016-01-23 | 28| |2016-01-24 | 30| |2016-01-25 | 26| |2016-01-26 | 29| |2016-01-27 | 33| |2016-01-28 | 41| |2016-01-29 | 41| |2016-01-30 | 39| |2016-01-31 | 35| ]] .pull-right[ <!-- --> How can we compare distributions across months? ] --- ## A bad idea: Many overlapping density plots .center[ <!-- --> ] --- ## Another bad idea: Stacked density plots .center[ <!-- --> ] --- ## Somewhat better: Small multiples .center[ <!-- --> ] --- ## Instead: Show values along y, conditions along x .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- A boxplot is a crude way of visualizing a distribution. --- ## How to read a boxplot .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## If you like density plots, consider violins .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- A violin plot is a density plot rotated 90 degrees and then mirrored. --- ## How to read a violin plot .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## For small datasets, you can also use a strip chart Advantage: Can see raw data points instead of abstract representation. .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- Horizontal jittering may be necessary to avoid overlapping points. --- ## For small datasets, you can also use a strip chart Advantage: Can see raw data points instead of abstract representation. .center[ <!-- --> ] Horizontal jittering may be necessary to avoid overlapping points. ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## For small datasets, you can also use a strip chart Advantage: Can see raw data points instead of abstract representation. .center[ <!-- --> ] Horizontal jittering may be necessary to avoid overlapping points. ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## We can also jitter points into violins .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- Such plots are called sina plots, to honor [Sina Hadi Sohi.](https://clauswilke.com/dataviz/boxplots-violins.html#fig:lincoln-temp-sina) --- ## Another option is a scatter-density plot Advantage: Best of both worlds for violin and jitter plot, see the raw data but also see the shape of the density .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) --- ## But maybe there's hope for overlapping density plots? .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- How about we stagger the densities vertically? --- ## Vertically staggered density plots are called ridgelines .center[ <!-- --> ] ??? Figure redrawn from [Claus O. Wilke. Fundamentals of Data Visualization. O'Reilly, 2019.](https://clauswilke.com/dataviz) -- Notice the single fill color. More colors would be distracting. --- class: center middle ## Advice - always show the finest granularity of data that is practical. If you don't have too many points, show them! It makes it much easier to interpret the data. Especially when you are exploring new datasets. Favor showing distributions over just a mean with error bars. --- --- class: center middle ## Making boxplots, violins, etc. in **ggplot2** --- ## Making boxplots, violins, etc. in **ggplot2** .small-font.center[ Plot type | Geom | Notes :----------- | :----------------- | :------------------------- boxplot | `geom_boxplot()` | violin plot | `geom_violin()` | strip chart | `geom_point()` | Jittering requires `position_jitter()` sina plot | `geom_sina()` | From package **ggforce** scatter-density plot | `geom_quasirandom()` | From package **ggbeeswarm** ridgeline | `geom_density_ridges()` | From package **ggridges** ] --- ## Examples: Boxplot .tiny-font[ ```r ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_boxplot(fill = "skyblue") ``` ] .center[ <!-- --> ] --- ## Examples: Violins .tiny-font[ ```r ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_violin(fill = "skyblue") ``` ] .center[ <!-- --> ] --- ## Examples: Strip chart (no jitter) .tiny-font[ ```r ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_point(size = 0.75) # reduce point size to minimize overplotting ``` ] .center[ <!-- --> ] --- ## Examples: Strip chart (w/ jitter) .tiny-font[ ```r ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_point(size = 0.75, # reduce point size to minimize overplotting position = position_jitter( width = 0.15, # amount of jitter in horizontal direction height = 0 # amount of jitter in vertical direction (0 = none) ) ) ``` ] .center[ <!-- --> ] --- ## Examples: Sina plot .tiny-font[ ```r library(ggforce) # for geom_sina() ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_violin(fill = "skyblue", color = NA) + # violins in background geom_sina(size = 0.75) # sina jittered points in foreground ``` ] .center[ <!-- --> ] --- ## Examples: scatter-density plot .tiny-font[ ```r library(ggbeeswarm) # for geom_quasirandom() ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_quasirandom(color = "skyblue") ``` ] .center[ <!-- --> ] --- ## Examples: scatter-density plot .tiny-font[ ```r library(ggbeeswarm) # for geom_quasirandom() ggplot(lincoln_temps, aes(x = month, y = mean_temp)) + geom_quasirandom(color = "skyblue", width=0.2, alpha=0.5) # make the spread skinnier and # set transparency (alpha) to half. ``` ] .center[ <!-- --> ] --- ## Examples: Ridgeline plot .tiny-font[ ```r library(ggridges) # for geom_density_ridges ggplot(lincoln_temps, aes(x = mean_temp, y = month_long)) + geom_density_ridges() ``` ] .center[ <!-- --> ] [//]: # "segment ends here" --- ## Exercise Time to try it out yourself! Go to [https://elsherbini.github.io/AMNH_R_Workshop_2023/modules/data-wrangling-module/](https://elsherbini.github.io/AMNH_R_Workshop_2023/modules/data-wrangling-module/) and complete the Visualizing Distributions exercise.