# FC721 Introduction to R Programming # Part3: Graphics in R using ggplot2 #----------------------------------- # Helpful resources: ## ggplot2 cheat sheet: # https://github.com/rstudio/cheatsheets/blob/master/data-visualization-2.1.pdf ## R Graphics CookBook: # https://r-graphics.org/ ## ggplot2: elegant graphics for data analysis # https://ggplot2-book.org/ ## R Graphics Gallery: # https://r-graph-gallery.com/ #----------------------------------- # ggplot2 package is a part of tidyverse library # you can either load ggplot2 separately or load the entire tidyverse library: library(tidyverse) # Load the data nhanes_tidy <- read_csv("nhanes_2018_df.csv") # We can use R graphics for data exploration and for sharing our results with others # There are several R packages for creating graphics, but ggplot2 is the most popular one # plotting with ggplot2 consists of two main steps: # 1. creating a plot object with ggplot() function # 2. adding layers to the plot object with + operator #----------------------------------- # Using ggplot2 for data exploration #----------------------------------- # Let's create a simple scatter plot of weight vs. height # ggplot() function creates a plot object # aes() function specifies the mapping of variables to aesthetics # geom_point() function adds a layer of points to the plot object ggplot(data = nhanes_tidy, aes(x = height, y = weight)) + geom_point() # If we want to add color to the points based on gender, we can use the color aesthetic: ggplot(data = nhanes_tidy, aes(x = height, y = weight, color = gender)) + geom_point() # Most popular geoms in ggplot2: # geom_point() - scatter plot # geom_line() - line plot # geom_bar() - bar plot # geom_histogram() - histogram # geom_boxplot() - box plot # geom_smooth() - smoothed conditional mean # geom_violin() - violin plot # geom_tile() - heatmap # Bar plot ggplot(data = nhanes_tidy, aes(x=gender)) + geom_bar() # Box plot ggplot(data = nhanes_tidy, aes(x=bp_diastolic)) + geom_boxplot() # Histogram ggplot(data = nhanes_tidy, aes(x=age)) + geom_histogram() ggplot(data = nhanes_tidy, aes(x=age)) + geom_histogram(bins=9) ggplot(data = nhanes_tidy, aes(x=age)) + geom_histogram(binwidth=10) # Violin plot ggplot(data = nhanes_tidy, aes(x=race, y=age)) + geom_violin() # Pie chart (it is a bad idea to use pie charts for data exploration!) # Basic piechart race_table <- nhanes_tidy |> count(race) ggplot(race_table, aes(x="", y=n, fill=race)) + geom_bar(stat="identity", width=1, color="white") + coord_polar("y", start=0) + theme_void() # remove background, grid, numeric labels #----------------------------------- # Using ggplot2 for sharing results #----------------------------------- # When we are sharing our results with others, we may want to customize our plots # and make them more informative and visually appealing # To do so we can add more layers to our plot object # Generally, we want our graphics to have the following components: # 1. Title # 2. Axis labels # 3. Legend # 4. Grid lines # 5. Appropriate colors # 6. Appropriate font size # 7. Most importantly, select appropriate type of plot for the data! # Let's return to our initial scatterplot: ggplot(data = nhanes_tidy, aes(x = height, y = weight, color = gender)) + geom_point() # Let's add title, modify axis labels, and change the color palette: ggplot(data = nhanes_tidy, aes(x = height, y = weight, color = gender)) + geom_point() + labs(title = "Weight vs. Height", subtitle = "NHANES 2018", caption = "BUMC: FC721", x = "Height (cm)", y = "Weight (kg)") + scale_color_manual(values = c("red", "blue")) # or ggplot(data = nhanes_tidy, aes(x = height, y = weight, color = gender)) + geom_point() + labs(title = "Weight vs. Height", subtitle = "NHANES 2018", caption = "BUMC: FC721", x = "Height (cm)", y = "Weight (kg)") + scale_colour_brewer(palette = "Set1") #----------------------------------------------------------------------- # Let's add a regression line to the scatter plot: ggplot(data = nhanes_tidy, aes(x = height, y = weight, color = gender)) + geom_point() + labs(title = "Weight vs. Height", subtitle = "NHANES 2018", caption = "BUMC: FC721", x = "Height (cm)", y = "Weight (kg)") + scale_colour_brewer(palette = "Set1") + geom_smooth() # There are 2 regression lines on the plot: one for each gender. # If we want to have a single regression line, we can move the color aesthetic to the geom_point() layer: ggplot(data = nhanes_tidy, aes(x = height, y = weight)) + geom_point(aes(color=gender)) + labs(title = "Weight vs. Height", subtitle = "NHANES 2018", caption = "BUMC: FC721", x = "Height (cm)", y = "Weight (kg)") + scale_colour_brewer(palette = "Set1") + geom_smooth() # By default geom_smooth() uses loess method for smoothing. # We can specify the method and the confidence interval: ggplot(data = nhanes_tidy, aes(x = height, y = weight)) + geom_point(aes(color=gender)) + labs(title = "Weight vs. Height", subtitle = "NHANES 2018", caption = "BUMC: FC721", x = "Height (cm)", y = "Weight (kg)") + scale_colour_brewer(palette = "Set1") + geom_smooth(method = "lm", se = TRUE) # In this case linear regression is a poor choice for the approximation of the relationship between weight and height #----------------------------------- # Bar plots #----------------------------------- # Let's create a bar plot of the number # of observations for each value in column race ggplot(data = nhanes_tidy, aes(x=race)) + geom_bar() + labs(title = "Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal() # Let's add a number of observations to the bars: ggplot(data = nhanes_tidy, aes(x=race)) + geom_bar() + geom_text(stat='count', aes(label=..count..), vjust=-0.5) + labs(title ="Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal() # Sometimes it is useful to change the range of values in the y-axis: ggplot(data = nhanes_tidy, aes(x=race)) + geom_bar() + geom_text(stat='count', aes(label=..count..), vjust=-0.5) + labs(title ="Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal()+ ylim (c(0, 3500)) # For the stacked bar plot we need to add fill aesthetic: ggplot(data = nhanes_tidy, aes(x=race, fill=gender)) + geom_bar() + labs(title ="Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal() # If we want to use side-by-side bars, we need to use position = "dodge": ggplot(data = nhanes_tidy, aes(x=race, fill=gender)) + geom_bar(position = "dodge") + labs(title ="Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal() #----------------------------------- # Bar plots when the data is already summarized #----------------------------------- race_table <- nhanes_tidy |> count(race, gender) # In this case we will use "identity" statistic: ggplot( race_table, aes(x = race, y= n, fill=gender)) + geom_bar(stat="identity",position="dodge") + geom_text(aes(label=n), position = position_dodge(0.9), vjust = -1) + labs(title ="Race count", subtitle = "NHANES 2018", caption = "BUMC: FC721") + theme_minimal() #----------------------------------- # Improving our pie chart #----------------------------------- race_table <- nhanes_tidy |> mutate( race = if_else( race == "Other Race - Including Multi-Racial", "Other", race)) |> count(race) # Compute position of labels: race_table1 <- race_table %>% arrange(desc(race)) %>% mutate(prop = n / sum(race_table$n) *100) %>% mutate(ypos = cumsum(prop)- 0.5*prop ) ggplot(race_table1, aes(x="", y=prop, fill=race)) + geom_bar(stat="identity", width=1, color="white") + coord_polar("y", start=0) + theme_void() + # remove background, grid, numeric labels theme(legend.position="none") + geom_text(aes(y = ypos, label = race), color = "white") + scale_fill_brewer(palette="Set1") #============================================================ # # Facets: display multiple plots at once for each individual category # #============================================================ # Let's create a scatter plot of weight vs. height for each gender ggplot(data = nhanes_tidy, aes(x = height, y = weight)) + geom_point() + facet_wrap(~gender) #============================================================ # # Flip coordinates # #============================================================ # Let's create a barplot for each race and flip the coordinates ggplot(data = nhanes_tidy, aes(x=race)) + geom_bar() + geom_text(stat='count', aes(label=..count..), hjust=-0.3) + coord_flip() + ylim(c(0, 3500))