# FC721 Introduction to R Programming # Part2: Working with Data in R #----------------------------------- library(tidyverse) # Load the data # There is a "regular" R function to read csv files: read.csv() nhanes_r <- read.csv("nhanes_2018_df.csv") nhanes_tidy <- read_csv("nhanes_2018_df.csv") # Check the structure of the data str(nhanes_r) str(nhanes_tidy) # nhanes_tidy is a tibble, which is a modern version of a data frame # tibbles are part of the tidyverse, and are more user-friendly than data frames # when you work with tibbles, you can use glimpse() instead of str() glimpse(nhanes_tidy) # When you use str() and glimpse() functions, you can see the data type of each column # Make sure that the data type of each column is correct # (num) stands for double, which is a numeric data type # (int) stands for integer # (chr) stands for character, which is a text data type # (factor) stands for factor, which is a categorical data type # (logi) stands for logical, which is a boolean data type # and some others #----------------------------------- # Next we need to explore our dataset to understand its structure # We can use the following functions to explore the data: # head(), tail(), summary(), colnames(), unique(), table() # head() function shows the first 6 rows of the data head(nhanes_tidy) # To see more or less rows, you can specify the number of rows you want to see: head(nhanes_tidy, 10) # To see the last 6 rows of the data, you can use tail() function tail(nhanes_tidy) # summary() function provides a quick summary of the data summary(nhanes_tidy) # Make sure to carefully examine the output of the summary() function # It provides information about the minimum, 1st quartile, median, mean, 3rd quartile, and maximum values for each column # It also provides the number of missing values in each column # Check if the minimum and maximum values make sense for each column # Check if the number of missing values is acceptable # If there are any issues, you may need to clean the data # For the columns that are character, use unique() function to see unique values: unique(nhanes_tidy$race) # For the columns that contain categorical values, use table() function to see the frequency of each category: table(nhanes_tidy$race) # Note: categorical variable may be stored as a character or numeric data type # You may need to convert it to a factor data type using as.factor() function: nhanes_tidy$race <- as.factor(nhanes_tidy$race) # Let's check the structure of the data again glimpse(nhanes_tidy) # Notice the difference in the output of the summary() function for the race column: # It now shows the frequency of each category: summary(nhanes_tidy) #----------------------------------- # |> and %>% "pipes" #----------------------------------- # Native R's pipe symbol |> was introduced in 4.1.0 # It allows to send the output of one function as an input object to the next function: nhanes_tidy |> head() # You may see another "pipe" symbol used very frequently. # %>% "pipe" symbol was introduced in magrittr package long time ago and # then it was adopted by other packages in tidyverse library and some other R packages nhanes_tidy %>% head() #----------------------------------- # Data Manipulation #----------------------------------- # 1. Select columns you want to keep # 2. Filter rows you want to keep # 3. Mutate columns to create new columns or modify existing ones # 4. Group data (if needed) # 5. Summarize data # 1. Select columns you want to keep # Use select() function. # You can specify the columns you want to keep by their names and use # comma to select specific columns or colon to select a range of columns: df1 <- nhanes_tidy |> select(id, gender, age, bp_systolic: weight) # This is equivalent to: df1 <- select(nhanes_tidy, id, gender, age, bp_systolic: weight) # 2. Filter rows you want to keep df2 <- df1 |> filter ( ! is.na(bp_systolic) & ! is.na(bp_diastolic) ) # Useful functions for filtering: # is.na() - checks if a value is missing # %in% - checks if a value is in a list of values # between() - checks if a value is between two values # == - checks if a value is equal to another value # != - checks if a value is not equal to another value # >, <, >=, <= - checks if a value is greater than, less than, greater than or equal to, less than or equal to another value # & - logical AND # | - logical OR nhanes_tidy |> filter ( between(age, 18, 65) ) |> head() nhanes_tidy |> filter ( race %in% c("Non-Hispanic Black", "Mexican American") ) |> head() # 3. Mutate columns to create new columns or modify existing ones df3 <- df2 |> mutate( height_ft = height / 30.48) head(df3) summary(df3$height_ft) # Useful functions for mutating: # if_else() - conditional statement # case_when() - multiple conditional statements df2 |> mutate(age_group = if_else(age < 18, "Child", "Adult")) |> head() # Instead of doing this in 3 separate steps, you can chain them together using pipe symbols: df <- nhanes_tidy |> select(id,gender, age, bp_systolic: weight) |> filter( ! is.na(bp_systolic) & ! is.na(bp_diastolic) ) |> mutate( height_ft = height / 30.48 ) # 4. - 5. Group (if needed) and summarize data: nhanes_tidy |> group_by(gender) |> summarize(mean_age = mean(age), mean_weight = mean(weight, na.rm = TRUE), mean_height = mean(height, na.rm = TRUE)) # You can also use a new ".by" argument in summarize() function: nhanes_tidy |> summarize(mean_age = mean(age), mean_weight = mean(weight, na.rm = TRUE), mean_height = mean(height, na.rm = TRUE), .by = c(gender, race)) # use n() function to count the number of rows in each group: nhanes_tidy |> group_by(race) |> summarize(N= n(), mean_age = mean(age)) #----------------------------------- # Exercise #----------------------------------- # 1. Using nhanes_tidy dataset select the columns id, gender and age # 2. Filter the rows where age is greater than 18 # 3. Mutate the dataset to create a new column bmi: bmi = weight / (height / 100)^2 # 4. Save the result to a new dataset called nhanes_adults # 5. Using write_csv() function save the resulting dataset to a new csv file called nhanes_adults.csv #----------------------------------- #----------------------------------- # A few more useful functions #----------------------------------- # 1. arrange() function is used to sort the data by one or more columns nhanes_tidy |> arrange(age) |> head() # 2. distinct() function is used to remove duplicate rows nhanes_tidy |> distinct() |> head() # 3. rename() function is used to rename columns nhanes_tidy |> rename(Height = height) |> head() # 4. count() function is used to count the number of rows for each group nhanes_tidy |> count(race) #----------------------------------- # You may find table1 package useful for creating table1 summaries #----------------------------------- install.packages("table1") library(table1) table1( ~ age + bp_systolic + bp_diastolic + height + weight | race, data = nhanes_tidy)