library(tidyverse) ### Useful links: # R Markdown Cheatsheet: https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf # dplyr CheatSheet: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf # dplyr Tutorial: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html ### Data Science Workflow: # * Import # * Clean # * Understand # + Transform # + Visualize # + Model # * Communicate ### Import the data # There are a few ways csv and other types of files can be read in R. # We will use *read_csv()* function from **readr** package: visit <- read_csv("http://rcs.bu.edu/classes/FC764/VisitData.csv") patient <- read_csv("http://rcs.bu.edu/classes/FC764/PatientData.csv") ### Explore the data # Now, once we read the data, we can explore it: head(visit) str(visit) summary(visit) # Now we will use **dplyr** package to manipulate the data: glimpse(visit) ### Basic dplyr functions # * select() select columns # * filter() filter rows # * arrange() re-order or arrange rows # * mutate() create new columns # * summarise() summarise values # * group_by() allows for group operations in the “split-apply-combine” concept ### Some other dplyr functions include: # count(), rename(), distinct(), case_when() ### Understand the Data table(visit$drug.use) ### Selecting columns using select() someData <- select(visit, ID, problem) head(someData) # To select a columns except a specific column, we can use "-" (subtraction): someData <- select(visit, -problem) head(someData) # To select a range of columns, use ":" (colon) operator someData <- select(visit, ID:drug.use) head(someData) # Some additional options to select columns based on a specific substring in their names: # ends_with() = Select columns that end with a character string # contains() = Select columns that contain a character string # matches() = Select columns that match a regular expression # one_of() = Select columns names that are from a group of names ### Selecting rows using filter() someData <- filter(visit, problem == "Anemia") head(someData) # We might want to select a number of "problems": someData <- filter(visit, problem %in% c("Asthma", "Bronchitis") ) head(someData) # We can also specify multiple criteria someData <- filter(visit, problem == "Anemia", hosp_admsn_date >=as.Date("2017-01-01")) head(someData) ### Modifying the data using mutate() someData <- mutate(visit, lengthOfStay = hosp_disch_date - hosp_admsn_date) glimpse(someData) ### Pipe operators:|> and %>% visit |> filter( problem %in% c("Asthma", "Bronchitis") ) |> head() ### Sort using arrange(): visit |> select( ID: hosp_disch_date, problem) |> mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |> arrange( lengthOfStay ) |> head() # Looks like we found a problem with our data set. The length of stay for a couple of observations is negative. # We should fix it. We will mark these observations as missing: visit |> select( ID: hosp_disch_date, problem) |> mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |> mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |> arrange( lengthOfStay ) |> head() # We can also sort in reverse order: visit |> select( ID: hosp_disch_date, problem) |> mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |> mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |> arrange( -lengthOfStay ) |> head() ### Create summaries using summarise() function visit |> mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |> mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |> summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T), maxLengthOfStay = max(lengthOfStay , na.rm=T)) ### Summarise data for each category using group_by() visit |> mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |> mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |> group_by(problem) |> summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T), maxLengthOfStay = max(lengthOfStay , na.rm=T)) |> arrange( -aveLengthOfStay ) ### Merging 2 datasets together merged_data <- merge(visit, patient, by="ID", all.x=T, all.y=F) ### Basic statistics # Is smoking more prevalent among women (in this population)? # We will use Chi-square test of independence table(merged_data$smoke, merged_data$sex) # First let's remove "unknown" category smoke.data <- merged_data |> filter(smoke %in% c("non-smoker", "smoker")) # Perform the test chisq.test(smoke.data$smoke, smoke.data$sex) # More examples of statistical tests in R (and other languages) ## https://stats.idre.ucla.edu/other/mult-pkg/whatstat/ ## https://stats.idre.ucla.edu/r/whatstat/what-statistical-analysis-should-i-usestatistical-analyses-using-r/