Data Science Workflow:

Import the data

There are a few ways csv and other types of files can be read in R. We will use read_csv() function from readr package:

library(readr)
visit <- read_csv("http://rcs.bu.edu/classes/FC764/VisitData.csv")
patient <- read_csv("http://rcs.bu.edu/classes/FC764/PatientData.csv")

Explore the data

Now, once we read the data, we can explore it:

head(visit)
## # A tibble: 6 × 9
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 249930 2017-04-14      2017-04-18      FALSE     FALSE  FALSE   smok… FALSE   
## 2 294082 2017-06-16      2017-06-19      FALSE     FALSE  FALSE   non-… FALSE   
## 3 295729 2016-09-05      2016-09-07      FALSE     FALSE  FALSE   non-… FALSE   
## 4 304529 2016-10-03      2016-10-08      FALSE     FALSE  FALSE   non-… FALSE   
## 5 304529 2016-12-24      2016-12-25      FALSE     FALSE  FALSE   non-… FALSE   
## 6 305944 2016-10-26      2016-10-29      FALSE     FALSE  FALSE   non-… FALSE   
## # … with 1 more variable: problem <chr>
str(visit)
## spec_tbl_df [27,084 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID             : num [1:27084] 249930 294082 295729 304529 304529 ...
##  $ hosp_admsn_date: Date[1:27084], format: "2017-04-14" "2017-06-16" ...
##  $ hosp_disch_date: Date[1:27084], format: "2017-04-18" "2017-06-19" ...
##  $ Pneumonia      : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Stroke         : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol        : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ smoke          : chr [1:27084] "smoker" "non-smoker" "non-smoker" "non-smoker" ...
##  $ drug.use       : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ problem        : chr [1:27084] "Anemia" "Hypertension" "Anemia" "Asthma" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   hosp_admsn_date = col_date(format = ""),
##   ..   hosp_disch_date = col_date(format = ""),
##   ..   Pneumonia = col_logical(),
##   ..   Stroke = col_logical(),
##   ..   Alcohol = col_logical(),
##   ..   smoke = col_character(),
##   ..   drug.use = col_logical(),
##   ..   problem = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(visit)
##        ID          hosp_admsn_date      hosp_disch_date      Pneumonia      
##  Min.   : 249930   Min.   :2016-07-01   Min.   :2016-07-01   Mode :logical  
##  1st Qu.:2531878   1st Qu.:2016-09-29   1st Qu.:2016-10-04   FALSE:24991    
##  Median :3132171   Median :2016-12-30   Median :2017-01-04   TRUE :2093     
##  Mean   :3080644   Mean   :2016-12-29   Mean   :2017-01-03                  
##  3rd Qu.:3881028   3rd Qu.:2017-03-30   3rd Qu.:2017-04-04                  
##  Max.   :4086138   Max.   :2017-09-26   Max.   :2017-09-27                  
##    Stroke         Alcohol           smoke            drug.use      
##  Mode :logical   Mode :logical   Length:27084       Mode :logical  
##  FALSE:25243     FALSE:23538     Class :character   FALSE:24585    
##  TRUE :1841      TRUE :3546      Mode  :character   TRUE :2499     
##                                                                    
##                                                                    
##                                                                    
##    problem         
##  Length:27084      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Now we will use dplyr package to manipulate the data:

library(dplyr)
glimpse(visit)
## Rows: 27,084
## Columns: 9
## $ ID              <dbl> 249930, 294082, 295729, 304529, 304529, 305944, 305944…
## $ hosp_admsn_date <date> 2017-04-14, 2017-06-16, 2016-09-05, 2016-10-03, 2016-…
## $ hosp_disch_date <date> 2017-04-18, 2017-06-19, 2016-09-07, 2016-10-08, 2016-…
## $ Pneumonia       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Stroke          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Alcohol         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ smoke           <chr> "smoker", "non-smoker", "non-smoker", "non-smoker", "n…
## $ drug.use        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ problem         <chr> "Anemia", "Hypertension", "Anemia", "Asthma", "Asthma"…

Basic dplyr functions

Understand the Data

table(visit$drug.use)
## 
## FALSE  TRUE 
## 24585  2499

Selecting columns using select()

someData <- select(visit, ID, problem)
head(someData)
## # A tibble: 6 × 2
##       ID problem       
##    <dbl> <chr>         
## 1 249930 Anemia        
## 2 294082 Hypertension  
## 3 295729 Anemia        
## 4 304529 Asthma        
## 5 304529 Asthma        
## 6 305944 Osteoarthritis

To select a columns except a specific column, we can use “-” (subtraction):

someData <- select(visit, -problem)
head(someData)
## # A tibble: 6 × 8
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 249930 2017-04-14      2017-04-18      FALSE     FALSE  FALSE   smok… FALSE   
## 2 294082 2017-06-16      2017-06-19      FALSE     FALSE  FALSE   non-… FALSE   
## 3 295729 2016-09-05      2016-09-07      FALSE     FALSE  FALSE   non-… FALSE   
## 4 304529 2016-10-03      2016-10-08      FALSE     FALSE  FALSE   non-… FALSE   
## 5 304529 2016-12-24      2016-12-25      FALSE     FALSE  FALSE   non-… FALSE   
## 6 305944 2016-10-26      2016-10-29      FALSE     FALSE  FALSE   non-… FALSE

To select a range of columns, use “:” (colon) operator

someData <- select(visit, ID:drug.use)
head(someData)
## # A tibble: 6 × 8
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 249930 2017-04-14      2017-04-18      FALSE     FALSE  FALSE   smok… FALSE   
## 2 294082 2017-06-16      2017-06-19      FALSE     FALSE  FALSE   non-… FALSE   
## 3 295729 2016-09-05      2016-09-07      FALSE     FALSE  FALSE   non-… FALSE   
## 4 304529 2016-10-03      2016-10-08      FALSE     FALSE  FALSE   non-… FALSE   
## 5 304529 2016-12-24      2016-12-25      FALSE     FALSE  FALSE   non-… FALSE   
## 6 305944 2016-10-26      2016-10-29      FALSE     FALSE  FALSE   non-… FALSE

To select columns that start or end with a specific string:

someData <- select(visit, ID, ends_with("date"))
head(someData)
## # A tibble: 6 × 3
##       ID hosp_admsn_date hosp_disch_date
##    <dbl> <date>          <date>         
## 1 249930 2017-04-14      2017-04-18     
## 2 294082 2017-06-16      2017-06-19     
## 3 295729 2016-09-05      2016-09-07     
## 4 304529 2016-10-03      2016-10-08     
## 5 304529 2016-12-24      2016-12-25     
## 6 305944 2016-10-26      2016-10-29

Some additional options to select columns based on a specific substring in their names:

Selecting columns using filter()

someData <- filter(visit, problem == "Anemia")
head(someData)
## # A tibble: 6 × 9
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 249930 2017-04-14      2017-04-18      FALSE     FALSE  FALSE   smok… FALSE   
## 2 295729 2016-09-05      2016-09-07      FALSE     FALSE  FALSE   non-… FALSE   
## 3 341732 2016-07-06      2016-07-08      FALSE     FALSE  FALSE   non-… FALSE   
## 4 361103 2017-02-20      2017-02-26      FALSE     FALSE  FALSE   non-… FALSE   
## 5 361103 2017-03-02      2017-03-03      FALSE     FALSE  FALSE   non-… FALSE   
## 6 361103 2017-03-05      2017-03-08      FALSE     FALSE  FALSE   non-… FALSE   
## # … with 1 more variable: problem <chr>

We might want to select a number of “problems”:

someData <- filter(visit, problem %in% c("Asthma", "Bronchitis") )
head(someData)
## # A tibble: 6 × 9
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 304529 2016-10-03      2016-10-08      FALSE     FALSE  FALSE   non-… FALSE   
## 2 304529 2016-12-24      2016-12-25      FALSE     FALSE  FALSE   non-… FALSE   
## 3 526261 2017-06-10      2017-06-12      FALSE     FALSE  FALSE   non-… FALSE   
## 4 530087 2017-02-13      2017-02-13      FALSE     FALSE  FALSE   non-… FALSE   
## 5 530087 2017-02-19      2017-02-20      FALSE     FALSE  FALSE   non-… FALSE   
## 6 530087 2017-03-09      2017-03-13      FALSE     FALSE  FALSE   non-… FALSE   
## # … with 1 more variable: problem <chr>

We can also specify multiple criterias

someData <- filter(visit, problem == "Anemia", hosp_admsn_date >=as.Date("2017-01-01"))
head(someData)
## # A tibble: 6 × 9
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 249930 2017-04-14      2017-04-18      FALSE     FALSE  FALSE   smok… FALSE   
## 2 361103 2017-02-20      2017-02-26      FALSE     FALSE  FALSE   non-… FALSE   
## 3 361103 2017-03-02      2017-03-03      FALSE     FALSE  FALSE   non-… FALSE   
## 4 361103 2017-03-05      2017-03-08      FALSE     FALSE  FALSE   non-… FALSE   
## 5 361103 2017-04-07      2017-04-13      FALSE     FALSE  FALSE   non-… FALSE   
## 6 383238 2017-06-23      2017-06-27      FALSE     TRUE   FALSE   non-… FALSE   
## # … with 1 more variable: problem <chr>

Modifying the data using mutate()

someData <- mutate(visit, lengthOfStay = hosp_disch_date - hosp_admsn_date)
glimpse(someData)
## Rows: 27,084
## Columns: 10
## $ ID              <dbl> 249930, 294082, 295729, 304529, 304529, 305944, 305944…
## $ hosp_admsn_date <date> 2017-04-14, 2017-06-16, 2016-09-05, 2016-10-03, 2016-…
## $ hosp_disch_date <date> 2017-04-18, 2017-06-19, 2016-09-07, 2016-10-08, 2016-…
## $ Pneumonia       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Stroke          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Alcohol         <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ smoke           <chr> "smoker", "non-smoker", "non-smoker", "non-smoker", "n…
## $ drug.use        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ problem         <chr> "Anemia", "Hypertension", "Anemia", "Asthma", "Asthma"…
## $ lengthOfStay    <drtn> 4 days, 3 days, 2 days, 5 days, 1 days, 3 days, 1 day…

Pipe operator: %>%

Dplyr package uses the pipe operator from another package (magrittr). It allows to pipe the output from the previous function to the input of the next function. So we can simply do:

visit %>%
  filter( problem %in% c("Asthma", "Bronchitis") ) %>%
  head()
## # A tibble: 6 × 9
##       ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
##    <dbl> <date>          <date>          <lgl>     <lgl>  <lgl>   <chr> <lgl>   
## 1 304529 2016-10-03      2016-10-08      FALSE     FALSE  FALSE   non-… FALSE   
## 2 304529 2016-12-24      2016-12-25      FALSE     FALSE  FALSE   non-… FALSE   
## 3 526261 2017-06-10      2017-06-12      FALSE     FALSE  FALSE   non-… FALSE   
## 4 530087 2017-02-13      2017-02-13      FALSE     FALSE  FALSE   non-… FALSE   
## 5 530087 2017-02-19      2017-02-20      FALSE     FALSE  FALSE   non-… FALSE   
## 6 530087 2017-03-09      2017-03-13      FALSE     FALSE  FALSE   non-… FALSE   
## # … with 1 more variable: problem <chr>

Sort using arrange():

visit %>%
  select( ID: hosp_disch_date, problem) %>%
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
  arrange( lengthOfStay  ) %>%
  head()
## # A tibble: 6 × 5
##        ID hosp_admsn_date hosp_disch_date problem        lengthOfStay
##     <dbl> <date>          <date>          <chr>          <drtn>      
## 1 2444137 2017-06-05      2017-05-01      Hypertension   -35 days    
## 2 2444137 2017-06-05      2017-05-01      Hypertension   -35 days    
## 3  380746 2016-07-13      2016-07-13      Osteoarthritis   0 days    
## 4  413834 2016-08-23      2016-08-23      Hypertension     0 days    
## 5  522412 2016-09-24      2016-09-24      Anemia           0 days    
## 6  524464 2017-05-02      2017-05-02      Anemia           0 days

Looks like we found a problem with our data set. The length of stay for a couple of observations is negative. We should fix it. We will mark these observations as missing:

visit %>%
  select( ID: hosp_disch_date, problem) %>%
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
  arrange( lengthOfStay  ) %>%
  head()
## # A tibble: 6 × 5
##       ID hosp_admsn_date hosp_disch_date problem        lengthOfStay
##    <dbl> <date>          <date>          <chr>                 <dbl>
## 1 380746 2016-07-13      2016-07-13      Osteoarthritis            0
## 2 413834 2016-08-23      2016-08-23      Hypertension              0
## 3 522412 2016-09-24      2016-09-24      Anemia                    0
## 4 524464 2017-05-02      2017-05-02      Anemia                    0
## 5 529637 2016-09-16      2016-09-16      Anemia                    0
## 6 530087 2017-02-13      2017-02-13      Asthma                    0

We can also sort in reverse order:

visit %>%
  select( ID: hosp_disch_date, problem) %>%
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
  arrange( -lengthOfStay  ) %>%
  head()
## # A tibble: 6 × 5
##        ID hosp_admsn_date hosp_disch_date problem        lengthOfStay
##     <dbl> <date>          <date>          <chr>                 <dbl>
## 1 1967757 2016-08-26      2017-05-19      Anemia                  266
## 2 2065107 2016-09-24      2017-05-11      Hypertension            229
## 3 2617879 2016-11-25      2017-06-02      Hyperlipidemia          189
## 4  622433 2017-03-04      2017-08-31      Other                   180
## 5 3511461 2017-03-24      2017-09-20      Anemia                  180
## 6 4061773 2017-02-28      2017-07-26      Anemia                  148

Create summaries using summarise() function

visit %>%
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
  summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
             maxLengthOfStay = max(lengthOfStay , na.rm=T))  
## # A tibble: 1 × 2
##   aveLengthOfStay maxLengthOfStay
##             <dbl>           <dbl>
## 1            4.27             266

Summarise data for each category using group_by()

visit %>%
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
  group_by(problem) %>%
  summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
             maxLengthOfStay = max(lengthOfStay , na.rm=T))   %>%
  arrange( -aveLengthOfStay )
## # A tibble: 36 × 3
##    problem                        aveLengthOfStay maxLengthOfStay
##    <chr>                                    <dbl>           <dbl>
##  1 Ischemic stroke                           6.53              49
##  2 heart failure                             5.88             136
##  3 Ischemic cardiomyopathy                   5.76              52
##  4 Hepatitis C                               5.57              78
##  5 Anemia                                    5.51             266
##  6 Chronic Bronchitis                        5.46              28
##  7 Hyponatremia                              5.28              43
##  8 Pneumonia                                 5.25              78
##  9 Insomnia                                  5.18              47
## 10 Chronic ischemic heart disease            5.09              88
## # … with 26 more rows