There are a few ways csv and other types of files can be read in R. We will use read_csv() function from readr package:
library(readr)
visit <- read_csv("http://rcs.bu.edu/classes/FC764/VisitData.csv")
patient <- read_csv("http://rcs.bu.edu/classes/FC764/PatientData.csv")
Now, once we read the data, we can explore it:
head(visit)
## # A tibble: 6 × 9
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 249930 2017-04-14 2017-04-18 FALSE FALSE FALSE smok… FALSE
## 2 294082 2017-06-16 2017-06-19 FALSE FALSE FALSE non-… FALSE
## 3 295729 2016-09-05 2016-09-07 FALSE FALSE FALSE non-… FALSE
## 4 304529 2016-10-03 2016-10-08 FALSE FALSE FALSE non-… FALSE
## 5 304529 2016-12-24 2016-12-25 FALSE FALSE FALSE non-… FALSE
## 6 305944 2016-10-26 2016-10-29 FALSE FALSE FALSE non-… FALSE
## # … with 1 more variable: problem <chr>
str(visit)
## spec_tbl_df [27,084 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:27084] 249930 294082 295729 304529 304529 ...
## $ hosp_admsn_date: Date[1:27084], format: "2017-04-14" "2017-06-16" ...
## $ hosp_disch_date: Date[1:27084], format: "2017-04-18" "2017-06-19" ...
## $ Pneumonia : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Stroke : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ smoke : chr [1:27084] "smoker" "non-smoker" "non-smoker" "non-smoker" ...
## $ drug.use : logi [1:27084] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ problem : chr [1:27084] "Anemia" "Hypertension" "Anemia" "Asthma" ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. hosp_admsn_date = col_date(format = ""),
## .. hosp_disch_date = col_date(format = ""),
## .. Pneumonia = col_logical(),
## .. Stroke = col_logical(),
## .. Alcohol = col_logical(),
## .. smoke = col_character(),
## .. drug.use = col_logical(),
## .. problem = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(visit)
## ID hosp_admsn_date hosp_disch_date Pneumonia
## Min. : 249930 Min. :2016-07-01 Min. :2016-07-01 Mode :logical
## 1st Qu.:2531878 1st Qu.:2016-09-29 1st Qu.:2016-10-04 FALSE:24991
## Median :3132171 Median :2016-12-30 Median :2017-01-04 TRUE :2093
## Mean :3080644 Mean :2016-12-29 Mean :2017-01-03
## 3rd Qu.:3881028 3rd Qu.:2017-03-30 3rd Qu.:2017-04-04
## Max. :4086138 Max. :2017-09-26 Max. :2017-09-27
## Stroke Alcohol smoke drug.use
## Mode :logical Mode :logical Length:27084 Mode :logical
## FALSE:25243 FALSE:23538 Class :character FALSE:24585
## TRUE :1841 TRUE :3546 Mode :character TRUE :2499
##
##
##
## problem
## Length:27084
## Class :character
## Mode :character
##
##
##
Now we will use dplyr package to manipulate the data:
library(dplyr)
glimpse(visit)
## Rows: 27,084
## Columns: 9
## $ ID <dbl> 249930, 294082, 295729, 304529, 304529, 305944, 305944…
## $ hosp_admsn_date <date> 2017-04-14, 2017-06-16, 2016-09-05, 2016-10-03, 2016-…
## $ hosp_disch_date <date> 2017-04-18, 2017-06-19, 2016-09-07, 2016-10-08, 2016-…
## $ Pneumonia <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Stroke <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Alcohol <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ smoke <chr> "smoker", "non-smoker", "non-smoker", "non-smoker", "n…
## $ drug.use <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ problem <chr> "Anemia", "Hypertension", "Anemia", "Asthma", "Asthma"…
table(visit$drug.use)
##
## FALSE TRUE
## 24585 2499
someData <- select(visit, ID, problem)
head(someData)
## # A tibble: 6 × 2
## ID problem
## <dbl> <chr>
## 1 249930 Anemia
## 2 294082 Hypertension
## 3 295729 Anemia
## 4 304529 Asthma
## 5 304529 Asthma
## 6 305944 Osteoarthritis
To select a columns except a specific column, we can use “-” (subtraction):
someData <- select(visit, -problem)
head(someData)
## # A tibble: 6 × 8
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 249930 2017-04-14 2017-04-18 FALSE FALSE FALSE smok… FALSE
## 2 294082 2017-06-16 2017-06-19 FALSE FALSE FALSE non-… FALSE
## 3 295729 2016-09-05 2016-09-07 FALSE FALSE FALSE non-… FALSE
## 4 304529 2016-10-03 2016-10-08 FALSE FALSE FALSE non-… FALSE
## 5 304529 2016-12-24 2016-12-25 FALSE FALSE FALSE non-… FALSE
## 6 305944 2016-10-26 2016-10-29 FALSE FALSE FALSE non-… FALSE
To select a range of columns, use “:” (colon) operator
someData <- select(visit, ID:drug.use)
head(someData)
## # A tibble: 6 × 8
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 249930 2017-04-14 2017-04-18 FALSE FALSE FALSE smok… FALSE
## 2 294082 2017-06-16 2017-06-19 FALSE FALSE FALSE non-… FALSE
## 3 295729 2016-09-05 2016-09-07 FALSE FALSE FALSE non-… FALSE
## 4 304529 2016-10-03 2016-10-08 FALSE FALSE FALSE non-… FALSE
## 5 304529 2016-12-24 2016-12-25 FALSE FALSE FALSE non-… FALSE
## 6 305944 2016-10-26 2016-10-29 FALSE FALSE FALSE non-… FALSE
To select columns that start or end with a specific string:
someData <- select(visit, ID, ends_with("date"))
head(someData)
## # A tibble: 6 × 3
## ID hosp_admsn_date hosp_disch_date
## <dbl> <date> <date>
## 1 249930 2017-04-14 2017-04-18
## 2 294082 2017-06-16 2017-06-19
## 3 295729 2016-09-05 2016-09-07
## 4 304529 2016-10-03 2016-10-08
## 5 304529 2016-12-24 2016-12-25
## 6 305944 2016-10-26 2016-10-29
Some additional options to select columns based on a specific substring in their names:
someData <- filter(visit, problem == "Anemia")
head(someData)
## # A tibble: 6 × 9
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 249930 2017-04-14 2017-04-18 FALSE FALSE FALSE smok… FALSE
## 2 295729 2016-09-05 2016-09-07 FALSE FALSE FALSE non-… FALSE
## 3 341732 2016-07-06 2016-07-08 FALSE FALSE FALSE non-… FALSE
## 4 361103 2017-02-20 2017-02-26 FALSE FALSE FALSE non-… FALSE
## 5 361103 2017-03-02 2017-03-03 FALSE FALSE FALSE non-… FALSE
## 6 361103 2017-03-05 2017-03-08 FALSE FALSE FALSE non-… FALSE
## # … with 1 more variable: problem <chr>
We might want to select a number of “problems”:
someData <- filter(visit, problem %in% c("Asthma", "Bronchitis") )
head(someData)
## # A tibble: 6 × 9
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 304529 2016-10-03 2016-10-08 FALSE FALSE FALSE non-… FALSE
## 2 304529 2016-12-24 2016-12-25 FALSE FALSE FALSE non-… FALSE
## 3 526261 2017-06-10 2017-06-12 FALSE FALSE FALSE non-… FALSE
## 4 530087 2017-02-13 2017-02-13 FALSE FALSE FALSE non-… FALSE
## 5 530087 2017-02-19 2017-02-20 FALSE FALSE FALSE non-… FALSE
## 6 530087 2017-03-09 2017-03-13 FALSE FALSE FALSE non-… FALSE
## # … with 1 more variable: problem <chr>
We can also specify multiple criterias
someData <- filter(visit, problem == "Anemia", hosp_admsn_date >=as.Date("2017-01-01"))
head(someData)
## # A tibble: 6 × 9
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 249930 2017-04-14 2017-04-18 FALSE FALSE FALSE smok… FALSE
## 2 361103 2017-02-20 2017-02-26 FALSE FALSE FALSE non-… FALSE
## 3 361103 2017-03-02 2017-03-03 FALSE FALSE FALSE non-… FALSE
## 4 361103 2017-03-05 2017-03-08 FALSE FALSE FALSE non-… FALSE
## 5 361103 2017-04-07 2017-04-13 FALSE FALSE FALSE non-… FALSE
## 6 383238 2017-06-23 2017-06-27 FALSE TRUE FALSE non-… FALSE
## # … with 1 more variable: problem <chr>
someData <- mutate(visit, lengthOfStay = hosp_disch_date - hosp_admsn_date)
glimpse(someData)
## Rows: 27,084
## Columns: 10
## $ ID <dbl> 249930, 294082, 295729, 304529, 304529, 305944, 305944…
## $ hosp_admsn_date <date> 2017-04-14, 2017-06-16, 2016-09-05, 2016-10-03, 2016-…
## $ hosp_disch_date <date> 2017-04-18, 2017-06-19, 2016-09-07, 2016-10-08, 2016-…
## $ Pneumonia <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Stroke <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Alcohol <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ smoke <chr> "smoker", "non-smoker", "non-smoker", "non-smoker", "n…
## $ drug.use <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ problem <chr> "Anemia", "Hypertension", "Anemia", "Asthma", "Asthma"…
## $ lengthOfStay <drtn> 4 days, 3 days, 2 days, 5 days, 1 days, 3 days, 1 day…
Dplyr package uses the pipe operator from another package (magrittr). It allows to pipe the output from the previous function to the input of the next function. So we can simply do:
visit %>%
filter( problem %in% c("Asthma", "Bronchitis") ) %>%
head()
## # A tibble: 6 × 9
## ID hosp_admsn_date hosp_disch_date Pneumonia Stroke Alcohol smoke drug.use
## <dbl> <date> <date> <lgl> <lgl> <lgl> <chr> <lgl>
## 1 304529 2016-10-03 2016-10-08 FALSE FALSE FALSE non-… FALSE
## 2 304529 2016-12-24 2016-12-25 FALSE FALSE FALSE non-… FALSE
## 3 526261 2017-06-10 2017-06-12 FALSE FALSE FALSE non-… FALSE
## 4 530087 2017-02-13 2017-02-13 FALSE FALSE FALSE non-… FALSE
## 5 530087 2017-02-19 2017-02-20 FALSE FALSE FALSE non-… FALSE
## 6 530087 2017-03-09 2017-03-13 FALSE FALSE FALSE non-… FALSE
## # … with 1 more variable: problem <chr>
visit %>%
select( ID: hosp_disch_date, problem) %>%
mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
arrange( lengthOfStay ) %>%
head()
## # A tibble: 6 × 5
## ID hosp_admsn_date hosp_disch_date problem lengthOfStay
## <dbl> <date> <date> <chr> <drtn>
## 1 2444137 2017-06-05 2017-05-01 Hypertension -35 days
## 2 2444137 2017-06-05 2017-05-01 Hypertension -35 days
## 3 380746 2016-07-13 2016-07-13 Osteoarthritis 0 days
## 4 413834 2016-08-23 2016-08-23 Hypertension 0 days
## 5 522412 2016-09-24 2016-09-24 Anemia 0 days
## 6 524464 2017-05-02 2017-05-02 Anemia 0 days
Looks like we found a problem with our data set. The length of stay for a couple of observations is negative. We should fix it. We will mark these observations as missing:
visit %>%
select( ID: hosp_disch_date, problem) %>%
mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
arrange( lengthOfStay ) %>%
head()
## # A tibble: 6 × 5
## ID hosp_admsn_date hosp_disch_date problem lengthOfStay
## <dbl> <date> <date> <chr> <dbl>
## 1 380746 2016-07-13 2016-07-13 Osteoarthritis 0
## 2 413834 2016-08-23 2016-08-23 Hypertension 0
## 3 522412 2016-09-24 2016-09-24 Anemia 0
## 4 524464 2017-05-02 2017-05-02 Anemia 0
## 5 529637 2016-09-16 2016-09-16 Anemia 0
## 6 530087 2017-02-13 2017-02-13 Asthma 0
We can also sort in reverse order:
visit %>%
select( ID: hosp_disch_date, problem) %>%
mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
arrange( -lengthOfStay ) %>%
head()
## # A tibble: 6 × 5
## ID hosp_admsn_date hosp_disch_date problem lengthOfStay
## <dbl> <date> <date> <chr> <dbl>
## 1 1967757 2016-08-26 2017-05-19 Anemia 266
## 2 2065107 2016-09-24 2017-05-11 Hypertension 229
## 3 2617879 2016-11-25 2017-06-02 Hyperlipidemia 189
## 4 622433 2017-03-04 2017-08-31 Other 180
## 5 3511461 2017-03-24 2017-09-20 Anemia 180
## 6 4061773 2017-02-28 2017-07-26 Anemia 148
visit %>%
mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
maxLengthOfStay = max(lengthOfStay , na.rm=T))
## # A tibble: 1 × 2
## aveLengthOfStay maxLengthOfStay
## <dbl> <dbl>
## 1 4.27 266
visit %>%
mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) %>%
mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) %>%
group_by(problem) %>%
summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
maxLengthOfStay = max(lengthOfStay , na.rm=T)) %>%
arrange( -aveLengthOfStay )
## # A tibble: 36 × 3
## problem aveLengthOfStay maxLengthOfStay
## <chr> <dbl> <dbl>
## 1 Ischemic stroke 6.53 49
## 2 heart failure 5.88 136
## 3 Ischemic cardiomyopathy 5.76 52
## 4 Hepatitis C 5.57 78
## 5 Anemia 5.51 266
## 6 Chronic Bronchitis 5.46 28
## 7 Hyponatremia 5.28 43
## 8 Pneumonia 5.25 78
## 9 Insomnia 5.18 47
## 10 Chronic ischemic heart disease 5.09 88
## # … with 26 more rows