library(tidyverse)

### Useful links:
# R Markdown Cheatsheet: https://www.rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf
# dplyr CheatSheet:      https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# dplyr Tutorial:        https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html

### Data Science Workflow:

# * Import
# * Clean
# * Understand
#    + Transform
#    + Visualize
#    + Model
# * Communicate


### Import the data

# There are a few ways csv and other types of files can be read in R. 
# We will use *read_csv()* function from **readr** package:

visit <- read_csv("http://rcs.bu.edu/classes/FC764/VisitData.csv")
patient <- read_csv("http://rcs.bu.edu/classes/FC764/PatientData.csv")


### Explore the data
# Now, once we read the data, we can explore it:
head(visit)
str(visit)
summary(visit)

# Now we will use **dplyr** package to manipulate the data:
glimpse(visit)


### Basic dplyr functions
# * select()     select columns
# * filter()   filter rows
# * arrange() 	re-order or arrange rows
# * mutate() 	create new columns
# * summarise() 	summarise values
# * group_by() 	allows for group operations in the “split-apply-combine” concept


### Some other dplyr functions include:
# count(), rename(), distinct(), case_when()


### Understand the Data
table(visit$drug.use)


### Selecting columns using select()
someData <- select(visit, ID, problem)
head(someData)


# To select a columns except a specific column, we can use "-" (subtraction):
someData <- select(visit, -problem)
head(someData)

# To select a range of columns, use ":" (colon) operator
someData <- select(visit, ID:drug.use)
head(someData)

# Some additional options to select columns based on a specific substring in their names:

# ends_with() = Select columns that end with a character string
# contains()  = Select columns that contain a character string
# matches()   = Select columns that match a regular expression
# one_of()    = Select columns names that are from a group of names

### Selecting rows using filter()
someData <- filter(visit, problem == "Anemia")
head(someData)

# We might want to select a number of "problems":
someData <- filter(visit, problem %in% c("Asthma", "Bronchitis") )
head(someData)


# We can also specify multiple criteria
someData <- filter(visit, problem == "Anemia", hosp_admsn_date >=as.Date("2017-01-01"))
head(someData)

### Modifying the data using mutate()
someData <- mutate(visit, lengthOfStay = hosp_disch_date - hosp_admsn_date)
glimpse(someData)

### Pipe operators:|> and  %>%

visit |>
  filter( problem %in% c("Asthma", "Bronchitis") ) |>
  head()


### Sort using arrange():

visit |>
  select( ID: hosp_disch_date, problem) |>
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |>
  arrange( lengthOfStay  ) |>
  head()

# Looks like we found a problem with our data set. The length of stay for a couple of observations is negative. 
# We should fix it. We will mark these observations as missing:
visit |>
  select( ID: hosp_disch_date, problem) |>
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |>
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |>
  arrange( lengthOfStay  ) |>
  head()


# We can also sort in reverse order:
visit |>
  select( ID: hosp_disch_date, problem) |>
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |>
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |>
  arrange( -lengthOfStay  ) |>
  head()

### Create summaries using summarise() function
visit |>
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |>
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |>
  summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
             maxLengthOfStay = max(lengthOfStay , na.rm=T))  


### Summarise data for each category using group_by() 
visit |>
  mutate( lengthOfStay = hosp_disch_date - hosp_admsn_date) |>
  mutate( lengthOfStay = ifelse(lengthOfStay < 0, NA, lengthOfStay) ) |>
  group_by(problem) |>
  summarise( aveLengthOfStay = mean(lengthOfStay , na.rm=T),
             maxLengthOfStay = max(lengthOfStay , na.rm=T))   |>
  arrange( -aveLengthOfStay )
  


### Merging 2 datasets together
merged_data <- merge(visit, patient, by="ID", all.x=T, all.y=F)


### Basic statistics
# Is smoking more prevalent among women (in this population)?
# We will use Chi-square test of independence
table(merged_data$smoke, merged_data$sex)

# First let's remove "unknown" category
smoke.data <- merged_data |> filter(smoke %in% c("non-smoker", "smoker"))
# Perform the test
chisq.test(smoke.data$smoke, smoke.data$sex)

# More examples of statistical tests in R (and other languages)
## https://stats.idre.ucla.edu/other/mult-pkg/whatstat/
## https://stats.idre.ucla.edu/r/whatstat/what-statistical-analysis-should-i-usestatistical-analyses-using-r/