# Introduction to Data Science using R for NCBB


#-------------------------------------
# Introduction to R Basics
# Basic arithmetic and data structures
#-------------------------------------

2 + 3
c(1, 2, 3, 4, 5)  # Vector
list("gene" = "BRCA1", "value" = 42)  # List


#-------------------------------------
# Loading and Exploring Data
#-------------------------------------

# Load a sample dataset that comes with R
data(iris)

# View the first few and the last few rows of the dataset
head(iris)
tail(iris)

# Summary statistics
summary(iris)

# View the structure of the R object (in this case a data frame)
str(iris)

# let's now load a dataset that comes from a file:
med_data <- read.csv("dataset_sample.csv")


# explore the med_data dataset
head(med_data)
summary(med_data)
str(med_data)

# Why the BMI column has type "character"? How can we fix it?
# We can sort the values in this column to identify the issue
sort(unique(med_data$BMI))
# There are some values that R does not recognize as numbers, such as "#NA" 
# It can be "Unknown" or some "impossible" value.
# We can use options in read.csv function to specify how to handle these values
med_data <- read.csv("dataset_sample.csv", na.strings = c("#NA", ""))
str(med_data)
summary(med_data)

# ------------------------------- 
# Introducing R pipe symbols
# -------------------------------

# |> and %>%
# The pipe operator |> is used to pass the output of one function 
# as the input to another function.

# %>% operator was used before R introduced |> pipe symbol and so many online examples
# use this symbol, but it requires to load additional R library (magrittr)

# example of usage:
read.csv("dataset_sample.csv") |> head()


# ------------------------------- 
# Installing and loading R libraries
# -------------------------------
install.packages(c("tidyverse", "ggfortify"))
library(tidyverse)  # For data manipulation and visualization


# ------------------------------- 
# Data preparation and exploration
# -------------------------------


# Filter data (e.g., selecting only "smokers")
smoker_data <- med_data |> filter(Smoking_Status == "Smoker")

# Selecting columns 
subset <- med_data |> select(ID, Age , Heart_Rate:Gender)

# Creating new columns
med_data_bmi <- med_data |> mutate(BMI = 10000*Weight_kg / (Height_cm^2))

# Grouping and summarizing data
med_data |> 
  group_by(Smoking_Status, Gender) |> 
  summarize(mean_Heart_Rate = mean(Heart_Rate_bpm), 
            mean_Cholesterol_mg_dL = mean(Cholesterol_mg_dL),
            mean_Blood_Pressure_Sys = mean(Blood_Pressure_Sys),
            .groups='drop')


#--------------------------
# Some online books and resources
#--------------------------
# R for Data Science (https://r4ds.had.co.nz/)

# R for applied epidemiology and public health
# https://epirhandbook.com/

# UCLA online Data Analysis Examples
# https://stats.idre.ucla.edu/other/dae/