# Introduction to Data Science using R for NCBB #------------------------------------- # Introduction to R Basics # Basic arithmetic and data structures #------------------------------------- 2 + 3 c(1, 2, 3, 4, 5) # Vector list("gene" = "BRCA1", "value" = 42) # List #------------------------------------- # Loading and Exploring Data #------------------------------------- # Load a sample dataset that comes with R data(iris) # View the first few and the last few rows of the dataset head(iris) tail(iris) # Summary statistics summary(iris) # View the structure of the R object (in this case a data frame) str(iris) # let's now load a dataset that comes from a file: med_data <- read.csv("dataset_sample.csv") # explore the med_data dataset head(med_data) summary(med_data) str(med_data) # Why the BMI column has type "character"? How can we fix it? # We can sort the values in this column to identify the issue sort(unique(med_data$BMI)) # There are some values that R does not recognize as numbers, such as "#NA" # It can be "Unknown" or some "impossible" value. # We can use options in read.csv function to specify how to handle these values med_data <- read.csv("dataset_sample.csv", na.strings = c("#NA", "")) str(med_data) summary(med_data) # ------------------------------- # Introducing R pipe symbols # ------------------------------- # |> and %>% # The pipe operator |> is used to pass the output of one function # as the input to another function. # %>% operator was used before R introduced |> pipe symbol and so many online examples # use this symbol, but it requires to load additional R library (magrittr) # example of usage: read.csv("dataset_sample.csv") |> head() # ------------------------------- # Installing and loading R libraries # ------------------------------- install.packages(c("tidyverse", "ggfortify")) library(tidyverse) # For data manipulation and visualization # ------------------------------- # Data preparation and exploration # ------------------------------- # Filter data (e.g., selecting only "smokers") smoker_data <- med_data |> filter(Smoking_Status == "Smoker") # Selecting columns subset <- med_data |> select(ID, Age , Heart_Rate:Gender) # Creating new columns med_data_bmi <- med_data |> mutate(BMI = 10000*Weight_kg / (Height_cm^2)) # Grouping and summarizing data med_data |> group_by(Smoking_Status, Gender) |> summarize(mean_Heart_Rate = mean(Heart_Rate_bpm), mean_Cholesterol_mg_dL = mean(Cholesterol_mg_dL), mean_Blood_Pressure_Sys = mean(Blood_Pressure_Sys), .groups='drop') #-------------------------- # Some online books and resources #-------------------------- # R for Data Science (https://r4ds.had.co.nz/) # R for applied epidemiology and public health # https://epirhandbook.com/ # UCLA online Data Analysis Examples # https://stats.idre.ucla.edu/other/dae/