#----------------------------------- # Introduction to R # Summer Training as Research Scholar # Summer 2024 # # Katia Bulekova: ktrn@bu.edu #----------------------------------- ## ----------------------------------- ## R variables ## ----------------------------------- a <- 3 # Assign a value to a variable b = -5 # In most cases equal sign could also be used (though <- is recommended) A <- 7 # R is case sensitive # Avoid using names c, t, cat, F, T, D as those are built-in functions/constants #The variable name can contain letters, digits, underscores and dots and #start with the letter or dot. #The variable name cannot contain dollar sign or other special characters. str.var <- "oxygen" # character variable num.var <- 15.99 # numerical variable bool.var <- TRUE # Boolean (or logical) variable bool.var.1 <- F # logical values can be abbreviated ## ----------------------------------- ## R help ## ----------------------------------- # Access help file for the R function if the name of the function is known ?sd help(sd) # Search for help ??"standard deviation" help.search("standard deviation") help.search("analysis of variance") ## ----------------------------------- ## R vectors ## ----------------------------------- # Vector is an array of values of the same type: # Vectors can be numeric, character or logical. # Function c() is used to create a vector from a list of values num.vector <- c( 5, 7, 9, 11, 4, -1, 0) # Numeric vectors can be defined in a number of ways: vals1 <- c (2, -7, 5, 3, -1 ) # concatenation vals2 <- 25:75 # range of values vals3 <- seq(from=0, to=3, by=0.5) # sequence definition vals4 <- rep(1, times=7) # repeat value vals5 <- rnorm(5, mean=2, sd=1.5 ) # normally distributed values # view the values of a variable # you can also just print the name of the variable at the command prompt print(vals2) ## ----------------------------------- ## Installing R packages ## ----------------------------------- # There are more than 20 thousand packages published officially on the CRAN’s website. # You can search them by name or by topic. # There are also many packages published on the Bioconductor site # To install an R package from the CRAN, # use install.packages("package_name") command. # For example, you may want to install a very popular # tidyverse library of packages used by many data scientists: install.packages("tidyverse") #Another very handy package is table1: install.packages("table1") # Once a package is installed it can be loaded and used: library(table1) ## ----------------------------------- ## Vector operations in R ## ----------------------------------- # Define a vector with values of body temperature in Fahrenheit ftemp <- c(97.8, 99.5, 97.9, 102.1, 98.4, 97.7) # Convert ftem to a vector with values in Celsius ctemp <- (ftemp - 32) / 9 * 5 print(ctemp) ## ----------------------------------- ## vector slicing (subsetting) ## ----------------------------------- x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector x[2] # returns second element x[2:4] # returns second through 4th elements inclusive x[c(1,3,5)] # returns 1st, 3rd and 5th elements #compare each element of the vector with a value x < 37.0 #return only those elements of the vector that satisfy a specific condition x[ x < 37.0 ] # There are a number of functions that are useful to locate a value satidfying specific condition(s) which.max(x) # find the (first)maximum element and return its index which.min(x) which(x >= 37.0) # find the location of all the elements that satisfy a specific condition ## ----------------------------------- ## vector functions: ## ----------------------------------- # max(x), min(x), sum(x) # mean(x), sd(), median(x), range(x) # var(x) # simple variance # cor(x,y) # correlation between x and y # sort(x), rank(x), order(x) # sorting # duplicated(x), unique(x) # unique and repeating values # summary() ## ----------------------------------- ## Missing Values ## ----------------------------------- x <- c(734, 145, NA, 456, NA) # define a numeric vector anyNA(x) # are there any missing data # To check which values are missing use is.na() function: x == NA # this does not work ! - missing value cannot be compared to anything is.na(x) # check if the element in the vector is missing which(is.na(x)) # which elements are missing # By default statistical functions will not compute if the data contain missing values: mean(x) # To view the arguments that need to be used to remove missing data, read help topic for the function: ?mean #Perform computation removing missing data mean(x, na.rm=TRUE) ## ----------------------------------- ## Reading input files ## ----------------------------------- # There are many R functions to process files with various formats: # Some come from base R: # read.table() # read.csv() # read.delim() # read.fwf() # scan() # and many others # Read regular csv file: salt <- read.csv("http://rcs.bu.edu/classes/STaRS/intersalt.csv") # There are a few R packages which provide additional functionality to read files. #Load R package "foreign" install.packages("foreign") library(foreign) # Read data in Stata format swissdata <- read.dta("http://rcs.bu.edu/classes/STaRS/swissfile.dta") # Load R package haven to read SAS-formatted files (make sure it is installed! ) library(haven) # Read data in SAS format fhsdata <- read_sas("http://rcs.bu.edu/classes/STaRS/fhs.sas7bdat") ## ----------------------------------- ## Exploring R dataframes ## ----------------------------------- head(fhsdata) summary(fhsdata) # We can use table1 package to get various summaries stratifying by one or more variable: # One level of stratification table1(~ SEX + AGE + TOTCHOL | factor(DIABETES), data=fhsdata) # Two levels of stratification (nesting) table1(~ AGE + TOTCHOL + HDLC | factor(DIABETES)*factor(SEX), data=fhsdata) #Let's take a look at the salt dataset head(salt) #This data contains median blood pressure, as a function of salt intake: # b - numeric vector # bp - mean diastolic blood pressure (mm Hg) # sodium - mean sodium excretion (mmol/24h) # country #Get the structure of the data: str(salt) #Get the summary of the data: summary(salt) #Sometimes it is helpful to visualize the data plot(x = salt$sodium, y = salt$bp, xlab = "mean sodium excretion (mmol/24h)", ylab = "mean diastolic blood pressure (mm Hg)", main = "Diastolic Blood Presure vs Sodium excretion") # As we can see in this plot, there is little correlation between these 2 variables # and there are a few "outlier" points that will affect the correlation calculation!!! ## ----------------------------------- ## Correlation ## ----------------------------------- # This test is used to test the linear relationship of 2 continuous variables cor.test(salt$bp, salt$sodium) #The Null Hypothesis : # the true correlation between bp and sodium (blood pressure and salt intake) is 0 (they are independent) #The Alternative hypothesis: # true correlation between bp and sodium is not equal to 0. #The p-value is less that 0.05 and the 95% CI does not contain 0, # so at the significance level of 0.05 we reject null hypothesis and # state that there is some (positive) correlation (0.359) between these 2 variables. # 0 < |r| < .3 - weak correlation # .3 < |r| < .7 - moderate correlation # |r| > .7 - strong correlation # Important: # The order of the variables in the test is not important # Correlation provide evidence of association, not causation! # Correlation values is always between -1 and 1 and does not change if the units of either or both variables change # Correlation describes linear relationship # Correlation is strongly affected by outliers (extreme observations) # In this case the correlation is weak and #there are a few points that significantly affect the result. ## ----------------------------------- ## Linear Model ## ----------------------------------- lm.res <- lm( bp ~ sodium, data = salt) summary(lm.res) # Here we estimated that relationship between the predictor (sodium) and response (bp) variables. # The summary statistics here reports a number of things. p-value tells us if the model is statistically significant. # In Linear Regression, the Null Hypothesis is that the coefficient associated with the variables are equal to zero. # Multiple R-squared value is equal to the square of the correlation value we calculated in the previous test. # When the model fits the data; # R-squared - The higher - the better ( > 0.7) # F-statistics - The higher the better # Std. Error - The closer to 0 - the better # Check if linear model is appropriate: view diagnostic plots par(mfrow=c(2,2)) plot(lm.res) par(mfrow=c(1,1)) # First plot: # Residuals should equally spread around a horizontal line without distinct patterns. # This is a good indication you don’t have non-linear relationships. # Second plot (Normal QQ plot): # Residuals should lined well on the straight dashed line # Third plot (check the assumption of equal variance (homoscedasticity)): # You should see a horizontal line with equally (randomly) spread points # Forth plot () # This plot helps to find influential cases if there are any. # When points are outside of the dashed lines # (meaning they have high "Cook’s distance" scores), # the points are influential to the regression results. ## ----------------------------------- ## Shapiro test ## ----------------------------------- # This test is used to check if the sample follows a normal distribution shapiro.test(salt$sodium) # The null hypothesis: sample being tested is normally distributed. # p value for this test is very small (8.758e-06), so we can reject the null hypothesis # and state that this sample does not follow a normal distribution. # Let's now take a look at the variable bp: shapiro.test(salt$bp) # Here the p value is greater than 0.05 and at the significance level of 0.05 # we cannot reject the null hypothesis. ## ----------------------------------- ## One Sample t-Test ## ----------------------------------- # This test is used to test the mean of a sample from a normal distribution t.test(salt$bp, mu=70) # The null hypothesis: The true mean is equal to 70 # The alternative hypothesis: true mean is not equal to 70 # Since p-value is small (1.703e-05) - less than 0.05, 95% percent CI does not contain the value 70, we can # reject the NULL hypothesis. ## ----------------------------------- ## Two Sample t-TEST ## ----------------------------------- # Let's load some other dataset. It comes with R. # This dataset shows the effect of two soporific drugs # (increase in hours of sleep compared to control) on 10 patients. # There are 3 variables: # extra: (numeric) increase in hours of sleep # group: (factor) categorical variable indicating which drug is given # ID: (factor) patient ID data(sleep) head(sleep) summary(sleep) #To compare the means of 2 samples: boxplot(extra ~ group, data = sleep) t.test(extra ~ group, data = sleep) # Here the Null Hypothesis is that the true difference between 2 groups is equal to 0 # And Alternative Hypothesis is that it does not equal to 0 # In this test the p-value is above significance level of 0.05 and # 95% CI contains 0, so we cannot reject the NULL hypothesis. # Note, that t.test has a number of options, including alternative which can be set to # "two.sided", "less", "greater", depending which test you would like to perform. # Using option var.equal you can also specify if the variances # of the values in each group are equal or not. ## ----------------------------------- ## One-way ANOVA test ## ----------------------------------- # The one-way analysis of variance (ANOVA) test is an extension # of two-samples t.test for comparing means for datasets with more than 2 groups. # Assumptions: # The observations are obtained independently and randomly from # the population defined by the categorical variable # The data of each category is normally distributed # The populations have a common variance. data("PlantGrowth") head(PlantGrowth) summary(PlantGrowth) boxplot(weight~group, data=PlantGrowth) # As visible from the side-by-side boxplots, there is some difference # in the weights of 3 groups, # but we cannot determine from the plot if this difference is significant. aov.res <- aov(weight~group, data=PlantGrowth) summary( aov.res ) # As we can see from the summary output the p-value is 0.0159 < 0.05 which indicates that # there is a statistically significant difference in weignt between these groups. # We can check the confidence intervals for the treatment parameters: confint(aov.res) # Important: In one-way ANOVA testsa small p-value indicates that # some of the group means are different, but it does not say which ones! #For multiple pairwise-comparisions we use Tukey test: TukeyHSD(aov.res) # This result indicates that the significant difference in between # treatment 1 and treatment 2 with the adjusted p-value of 0.012. ## ----------------------------------- ## Chi-Squred test of independence in R ## ----------------------------------- # The chi-square test is used to analyze the frequency table ( or contengency table) # formed by two categorical variables. # It evaluates whether there is a significant association between # the categories of the two variables. treat <- read.csv("http://rcs.bu.edu/classes/STaRS/treatment.csv") head(treat) summary(treat) # In the above dataset there are 2 categorical variables: # treated - 0 or 1 # improved - 0 or 1 #We would like to test if there is an improvement after the treatment. # In other words if these 2 categorical variables are dependent. # First let's take a look at the tables: # Frequency table: "treated will be rows, "improved" - columns" table(treat$treated, treat$improved) # Proportion table prop.table(table(treat$treated, treat$improved)) # We can visualize this contingency table with a mosaic plot: tbl <- table(treat$treated, treat$improved) mosaicplot( tbl, color = 2:3, las = 2, main = "Improvement vs. Treatment" ) # Compute chi-squred test chisq.test (tbl) # The Null Hypothesis is that treated and improved variables are independent. # In the above test the p-value is 0.03083, so at the significance level of # 0.05 we can reject the null hypothesis ## ----------------------------------- ## Additional resources: ## ----------------------------------- # 1. R for applied epidemiology and public health: https://epirhandbook.com/en/ # 2. R for data science: https://r4ds.had.co.nz/ # 3. The R Graph Gallery: https://r-graph-gallery.com/ # 4. FAQ in R and other R topics by UCLA: https://stats.oarc.ucla.edu/r/ # 5. Which Statistical Test to use: https://stats.oarc.ucla.edu/other/mult-pkg/whatstat/