2 + 3 # addition
[1] 5
2 ^ 3 # power
[1] 8
log(2) # built-in functions
[1] 0.6931472
a <- 3 # Assign a value to a variable
b = -5 # In most cases equal sign could also be used (though <- is recommended)
A <- 7 # R is case sensitive
Avoid using names c, t, cat, F, T, D as those are built-in functions/constants
The variable name can contain letters, digits, underscores and dots
and start with the letter or dot.
The variable name cannot contain
dollar sign or other special characters.
str.var <- "oxygen" # character variable
num.var <- 15.99 # numerical variable
bool.var <- TRUE # Boolean (or logical) variable
bool.var.1 <- F # logical values can be abbreviated
# Access help file for the R function if the name of the function is known
?sd
help(sd)
# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")
Vector is an array of values of the same type: Vectors can be numeric, character or logical.
# Function c() is used to create a vector from a list of values
num.vector <- c( 5, 7, 9, 11, 4, -1, 0)
# Numeric vectors can be defined in a number of ways:
vals1 <- c (2, -7, 5, 3, -1 ) # concatenation
vals2 <- 25:75 # range of values
vals3 <- seq(from=0, to=3, by=0.5) # sequence definition
vals4 <- rep(1, times=7) # repeat value
vals5 <- rnorm(5, mean=2, sd=1.5 ) # normally distributed values
# print values of a variable
print(vals2)
[1] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
[26] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
[51] 75
# Define 2 vectors ( first with "distance" values and one with "time valuse")
dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4, 0.7, 1.2, 0.85)
# Compute velocity values for each dist-time pair:
velocity <- dist/time # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)
[1] 12.00000 11.11111 12.50000 11.42857 12.50000 10.58824
# Define a vector with values of body temperature in Fahrenheit
ftemp <- c(97.8, 99.5, 97.9, 102.1, 98.4, 97.7)
# Convert ftem to a vector with values in Celsius
ctemp <- (ftemp - 32) / 9 * 5
print(ctemp)
[1] 36.55556 37.50000 36.61111 38.94444 36.88889 36.50000
x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector
x[2] # returns second element
[1] 38.2
x[2:4] # returns second through 4th elements inclusive
[1] 38.2 36.4 37.9
x[c(1,3,5)] # returns 1st, 3rd and 5th elements
[1] 36.6 36.4 41.0
x[-2] # returns all but 2nd element
[1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5
x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements
[1] 36.6 36.4 36.8
#compare each element of the vector with a value
x < 37.0
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]
[1] 36.6 36.4 36.8
There are a number of functions that are useful to locate a value satidfying specific condition(s)
which.max(x) # find the (first)maximum element and return its index
[1] 5
which.min(x)
[1] 3
which(x >= 37.0) # find the location of all the elements that satisfy a specific condition
[1] 2 4 5 6 8
max(x), min(x), sum(x) mean(x), sd(), median(x), range(x) var(x) - simple variance cor(x,y) - correlation between x and y sort(x), rank(x), order(x) - sorting duplicated(x), unique(x) - unique and repeating values summary()
x <- c(734, 145, NA, 456, NA) # define a numeric vector
anyNA(x) # are there any missing data
[1] TRUE
To check which values are missing use is.na() function:
x == NA # this does not work ! - missing value cannot be compared to anything
[1] NA NA NA NA NA
is.na(x) # check if the element in the vector is missing
[1] FALSE FALSE TRUE FALSE TRUE
which(is.na(x)) # which elements are missing
[1] 3 5
By default statistical functions will not compute if the data contain missing values:
mean(x)
[1] NA
To view the arguments that need to be used to remove missing data, read help topic for the function:
?mean
#Perform computation removing missing data
mean(x, na.rm=TRUE)
[1] 445
#To install package
install.packages("table1")
Once package is installed it can be loaded and used:
#Load R package
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
dat <- expand.grid(id=1:10, sex=c("Male", "Female"), treat=c("Treated", "Placebo"))
dat$age <- runif(nrow(dat), 10, 50)
dat$age[3] <- NA # Add a missing value
dat$wt <- exp(rnorm(nrow(dat), log(70), 0.2))
label(dat$sex) <- "Sex"
label(dat$age) <- "Age"
label(dat$treat) <- "Treatment Group"
label(dat$wt) <- "Weight"
units(dat$age) <- "years"
units(dat$wt) <- "kg"
# One level of stratification
table1(~ sex + age + wt | treat, data=dat)
Treated (N=20) |
Placebo (N=20) |
Overall (N=40) |
|
---|---|---|---|
Sex | |||
Male | 10 (50.0%) | 10 (50.0%) | 20 (50.0%) |
Female | 10 (50.0%) | 10 (50.0%) | 20 (50.0%) |
Age (years) | |||
Mean (SD) | 30.0 (12.9) | 28.7 (10.8) | 29.3 (11.7) |
Median [Min, Max] | 30.7 [10.7, 48.5] | 29.5 [10.0, 48.1] | 30.0 [10.0, 48.5] |
Missing | 1 (5.0%) | 0 (0%) | 1 (2.5%) |
Weight (kg) | |||
Mean (SD) | 71.1 (14.0) | 71.6 (14.2) | 71.3 (13.9) |
Median [Min, Max] | 67.6 [52.2, 119] | 68.6 [47.4, 101] | 68.3 [47.4, 119] |
# Two levels of stratification (nesting)
table1(~ age + wt | treat*sex, data=dat)
Treated |
Placebo |
Overall |
||||
---|---|---|---|---|---|---|
Male (N=10) |
Female (N=10) |
Male (N=10) |
Female (N=10) |
Male (N=20) |
Female (N=20) |
|
Age (years) | ||||||
Mean (SD) | 24.9 (12.5) | 34.5 (12.1) | 27.1 (10.3) | 30.4 (11.6) | 26.1 (11.1) | 32.4 (11.7) |
Median [Min, Max] | 21.9 [10.7, 45.6] | 38.9 [14.9, 48.5] | 26.4 [10.2, 46.0] | 31.6 [10.0, 48.1] | 24.0 [10.2, 46.0] | 34.0 [10.0, 48.5] |
Missing | 1 (10.0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5.0%) | 0 (0%) |
Weight (kg) | ||||||
Mean (SD) | 72.0 (17.6) | 70.2 (10.1) | 68.1 (11.7) | 75.0 (16.2) | 70.0 (14.7) | 72.6 (13.4) |
Median [Min, Max] | 66.8 [55.0, 119] | 70.3 [52.2, 83.6] | 65.4 [52.0, 91.7] | 75.5 [47.4, 101] | 66.7 [52.0, 119] | 72.3 [47.4, 101] |
To see more examples of table1 package usage, see https://cran.r-project.org/web/packages/table1/vignettes/table1-examples.html
There are many R functions to process files with various formats: Some come from base R:
There are a few R packages which provide additional functionality to read files.
# Install package first
# You can install packages from the RStudio Menu (Tools -> Install packages)
# or by executing the following R command:
#install.packages("foreign")
#Load R package "foreign"
library(foreign)
# Read data in Stata format
swissdata <- read.dta("http://rcs.bu.edu/classes/STaRS/swissfile.dta")
# Load R package haven to read SAS-formatted files (make sure it is installed! )
#install.packages("haven")
library(haven)
# Read data in SAS format
fhsdata <- read_sas("http://rcs.bu.edu/classes/STaRS/fhs.sas7bdat")
# Read regular csv file:
salt <- read.csv("http://rcs.bu.edu/classes/STaRS/intersalt.csv")
This data contains median blood pressure, as a fuction of salt intake:
#Look at the first and last few records
head(salt)
b bp sodium country
1 0.512 72.0 149.3 Argentina
2 0.226 78.2 133.0 Belgium
3 0.316 73.9 142.6 Belgium
4 0.042 61.7 5.8 Brazil
5 0.086 61.4 0.2 Brazil
6 0.265 73.4 148.9 Canada
tail(salt)
b bp sodium country
47 0.434 73.2 130.3 Hawaii
48 0.295 81.4 136.6 USBlack
49 0.204 76.2 133.4 US
50 0.300 74.7 162.0 West Germany
51 0.303 73.1 172.0 West Germany
52 0.360 75.6 129.7 Zimbabwe
#Get the list of the columns
names(salt)
[1] "b" "bp" "sodium" "country"
#Number of rows:
nrow(salt)
[1] 52
#Number of columns:
ncol(salt)
[1] 4
#Get the structure of the data:
str(salt)
'data.frame': 52 obs. of 4 variables:
$ b : num 0.512 0.226 0.316 0.042 0.086 0.265 0.384 0.501 0.352 0.443 ...
$ bp : num 72 78.2 73.9 61.7 61.4 73.4 79.2 66.6 82.1 75 ...
$ sodium : num 149.3 133 142.6 5.8 0.2 ...
$ country: chr "Argentina" "Belgium" "Belgium" "Brazil" ...
#Get the summary of the data:
summary(salt)
b bp sodium country
Min. :0.0420 Min. :61.40 Min. : 0.2 Length:52
1st Qu.:0.2930 1st Qu.:70.58 1st Qu.:135.0 Class :character
Median :0.3460 Median :73.15 Median :152.5 Mode :character
Mean :0.3502 Mean :73.15 Mean :148.3
3rd Qu.:0.4422 3rd Qu.:76.45 3rd Qu.:172.7
Max. :0.6790 Max. :82.10 Max. :242.1
min(salt$bp)
[1] 61.4
max(salt$bp)
[1] 82.1
range(salt$bp)
[1] 61.4 82.1
summary(salt$bp)
Min. 1st Qu. Median Mean 3rd Qu. Max.
61.40 70.58 73.15 73.15 76.45 82.10
#Sometimes it is helpful to visualize the data
boxplot( salt$bp )
plot(x = salt$sodium,
y = salt$bp,
xlab = "mean sodium excretion (mmol/24h)",
ylab = "mean diastolic blood pressure (mm Hg)",
main = "Diastolic Blood Presure vs Sodium excretion")
This test is used to test the linear relationship of 2 continious variables
cor.test(salt$bp, salt$sodium)
Pearson's product-moment correlation
data: salt$bp and salt$sodium
t = 2.7223, df = 50, p-value = 0.008901
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.09577287 0.57573339
sample estimates:
cor
0.3592828
The Null Hypothesis : the true correlation between
bp and sodium (blood pressure and salt intake) is 0 (they are
independent);
The Alternative hypothesis: true
correlation between bp and sodium is not equal to 0.
The p-value is less that 0.05 and the 95% CI does not contain 0, so at the significance level of 0.05 we reject null hypothesis and state that there is some (positive) correlation (0.359) between these 2 variables.
* 0 < |r| < .3 - weak correlation * .3 < |r| < .7 - moderate correlation * |r| > .7 - strong correlation
Important:
lm.res <- lm( bp ~ sodium, data = salt)
summary(lm.res)
Call:
lm(formula = bp ~ sodium, data = salt)
Residuals:
Min 1Q Median 3Q Max
-8.8625 -2.8906 0.0299 3.6470 9.4283
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67.56245 2.14643 31.477 <2e-16 ***
sodium 0.03768 0.01384 2.722 0.0089 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.511 on 50 degrees of freedom
Multiple R-squared: 0.1291, Adjusted R-squared: 0.1117
F-statistic: 7.411 on 1 and 50 DF, p-value: 0.008901
Here we estimated that relationship between the predictor
(sodium) and response (bp) variables.
The summary
statistics here reports a number of things. p-value tells us if
the model is statistically significant. In Linear Regression, the Null
Hypothesis is that the coefficient associated with the variables are
equal to zero.
Multiple R-squared value is equal to the square of
the correlation value we calculated in the previous test.
When the model fits the data;
This test is used to check if the sample follows a normal distribution
shapiro.test(salt$sodium)
Shapiro-Wilk normality test
data: salt$sodium
W = 0.84629, p-value = 8.758e-06
The null hypothesis: sample being tested is normally
distributed.
p value for this test is very small
(8.758e-06), so we can reject the null hypothesis and state that this
sample does not follow a normal distribution.
Let’s now take a
look at the variable bp:
shapiro.test(salt$bp)
Shapiro-Wilk normality test
data: salt$bp
W = 0.97341, p-value = 0.2932
Here the p value is greater than 0.05 and at the significance level of 0.05 we cannot reject the null hypothesis.
This test is used to test the mean of a sample from a normal distribution
t.test(salt$bp, mu=70)
One Sample t-test
data: salt$bp
t = 4.7485, df = 51, p-value = 1.703e-05
alternative hypothesis: true mean is not equal to 70
95 percent confidence interval:
71.81934 74.48451
sample estimates:
mean of x
73.15192
The null hypothesis: The true mean is equal to
70
The alternative hypothesis: true mean is not
equal to 70
Since p-value is small (1.703e-05) - less than
0.05, 95% percent CI does not contain the value 70, we can reject the
NULL hypothesis.
Let’s load some other dataset. It comes with R. This dataset shows the effect of two soporific drugs (increase in hours of sleep compared to control) on 10 patients. There are 3 variables:
data(sleep)
head(sleep)
extra group ID
1 0.7 1 1
2 -1.6 1 2
3 -0.2 1 3
4 -1.2 1 4
5 -0.1 1 5
6 3.4 1 6
summary(sleep)
extra group ID
Min. :-1.600 1:10 1 :2
1st Qu.:-0.025 2:10 2 :2
Median : 0.950 3 :2
Mean : 1.540 4 :2
3rd Qu.: 3.400 5 :2
Max. : 5.500 6 :2
(Other):8
To compare the means of 2 samples:
boxplot(extra ~ group, data = sleep)
t.test(extra ~ group, data = sleep)
Welch Two Sample t-test
data: extra by group
t = -1.8608, df = 17.776, p-value = 0.07939
alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
95 percent confidence interval:
-3.3654832 0.2054832
sample estimates:
mean in group 1 mean in group 2
0.75 2.33
Here the Null Hypothesis is that the true difference
between 2 groups is equal to 0
And Alternative
Hypothesis is that it does not equal to 0 In this test the
p-value is above significance level of 0.05 and 95% CI contains 0, so we
cannot reject the NULL hypothesis.
Note, that t.test has a number of options, including alternative which can be set to “two.sided”, “less”, “greater”, depending which test you would like to perform. Using option var.equal you can also specify if the variances of the values in each group are equal or not.
The one-way analysis of variance (ANOVA) test is an extension of two-samples t.test for comparing means for datasets with more than 2 groups.
Assumptions:
data("PlantGrowth")
head(PlantGrowth)
weight group
1 4.17 ctrl
2 5.58 ctrl
3 5.18 ctrl
4 6.11 ctrl
5 4.50 ctrl
6 4.61 ctrl
summary(PlantGrowth)
weight group
Min. :3.590 ctrl:10
1st Qu.:4.550 trt1:10
Median :5.155 trt2:10
Mean :5.073
3rd Qu.:5.530
Max. :6.310
boxplot(weight~group, data=PlantGrowth)
As visible from the side-by-side boxplots, there is some difference in the weights of 3 groups, but we cannot determine from the plot if this difference is significant.
aov.res <- aov(weight~group, data=PlantGrowth)
summary( aov.res )
Df Sum Sq Mean Sq F value Pr(>F)
group 2 3.766 1.8832 4.846 0.0159 *
Residuals 27 10.492 0.3886
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As we can see from the summary output the p-value is 0.0159 < 0.05 which indicates that there is a statistically significant difference in weignt between these groups. We can check the confidence intervals for the treatment parameters:
confint(aov.res)
2.5 % 97.5 %
(Intercept) 4.62752600 5.4364740
grouptrt1 -0.94301261 0.2010126
grouptrt2 -0.07801261 1.0660126
Important: In one-way ANOVA testsa small p-value indicates that some of the group means are different, but it does not say which ones!
For multiple pairwise-comparisions we use Tukey test:
TukeyHSD(aov.res)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = weight ~ group, data = PlantGrowth)
$group
diff lwr upr p adj
trt1-ctrl -0.371 -1.0622161 0.3202161 0.3908711
trt2-ctrl 0.494 -0.1972161 1.1852161 0.1979960
trt2-trt1 0.865 0.1737839 1.5562161 0.0120064
This result indicates that the significant difference in between treatment 1 and treatment 2 with the adjusted p-value of 0.012.
The chi-square test is used to analyze the frequency table ( or contengency table) formed by two categorical variables. It evaluates whether there is a significant association between the categories of the two variables.
treat <- read.csv("http://rcs.bu.edu/classes/STaRS/treatment.csv")
head(treat)
id treated improved
1 1 1 1
2 2 1 1
3 3 0 1
4 4 1 1
5 5 1 0
6 6 1 0
summary(treat)
id treated improved
Min. : 1 Min. :0.0000 Min. :0.000
1st Qu.: 27 1st Qu.:0.0000 1st Qu.:0.000
Median : 53 Median :0.0000 Median :1.000
Mean : 53 Mean :0.4762 Mean :0.581
3rd Qu.: 79 3rd Qu.:1.0000 3rd Qu.:1.000
Max. :105 Max. :1.0000 Max. :1.000
In the above dataset there are 2 categorical variables:
We would like to test if there is an improvement after the treatment. In other words if these 2 categorical variables are dependent.
First let’s take a look at the tables:
# Frequency table: "treated will be rows, "improved" - columns"
table(treat$treated, treat$improved)
0 1
0 29 26
1 15 35
# Proportion table
prop.table(table(treat$treated, treat$improved))
0 1
0 0.2761905 0.2476190
1 0.1428571 0.3333333
We can visualize this contingency table with a mosaic plot:
tbl <- table(treat$treated, treat$improved)
mosaicplot( tbl, color = 2:3, las = 2, main = "Improvement vs. Treatment" )
Compute chi-squred test
chisq.test (tbl)
Pearson's Chi-squared test with Yates' continuity correction
data: tbl
X-squared = 4.6626, df = 1, p-value = 0.03083
The Null Hypothesis is that treated and
improved variables are independent.
In the above test the
p-value is 0.03083, so at the significance level of 0.05 we can
reject the null hypothesis