2+3 # addition
[1] 5
2^3 # power
[1] 8
log(2) # built-in functions
[1] 0.6931472
a <- 3
b = -5 # both assignment operators are equivalent, the first one is more traditional
A <- 7 # R is case sensitive
Avoid using names c, t, cat, F, T, D as those are built-in functions/constants
The variable name can contain letters, digits, underscores and dots and start with the letter or dot The variable name cannot contain dollar sign or other special characters.
str.var <- "oxygen" # character variable
num.var <- 15.99 # numerical variable
bool.var <- TRUE # boolean (or logical) variable
bool.var.1 <- F # logical values can be abbriviated
rna.str <- "TTAGAATT" # string variable
mode(rna.str) # get the type (or mode) of the object
[1] "character"
nchar(rna.str) # get the lenght of the string
[1] 8
# Access help file for the R function if the name of the function is known
?sd
help(sd)
# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")
Vector is an array of R objects of the same type: Vectors can be numeric, character or logical.
# Function c() is used to create a vector from a list of values
num.vector <- c( 5,7, 9,11, 4, -1, 0)
# Numeric vectors can be defined in a number of ways:
vals1 <- c (2, -7, 5, 3, -1 ) # concatenation
vals2 <- 25:75 # range of values
vals3 <- seq(from=0, to=3, by=0.5) # sequence definition
vals4 <- rep(1, times=7) # repeat value
vals5 <- numeric(9) # initialize numeric vector with zeros
vals6 <- rnorm(5, mean=2, sd=1.5 ) # returns normally distributed values
# view the values of the vector
print(vals2)
[1] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
[26] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
[51] 75
R allows to operate on whole vectors:
# Define 2 vectors ( first with "distance" values and one with "time valuse")
dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85)
# Compute velocity values for each dist-time pair:
velocity <- dist/time # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)
[1] 12.00000 11.11111 12.50000 11.42857 12.50000 10.58824
# Define a vector with values of body temperature in Fahrenheit
ftemp <- c(97.8, 99.5, 97.9, 102.1, 98.4, 97.7)
# Convert ftem to the vector with values in Celsius
ctemp <- (ftemp -32)/9*5
print(ctemp)
[1] 36.55556 37.50000 36.61111 38.94444 36.88889 36.50000
+, -, *, / - addition, subtraction, multiplication, division ^ or ** - exponentiation %% - modulus %/% - integer division %in% - membership : - sequence genration <, <=, ==, >=, >, != - boolean comparative |, || - OR vectorized/not vectorized &, && - AND
x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector
x[2] # returns second element
[1] 38.2
x[2:4] # returns second through 4th elements inclusive
[1] 38.2 36.4 37.9
x[c(1,3,5)] # returns 1st, 3rd and 5th elements
[1] 36.6 36.4 41.0
x[-2] # returns all but 2nd element
[1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5
x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements
[1] 36.6 36.4 36.8
#compare each element of the vector with a value
x < 37.0
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]
[1] 36.6 36.4 36.8
There are a number of functions that are useful to locate a value satidfying specific condition(s)
which.max(x) # find the (first)maximum element and return its index
[1] 5
which.min(x)
[1] 3
which(x >= 37.0) # find the location of all the elements that satisfy a specific condition
[1] 2 4 5 6 8
max(x), min(x), sum(x) mean(x), sd(), median(x), range(x) var(x) - simple variance cor(x,y) - correlation between x and y sort(x), rank(x), order(x) cumsum(), cumprod(x), cummin(x), cumprod(x) duplicated(x), unique(x) summary()
There are many R functions to process files with various formats: Some come from base R:
There are many R packages which provide additional functionality to read files.
# Read regular csv file:
salt <- read.csv("http://rcs.bu.edu/classes/FC764/intersalt.csv")
This data contains median blood pressure, as a fuction of salt intake * b - numeric vector * bp - mean diastolic blood pressure (mm Hg) * na - mean sodium excretion (mmol/24h) * country
#Look at the first and last few records
head(salt)
b bp na country
1 0.512 72.0 149.3 Argentina
2 0.226 78.2 133.0 Belgium
3 0.316 73.9 142.6 Belgium
4 0.042 61.7 5.8 Brazil
5 0.086 61.4 0.2 Brazil
6 0.265 73.4 148.9 Canada
tail(salt)
b bp na country
47 0.434 73.2 130.3 Hawaii
48 0.295 81.4 136.6 USBlack
49 0.204 76.2 133.4 US
50 0.300 74.7 162.0 West Germany
51 0.303 73.1 172.0 West Germany
52 0.360 75.6 129.7 Zimbabwe
#Get the list of the columns
names(salt)
[1] "b" "bp" "na" "country"
#Number of rows:
nrow(salt)
[1] 52
#Number of columns:
ncol(salt)
[1] 4
#Get the structure of the data:
str(salt)
'data.frame': 52 obs. of 4 variables:
$ b : num 0.512 0.226 0.316 0.042 0.086 0.265 0.384 0.501 0.352 0.443 ...
$ bp : num 72 78.2 73.9 61.7 61.4 73.4 79.2 66.6 82.1 75 ...
$ na : num 149.3 133 142.6 5.8 0.2 ...
$ country: chr "Argentina" "Belgium" "Belgium" "Brazil" ...
#Get the summary of the data:
summary(salt)
b bp na country
Min. :0.0420 Min. :61.40 Min. : 0.2 Length:52
1st Qu.:0.2930 1st Qu.:70.58 1st Qu.:135.0 Class :character
Median :0.3460 Median :73.15 Median :152.5 Mode :character
Mean :0.3502 Mean :73.15 Mean :148.3
3rd Qu.:0.4422 3rd Qu.:76.45 3rd Qu.:172.7
Max. :0.6790 Max. :82.10 Max. :242.1
# Get unique values of a vector
unique(salt$country)
[1] "Argentina" "Belgium" "Brazil" "Canada" "Colombia"
[6] "Denmark" "East Germany" "Finland" "Hungary" "Iceland"
[11] "India" "Italy" "Japan" "Kenya" "Malta"
[16] "Mexico" "Netherlands" "PNG" "PRC" "Poland"
[21] "Portugal" "South Korea" "SovietUnion" "Spain" "Taiwan"
[26] "Trinidad" "UK" "US" "USBlack" "Hawaii"
[31] "West Germany" "Zimbabwe"
#numeric data exploratory analysis
min(salt$bp)
[1] 61.4
max(salt$bp)
[1] 82.1
range(salt$bp)
[1] 61.4 82.1
summary(salt$bp)
Min. 1st Qu. Median Mean 3rd Qu. Max.
61.40 70.58 73.15 73.15 76.45 82.10
Sometimes it is helpful to visualize the data
```r
boxplot( salt$bp )
plot(salt$na, salt$bp)
#To install package
install.packages("ggplot2")
Once package is installed it can be loaded and used:
#Load R package
library(ggplot2)
# Use boxplot from ggplot2 package
ggplot(salt, aes( na, bp)) + geom_point()
# dataframe slicing (accessing elements)
salt[2,3] # element, from the 2nd row, 3rd column
## [1] 133
salt[2,] # all elements from the 2nd row
## b bp na country
## 2 0.226 78.2 133 Belgium
salt[ ,3] # all elements in the 3rd column
## [1] 149.3 133.0 142.6 5.8 0.2 148.9 184.3 194.1 135.6 138.7 164.1 144.4
## [13] 189.7 135.1 194.4 153.2 175.9 169.9 170.0 162.6 167.1 171.2 201.2 51.3
## [25] 165.6 134.6 146.4 26.8 196.4 158.1 242.1 191.8 174.6 175.4 201.4 161.8
## [37] 165.4 175.2 136.2 108.3 149.9 150.0 151.8 133.8 95.9 125.9 130.3 136.6
## [49] 133.4 162.0 172.0 129.7
salt$bp # accessing elemnts using variable name
## [1] 72.0 78.2 73.9 61.7 61.4 73.4 79.2 66.6 82.1 75.0 77.5 77.4 79.2 71.7 70.7
## [16] 73.9 79.6 69.9 76.0 72.9 68.4 67.2 73.5 67.9 77.2 72.6 79.7 62.9 66.1 67.4
## [31] 70.2 75.7 77.9 78.2 71.4 72.1 72.7 68.0 75.2 75.5 73.1 71.2 72.4 70.0 78.5
## [46] 72.4 73.2 81.4 76.2 74.7 73.1 75.6
salt[salt$bp > 75, ] # list all the rows in the dataset for which variablebp is greater than 75
## b bp na country
## 2 0.226 78.2 133.0 Belgium
## 7 0.384 79.2 184.3 Canada
## 9 0.352 82.1 135.6 Denmark
## 11 0.508 77.5 164.1 Finland
## 12 0.465 77.4 144.4 Finland
## 13 0.308 79.2 189.7 Hungary
## 17 0.436 79.6 175.9 Italy
## 19 0.370 76.0 170.0 Italy
## 25 0.309 77.2 165.6 Malta
## 27 0.301 79.7 146.4 Netherlands
## 32 0.369 75.7 191.8 Poland
## 33 0.302 77.9 174.6 Poland
## 34 0.679 78.2 175.4 Portugal
## 39 0.367 75.2 136.2 Taiwan
## 40 0.340 75.5 108.3 Trinidad
## 45 0.294 78.5 95.9 USBlack
## 48 0.295 81.4 136.6 USBlack
## 49 0.204 76.2 133.4 US
## 52 0.360 75.6 129.7 Zimbabwe
salt[salt$bp < 70, ] # list all the rows in the dataset for which variablebp is less than 70
## b bp na country
## 4 0.042 61.7 5.8 Brazil
## 5 0.086 61.4 0.2 Brazil
## 8 0.501 66.6 194.1 Colombia
## 18 0.439 69.9 169.9 Italy
## 21 0.446 68.4 167.1 Japan
## 22 0.254 67.2 171.2 Japan
## 24 0.134 67.9 51.3 Kenya
## 28 0.047 62.9 26.8 PNG
## 29 0.501 66.1 196.4 PRC
## 30 0.465 67.4 158.1 PRC
## 38 0.490 68.0 175.2 Spain
organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)
dt <- data.frame(organism=organism,
genomesz=genomeSizeBP,
genecount=estGeneCount)
dt
## organism genomesz genecount
## 1 Human 3.000e+09 30000
## 2 Mouse 3.000e+09 30000
## 3 Fruit Fly 1.356e+08 13061
## 4 Roundworm 9.700e+07 19099
## 5 Yeast 1.210e+07 6034
# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount
## [1] 100000.000 100000.000 10382.053 5078.800 2005.303
# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt
## organism genomesz genecount bp
## 1 Human 3.000e+09 30000 100000.000
## 2 Mouse 3.000e+09 30000 100000.000
## 3 Fruit Fly 1.356e+08 13061 10382.053
## 4 Roundworm 9.700e+07 19099 5078.800
## 5 Yeast 1.210e+07 6034 2005.303
R contains many packages. Many of them can be found on the CRAN website :
https://cran.r-project.org/web/packages/available_packages_by_name.html
Bioconductor contains many packages used in bioinformatics: https://www.bioconductor.org/
#To install package
install.packages("dplyr")
After package is installed it has to be loaded before the functions it contains can be used:
#load package
library(dplyr)
dplyr package is a very useful package for data manipulation.
The following webpage contains easy to folow guide for this package:
https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html
x <- c(734, 145, NA, 456, NA) # define a numeric vector
anyNA(x) # are there any missing data
[1] TRUE
To check which values are missing use is.na() function:
x == NA # this does not work ! - missing value cannot be compared to anything
[1] NA NA NA NA NA
is.na(x) # check if the element in the vector is missing
[1] FALSE FALSE TRUE FALSE TRUE
which(is.na(x)) # which elements are missing
[1] 3 5
By default mamy statistical functions will not compute if the data contain missing values:
mean(x)
[1] NA
To view the arguments that need to be used to remove missing data, read help topic for the function:
?mean
#Perform computation removing the missing data
mean(x, na.rm=TRUE)
[1] 445
This dataset consists of gene expression measurements for 10 genes under control and reatment conditions with 4 replicates each
genht <- read.csv("/project/scv/classes/FC764/geHT.csv")
str(genht)
'data.frame': 10 obs. of 9 variables:
$ gene: int 1 2 3 4 5 6 7 8 9 10
$ c1 : int 2650 1200 1541 1545 1956 1599 2430 1902 1530 2008
$ t1 : int 3115 1101 1358 1910 2999 2710 2589 1910 2329 2485
$ c2 : int 2619 1200 1401 1652 2066 1754 2789 2028 1660 2104
$ t2 : int 2933 1309 1499 2028 2880 2765 2899 2100 2332 2871
$ c3 : int 2331 1888 1256 1449 1777 1434 2332 1888 1501 1987
$ t3 : int 2799 1901 1238 1901 2898 2689 2300 1901 2298 2650
$ c4 : int 2750 1315 1625 1399 1999 1702 2250 2000 1478 2100
$ t4 : int 3200 980 1421 2002 2798 2402 2741 1899 2287 2520
#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4 )
# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)
One Sample t-test
data: control
t = -2.174, df = 39, p-value = 0.03583
alternative hypothesis: true mean is not equal to 2000
95 percent confidence interval:
1715.028 1989.722
sample estimates:
mean of x
1852.375
The test result with a p-value of 0.03583 rejects the null hypothesis at a critical value (alpha level) of 0.05
# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4 )
t.test(control, treat)
Welch Two Sample t-test
data: control and treat
t = -3.6163, df = 70.732, p-value = 0.0005564
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-653.6098 -188.9902
sample estimates:
mean of x mean of y
1852.375 2273.675
The p-value for this test very strongly rejects the null hypothesis of no difference between the mean of the treatment group and control group
# paired t-test
#Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test
t.test(control, treat, paired=T)
Paired t-test
data: control and treat
t = -6.3945, df = 39, p-value = 1.468e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-554.5654 -288.0346
sample estimates:
mean of the differences
-421.3