Introduction to R (Part 1)

R as a scientific calculator

2+3    # addition

[1] 5

2^3    # power

[1] 8

log(2) # built-in functions

[1] 0.6931472

R variables

a <- 3
b = -5  # both assignment operators are equivalent, the first one is more traditional
A <- 7  # R is case sensitive

Avoid using names c, t, cat, F, T, D as those are built-in functions/constants

The variable name can contain letters, digits, underscores and dots and start with the letter or dot The variable name cannot contain dollar sign or other special characters.

str.var  <- "oxygen"   # character variable
num.var  <- 15.99                  # numerical variable
bool.var <- TRUE                   # boolean (or logical) variable
bool.var.1 <- F                    # logical values can be abbriviated


rna.str <- "TTAGAATT"    # string variable
mode(rna.str)            # get the type (or mode) of the object

[1] "character"

nchar(rna.str)           # get the lenght of the string

[1] 8

R help

# Access help file for the R function if the name of the function is known
?sd
help(sd)


# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")

R vectors

Vector is an array of R objects of the same type: Vectors can be numeric, character or logical.

# Function c() is used to create a vector from a list of values
num.vector <- c( 5,7, 9,11, 4, -1, 0)

# Numeric vectors can be defined in a number of ways:
vals1 <- c (2, -7, 5, 3, -1 )             # concatenation
vals2 <- 25:75                            # range of values
vals3 <- seq(from=0, to=3, by=0.5)        # sequence definition
vals4 <- rep(1, times=7)                  # repeat value
vals5 <- numeric(9)                       # initialize numeric vector with zeros
vals6 <- rnorm(5, mean=2, sd=1.5 )        # returns normally distributed values

# view the values of the vector
print(vals2)

 [1] 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
[26] 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
[51] 75

R allows to operate on whole vectors:

# Define 2 vectors ( first with "distance" values and one with "time valuse")
dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85)

# Compute velocity values for each dist-time pair:
velocity <- dist/time   # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)

[1] 12.00000 11.11111 12.50000 11.42857 12.50000 10.58824

# Define a vector with values of body temperature in Fahrenheit
ftemp <- c(97.8, 99.5, 97.9, 102.1, 98.4, 97.7)
  
# Convert ftem to the vector with values in Celsius
ctemp <- (ftemp -32)/9*5
print(ctemp)

[1] 36.55556 37.50000 36.61111 38.94444 36.88889 36.50000

R operators

              +, -, *, /  - addition, subtraction, multiplication, division
              ^ or **     - exponentiation
              %%          - modulus
              %/%         - integer division
              %in%        - membership
              :           - sequence genration
              <, <=, ==, >=, >, !=     - boolean comparative
              |, ||       - OR  vectorized/not vectorized
              &, &&       - AND

vector slicing (subsetting)

x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)    # define a numeric vector
x[2]         # returns second element

[1] 38.2

x[2:4]       # returns second through 4th elements inclusive

[1] 38.2 36.4 37.9

x[c(1,3,5)]  # returns 1st, 3rd and 5th elements

[1] 36.6 36.4 41.0

x[-2]        # returns all but 2nd element

[1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5

x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)]   # returns 1st, 3rd, and 7th elements

[1] 36.6 36.4 36.8

#compare each element of the vector with a value
x < 37.0

[1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE

#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]

[1] 36.6 36.4 36.8

There are a number of functions that are useful to locate a value satidfying specific condition(s)

which.max(x)  # find the (first)maximum element and return its index

[1] 5

which.min(x)

[1] 3

which(x >= 37.0) # find the location of all the elements that satisfy a specific condition

[1] 2 4 5 6 8

vector functions:

               max(x),   min(x),     sum(x)
               mean(x),  sd(), median(x),  range(x)
               var(x)                            - simple variance
               cor(x,y)                          - correlation between x and y
               sort(x), rank(x), order(x)
               cumsum(), cumprod(x), cummin(x), cumprod(x)
               duplicated(x), unique(x)
               summary()

Reading input files

There are many R functions to process files with various formats: Some come from base R:

read.table()
read.csv()
read.delim()
read.fwf()
scan() and many others

There are many R packages which provide additional functionality to read files.

# Read regular csv file:
salt <- read.csv("http://rcs.bu.edu/classes/FC764/intersalt.csv")

This data contains median blood pressure, as a fuction of salt intake * b - numeric vector * bp - mean diastolic blood pressure (mm Hg) * na - mean sodium excretion (mmol/24h) * country

Exploring R dataframes

#Look at the first and last few records
head(salt)

      b   bp    na   country
1 0.512 72.0 149.3 Argentina
2 0.226 78.2 133.0   Belgium
3 0.316 73.9 142.6   Belgium
4 0.042 61.7   5.8    Brazil
5 0.086 61.4   0.2    Brazil
6 0.265 73.4 148.9    Canada

tail(salt)

       b   bp    na      country
47 0.434 73.2 130.3       Hawaii
48 0.295 81.4 136.6      USBlack
49 0.204 76.2 133.4           US
50 0.300 74.7 162.0 West Germany
51 0.303 73.1 172.0 West Germany
52 0.360 75.6 129.7     Zimbabwe

#Get the list of the columns
names(salt)

[1] "b"       "bp"      "na"      "country"

#Number of rows:
nrow(salt)

[1] 52

#Number of columns:
ncol(salt)

[1] 4

#Get the structure of the data:
str(salt)

'data.frame':   52 obs. of  4 variables:
 $ b      : num  0.512 0.226 0.316 0.042 0.086 0.265 0.384 0.501 0.352 0.443 ...
 $ bp     : num  72 78.2 73.9 61.7 61.4 73.4 79.2 66.6 82.1 75 ...
 $ na     : num  149.3 133 142.6 5.8 0.2 ...
 $ country: chr  "Argentina" "Belgium" "Belgium" "Brazil" ...

#Get the summary of the data:
summary(salt)

       b                bp              na          country         
 Min.   :0.0420   Min.   :61.40   Min.   :  0.2   Length:52         
 1st Qu.:0.2930   1st Qu.:70.58   1st Qu.:135.0   Class :character  
 Median :0.3460   Median :73.15   Median :152.5   Mode  :character  
 Mean   :0.3502   Mean   :73.15   Mean   :148.3                     
 3rd Qu.:0.4422   3rd Qu.:76.45   3rd Qu.:172.7                     
 Max.   :0.6790   Max.   :82.10   Max.   :242.1

# Get unique values of a vector
unique(salt$country)

 [1] "Argentina"    "Belgium"      "Brazil"       "Canada"       "Colombia"    
 [6] "Denmark"      "East Germany" "Finland"      "Hungary"      "Iceland"     
[11] "India"        "Italy"        "Japan"        "Kenya"        "Malta"       
[16] "Mexico"       "Netherlands"  "PNG"          "PRC"          "Poland"      
[21] "Portugal"     "South Korea"  "SovietUnion"  "Spain"        "Taiwan"      
[26] "Trinidad"     "UK"           "US"           "USBlack"      "Hawaii"      
[31] "West Germany" "Zimbabwe"

#numeric data exploratory analysis
min(salt$bp)

[1] 61.4

max(salt$bp)

[1] 82.1

range(salt$bp)

[1] 61.4 82.1

summary(salt$bp)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  61.40   70.58   73.15   73.15   76.45   82.10

Sometimes it is helpful to visualize the data


```r
boxplot( salt$bp )

plot(salt$na, salt$bp)

R packages

#To install package
install.packages("ggplot2")

Once package is installed it can be loaded and used:

#Load R package
library(ggplot2)

# Use boxplot from ggplot2 package
ggplot(salt, aes( na, bp)) + geom_point()

dataframes slicing

# dataframe slicing (accessing elements)
salt[2,3]    # element, from the 2nd row, 3rd column

## [1] 133

salt[2,]     # all elements from the 2nd row

##       b   bp  na country
## 2 0.226 78.2 133 Belgium

salt[ ,3]    # all elements in the 3rd column

##  [1] 149.3 133.0 142.6   5.8   0.2 148.9 184.3 194.1 135.6 138.7 164.1 144.4
## [13] 189.7 135.1 194.4 153.2 175.9 169.9 170.0 162.6 167.1 171.2 201.2  51.3
## [25] 165.6 134.6 146.4  26.8 196.4 158.1 242.1 191.8 174.6 175.4 201.4 161.8
## [37] 165.4 175.2 136.2 108.3 149.9 150.0 151.8 133.8  95.9 125.9 130.3 136.6
## [49] 133.4 162.0 172.0 129.7

salt$bp  # accessing elemnts using variable name

##  [1] 72.0 78.2 73.9 61.7 61.4 73.4 79.2 66.6 82.1 75.0 77.5 77.4 79.2 71.7 70.7
## [16] 73.9 79.6 69.9 76.0 72.9 68.4 67.2 73.5 67.9 77.2 72.6 79.7 62.9 66.1 67.4
## [31] 70.2 75.7 77.9 78.2 71.4 72.1 72.7 68.0 75.2 75.5 73.1 71.2 72.4 70.0 78.5
## [46] 72.4 73.2 81.4 76.2 74.7 73.1 75.6

salt[salt$bp > 75, ]  # list all the rows in the dataset for which variablebp is greater than 75

##        b   bp    na     country
## 2  0.226 78.2 133.0     Belgium
## 7  0.384 79.2 184.3      Canada
## 9  0.352 82.1 135.6     Denmark
## 11 0.508 77.5 164.1     Finland
## 12 0.465 77.4 144.4     Finland
## 13 0.308 79.2 189.7     Hungary
## 17 0.436 79.6 175.9       Italy
## 19 0.370 76.0 170.0       Italy
## 25 0.309 77.2 165.6       Malta
## 27 0.301 79.7 146.4 Netherlands
## 32 0.369 75.7 191.8      Poland
## 33 0.302 77.9 174.6      Poland
## 34 0.679 78.2 175.4    Portugal
## 39 0.367 75.2 136.2      Taiwan
## 40 0.340 75.5 108.3    Trinidad
## 45 0.294 78.5  95.9     USBlack
## 48 0.295 81.4 136.6     USBlack
## 49 0.204 76.2 133.4          US
## 52 0.360 75.6 129.7    Zimbabwe

salt[salt$bp < 70, ]  # list all the rows in the dataset for which variablebp is less than 70

##        b   bp    na  country
## 4  0.042 61.7   5.8   Brazil
## 5  0.086 61.4   0.2   Brazil
## 8  0.501 66.6 194.1 Colombia
## 18 0.439 69.9 169.9    Italy
## 21 0.446 68.4 167.1    Japan
## 22 0.254 67.2 171.2    Japan
## 24 0.134 67.9  51.3    Kenya
## 28 0.047 62.9  26.8      PNG
## 29 0.501 66.1 196.4      PRC
## 30 0.465 67.4 158.1      PRC
## 38 0.490 68.0 175.2    Spain

Create dataframe from vectors

organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)
dt <- data.frame(organism=organism,
                 genomesz=genomeSizeBP, 
                 genecount=estGeneCount)
dt

##    organism  genomesz genecount
## 1     Human 3.000e+09     30000
## 2     Mouse 3.000e+09     30000
## 3 Fruit Fly 1.356e+08     13061
## 4 Roundworm 9.700e+07     19099
## 5     Yeast 1.210e+07      6034

# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount

## [1] 100000.000 100000.000  10382.053   5078.800   2005.303

# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt

##    organism  genomesz genecount         bp
## 1     Human 3.000e+09     30000 100000.000
## 2     Mouse 3.000e+09     30000 100000.000
## 3 Fruit Fly 1.356e+08     13061  10382.053
## 4 Roundworm 9.700e+07     19099   5078.800
## 5     Yeast 1.210e+07      6034   2005.303

R packages

R contains many packages. Many of them can be found on the CRAN website :
https://cran.r-project.org/web/packages/available_packages_by_name.html
Bioconductor contains many packages used in bioinformatics: https://www.bioconductor.org/

#To install package
install.packages("dplyr")

After package is installed it has to be loaded before the functions it contains can be used:

#load package
library(dplyr)

dplyr package is a very useful package for data manipulation.

filter() to select cases based on their values.
arrange() to reorder the cases.
select() and rename() to select variables based on their names.
mutate() and transmute() to add new variables that are functions of existing variables.
summarise() to condense multiple values to a single value.
sample_n() and sample_frac() to take random samples.

The following webpage contains easy to folow guide for this package:
https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html

Missing Values

x <- c(734, 145, NA, 456, NA)    # define a numeric vector
anyNA(x)              # are there any missing data

[1] TRUE

To check which values are missing use is.na() function:

x == NA   # this does not work ! - missing value cannot be compared to anything

[1] NA NA NA NA NA

is.na(x)              # check if the element in the vector is missing

[1] FALSE FALSE  TRUE FALSE  TRUE

which(is.na(x))       # which elements are missing

[1] 3 5

By default mamy statistical functions will not compute if the data contain missing values:

mean(x)

[1] NA

To view the arguments that need to be used to remove missing data, read help topic for the function:

?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)

[1] 445

Hypothesis Testing in R

This dataset consists of gene expression measurements for 10 genes under control and reatment conditions with 4 replicates each

genht <- read.csv("/project/scv/classes/FC764/geHT.csv")

str(genht)

'data.frame':   10 obs. of  9 variables:
 $ gene: int  1 2 3 4 5 6 7 8 9 10
 $ c1  : int  2650 1200 1541 1545 1956 1599 2430 1902 1530 2008
 $ t1  : int  3115 1101 1358 1910 2999 2710 2589 1910 2329 2485
 $ c2  : int  2619 1200 1401 1652 2066 1754 2789 2028 1660 2104
 $ t2  : int  2933 1309 1499 2028 2880 2765 2899 2100 2332 2871
 $ c3  : int  2331 1888 1256 1449 1777 1434 2332 1888 1501 1987
 $ t3  : int  2799 1901 1238 1901 2898 2689 2300 1901 2298 2650
 $ c4  : int  2750 1315 1625 1399 1999 1702 2250 2000 1478 2100
 $ t4  : int  3200 980 1421 2002 2798 2402 2741 1899 2287 2520

#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4  )


# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)


    One Sample t-test

data:  control
t = -2.174, df = 39, p-value = 0.03583
alternative hypothesis: true mean is not equal to 2000
95 percent confidence interval:
 1715.028 1989.722
sample estimates:
mean of x 
 1852.375

The test result with a p-value of 0.03583 rejects the null hypothesis at a critical value (alpha level) of 0.05

# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4  )
t.test(control, treat)


    Welch Two Sample t-test

data:  control and treat
t = -3.6163, df = 70.732, p-value = 0.0005564
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -653.6098 -188.9902
sample estimates:
mean of x mean of y 
 1852.375  2273.675

The p-value for this test very strongly rejects the null hypothesis of no difference between the mean of the treatment group and control group

# paired t-test
#Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test
t.test(control, treat, paired=T)


    Paired t-test

data:  control and treat
t = -6.3945, df = 39, p-value = 1.468e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -554.5654 -288.0346
sample estimates:
mean of the differences 
                 -421.3