R intro (session 1)

R as a scientific calculator

2+3    # addition

## [1] 5

2^3    # power

## [1] 8

log(2) # built-in functions

## [1] 0.6931472

Variables

R variable name can contain letters, digits, underscores and dots. It has to start with the letter or dot. The variable name cannot contain dollar sign or other special characters. Avoid using names c, t, cat, F, T, D as those are built-in functions or constants

a <- 3
b = -5  # both assignment operators are equivalent, the first one is more traditional
A <- 7  # R is case sensitive

Character, numerical and logical vairables are most popular basic types in R:

str.var  <- "character variable"   # character variable
num.var  <- 21.17                  # numerical variable
bool.var <- TRUE                   # boolean (or logical) variable
bool.var.1 <- F                    # logical values can be abbriviated

Use mode() function to get information about the type of a variable:

rna.str <- "TTAGAATT"    # string variable
mode(rna.str)            # get the type (or mode) of the object

## [1] "character"

nchar(rna.str)           # get the lenght of the string

## [1] 8

R help

R provides a comprehensive help for every function with plenty examples and links for further information.

# Access help file for a known name of  R function
?sd
help(sd)

# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")

R vectors

Vector is an array of R objects of the same type:

# An example of a numeric vector:
num.vector <- c( 5,7, 9,11, 4, -1, 0)

Numeric vectors can be defined in a number of ways:

vals <- c (2, -7, 5, 3, -1 )             # concatenation
vals <- 25:75                            # range of values
vals <- seq(from=0, to=3, by=0.5)        # sequence definition
vals <- rep(1, times=7)                  # repeat value
vals <- numeric(9)                       # initialize numeric vector with zeros
vals <- rnorm(5, mean=2, sd=1.5 )        # returns normally distributed values

Vector arithmetic:

Most R functions and operators can operate using whole vectors:

dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85)
velocity <- dist/time   # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)

## [1] 12.00000 11.11111 12.50000 11.42857 12.50000 10.58824

R operators

              +, -, *, /  - addition, subtraction, multiplication, division
              ^ or **     - exponentiation
              %%          - modulus
              %/%         - integer division
              %in%        - membership
              :           - sequence genration
              <, <=, ==, >=, >, !=     - boolean comparative
              |, ||       - OR  vectorized/not vectorized
              &, &&       - AND

#Check if a particular value is present in a vector
5 %in% dist

## [1] TRUE

vector slicing (subsetting)

x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)    # define a numeric vector
x[2]         # returns second element

## [1] 38.2

x[2:4]       # returns second through 4th elements inclusive

## [1] 38.2 36.4 37.9

x[c(1,3,5)]  # returns 1st, 3rd and 5th elements

## [1] 36.6 36.4 41.0

x[-2]        # returns all but 2nd element

## [1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5

x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)]   # returns 1st, 3rd, and 7th elements

## [1] 36.6 36.4 36.8

#compare each element of the vector with a value
x < 37.0

## [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE

#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]

## [1] 36.6 36.4 36.8

There are a few useful “which” functions:

which.max(x)  # find the (first)maximum element and return its index

## [1] 5

which.min(x)

## [1] 3

which(x >= 37.0) # find the location of all the elements that satisfy a specific condition

## [1] 2 4 5 6 8

vector functions:

               max(x),   min(x),     sum(x)
               mean(x),  sd(), median(x),  range(x)
               var(x)                            - simple variance
               cor(x,y)                          - correlation between x and y
               sort(x), rank(x), order(x)
               cumsum(), cumprod(x), cummin(x), cumprod(x)
               duplicated(x), unique(x)
               summary()

R packages

R contains many packages. Many of them can be found on the CRAN website Bioconductor contains many packages used in bioinformatics

# To install a package
install.packages("seqinr")

#load R package
library(seqinr)

Reading input files

There are many R functions to process files with various formats: Some come from base R:

read.table()
read.csv()
read.delim()
read.fwf()
scan() and many others

There are many R packages which provide additional functionality to read files.

# Read regular csv file:
salaries <- read.csv("/project/scv/classes/BI594/Salaries.csv")

# Read fast files (requires seqinr package)
fasta <- read.fasta("/project/scv/classes/BI594/contigs.fasta")

Exploring R dataframes

#Explore the regular dataset:
#Look at the first and last few records
head(salaries)

##   rank discipline yrs.since.phd yrs.service  sex salary
## 1 Prof          B            56          49 Male 186960
## 2 Prof          A            12           6 Male  93000
## 3 Prof          A            23          20 Male 110515
## 4 Prof          A            40          31 Male 131205
## 5 Prof          B            20          18 Male 104800
## 6 Prof          A            20          20 Male 122400

tail(salaries)

##         rank discipline yrs.since.phd yrs.service    sex salary
## 73      Prof          B            24          15 Female 161101
## 74      Prof          B            18          10 Female 105450
## 75 AssocProf          B            19           6 Female 104542
## 76      Prof          B            17          17 Female 124312
## 77      Prof          A            28          14 Female 109954
## 78      Prof          A            23          15 Female 109646

#Get the list of the columns
names(salaries)

## [1] "rank"          "discipline"    "yrs.since.phd" "yrs.service"  
## [5] "sex"           "salary"

#Number of rows:
nrow(salaries)

## [1] 78

#Number of columns:
ncol(salaries)

## [1] 6

#Get the structure of the data:
str(salaries)

## 'data.frame':    78 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AssocProf","AsstProf",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 1 1 1 2 1 1 1 1 1 ...
##  $ yrs.since.phd: int  56 12 23 40 20 20 20 18 29 51 ...
##  $ yrs.service  : int  49 6 20 31 18 20 17 18 19 51 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ salary       : int  186960 93000 110515 131205 104800 122400 81285 126300 94350 57800 ...

#Get the summary of the data:
summary(salaries)

##         rank    discipline yrs.since.phd    yrs.service        sex    
##  AssocProf:13   A:36       Min.   : 1.00   Min.   : 0.00   Female:39  
##  AsstProf :19   B:42       1st Qu.:10.25   1st Qu.: 5.25   Male  :39  
##  Prof     :46              Median :18.50   Median :14.50              
##                            Mean   :19.71   Mean   :15.05              
##                            3rd Qu.:27.75   3rd Qu.:20.75              
##                            Max.   :56.00   Max.   :51.00              
##      salary      
##  Min.   : 57800  
##  1st Qu.: 88612  
##  Median :104671  
##  Mean   :108024  
##  3rd Qu.:126775  
##  Max.   :186960

# Get the number of unique values for categorical variable:
table(salaries$rank)

## 
## AssocProf  AsstProf      Prof 
##        13        19        46

Exploring Fasta data

# Get the structure of the object
str(fasta)

## List of 1
##  $ NODE_1_length_1000_cov_140.621_ID_1:Class 'SeqFastadna'  atomic [1:1000] a a t c ...
##   .. ..- attr(*, "name")= chr "NODE_1_length_1000_cov_140.621_ID_1"
##   .. ..- attr(*, "Annot")= chr ">NODE_1_length_1000_cov_140.621_ID_1"

# Extract the sequence:
fseq <- fasta[[1]]  # get the first element from the fasta list

#Look at the first few elements
head(fseq)

## [1] "a" "a" "t" "c" "g" "g"

# The length of the vector:
length(fseq)

## [1] 1000

#Base composition of the sequence
table(fseq)

## fseq
##   a   c   g   t 
## 235 253 254 258

#Calculate GC Content  %(G+C)
GC(fseq)

## [1] 0.507

# Count the number of 2-letter words
count(fseq,2)

## 
## aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt 
## 72 53 47 63 76 63 69 45 41 85 64 64 45 52 74 86

Querying the NCBI Database in R

#List available repositories
choosebank()

##  [1] "genbank"         "embl"            "emblwgs"        
##  [4] "swissprot"       "ensembl"         "hogenom"        
##  [7] "hogenomdna"      "hovergendna"     "hovergen"       
## [10] "hogenom5"        "hogenom5dna"     "hogenom4"       
## [13] "hogenom4dna"     "homolens"        "homolensdna"    
## [16] "hobacnucl"       "hobacprot"       "phever2"        
## [19] "phever2dna"      "refseq"          "greviews"       
## [22] "bacterial"       "archaeal"        "protozoan"      
## [25] "ensprotists"     "ensfungi"        "ensmetazoa"     
## [28] "ensplants"       "ensemblbacteria" "mito"           
## [31] "polymorphix"     "emglib"          "refseqViruses"  
## [34] "ribodb"          "taxodb"

# Chose specific repository
choosebank("refseqViruses")

#Finding the sequence for the DEN-1 Dengue virus genome
den<-query("Dengue1", "AC=NC_001477")
#exploring
attributes(den)

## $names
## [1] "call"     "name"     "nelem"    "typelist" "req"      "socket"  
## 
## $class
## [1] "qaw"

str(den)

## List of 6
##  $ call    : language query(listname = "Dengue1", query = "AC=NC_001477")
##  $ name    : chr "Dengue1"
##  $ nelem   : int 1
##  $ typelist: chr "SQ"
##  $ req     :List of 1
##   ..$ :Class 'SeqAcnucWeb'  atomic [1:1] NC_001477
##   .. .. ..- attr(*, "length")= num 10735
##   .. .. ..- attr(*, "frame")= num 0
##   .. .. ..- attr(*, "ncbigc")= num 1
##  $ socket  :Classes 'sockconn', 'connection'  atomic [1:1] 5
##   .. ..- attr(*, "conn_id")=<externalptr> 
##  - attr(*, "class")= chr "qaw"

#Extract sequence
dseq <- getSequence(den$req[[1]])


#Once we have sequence in R we can retrieve the annotation:
dann <- getAnnot(den$req[[1]])
dann[1:30]   #print first 30 lines

##  [1] "LOCUS       NC_001477              10735 bp ss-RNA     linear   VRL 17-NOV-2011"
##  [2] "DEFINITION  Dengue virus 1, complete genome."                                   
##  [3] "ACCESSION   NC_001477"                                                          
##  [4] "VERSION     NC_001477.1  GI:9626685"                                            
##  [5] "DBLINK      Project: 15306"                                                     
##  [6] "KEYWORDS    ."                                                                  
##  [7] "SOURCE      Dengue virus 1"                                                     
##  [8] "  ORGANISM  Dengue virus 1"                                                     
##  [9] "            Viruses; ssRNA positive-strand viruses, no DNA stage; Flaviviridae;"
## [10] "            Flavivirus; Dengue virus group."                                    
## [11] "REFERENCE   1  (bases 1 to 10735)"                                              
## [12] "  AUTHORS   Puri,B., Nelson,W.M., Henchal,E.A., Hoke,C.H., Eckels,K.H.,"        
## [13] "            Dubois,D.R., Porter,K.R. and Hayes,C.G."                            
## [14] "  TITLE     Molecular analysis of dengue virus attenuation after serial passage"
## [15] "            in primary dog kidney cells"                                        
## [16] "  JOURNAL   J. Gen. Virol. 78 (PT 9), 2287-2291 (1997)"                         
## [17] "   PUBMED   9292016"                                                            
## [18] "REFERENCE   2  (bases 1 to 10735)"                                              
## [19] "  AUTHORS   McKee,K.T. Jr., Bancroft,W.H., Eckels,K.H., Redfield,R.R.,"         
## [20] "            Summers,P.L. and Russell,P.K."                                      
## [21] "  TITLE     Lack of attenuation of a candidate dengue 1 vaccine (45AZ5) in"     
## [22] "            human volunteers"                                                   
## [23] "  JOURNAL   Am. J. Trop. Med. Hyg. 36 (2), 435-442 (1987)"                      
## [24] "   PUBMED   3826504"                                                            
## [25] "REFERENCE   3  (bases 1 to 10735)"                                              
## [26] "  CONSRTM   NCBI Genome Project"                                                
## [27] "  TITLE     Direct Submission"                                                  
## [28] "  JOURNAL   Submitted (01-AUG-2000) National Center for Biotechnology"          
## [29] "            Information, NIH, Bethesda, MD 20894, USA"                          
## [30] "REFERENCE   4  (bases 1 to 10735)"

# Close connection
closebank()

Explore the regular R dataframes

#numeric data exploratory analysis
min(salaries$salary)

## [1] 57800

max(salaries$salary)

## [1] 186960

range(salaries$yrs.service)

## [1]  0 51

summary(salaries$salary)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57800   88610  104700  108000  126800  187000

#view the data
hist(salaries$salary)

#another way to look at the data
boxplot( salaries$salary )

dataframes slicing

# dataframe slicing (accessing elements)
salaries[3,5]    # element, from the 3rd row, 5th column

## [1] Male
## Levels: Female Male

salaries[3,]     # all elements from the 3rd row

##   rank discipline yrs.since.phd yrs.service  sex salary
## 3 Prof          A            23          20 Male 110515

salaries[ ,5]    # all elements in the 5th column

##  [1] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [11] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [21] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [31] Male   Male   Male   Male   Male   Male   Male   Male   Male   Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male

salaries[[5]]    # same: accessing 5th variable (column) in the dataframe

##  [1] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [11] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [21] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [31] Male   Male   Male   Male   Male   Male   Male   Male   Male   Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male

salaries$sex  # accessing elemnts using variable name

##  [1] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [11] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [21] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [31] Male   Male   Male   Male   Male   Male   Male   Male   Male   Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male

salaries[salaries$sex == "Male", ]  # list all the rows in the dataset for which variable ed is equal to specific value

##         rank discipline yrs.since.phd yrs.service  sex salary
## 1       Prof          B            56          49 Male 186960
## 2       Prof          A            12           6 Male  93000
## 3       Prof          A            23          20 Male 110515
## 4       Prof          A            40          31 Male 131205
## 5       Prof          B            20          18 Male 104800
## 6       Prof          A            20          20 Male 122400
## 7  AssocProf          A            20          17 Male  81285
## 8       Prof          A            18          18 Male 126300
## 9       Prof          A            29          19 Male  94350
## 10      Prof          A            51          51 Male  57800
## 11      Prof          B            39          33 Male 128250
## 12      Prof          B            23          23 Male 134778
## 13  AsstProf          B             1           0 Male  88000
## 14      Prof          B            35          33 Male 162200
## 15      Prof          B            25          19 Male 153750
## 16      Prof          B            17           3 Male 150480
## 17  AsstProf          B             8           3 Male  75044
## 18  AsstProf          B             4           0 Male  92000
## 19      Prof          A            19           7 Male 107300
## 20      Prof          A            29          27 Male 150500
## 21  AsstProf          B             4           4 Male  92000
## 22      Prof          A            33          30 Male 103106
## 23  AsstProf          A             4           2 Male  73000
## 24  AsstProf          A             2           0 Male  85000
## 25      Prof          A            30          23 Male  91100
## 26      Prof          B            35          31 Male  99418
## 27      Prof          A            38          19 Male 148750
## 28      Prof          A            45          43 Male 155865
## 29  AsstProf          B             7           2 Male  91300
## 30      Prof          B            21          20 Male 123683
## 31 AssocProf          B             9           7 Male 107008
## 32      Prof          B            22          21 Male 155750
## 33      Prof          A            27          19 Male 103275
## 34      Prof          B            18          18 Male 120000
## 35 AssocProf          B            12           8 Male 119800
## 36      Prof          B            28          23 Male 126933
## 37      Prof          B            45          45 Male 146856
## 38      Prof          A            20           8 Male 102000
## 39  AsstProf          B             4           3 Male  91000

Creating dataframe

#Define vectors
organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)

# Create a dataframe from vectors
dt <- data.frame(organism=organism,
                 genomesz=genomeSizeBP, 
                 genecount=estGeneCount)
dt

##    organism  genomesz genecount
## 1     Human 3.000e+09     30000
## 2     Mouse 3.000e+09     30000
## 3 Fruit Fly 1.356e+08     13061
## 4 Roundworm 9.700e+07     19099
## 5     Yeast 1.210e+07      6034

# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount

## [1] 100000.000 100000.000  10382.053   5078.800   2005.303

# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt

##    organism  genomesz genecount         bp
## 1     Human 3.000e+09     30000 100000.000
## 2     Mouse 3.000e+09     30000 100000.000
## 3 Fruit Fly 1.356e+08     13061  10382.053
## 4 Roundworm 9.700e+07     19099   5078.800
## 5     Yeast 1.210e+07      6034   2005.303

Missing Values

x <- c(734, 145, NA, 456, NA)    # define a numeric vector
which(is.na(x))       # which elements are missing

## [1] 3 5

anyNA(x)              # are there any missing data

## [1] TRUE

# the following does not work ! - missing value cannot be compared to anything
x == NA

## [1] NA NA NA NA NA

# Use is.na() function to check for missing values
is.na(x)

## [1] FALSE FALSE  TRUE FALSE  TRUE

# By default mamy statistical functions will not compute if the data contain missing values
mean(x)

## [1] NA

# Read help topic for the function
?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)

## [1] 445

Hypothesis Testing in R

# This dataset consists of gene expression measurements for 10 genes 
# under control and reatment conditions with 4 replicates each
genht <- read.csv("/project/scv/classes/BI594/geHT.csv")

str(genht)

## 'data.frame':    10 obs. of  9 variables:
##  $ gene: int  1 2 3 4 5 6 7 8 9 10
##  $ c1  : int  2650 1200 1541 1545 1956 1599 2430 1902 1530 2008
##  $ t1  : int  3115 1101 1358 1910 2999 2710 2589 1910 2329 2485
##  $ c2  : int  2619 1200 1401 1652 2066 1754 2789 2028 1660 2104
##  $ t2  : int  2933 1309 1499 2028 2880 2765 2899 2100 2332 2871
##  $ c3  : int  2331 1888 1256 1449 1777 1434 2332 1888 1501 1987
##  $ t3  : int  2799 1901 1238 1901 2898 2689 2300 1901 2298 2650
##  $ c4  : int  2750 1315 1625 1399 1999 1702 2250 2000 1478 2100
##  $ t4  : int  3200 980 1421 2002 2798 2402 2741 1899 2287 2520

#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4  )

# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)

## 
##  One Sample t-test
## 
## data:  control
## t = -2.174, df = 39, p-value = 0.03583
## alternative hypothesis: true mean is not equal to 2000
## 95 percent confidence interval:
##  1715.028 1989.722
## sample estimates:
## mean of x 
##  1852.375

#The test result with a p-value of 0.03583 rejects the null hypothesis at a critical
#value (alpha level) of 0.05

# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4  )
t.test(control, treat)

## 
##  Welch Two Sample t-test
## 
## data:  control and treat
## t = -3.6163, df = 70.732, p-value = 0.0005564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -653.6098 -188.9902
## sample estimates:
## mean of x mean of y 
##  1852.375  2273.675

#The p-value for this test very strongly rejects the null hypothesis of no
#difference between the mean of the treatment group and control group

# paired t-test
#Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test
t.test(control, treat, paired=T)

## 
##  Paired t-test
## 
## data:  control and treat
## t = -6.3945, df = 39, p-value = 1.468e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -554.5654 -288.0346
## sample estimates:
## mean of the differences 
##                  -421.3