2+3 # addition
## [1] 5
2^3 # power
## [1] 8
log(2) # built-in functions
## [1] 0.6931472
R variable name can contain letters, digits, underscores and dots. It has to start with the letter or dot. The variable name cannot contain dollar sign or other special characters. Avoid using names c, t, cat, F, T, D as those are built-in functions or constants
a <- 3
b = -5 # both assignment operators are equivalent, the first one is more traditional
A <- 7 # R is case sensitive
Character, numerical and logical vairables are most popular basic types in R:
str.var <- "character variable" # character variable
num.var <- 21.17 # numerical variable
bool.var <- TRUE # boolean (or logical) variable
bool.var.1 <- F # logical values can be abbriviated
Use mode()
function to get information about the type of a variable:
rna.str <- "TTAGAATT" # string variable
mode(rna.str) # get the type (or mode) of the object
## [1] "character"
nchar(rna.str) # get the lenght of the string
## [1] 8
R provides a comprehensive help for every function with plenty examples and links for further information.
# Access help file for a known name of R function
?sd
help(sd)
# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")
Vector is an array of R objects of the same type:
# An example of a numeric vector:
num.vector <- c( 5,7, 9,11, 4, -1, 0)
Numeric vectors can be defined in a number of ways:
vals <- c (2, -7, 5, 3, -1 ) # concatenation
vals <- 25:75 # range of values
vals <- seq(from=0, to=3, by=0.5) # sequence definition
vals <- rep(1, times=7) # repeat value
vals <- numeric(9) # initialize numeric vector with zeros
vals <- rnorm(5, mean=2, sd=1.5 ) # returns normally distributed values
Most R functions and operators can operate using whole vectors:
dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85)
velocity <- dist/time # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)
## [1] 12.00000 11.11111 12.50000 11.42857 12.50000 10.58824
+, -, *, / - addition, subtraction, multiplication, division ^ or ** - exponentiation %% - modulus %/% - integer division %in% - membership : - sequence genration <, <=, ==, >=, >, != - boolean comparative |, || - OR vectorized/not vectorized &, && - AND
#Check if a particular value is present in a vector
5 %in% dist
## [1] TRUE
x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector
x[2] # returns second element
## [1] 38.2
x[2:4] # returns second through 4th elements inclusive
## [1] 38.2 36.4 37.9
x[c(1,3,5)] # returns 1st, 3rd and 5th elements
## [1] 36.6 36.4 41.0
x[-2] # returns all but 2nd element
## [1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5
x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements
## [1] 36.6 36.4 36.8
#compare each element of the vector with a value
x < 37.0
## [1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]
## [1] 36.6 36.4 36.8
There are a few useful “which” functions:
which.max(x) # find the (first)maximum element and return its index
## [1] 5
which.min(x)
## [1] 3
which(x >= 37.0) # find the location of all the elements that satisfy a specific condition
## [1] 2 4 5 6 8
max(x), min(x), sum(x) mean(x), sd(), median(x), range(x) var(x) - simple variance cor(x,y) - correlation between x and y sort(x), rank(x), order(x) cumsum(), cumprod(x), cummin(x), cumprod(x) duplicated(x), unique(x) summary()
R contains many packages. Many of them can be found on the CRAN website Bioconductor contains many packages used in bioinformatics
# To install a package
install.packages("seqinr")
#load R package
library(seqinr)
There are many R functions to process files with various formats: Some come from base R:
There are many R packages which provide additional functionality to read files.
# Read regular csv file:
salaries <- read.csv("/project/scv/classes/BI594/Salaries.csv")
# Read fast files (requires seqinr package)
fasta <- read.fasta("/project/scv/classes/BI594/contigs.fasta")
#Explore the regular dataset:
#Look at the first and last few records
head(salaries)
## rank discipline yrs.since.phd yrs.service sex salary
## 1 Prof B 56 49 Male 186960
## 2 Prof A 12 6 Male 93000
## 3 Prof A 23 20 Male 110515
## 4 Prof A 40 31 Male 131205
## 5 Prof B 20 18 Male 104800
## 6 Prof A 20 20 Male 122400
tail(salaries)
## rank discipline yrs.since.phd yrs.service sex salary
## 73 Prof B 24 15 Female 161101
## 74 Prof B 18 10 Female 105450
## 75 AssocProf B 19 6 Female 104542
## 76 Prof B 17 17 Female 124312
## 77 Prof A 28 14 Female 109954
## 78 Prof A 23 15 Female 109646
#Get the list of the columns
names(salaries)
## [1] "rank" "discipline" "yrs.since.phd" "yrs.service"
## [5] "sex" "salary"
#Number of rows:
nrow(salaries)
## [1] 78
#Number of columns:
ncol(salaries)
## [1] 6
#Get the structure of the data:
str(salaries)
## 'data.frame': 78 obs. of 6 variables:
## $ rank : Factor w/ 3 levels "AssocProf","AsstProf",..: 3 3 3 3 3 3 1 3 3 3 ...
## $ discipline : Factor w/ 2 levels "A","B": 2 1 1 1 2 1 1 1 1 1 ...
## $ yrs.since.phd: int 56 12 23 40 20 20 20 18 29 51 ...
## $ yrs.service : int 49 6 20 31 18 20 17 18 19 51 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ salary : int 186960 93000 110515 131205 104800 122400 81285 126300 94350 57800 ...
#Get the summary of the data:
summary(salaries)
## rank discipline yrs.since.phd yrs.service sex
## AssocProf:13 A:36 Min. : 1.00 Min. : 0.00 Female:39
## AsstProf :19 B:42 1st Qu.:10.25 1st Qu.: 5.25 Male :39
## Prof :46 Median :18.50 Median :14.50
## Mean :19.71 Mean :15.05
## 3rd Qu.:27.75 3rd Qu.:20.75
## Max. :56.00 Max. :51.00
## salary
## Min. : 57800
## 1st Qu.: 88612
## Median :104671
## Mean :108024
## 3rd Qu.:126775
## Max. :186960
# Get the number of unique values for categorical variable:
table(salaries$rank)
##
## AssocProf AsstProf Prof
## 13 19 46
# Get the structure of the object
str(fasta)
## List of 1
## $ NODE_1_length_1000_cov_140.621_ID_1:Class 'SeqFastadna' atomic [1:1000] a a t c ...
## .. ..- attr(*, "name")= chr "NODE_1_length_1000_cov_140.621_ID_1"
## .. ..- attr(*, "Annot")= chr ">NODE_1_length_1000_cov_140.621_ID_1"
# Extract the sequence:
fseq <- fasta[[1]] # get the first element from the fasta list
#Look at the first few elements
head(fseq)
## [1] "a" "a" "t" "c" "g" "g"
# The length of the vector:
length(fseq)
## [1] 1000
#Base composition of the sequence
table(fseq)
## fseq
## a c g t
## 235 253 254 258
#Calculate GC Content %(G+C)
GC(fseq)
## [1] 0.507
# Count the number of 2-letter words
count(fseq,2)
##
## aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt
## 72 53 47 63 76 63 69 45 41 85 64 64 45 52 74 86
#List available repositories
choosebank()
## [1] "genbank" "embl" "emblwgs"
## [4] "swissprot" "ensembl" "hogenom"
## [7] "hogenomdna" "hovergendna" "hovergen"
## [10] "hogenom5" "hogenom5dna" "hogenom4"
## [13] "hogenom4dna" "homolens" "homolensdna"
## [16] "hobacnucl" "hobacprot" "phever2"
## [19] "phever2dna" "refseq" "greviews"
## [22] "bacterial" "archaeal" "protozoan"
## [25] "ensprotists" "ensfungi" "ensmetazoa"
## [28] "ensplants" "ensemblbacteria" "mito"
## [31] "polymorphix" "emglib" "refseqViruses"
## [34] "ribodb" "taxodb"
# Chose specific repository
choosebank("refseqViruses")
#Finding the sequence for the DEN-1 Dengue virus genome
den<-query("Dengue1", "AC=NC_001477")
#exploring
attributes(den)
## $names
## [1] "call" "name" "nelem" "typelist" "req" "socket"
##
## $class
## [1] "qaw"
str(den)
## List of 6
## $ call : language query(listname = "Dengue1", query = "AC=NC_001477")
## $ name : chr "Dengue1"
## $ nelem : int 1
## $ typelist: chr "SQ"
## $ req :List of 1
## ..$ :Class 'SeqAcnucWeb' atomic [1:1] NC_001477
## .. .. ..- attr(*, "length")= num 10735
## .. .. ..- attr(*, "frame")= num 0
## .. .. ..- attr(*, "ncbigc")= num 1
## $ socket :Classes 'sockconn', 'connection' atomic [1:1] 5
## .. ..- attr(*, "conn_id")=<externalptr>
## - attr(*, "class")= chr "qaw"
#Extract sequence
dseq <- getSequence(den$req[[1]])
#Once we have sequence in R we can retrieve the annotation:
dann <- getAnnot(den$req[[1]])
dann[1:30] #print first 30 lines
## [1] "LOCUS NC_001477 10735 bp ss-RNA linear VRL 17-NOV-2011"
## [2] "DEFINITION Dengue virus 1, complete genome."
## [3] "ACCESSION NC_001477"
## [4] "VERSION NC_001477.1 GI:9626685"
## [5] "DBLINK Project: 15306"
## [6] "KEYWORDS ."
## [7] "SOURCE Dengue virus 1"
## [8] " ORGANISM Dengue virus 1"
## [9] " Viruses; ssRNA positive-strand viruses, no DNA stage; Flaviviridae;"
## [10] " Flavivirus; Dengue virus group."
## [11] "REFERENCE 1 (bases 1 to 10735)"
## [12] " AUTHORS Puri,B., Nelson,W.M., Henchal,E.A., Hoke,C.H., Eckels,K.H.,"
## [13] " Dubois,D.R., Porter,K.R. and Hayes,C.G."
## [14] " TITLE Molecular analysis of dengue virus attenuation after serial passage"
## [15] " in primary dog kidney cells"
## [16] " JOURNAL J. Gen. Virol. 78 (PT 9), 2287-2291 (1997)"
## [17] " PUBMED 9292016"
## [18] "REFERENCE 2 (bases 1 to 10735)"
## [19] " AUTHORS McKee,K.T. Jr., Bancroft,W.H., Eckels,K.H., Redfield,R.R.,"
## [20] " Summers,P.L. and Russell,P.K."
## [21] " TITLE Lack of attenuation of a candidate dengue 1 vaccine (45AZ5) in"
## [22] " human volunteers"
## [23] " JOURNAL Am. J. Trop. Med. Hyg. 36 (2), 435-442 (1987)"
## [24] " PUBMED 3826504"
## [25] "REFERENCE 3 (bases 1 to 10735)"
## [26] " CONSRTM NCBI Genome Project"
## [27] " TITLE Direct Submission"
## [28] " JOURNAL Submitted (01-AUG-2000) National Center for Biotechnology"
## [29] " Information, NIH, Bethesda, MD 20894, USA"
## [30] "REFERENCE 4 (bases 1 to 10735)"
# Close connection
closebank()
#numeric data exploratory analysis
min(salaries$salary)
## [1] 57800
max(salaries$salary)
## [1] 186960
range(salaries$yrs.service)
## [1] 0 51
summary(salaries$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57800 88610 104700 108000 126800 187000
#view the data
hist(salaries$salary)
#another way to look at the data
boxplot( salaries$salary )
# dataframe slicing (accessing elements)
salaries[3,5] # element, from the 3rd row, 5th column
## [1] Male
## Levels: Female Male
salaries[3,] # all elements from the 3rd row
## rank discipline yrs.since.phd yrs.service sex salary
## 3 Prof A 23 20 Male 110515
salaries[ ,5] # all elements in the 5th column
## [1] Male Male Male Male Male Male Male Male Male Male
## [11] Male Male Male Male Male Male Male Male Male Male
## [21] Male Male Male Male Male Male Male Male Male Male
## [31] Male Male Male Male Male Male Male Male Male Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male
salaries[[5]] # same: accessing 5th variable (column) in the dataframe
## [1] Male Male Male Male Male Male Male Male Male Male
## [11] Male Male Male Male Male Male Male Male Male Male
## [21] Male Male Male Male Male Male Male Male Male Male
## [31] Male Male Male Male Male Male Male Male Male Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male
salaries$sex # accessing elemnts using variable name
## [1] Male Male Male Male Male Male Male Male Male Male
## [11] Male Male Male Male Male Male Male Male Male Male
## [21] Male Male Male Male Male Male Male Male Male Male
## [31] Male Male Male Male Male Male Male Male Male Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male
salaries[salaries$sex == "Male", ] # list all the rows in the dataset for which variable ed is equal to specific value
## rank discipline yrs.since.phd yrs.service sex salary
## 1 Prof B 56 49 Male 186960
## 2 Prof A 12 6 Male 93000
## 3 Prof A 23 20 Male 110515
## 4 Prof A 40 31 Male 131205
## 5 Prof B 20 18 Male 104800
## 6 Prof A 20 20 Male 122400
## 7 AssocProf A 20 17 Male 81285
## 8 Prof A 18 18 Male 126300
## 9 Prof A 29 19 Male 94350
## 10 Prof A 51 51 Male 57800
## 11 Prof B 39 33 Male 128250
## 12 Prof B 23 23 Male 134778
## 13 AsstProf B 1 0 Male 88000
## 14 Prof B 35 33 Male 162200
## 15 Prof B 25 19 Male 153750
## 16 Prof B 17 3 Male 150480
## 17 AsstProf B 8 3 Male 75044
## 18 AsstProf B 4 0 Male 92000
## 19 Prof A 19 7 Male 107300
## 20 Prof A 29 27 Male 150500
## 21 AsstProf B 4 4 Male 92000
## 22 Prof A 33 30 Male 103106
## 23 AsstProf A 4 2 Male 73000
## 24 AsstProf A 2 0 Male 85000
## 25 Prof A 30 23 Male 91100
## 26 Prof B 35 31 Male 99418
## 27 Prof A 38 19 Male 148750
## 28 Prof A 45 43 Male 155865
## 29 AsstProf B 7 2 Male 91300
## 30 Prof B 21 20 Male 123683
## 31 AssocProf B 9 7 Male 107008
## 32 Prof B 22 21 Male 155750
## 33 Prof A 27 19 Male 103275
## 34 Prof B 18 18 Male 120000
## 35 AssocProf B 12 8 Male 119800
## 36 Prof B 28 23 Male 126933
## 37 Prof B 45 45 Male 146856
## 38 Prof A 20 8 Male 102000
## 39 AsstProf B 4 3 Male 91000
#Define vectors
organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)
# Create a dataframe from vectors
dt <- data.frame(organism=organism,
genomesz=genomeSizeBP,
genecount=estGeneCount)
dt
## organism genomesz genecount
## 1 Human 3.000e+09 30000
## 2 Mouse 3.000e+09 30000
## 3 Fruit Fly 1.356e+08 13061
## 4 Roundworm 9.700e+07 19099
## 5 Yeast 1.210e+07 6034
# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount
## [1] 100000.000 100000.000 10382.053 5078.800 2005.303
# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt
## organism genomesz genecount bp
## 1 Human 3.000e+09 30000 100000.000
## 2 Mouse 3.000e+09 30000 100000.000
## 3 Fruit Fly 1.356e+08 13061 10382.053
## 4 Roundworm 9.700e+07 19099 5078.800
## 5 Yeast 1.210e+07 6034 2005.303
x <- c(734, 145, NA, 456, NA) # define a numeric vector
which(is.na(x)) # which elements are missing
## [1] 3 5
anyNA(x) # are there any missing data
## [1] TRUE
# the following does not work ! - missing value cannot be compared to anything
x == NA
## [1] NA NA NA NA NA
# Use is.na() function to check for missing values
is.na(x)
## [1] FALSE FALSE TRUE FALSE TRUE
# By default mamy statistical functions will not compute if the data contain missing values
mean(x)
## [1] NA
# Read help topic for the function
?mean
#Perform computation removing the missing data
mean(x, na.rm=TRUE)
## [1] 445
# This dataset consists of gene expression measurements for 10 genes
# under control and reatment conditions with 4 replicates each
genht <- read.csv("/project/scv/classes/BI594/geHT.csv")
str(genht)
## 'data.frame': 10 obs. of 9 variables:
## $ gene: int 1 2 3 4 5 6 7 8 9 10
## $ c1 : int 2650 1200 1541 1545 1956 1599 2430 1902 1530 2008
## $ t1 : int 3115 1101 1358 1910 2999 2710 2589 1910 2329 2485
## $ c2 : int 2619 1200 1401 1652 2066 1754 2789 2028 1660 2104
## $ t2 : int 2933 1309 1499 2028 2880 2765 2899 2100 2332 2871
## $ c3 : int 2331 1888 1256 1449 1777 1434 2332 1888 1501 1987
## $ t3 : int 2799 1901 1238 1901 2898 2689 2300 1901 2298 2650
## $ c4 : int 2750 1315 1625 1399 1999 1702 2250 2000 1478 2100
## $ t4 : int 3200 980 1421 2002 2798 2402 2741 1899 2287 2520
#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4 )
# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)
##
## One Sample t-test
##
## data: control
## t = -2.174, df = 39, p-value = 0.03583
## alternative hypothesis: true mean is not equal to 2000
## 95 percent confidence interval:
## 1715.028 1989.722
## sample estimates:
## mean of x
## 1852.375
#The test result with a p-value of 0.03583 rejects the null hypothesis at a critical
#value (alpha level) of 0.05
# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4 )
t.test(control, treat)
##
## Welch Two Sample t-test
##
## data: control and treat
## t = -3.6163, df = 70.732, p-value = 0.0005564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -653.6098 -188.9902
## sample estimates:
## mean of x mean of y
## 1852.375 2273.675
#The p-value for this test very strongly rejects the null hypothesis of no
#difference between the mean of the treatment group and control group
# paired t-test
#Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test
t.test(control, treat, paired=T)
##
## Paired t-test
##
## data: control and treat
## t = -6.3945, df = 39, p-value = 1.468e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -554.5654 -288.0346
## sample estimates:
## mean of the differences
## -421.3