# R as a scientific calculator 2+3 # addition 2^3 # power log(2) # built-in functions #------------------ # # variables # #------------------ # a <- 3 b = -5 # both assignment operators are equivalent, the first one is more traditional A <- 7 # R is case sensitive # Avoid using names c, t, cat, F, T, D as those are built-in functions/constants # The variable name can contain letters, digits, underscores and dots and # start with the letter or dot # The variable name cannot contain dollar sign str.var <- "character variable" # character variable num.var <- 21.17 # numerical variable bool.var <- TRUE # boolean (or logical) variable bool.var.1 <- F # logical values can be abbriviated rna.str <- "TTAGAATT" # string variable mode(rna.str) # get the type (or mode) of the object nchar(rna.str) # get the lenght of the string #------------------ # # R help # #------------------ # # Access help file for the R function ?sd help(sd) # Search for help ??"standard deviation" help.search("standard deviation") help.search("analysis of variance") #------------------ # # R vectors # #------------------ # # Vector is an array of R objects of the same type: num.vector <- c( 5,7, 9,11, 4, -1, 0) # Numeric vectors can be defined in a number of ways: vals <- c (2, -7, 5, 3, -1 ) # concatenation vals <- 25:75 # range of values vals <- seq(from=0, to=3, by=0.5) # sequence definition vals <- rep(1, times=7) # repeat value vals <- numeric(9) # initialize numeric vector with zeros vals <- rnorm(5, mean=2, sd=1.5 ) # returns normally distributed values # Vector arithmetic: dist <- c(6, 10, 5, 8, 15, 9) time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85) velocity <- dist/time # each element of the first vector will be divided by the element from the second vector with the same index print(velocity) # R operators # +, -, *, / - addition, subtraction, multiplication, division # ^ or ** - exponentiation # %% - modulus # %/% - integer division # %in% - membership # : - sequence genration # <, <=, ==, >=, >, != - boolean comparative # |, || - OR vectorized/not vectorized # &, && - AND # # vector slicing (subsetting) x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector x[2] # returns second element x[2:4] # returns second through 4th elements inclusive x[c(1,3,5)] # returns 1st, 3rd and 5th elements x[-2] # returns all but 2nd element x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements #compare each element of the vector with a value x < 37.0 #return only those elements of the vector that satisfy a specific condition x[ x < 37.0 ] which.max(x) # find the (first)maximum element and return its index which.min(x) which(x >= 37.0) # find the location of all the elements that satisfy a specific condition # vector functions: # max(x), min(x), sum(x) # mean(x), sd(), median(x), range(x) # var(x) - simple variance # cor(x,y) - correlation between x and y # sort(x), rank(x), order(x) # cumsum(), cumprod(x), cummin(x), cumprod(x) # duplicated(x), unique(x) # summary() #------------------ # # R packages # #------------------ # # R contains many packages. Many of them can be found on the CRAN website # Bioconductor contains many packages used in bioinformatics #load package library(seqinr) #---------------------------- # # Reading input files # #---------------------------- # # Read regular csv file: salaries <- read.csv("http://rcs.bu.edu/classes/BI594/Salaries.csv") # Read fast files (requires seqinr package) fasta <- read.fasta("http://rcs.bu.edu/classes/BI594/contigs.fasta") #------------------------------ # # Exploring R dataframes # #------------------------------ # #Explore the regular dataset: #Look at the first and last few records head(salaries) tail(salaries) #Get the list of the columns names(salaries) #Number of rows: nrow(salaries) #Number of columns: ncol(salaries) #Get the structure of the data: str(salaries) #Get the summary of the data: summary(salaries) # Get the number of unique values for categorical variable: table(salaries$rank) #------------------------------ # # Exploring Fasta data # #------------------------------ # # Get the structure of the object str(fasta) # Extract the sequence: fseq <- fasta[[1]] # get the first element from the fasta list #Look at the first few elements head(fseq) # The length of the vector: length(fseq) #Base composition of the sequence table(fseq) #Calculate GC Content %(G+C) GC(fseq) # Count the number of 2-letter words count(fseq,2) #---------------------------------------- # # Querying the NCBI Database in R # #---------------------------------------- # choosebank() choosebank("refseqViruses") #Finding the sequence for the DEN-1 Dengue virus genome den<-query("Dengue1", "AC=NC_001477") #exploring attributes(den) str(den) #Extract sequence dseq <- getSequence(den$req[[1]]) #Once we have sequence in R we can retrieve the annotation: dann <- getAnnot(den$req[[1]]) dann[1:30] #print first 30 lines # Close connection closebank() #-----------------------------------# # Explore the regular R dataframes # #-----------------------------------# salaries <- read.csv("http://rcs.bu.edu/classes/BI594/Salaries.csv") #numeric data exploratory analysis min(salaries$salary) max(salaries$salary) range(salaries$yrs.service) summary(salaries$salary) #view the data hist(salaries$salary) #another way to look at the data boxplot( salaries$salary ) #------------------------ # # dataframes slicing # #------------------------ # # dataframe slicing (accessing elements) salaries[3,5] # element, from the 3rd row, 5th column salaries[3,] # all elements from the 3rd row salaries[ ,5] # all elements in the 5th column salaries[[5]] # same: accessing 5th variable (column) in the dataframe salaries$sex # accessing elemnts using variable name salaries[salaries$sex == "Male", ] # list all the rows in the dataset for which variable ed is equal to specific value # Create dataframe: organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast") genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000) estGeneCount<-c(30000,30000,13061,19099,6034) dt <- data.frame(organism=organism, genomesz=genomeSizeBP, genecount=estGeneCount) dt # Comute gene density (divide genome size by the genome count) dt$genomesz/dt$genecount # create a new column "base pairs" per gene dt$bp <- dt$genomesz/dt$genecount dt #------------------ # # Missing Values # #------------------ # x <- c(734, 145, NA, 456, NA) # define a numeric vector is.na(x) # check if the element in the vector is missing which(is.na(x)) # which elements are missing anyNA(x) # are there any missing data x == NA # this does not work ! - missing value cannot be compared to anything mean(x) # By default mamy statistical functions will not compute if the data contain missing values # Read help topic for the function ?mean #Perform computation removing the missing data mean(x, na.rm=TRUE) #--------------------------- # # Hypothesis Testing in R # #--------------------------- # # This dataset consists of gene expression measurements for 10 genes # under control and reatment conditions with 4 replicates each genht <- read.csv("http://rcs.bu.edu/classes/BI594/geHT.csv") str(genht) #All control values control <- c(genht$c1, genht$c2, genht$c3, genht$c4 ) # Perform One sample t-test with a Null hypothesis that the true mean is 2000 t.test(control, mu=2000) #The test result with a p-value of 0.03583 rejects the null hypothesis at a critical #value (alpha level) of 0.05 # 2-sample t-test treat <- c(genht$t1, genht$t2, genht$t3, genht$t4 ) t.test(control, treat) #The p-value for this test very strongly rejects the null hypothesis of no #difference between the mean of the treatment group and control group # paired t-test #Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test t.test(control, treat, paired=T)