--- title: "R intro (session 1)" author: Katia Oleinik (koleinik@bu.edu) date: "10/17/2017" output: html_document --- ## R as a scientific calculator ```{r} 2+3 # addition 2^3 # power log(2) # built-in functions ``` ## Variables R variable name can contain letters, digits, underscores and dots. It has to start with the letter or dot. The variable name cannot contain dollar sign or other special characters. Avoid using names c, t, cat, F, T, D as those are built-in functions or constants ```{r} a <- 3 b = -5 # both assignment operators are equivalent, the first one is more traditional A <- 7 # R is case sensitive ``` Character, numerical and logical vairables are most popular basic types in R: ```{r} str.var <- "character variable" # character variable num.var <- 21.17 # numerical variable bool.var <- TRUE # boolean (or logical) variable bool.var.1 <- F # logical values can be abbriviated ``` Use mode() function to get information about the type of a variable: ```{r} rna.str <- "TTAGAATT" # string variable mode(rna.str) # get the type (or mode) of the object nchar(rna.str) # get the lenght of the string ``` ## R help R provides a comprehensive help for every function with plenty examples and links for further information. ```{r eval=FALSE} # Access help file for a known name of R function ?sd help(sd) ``` ```{r eval=FALSE} # Search for help ??"standard deviation" help.search("standard deviation") help.search("analysis of variance") ``` ## R vectors Vector is an array of R objects of the same type: ```{r} # An example of a numeric vector: num.vector <- c( 5,7, 9,11, 4, -1, 0) ``` Numeric vectors can be defined in a number of ways: ```{r} vals <- c (2, -7, 5, 3, -1 ) # concatenation vals <- 25:75 # range of values vals <- seq(from=0, to=3, by=0.5) # sequence definition vals <- rep(1, times=7) # repeat value vals <- numeric(9) # initialize numeric vector with zeros vals <- rnorm(5, mean=2, sd=1.5 ) # returns normally distributed values ``` ## Vector arithmetic: Most R functions and operators can operate using whole vectors: ```{r} dist <- c(6, 10, 5, 8, 15, 9) time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85) velocity <- dist/time # each element of the first vector will be divided by the element from the second vector with the same index print(velocity) ``` ## R operators

              +, -, *, /  - addition, subtraction, multiplication, division
              ^ or **     - exponentiation
              %%          - modulus
              %/%         - integer division
              %in%        - membership
              :           - sequence genration
              <, <=, ==, >=, >, !=     - boolean comparative
              |, ||       - OR  vectorized/not vectorized
              &, &&       - AND

```{r} #Check if a particular value is present in a vector 5 %in% dist ``` ## vector slicing (subsetting) ```{r} x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector x[2] # returns second element x[2:4] # returns second through 4th elements inclusive x[c(1,3,5)] # returns 1st, 3rd and 5th elements x[-2] # returns all but 2nd element x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements #compare each element of the vector with a value x < 37.0 #return only those elements of the vector that satisfy a specific condition x[ x < 37.0 ] ``` There are a few useful "which" functions: ```{r} which.max(x) # find the (first)maximum element and return its index which.min(x) which(x >= 37.0) # find the location of all the elements that satisfy a specific condition ``` ## vector functions:

               max(x),   min(x),     sum(x)
               mean(x),  sd(), median(x),  range(x)
               var(x)                            - simple variance
               cor(x,y)                          - correlation between x and y
               sort(x), rank(x), order(x)
               cumsum(), cumprod(x), cummin(x), cumprod(x)
               duplicated(x), unique(x)
               summary()

## R packages R contains many packages. Many of them can be found on the CRAN website Bioconductor contains many packages used in bioinformatics ```{r eval=FALSE} # To install a package install.packages("seqinr") ``` ```{r} #load R package library(seqinr) ``` ## Reading input files There are many R functions to process files with various formats: Some come from base R: * read.table() * read.csv() * read.delim() * read.fwf() * scan() and many others There are many R packages which provide additional functionality to read files. ```{r} # Read regular csv file: salaries <- read.csv("/project/scv/classes/BI594/Salaries.csv") # Read fast files (requires seqinr package) fasta <- read.fasta("/project/scv/classes/BI594/contigs.fasta") ``` ## Exploring R dataframes ```{r} #Explore the regular dataset: #Look at the first and last few records head(salaries) tail(salaries) #Get the list of the columns names(salaries) #Number of rows: nrow(salaries) #Number of columns: ncol(salaries) #Get the structure of the data: str(salaries) #Get the summary of the data: summary(salaries) # Get the number of unique values for categorical variable: table(salaries$rank) ``` ## Exploring Fasta data ```{r} # Get the structure of the object str(fasta) # Extract the sequence: fseq <- fasta[[1]] # get the first element from the fasta list #Look at the first few elements head(fseq) # The length of the vector: length(fseq) #Base composition of the sequence table(fseq) #Calculate GC Content %(G+C) GC(fseq) # Count the number of 2-letter words count(fseq,2) ``` ## Querying the NCBI Database in R ```{r} #List available repositories choosebank() # Chose specific repository choosebank("refseqViruses") #Finding the sequence for the DEN-1 Dengue virus genome den<-query("Dengue1", "AC=NC_001477") #exploring attributes(den) str(den) #Extract sequence dseq <- getSequence(den$req[[1]]) #Once we have sequence in R we can retrieve the annotation: dann <- getAnnot(den$req[[1]]) dann[1:30] #print first 30 lines # Close connection closebank() ``` ## Explore the regular R dataframes ```{r} #numeric data exploratory analysis min(salaries$salary) max(salaries$salary) range(salaries$yrs.service) summary(salaries$salary) #view the data hist(salaries$salary) #another way to look at the data boxplot( salaries$salary ) ``` ## dataframes slicing ```{r} # dataframe slicing (accessing elements) salaries[3,5] # element, from the 3rd row, 5th column salaries[3,] # all elements from the 3rd row salaries[ ,5] # all elements in the 5th column salaries[[5]] # same: accessing 5th variable (column) in the dataframe salaries$sex # accessing elemnts using variable name salaries[salaries$sex == "Male", ] # list all the rows in the dataset for which variable ed is equal to specific value ``` ## Creating dataframe ```{r} #Define vectors organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast") genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000) estGeneCount<-c(30000,30000,13061,19099,6034) # Create a dataframe from vectors dt <- data.frame(organism=organism, genomesz=genomeSizeBP, genecount=estGeneCount) dt ``` ```{r} # Comute gene density (divide genome size by the genome count) dt$genomesz/dt$genecount # create a new column "base pairs" per gene dt$bp <- dt$genomesz/dt$genecount dt ``` ## Missing Values ```{r} x <- c(734, 145, NA, 456, NA) # define a numeric vector which(is.na(x)) # which elements are missing anyNA(x) # are there any missing data # the following does not work ! - missing value cannot be compared to anything x == NA # Use is.na() function to check for missing values is.na(x) # By default mamy statistical functions will not compute if the data contain missing values mean(x) ``` ```{r eval=FALSE} # Read help topic for the function ?mean ``` ```{r} #Perform computation removing the missing data mean(x, na.rm=TRUE) ``` ## Hypothesis Testing in R ```{r} # This dataset consists of gene expression measurements for 10 genes # under control and reatment conditions with 4 replicates each genht <- read.csv("/project/scv/classes/BI594/geHT.csv") str(genht) #All control values control <- c(genht$c1, genht$c2, genht$c3, genht$c4 ) # Perform One sample t-test with a Null hypothesis that the true mean is 2000 t.test(control, mu=2000) #The test result with a p-value of 0.03583 rejects the null hypothesis at a critical #value (alpha level) of 0.05 # 2-sample t-test treat <- c(genht$t1, genht$t2, genht$t3, genht$t4 ) t.test(control, treat) #The p-value for this test very strongly rejects the null hypothesis of no #difference between the mean of the treatment group and control group # paired t-test #Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test t.test(control, treat, paired=T) ```