# R as a scientific calculator
2+3    # addition
2^3    # power
log(2) # built-in functions


#------------------ #
#     variables     #
#------------------ #

a <- 3
b = -5  # both assignment operators are equivalent, the first one is more traditional
A <- 7  # R is case sensitive

# Avoid using names c, t, cat, F, T, D as those are built-in functions/constants

# The variable name can contain letters, digits, underscores and dots and 
# start with the letter or dot
# The variable name cannot contain dollar sign
str.var  <- "character variable"   # character variable
num.var  <- 21.17                  # numerical variable
bool.var <- TRUE                   # boolean (or logical) variable
bool.var.1 <- F                    # logical values can be abbriviated


rna.str <- "TTAGAATT"    # string variable
mode(rna.str)            # get the type (or mode) of the object
nchar(rna.str)           # get the lenght of the string

#------------------ #
#   R help          #
#------------------ #

# Access help file for the R function
?sd
help(sd)

# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")


#------------------ #
#   R vectors       #
#------------------ #

# Vector is an array of R objects of the same type:
num.vector <- c( 5,7, 9,11, 4, -1, 0)

# Numeric vectors can be defined in a number of ways:
vals <- c (2, -7, 5, 3, -1 )             # concatenation
vals <- 25:75                            # range of values
vals <- seq(from=0, to=3, by=0.5)        # sequence definition
vals <- rep(1, times=7)                  # repeat value
vals <- numeric(9)                       # initialize numeric vector with zeros
vals <- rnorm(5, mean=2, sd=1.5 )        # returns normally distributed values

# Vector arithmetic:
dist <- c(6, 10, 5, 8, 15, 9)
time <- c(0.5, 0.9, 0.4,0.7,1.2,0.85)
velocity <- dist/time   # each element of the first vector will be divided by the element from the second vector with the same index
print(velocity)

# R operators
#              +, -, *, /  - addition, subtraction, multiplication, division
#              ^ or **     - exponentiation
#              %%          - modulus
#              %/%         - integer division
#              %in%        - membership
#              :           - sequence genration
#              <, <=, ==, >=, >, !=     - boolean comparative
#              |, ||       - OR  vectorized/not vectorized
#              &, &&       - AND
#              


# vector slicing (subsetting)
x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)    # define a numeric vector
x[2]         # returns second element 
x[2:4]       # returns second through 4th elements inclusive
x[c(1,3,5)]  # returns 1st, 3rd and 5th elements
x[-2]        # returns all but 2nd element
x[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)]   # returns 1st, 3rd, and 7th elements

#compare each element of the vector with a value
x < 37.0

#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]    


which.max(x)  # find the (first)maximum element and return its index
which.min(x)
which(x >= 37.0) # find the location of all the elements that satisfy a specific condition


# vector functions:
#               max(x),   min(x),     sum(x)
#               mean(x),  sd(), median(x),  range(x)
#               var(x)                            - simple variance
#               cor(x,y)                          - correlation between x and y
#               sort(x), rank(x), order(x)
#               cumsum(), cumprod(x), cummin(x), cumprod(x)
#               duplicated(x), unique(x)
#               summary()


#------------------ #
#   R packages     #
#------------------ #
# R contains many packages. Many of them can be found on the CRAN website 
# Bioconductor contains many packages used in bioinformatics

#load package
library(seqinr)

#---------------------------- #
#   Reading input files       #
#---------------------------- #

# Read regular csv file:
salaries <- read.csv("http://rcs.bu.edu/classes/BI594/Salaries.csv")

# Read fast files (requires seqinr package)
fasta <- read.fasta("http://rcs.bu.edu/classes/BI594/contigs.fasta")


#------------------------------ #
#   Exploring R dataframes      #
#------------------------------ #
#Explore the regular dataset:

#Look at the first and last few records
head(salaries)
tail(salaries)

#Get the list of the columns
names(salaries)

#Number of rows:
nrow(salaries)

#Number of columns:
ncol(salaries)

#Get the structure of the data:
str(salaries)

#Get the summary of the data:
summary(salaries)

# Get the number of unique values for categorical variable:
table(salaries$rank)

#------------------------------ #
#   Exploring Fasta data        #
#------------------------------ #
# Get the structure of the object
str(fasta)

# Extract the sequence:
fseq <- fasta[[1]]  # get the first element from the fasta list

#Look at the first few elements
head(fseq)

# The length of the vector:
length(fseq)

#Base composition of the sequence
table(fseq)

#Calculate GC Content  %(G+C)
GC(fseq)

# Count the number of 2-letter words
count(fseq,2)


#---------------------------------------- #
#   Querying the NCBI Database in R       #
#---------------------------------------- #
choosebank()

choosebank("refseqViruses")

#Finding the sequence for the DEN-1 Dengue virus genome
den<-query("Dengue1", "AC=NC_001477")
#exploring
attributes(den)
str(den)

#Extract sequence
dseq <- getSequence(den$req[[1]])


#Once we have sequence in R we can retrieve the annotation:
dann <- getAnnot(den$req[[1]])
dann[1:30]   #print first 30 lines

# Close connection
closebank()


#-----------------------------------#
# Explore the regular R dataframes  #
#-----------------------------------#
salaries <- read.csv("http://rcs.bu.edu/classes/BI594/Salaries.csv")


#numeric data exploratory analysis
min(salaries$salary)
max(salaries$salary)
range(salaries$yrs.service)
summary(salaries$salary)

#view the data
hist(salaries$salary)

#another way to look at the data
boxplot( salaries$salary )


#------------------------ #
#   dataframes slicing    #
#------------------------ #
# dataframe slicing (accessing elements)
salaries[3,5]    # element, from the 3rd row, 5th column
salaries[3,]     # all elements from the 3rd row
salaries[ ,5]    # all elements in the 5th column 
salaries[[5]]    # same: accessing 5th variable (column) in the dataframe

salaries$sex  # accessing elemnts using variable name
salaries[salaries$sex == "Male", ]  # list all the rows in the dataset for which variable ed is equal to specific value


# Create dataframe:
organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)
dt <- data.frame(organism=organism,
                 genomesz=genomeSizeBP, 
                 genecount=estGeneCount)
dt

# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount

# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt

#------------------ #
#   Missing Values  #
#------------------ #
x <- c(734, 145, NA, 456, NA)    # define a numeric vector
is.na(x)              # check if the element in the vector is missing
which(is.na(x))       # which elements are missing
anyNA(x)              # are there any missing data

x == NA   # this does not work ! - missing value cannot be compared to anything

mean(x)
# By default mamy statistical functions will not compute if the data contain missing values


# Read help topic for the function
?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)

#--------------------------- #
#   Hypothesis Testing in R  #
#--------------------------- #

# This dataset consists of gene expression measurements for 10 genes 
# under control and reatment conditions with 4 replicates each
genht <- read.csv("http://rcs.bu.edu/classes/BI594/geHT.csv")

str(genht)


#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4  )


# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)
#The test result with a p-value of 0.03583 rejects the null hypothesis at a critical
#value (alpha level) of 0.05

# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4  )
t.test(control, treat)
#The p-value for this test very strongly rejects the null hypothesis of no
#difference between the mean of the treatment group and control group

# paired t-test
#Suppose that control and treatment values were "before" and "after" values for each gene. We could then perform a paired t-test
t.test(control, treat, paired=T)