# ---------------------------------------#
#                                        #
#  Introduction to R
#  Graphics                              #
# ---------------------------------------#

#To install a package:
#install.packages("ggplot2")
#install.packages("RColorBrewer")

#To load R package
library(ggplot2)
library(RColorBrewer)


#Read Salary dataset
salaries <- read.csv( "http://scv.bu.edu/classes/BI594/Salaries.csv")
ecoli <- read.csv( "http://scv.bu.edu/classes/BI594/Ecoli_metadata.csv")

#Explore the dataset
head(salaries)
str(salaries)
summary(salaries)

#Excersize: explore ecoli dataset

# Factors


#dplyr - R package that provide easy tools for data manipulation
#install.packages(dplyr)
library(dplyr)

#To chose specific columns from a dataframe
select(ecoli, sample, clade, cit, genome_size)

#To chose rows:
filter(ecoli, cit == "plus")

#Pipes
ecoli %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade)

#To save the resulting object"
ecoli_citplus <- ecoli %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade)

# Split - apply - combine ( n() is a function within dplyr package to find count for each group )
ecoli %>%
  group_by(cit) %>%
  summarize( n() )

#To calculate mean for each subgroup
ecoli %>%
  group_by(cit, clade) %>%
  summarize( mean_size = mean(genome_size, na.rm = TRUE) )


# There are 3 main environments in R to create graphics:
# - R base utilities
# - lattice package
# - ggplot2 package
# There is also powerful "grid" package and a couple in-development interactive packages


# ------ Base R Graphics -------

# Generic(high level) plot functions:
# -plot: generic x-y plots, scatterplots
# -matplot
# -barplot
# -boxplot
# -hist: histogram
# -pie: pie charts
# -dotchart: cleveland dot plots
# -image, heatmap, contour, persp: image-like plots
# -qqnorm, qqline, qqplot: distribution analysis plots
# -pairs, coplot: display of multivariant data
# -dotchart
# -mosaicplot
# -stripchart
# -contour
# -hclust: hierarchial clustering

# Low level - functions that add to the existing plot created with a high-level plot functions:
# -points
# -lines
# -abline
# -segments
# -text
# -mtext
# -title
# -axis
# -grid
# -legend
# -rug
# -polygon
# -box

#par() function contains the list of various graphics parameters that can be modified globaly or for each graph


# ------ Example of using graphics parameters to change the original plot into informative one:

# create points for sqrt(x) and log(x) functions
x <- 1:10
y1 <- log(x)
y2 <- sqrt(x)

# First attempt to draw the points:
plot(x,y1)
lines(x,y2)

# Using R graphics parameters we can convert this plot into high quality one:
plot(x, y1,                                          # specify the coordinates
     type = "b",                                     # type of the plot
     ylim = c(min(c(y1,y2)), max(c(y1,y2))),         # set the axis limits
     main = "Logarithm and Square root functions",   # title
     ylab = "y",     #                               # y label
     col = "darkgreen",                              # line color
     lwd = 2,                                        # line tickness
     pch=15,                                         # symbol to display the point
     cex.lab=1.5,                                    # size of the font used for labels
     font.lab = 3,                                   # font type
     cex.main = 2)                                   # size of the font used for the title

#Additional line
lines(x,y2,                        # coordinates
      col = "steelblue",           # color
      lwd =2,                      # line width
      type="b",                    # line type (b - both, lines and points) 
      pch=19,                      # symbol used to display points
      cex.lab=1.5)                 # font size for labels

#Add grid to the plot
abline(h = seq(0,3,by=0.5), v = c(0:10), col='lightgray',lty="dotted")

#Add legend
legend("topleft",                             # position
       col=c("darkgreen","steelblue"),        # colors
       lty=1,                                 # draw line
       pch=c(15,19),                          # draw points
       cex=1.5,                               # text size
       legend=c("logarithm","square root"))   # text

# Add text to the graph
text(x = 6, y = sqrt(6), pos = 2, labels=expression(paste("y = ",sqrt(x))), cex=2, offset=0.75)
text(x = 6, y = log(6),  pos = 2, labels=expression(paste("y = ",ln(x))),   cex=2, offset=0.75)


# For our ecoli dataset:
plot(ecoli$genome_size, pch=8, main="Scatter plot of genome sizes")

# -------- Bar plot --------

barplot(ranks)


b<-barplot(ranks,
           col = mycol,
           main = "Teaching Faculty",
           sub = "Source: University Administration",
           names.arg = names(ranks), 
           ylim = c(0, max(ranks) * 1.1),
           axes = FALSE)

text(b,ranks,
     labels = ranks, 
     pos = 3)


# -------- Colors in R ---------
# There are many ways to specify a color in R:
# - specify an index within a colorscheme
# - by name
# - using rgb, hsv, hex values

# 1. Using index within colorscheme (palettes)
barplot(c(1:10), col=c(1:10))

      #list colors in the current palette:
      palette()

      pie(rep(1,length(palette())),
          col=palette())

      #some available palettes: 
      #   rainbow(), heat.colors(), terrain.colors(), topo.colors(), cm.colors(), gray.colors()
num.col=15
barplot(c(1:num.col), col=rainbow(num.col))
barplot(c(1:num.col), col=heat.colors(num.col))
barplot(c(1:num.col), col=terrain.colors(num.col))
barplot(c(1:num.col), col=topo.colors(num.col))
barplot(c(1:num.col), col=cm.colors(num.col))
barplot(c(1:num.col), col=gray.colors(num.col))


#A popular palettes could be found in package RColorBrewer
#install.packages("RColorBrewer")
library(RColorBrewer)
display.brewer.all()
barplot(c(1:8), col=brewer.pal(8,"Dark2") )
barplot(c(1:9), col=brewer.pal(9,"Set1") )


# 2. Specify colors by name
colors() # list all colornames
barplot(c(21,19,16,14,12,10,8,5,4,3,2,1), col=c("steelblue4",
                                "darkred",
                                "dodgerblue",
                                "forestgreen",
                                "gold",
                                "steelblue",
                                "firebrick",
                                "darkorange",
                                "darkgrey",
                                "tomato",
                                "olivedrab",
                                "dimgray"))


# A lits of some of my favorite colors:
#steelblue, steelblue4
#firebrick
#tomato
#skyblue
#olivedrab
#forestgreen
#gold
#dodgerblue
#dimgray
#darkred
#darkorange
#darkgray


#springgreen
#violetred
#slateblue
#sienna
#seagreen
#sandybrown
#salmon
#saddlebrown
#maroon
#limegreen

barplot(c(21,19,17,14,12,8,5,4,3,2,1), col=c("springgreen",
                                          "skyblue",
                                 "violetred",
                                 "slateblue",
                                 "sienna",
                                 "seagreen",
                                 "sandybrown",
                                 "salmon",
                                 "saddlebrown",
                                 "maroon",
                                 "limegreen"))


# 3. Specify rgb values
col1 <- rgb(0.1, 0.5, 0.1)
col2 <- rgb(0.9, 0.7, 0.0)
barplot(c(7,11), col=c(col1, col2))


#Conversion one color format to another
# col2rgb(), rgb2hsv()

# Additional help related to colors
help(package=colorspace)


# -------- Boxplot--------

boxplot(salaries$salary)

#let's display 2 boxplots: one for male and one for female names
boxplot( formula = salary ~ discipline,
         data = salaries,
         names = c("A","B"),
         col = c("gold","forestgreen"),
         horizontal = TRUE,
         main = "Salary range for Disciplines A and B")

legend("topright", 
       legend=c("A","B"), 
       pch=15,
       col = c("gold","forestgreen"))


# Side by side boxplot for ecoli dataset
boxplot(genome_size ~ cit, ecoli)   #simple


#Add colors, title, labels
boxplot(genome_size ~ cit, ecoli,  col=c("pink","purple", "darkgrey"),
        main="Average expression differences between celltypes", ylab="Expression")


# ------- Plot -------
plot(salaries$salary~salaries$yrs.service)
# or 
plot(salaries$yrs.service, salaries$salary)

#Function plot could be used to plot points or lines

#Explore various symbols 
?points

#Draw a table with all symbols
plot(c(-1, 26), 0:1, type = "n", axes = FALSE, xlab="", ylab="")
text(0:25, 0.6, 0:25, cex=0.75)
points(0:25, rep(0.5, 26), pch = 0:25, bg = "grey")


plot(x = salaries$yrs.service,
     y = salaries$salary, 
     pch = 19)

# Some often used graphics parameters:
# - pch: plotting symbol (default - open circle)
# - lty: line type (default - solid line)
# - lwd: line width 
# - las: orientation of the axis labels on the plot
# - bg: background color
# - fg: foreground color
# - mar: margin size
# - oma: outer margin size

levels(salaries$rank)
col3 <- c("steelblue4","firebrick","forestgreen")
cols <- col3 [ as.numeric(salaries$rank) ]
plot(x = salaries$yrs.service,
     y = salaries$salary, 
     pch = 19, 
     main = "Analysis of dependency of 2 variables",
     xlab = "Years of Service",
     ylab = "Salary", 
     col = cols,
     cex=1.5)

lm.fit <- lm(salary ~ yrs.service, data = salaries)
abline(lm.fit, lty = 4, lwd = 2, col="darkgray")
legend("topleft",levels(salaries$rank),fill=col3 )


eq <- substitute(italic(salary) == a + b %.% italic(yrs.service)*","~~italic(r)^2~"="~r2, 
                 list(a = format(coef(lm.fit)[1], digits = 2), 
                      b = format(coef(lm.fit)[2], digits = 2), 
                      r2 = format(summary(lm.fit)$r.squared, digits = 3)))
#R-squared: what proportion of the variation in the outcome can be explained by the covariates/predictors
# if R-squared is low, than the outcome is poorly predicted by the covariates/


text(15,170000,labels=as.expression(eq), pos=4, cex=.8 )


# -------- Histogram --------


#plot simple histogram
hist(salaries$salary)


hist(salaries$salary, 
     main="Histogram of Salaries of professors in Disciplines A and B",
     col="steelblue3",
     xlab="Salary, dollars")

# Same plot with probability along the y axes
hist(salaries$salary, 
     main="Histogram of Salaries of professors in Disciplines A and B",
     col="steelblue3",
     xlab="Salary, dollars", 
     prob=TRUE)


#Add rugplot
rug(salaries$salary)

#Add density line
lines(density(salaries$salary), col="darkblue", lwd=2)


# ----- Pairs -----


# Delays per airport
pairs( salaries[ ,c("yrs.since.phd", "yrs.service","salary") ] ) 

#An alternative way using formula
pairs( ~ yrs.since.phd + yrs.service + salary, 
       data = salaries) 


panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs( ~ yrs.since.phd + yrs.service + salary, 
       data = salaries,
       lower.panel = panel.smooth, 
       upper.panel = panel.cor) 


# ----- Multiple plots in one graph -----
par.save <- par()  # save current graphics parameters
print( par() )     # print current graphics parameters

par( mfrow = c(1,2) )  # 1 row and 2 columns

    # first graph
    pie( ranks, labels = labels, col = mycol , init.angle = 90, main = "Teaching Faculty")

    # second graph
    b<-barplot(table(salaries$rank),
           col=c("steelblue4","firebrick","forestgreen"),
           main = "Professor Ranks",
           names.arg = c("Associate Professor", "Assistant Professor", "Professor"),
           ylim = c(0, max(table(salaries$rank)) * 1.25), 
           axes=FALSE)
    text(b,table(salaries$rank),table(salaries$rank), pos=3)
    mtext("Disciplines A and B")


par ( mfrow = c(1,1) )


# A few useful links with Many examples:
# http://shinyapps.org/apps/RGraphCompendium/index.php
# http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual   #TOC-Some-Great-R-Functions
# http://www.cookbook-r.com/Graphs/


#----------------------------------------------------
#
#  ggplot2 tutorial with many useful examples
# http://r-statistics.co/ggplot2-Tutorial-With-R.html

#
#----------------------------------------------------

# Very basic ggplot2 introduction:
library(ggplot2)


#basic plot with points
ggplot(salaries, aes( yrs.service, salary)) + geom_point() 

ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point() +    # type of geom objects
  theme_bw() +      # theme
  ggtitle(" Salary analysis")   # plot title

# color the points according to the faculty type
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )


#Add a regression line to the plot
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  geom_smooth() + 
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )

# Change the line to a simple lm line
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  geom_smooth( method='lm' , col = "black") + 
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )


ggplot(ecoli) +
  geom_point(aes(x = sample, y= genome_size, color = generation, shape = cit), size = rel(3.0)) +
  theme(axis.text.x = element_text(angle=45, hjust=1))

#boxplot
ggplot(salaries, aes( x=discipline, y=salary ) ) + geom_boxplot() 

#adding addional features to the boxplot
ggplot(salaries, aes( x=discipline, y=salary ) ) + 
  geom_boxplot() +
  geom_jitter(width = 0.2)

ggplot(salaries, aes( x=rank, y=salary) ) + 
  geom_boxplot( varwidth = TRUE ) + #width is proportional to the square roots of the number of observations
  geom_jitter(width = 0.2) + 
  ggtitle(" Salary analysis") + 
  coord_flip() + 
  theme ( plot.title=element_text(size=14,face="bold" ) )

ggplot(ecoli) +
  geom_boxplot(aes(x = cit, y = genome_size, fill = cit)) +
  ggtitle('Boxplot of genome size by citrate mutant type') +
  xlab('citrate mutant') +
  ylab('genome size') +
  theme(panel.grid.major = element_line(size = .5, color = "grey"),
        axis.text.x = element_text(angle=45, hjust=1),
        axis.title = element_text(size = rel(1.5)),
        axis.text = element_text(size = rel(1.25)))

#histogram
ggplot(salaries, aes(x=salary)) + geom_histogram()
ggplot(salaries, aes(x=salary)) + geom_histogram(binwidth=10000)

#improving histogram
ggplot(salaries, aes(x=salary)) + 
  geom_histogram(binwidth=10000, 
                 color="black", fill="white")

ggplot(salaries, aes(x=salary)) + 
  geom_histogram(binwidth=10000, 
                 color="black", fill="white", aes(y=..density..)) + 
  geom_density(alpha=.2, fill="darkorange")


#-------------------------------------------------------------
#To build Bioconductor package:
#source("http://bioconductor.org/biocLite.R")
#biocLite("hgu95av2.db")

library(annotate)
library(geneplotter)
library(hgu95av2.db)
newChrom <- buildChromLocation("hgu95av2")

#Explore the structure of newChrom object
str(newChrom)

#Print the content
print(newChrom)

#geneplotter package is used to plot all or selected chromosomes 
cPlot(newChrom, c("1","5","10","X","Y"),scale="max", bg="darkgray")
cPlot(newChrom, c("1","5","10","X","Y"),scale="relative", bg="darkgray")


#ggbio is a Bioconductor package built on ggplot2
library(ggbio)
library(Homo.sapiens)

#Make gene model from OrganismDb object
class(Homo.sapiens)
##
data(genesymbol, package = "biovizBase")
wh <- genesymbol[c("BRCA1", "NBR1")]
wh <- range(wh, ignore.strand = TRUE)
p.txdb <- autoplot(Homo.sapiens, which = wh)
p.txdb
autoplot(Homo.sapiens, which = wh, label.color = "black", color = "brown",
         fill = "brown")


#ggbio supports visualization of alignments file stored in bam files:
fl.bam <- system.file("extdata", "wg-brca1.sorted.bam", package = "biovizBase")
wh <- keepSeqlevels(wh, "chr17")
autoplot(fl.bam, which = wh)