# ---------------------------------------# # # # Introduction to R # Graphics # # ---------------------------------------# #To install a package: #install.packages("ggplot2") #install.packages("RColorBrewer") #To load R package library(ggplot2) library(RColorBrewer) #Read Salary dataset salaries <- read.csv( "http://scv.bu.edu/classes/BI594/Salaries.csv") ecoli <- read.csv( "http://scv.bu.edu/classes/BI594/Ecoli_metadata.csv") #Explore the dataset head(salaries) str(salaries) summary(salaries) #Excersize: explore ecoli dataset # Factors #dplyr - R package that provide easy tools for data manipulation #install.packages(dplyr) library(dplyr) #To chose specific columns from a dataframe select(ecoli, sample, clade, cit, genome_size) #To chose rows: filter(ecoli, cit == "plus") #Pipes ecoli %>% filter(cit == "plus") %>% select(sample, generation, clade) #To save the resulting object" ecoli_citplus <- ecoli %>% filter(cit == "plus") %>% select(sample, generation, clade) # Split - apply - combine ( n() is a function within dplyr package to find count for each group ) ecoli %>% group_by(cit) %>% summarize( n() ) #To calculate mean for each subgroup ecoli %>% group_by(cit, clade) %>% summarize( mean_size = mean(genome_size, na.rm = TRUE) ) # There are 3 main environments in R to create graphics: # - R base utilities # - lattice package # - ggplot2 package # There is also powerful "grid" package and a couple in-development interactive packages # ------ Base R Graphics ------- # Generic(high level) plot functions: # -plot: generic x-y plots, scatterplots # -matplot # -barplot # -boxplot # -hist: histogram # -pie: pie charts # -dotchart: cleveland dot plots # -image, heatmap, contour, persp: image-like plots # -qqnorm, qqline, qqplot: distribution analysis plots # -pairs, coplot: display of multivariant data # -dotchart # -mosaicplot # -stripchart # -contour # -hclust: hierarchial clustering # Low level - functions that add to the existing plot created with a high-level plot functions: # -points # -lines # -abline # -segments # -text # -mtext # -title # -axis # -grid # -legend # -rug # -polygon # -box #par() function contains the list of various graphics parameters that can be modified globaly or for each graph # ------ Example of using graphics parameters to change the original plot into informative one: # create points for sqrt(x) and log(x) functions x <- 1:10 y1 <- log(x) y2 <- sqrt(x) # First attempt to draw the points: plot(x,y1) lines(x,y2) # Using R graphics parameters we can convert this plot into high quality one: plot(x, y1, # specify the coordinates type = "b", # type of the plot ylim = c(min(c(y1,y2)), max(c(y1,y2))), # set the axis limits main = "Logarithm and Square root functions", # title ylab = "y", # # y label col = "darkgreen", # line color lwd = 2, # line tickness pch=15, # symbol to display the point cex.lab=1.5, # size of the font used for labels font.lab = 3, # font type cex.main = 2) # size of the font used for the title #Additional line lines(x,y2, # coordinates col = "steelblue", # color lwd =2, # line width type="b", # line type (b - both, lines and points) pch=19, # symbol used to display points cex.lab=1.5) # font size for labels #Add grid to the plot abline(h = seq(0,3,by=0.5), v = c(0:10), col='lightgray',lty="dotted") #Add legend legend("topleft", # position col=c("darkgreen","steelblue"), # colors lty=1, # draw line pch=c(15,19), # draw points cex=1.5, # text size legend=c("logarithm","square root")) # text # Add text to the graph text(x = 6, y = sqrt(6), pos = 2, labels=expression(paste("y = ",sqrt(x))), cex=2, offset=0.75) text(x = 6, y = log(6), pos = 2, labels=expression(paste("y = ",ln(x))), cex=2, offset=0.75) # For our ecoli dataset: plot(ecoli$genome_size, pch=8, main="Scatter plot of genome sizes") # -------- Bar plot -------- barplot(ranks) b<-barplot(ranks, col = mycol, main = "Teaching Faculty", sub = "Source: University Administration", names.arg = names(ranks), ylim = c(0, max(ranks) * 1.1), axes = FALSE) text(b,ranks, labels = ranks, pos = 3) # -------- Colors in R --------- # There are many ways to specify a color in R: # - specify an index within a colorscheme # - by name # - using rgb, hsv, hex values # 1. Using index within colorscheme (palettes) barplot(c(1:10), col=c(1:10)) #list colors in the current palette: palette() pie(rep(1,length(palette())), col=palette()) #some available palettes: # rainbow(), heat.colors(), terrain.colors(), topo.colors(), cm.colors(), gray.colors() num.col=15 barplot(c(1:num.col), col=rainbow(num.col)) barplot(c(1:num.col), col=heat.colors(num.col)) barplot(c(1:num.col), col=terrain.colors(num.col)) barplot(c(1:num.col), col=topo.colors(num.col)) barplot(c(1:num.col), col=cm.colors(num.col)) barplot(c(1:num.col), col=gray.colors(num.col)) #A popular palettes could be found in package RColorBrewer #install.packages("RColorBrewer") library(RColorBrewer) display.brewer.all() barplot(c(1:8), col=brewer.pal(8,"Dark2") ) barplot(c(1:9), col=brewer.pal(9,"Set1") ) # 2. Specify colors by name colors() # list all colornames barplot(c(21,19,16,14,12,10,8,5,4,3,2,1), col=c("steelblue4", "darkred", "dodgerblue", "forestgreen", "gold", "steelblue", "firebrick", "darkorange", "darkgrey", "tomato", "olivedrab", "dimgray")) # A lits of some of my favorite colors: #steelblue, steelblue4 #firebrick #tomato #skyblue #olivedrab #forestgreen #gold #dodgerblue #dimgray #darkred #darkorange #darkgray #springgreen #violetred #slateblue #sienna #seagreen #sandybrown #salmon #saddlebrown #maroon #limegreen barplot(c(21,19,17,14,12,8,5,4,3,2,1), col=c("springgreen", "skyblue", "violetred", "slateblue", "sienna", "seagreen", "sandybrown", "salmon", "saddlebrown", "maroon", "limegreen")) # 3. Specify rgb values col1 <- rgb(0.1, 0.5, 0.1) col2 <- rgb(0.9, 0.7, 0.0) barplot(c(7,11), col=c(col1, col2)) #Conversion one color format to another # col2rgb(), rgb2hsv() # Additional help related to colors help(package=colorspace) # -------- Boxplot-------- boxplot(salaries$salary) #let's display 2 boxplots: one for male and one for female names boxplot( formula = salary ~ discipline, data = salaries, names = c("A","B"), col = c("gold","forestgreen"), horizontal = TRUE, main = "Salary range for Disciplines A and B") legend("topright", legend=c("A","B"), pch=15, col = c("gold","forestgreen")) # Side by side boxplot for ecoli dataset boxplot(genome_size ~ cit, ecoli) #simple #Add colors, title, labels boxplot(genome_size ~ cit, ecoli, col=c("pink","purple", "darkgrey"), main="Average expression differences between celltypes", ylab="Expression") # ------- Plot ------- plot(salaries$salary~salaries$yrs.service) # or plot(salaries$yrs.service, salaries$salary) #Function plot could be used to plot points or lines #Explore various symbols ?points #Draw a table with all symbols plot(c(-1, 26), 0:1, type = "n", axes = FALSE, xlab="", ylab="") text(0:25, 0.6, 0:25, cex=0.75) points(0:25, rep(0.5, 26), pch = 0:25, bg = "grey") plot(x = salaries$yrs.service, y = salaries$salary, pch = 19) # Some often used graphics parameters: # - pch: plotting symbol (default - open circle) # - lty: line type (default - solid line) # - lwd: line width # - las: orientation of the axis labels on the plot # - bg: background color # - fg: foreground color # - mar: margin size # - oma: outer margin size levels(salaries$rank) col3 <- c("steelblue4","firebrick","forestgreen") cols <- col3 [ as.numeric(salaries$rank) ] plot(x = salaries$yrs.service, y = salaries$salary, pch = 19, main = "Analysis of dependency of 2 variables", xlab = "Years of Service", ylab = "Salary", col = cols, cex=1.5) lm.fit <- lm(salary ~ yrs.service, data = salaries) abline(lm.fit, lty = 4, lwd = 2, col="darkgray") legend("topleft",levels(salaries$rank),fill=col3 ) eq <- substitute(italic(salary) == a + b %.% italic(yrs.service)*","~~italic(r)^2~"="~r2, list(a = format(coef(lm.fit)[1], digits = 2), b = format(coef(lm.fit)[2], digits = 2), r2 = format(summary(lm.fit)$r.squared, digits = 3))) #R-squared: what proportion of the variation in the outcome can be explained by the covariates/predictors # if R-squared is low, than the outcome is poorly predicted by the covariates/ text(15,170000,labels=as.expression(eq), pos=4, cex=.8 ) # -------- Histogram -------- #plot simple histogram hist(salaries$salary) hist(salaries$salary, main="Histogram of Salaries of professors in Disciplines A and B", col="steelblue3", xlab="Salary, dollars") # Same plot with probability along the y axes hist(salaries$salary, main="Histogram of Salaries of professors in Disciplines A and B", col="steelblue3", xlab="Salary, dollars", prob=TRUE) #Add rugplot rug(salaries$salary) #Add density line lines(density(salaries$salary), col="darkblue", lwd=2) # ----- Pairs ----- # Delays per airport pairs( salaries[ ,c("yrs.since.phd", "yrs.service","salary") ] ) #An alternative way using formula pairs( ~ yrs.since.phd + yrs.service + salary, data = salaries) panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * r) } pairs( ~ yrs.since.phd + yrs.service + salary, data = salaries, lower.panel = panel.smooth, upper.panel = panel.cor) # ----- Multiple plots in one graph ----- par.save <- par() # save current graphics parameters print( par() ) # print current graphics parameters par( mfrow = c(1,2) ) # 1 row and 2 columns # first graph pie( ranks, labels = labels, col = mycol , init.angle = 90, main = "Teaching Faculty") # second graph b<-barplot(table(salaries$rank), col=c("steelblue4","firebrick","forestgreen"), main = "Professor Ranks", names.arg = c("Associate Professor", "Assistant Professor", "Professor"), ylim = c(0, max(table(salaries$rank)) * 1.25), axes=FALSE) text(b,table(salaries$rank),table(salaries$rank), pos=3) mtext("Disciplines A and B") par ( mfrow = c(1,1) ) # A few useful links with Many examples: # http://shinyapps.org/apps/RGraphCompendium/index.php # http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual #TOC-Some-Great-R-Functions # http://www.cookbook-r.com/Graphs/ #---------------------------------------------------- # # ggplot2 tutorial with many useful examples # http://r-statistics.co/ggplot2-Tutorial-With-R.html # #---------------------------------------------------- # Very basic ggplot2 introduction: library(ggplot2) #basic plot with points ggplot(salaries, aes( yrs.service, salary)) + geom_point() ggplot(salaries, aes( yrs.service, salary)) + geom_point() + # type of geom objects theme_bw() + # theme ggtitle(" Salary analysis") # plot title # color the points according to the faculty type ggplot(salaries, aes( yrs.service, salary)) + geom_point( aes(color = factor(rank) ), size=5) + # type of geom objects ggtitle(" Salary analysis") + # plot title theme_bw( ) + # theme theme ( plot.title=element_text(size=14,face="bold" ) ) #Add a regression line to the plot ggplot(salaries, aes( yrs.service, salary)) + geom_point( aes(color = factor(rank) ), size=5) + # type of geom objects ggtitle(" Salary analysis") + # plot title geom_smooth() + theme_bw( ) + # theme theme ( plot.title=element_text(size=14,face="bold" ) ) # Change the line to a simple lm line ggplot(salaries, aes( yrs.service, salary)) + geom_point( aes(color = factor(rank) ), size=5) + # type of geom objects ggtitle(" Salary analysis") + # plot title geom_smooth( method='lm' , col = "black") + theme_bw( ) + # theme theme ( plot.title=element_text(size=14,face="bold" ) ) ggplot(ecoli) + geom_point(aes(x = sample, y= genome_size, color = generation, shape = cit), size = rel(3.0)) + theme(axis.text.x = element_text(angle=45, hjust=1)) #boxplot ggplot(salaries, aes( x=discipline, y=salary ) ) + geom_boxplot() #adding addional features to the boxplot ggplot(salaries, aes( x=discipline, y=salary ) ) + geom_boxplot() + geom_jitter(width = 0.2) ggplot(salaries, aes( x=rank, y=salary) ) + geom_boxplot( varwidth = TRUE ) + #width is proportional to the square roots of the number of observations geom_jitter(width = 0.2) + ggtitle(" Salary analysis") + coord_flip() + theme ( plot.title=element_text(size=14,face="bold" ) ) ggplot(ecoli) + geom_boxplot(aes(x = cit, y = genome_size, fill = cit)) + ggtitle('Boxplot of genome size by citrate mutant type') + xlab('citrate mutant') + ylab('genome size') + theme(panel.grid.major = element_line(size = .5, color = "grey"), axis.text.x = element_text(angle=45, hjust=1), axis.title = element_text(size = rel(1.5)), axis.text = element_text(size = rel(1.25))) #histogram ggplot(salaries, aes(x=salary)) + geom_histogram() ggplot(salaries, aes(x=salary)) + geom_histogram(binwidth=10000) #improving histogram ggplot(salaries, aes(x=salary)) + geom_histogram(binwidth=10000, color="black", fill="white") ggplot(salaries, aes(x=salary)) + geom_histogram(binwidth=10000, color="black", fill="white", aes(y=..density..)) + geom_density(alpha=.2, fill="darkorange") #------------------------------------------------------------- #To build Bioconductor package: #source("http://bioconductor.org/biocLite.R") #biocLite("hgu95av2.db") library(annotate) library(geneplotter) library(hgu95av2.db) newChrom <- buildChromLocation("hgu95av2") #Explore the structure of newChrom object str(newChrom) #Print the content print(newChrom) #geneplotter package is used to plot all or selected chromosomes cPlot(newChrom, c("1","5","10","X","Y"),scale="max", bg="darkgray") cPlot(newChrom, c("1","5","10","X","Y"),scale="relative", bg="darkgray") #ggbio is a Bioconductor package built on ggplot2 library(ggbio) library(Homo.sapiens) #Make gene model from OrganismDb object class(Homo.sapiens) ## data(genesymbol, package = "biovizBase") wh <- genesymbol[c("BRCA1", "NBR1")] wh <- range(wh, ignore.strand = TRUE) p.txdb <- autoplot(Homo.sapiens, which = wh) p.txdb autoplot(Homo.sapiens, which = wh, label.color = "black", color = "brown", fill = "brown") #ggbio supports visualization of alignments file stored in bam files: fl.bam <- system.file("extdata", "wg-brca1.sorted.bam", package = "biovizBase") wh <- keepSeqlevels(wh, "chr17") autoplot(fl.bam, which = wh)