A few useful links with many examples:

Installing Packages

The following packages from R-CRAN will be used in this session. Use install.packages() function to install them:

install.packages(c("ggplot2", "RColorBrewer", "dplyr"))

If the package comes from Bioconductor, then to install it:

#To build Bioconductor package:
#source("http://bioconductor.org/biocLite.R")
#biocLite("annotate")
#biocLite("geneplotter")
#biocLite("hgu95av2.db")
#biocLite("ggbio")
#biocLite("Homo.sapiens")

To load packages in R use library() function. It’s a common practice to have all library() statements at the beginning of R script:

library(ggplot2)
library(RColorBrewer)
library(annotate)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## Loading required package: XML

#library(geneplotter)
#library(hgu95av2.db)
#library(Homo.sapiens)
#library(ggbio)

Reading input datasets;

salaries <- read.csv( "http://scv.bu.edu/classes/BI594/Salaries.csv")

Functions like head(), str(), summary() are a great way to explore the dataset, learn about the size of the data, column names, type of each column and get a quick summary for each variable there.

head( salaries )

##   rank discipline yrs.since.phd yrs.service  sex salary
## 1 Prof          B            56          49 Male 186960
## 2 Prof          A            12           6 Male  93000
## 3 Prof          A            23          20 Male 110515
## 4 Prof          A            40          31 Male 131205
## 5 Prof          B            20          18 Male 104800
## 6 Prof          A            20          20 Male 122400

str( salaries )

## 'data.frame':    78 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AssocProf","AsstProf",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 1 1 1 2 1 1 1 1 1 ...
##  $ yrs.since.phd: int  56 12 23 40 20 20 20 18 29 51 ...
##  $ yrs.service  : int  49 6 20 31 18 20 17 18 19 51 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ salary       : int  186960 93000 110515 131205 104800 122400 81285 126300 94350 57800 ...

summary( salaries )

##         rank    discipline yrs.since.phd    yrs.service        sex    
##  AssocProf:13   A:36       Min.   : 1.00   Min.   : 0.00   Female:39  
##  AsstProf :19   B:42       1st Qu.:10.25   1st Qu.: 5.25   Male  :39  
##  Prof     :46              Median :18.50   Median :14.50              
##                            Mean   :19.71   Mean   :15.05              
##                            3rd Qu.:27.75   3rd Qu.:20.75              
##                            Max.   :56.00   Max.   :51.00              
##      salary      
##  Min.   : 57800  
##  1st Qu.: 88612  
##  Median :104671  
##  Mean   :108024  
##  3rd Qu.:126775  
##  Max.   :186960

Exercise Read and explore Ecoli dataset:

##     sample generation   clade strain     cit       run genome_size
## 1   REL606          0    <NA> REL606 unknown                  4.62
## 2 REL1166A       2000 unknown REL606 unknown SRR098028        4.63
## 3   ZDB409       5000 unknown REL606 unknown SRR098281        4.60
## 4   ZDB429      10000      UC REL606 unknown SRR098282        4.59
## 5   ZDB446      15000      UC REL606 unknown SRR098283        4.66
## 6   ZDB458      20000 (C1,C2) REL606 unknown SRR098284        4.63

## 'data.frame':    30 obs. of  7 variables:
##  $ sample     : Factor w/ 30 levels "CZB152","CZB154",..: 7 6 18 19 20 21 22 23 24 25 ...
##  $ generation : int  0 2000 5000 10000 15000 20000 20000 20000 25000 25000 ...
##  $ clade      : Factor w/ 7 levels "C1","(C1,C2)",..: NA 7 7 6 6 2 2 2 1 4 ...
##  $ strain     : Factor w/ 1 level "REL606": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cit        : Factor w/ 3 levels "minus","plus",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ run        : Factor w/ 30 levels "","SRR097977",..: 1 5 22 23 24 25 26 27 28 29 ...
##  $ genome_size: num  4.62 4.63 4.6 4.59 4.66 4.63 4.62 4.61 4.65 4.59 ...

##       sample     generation        clade      strain        cit    
##  CZB152  : 1   Min.   :    0   Cit+   :9   REL606:30   minus  : 9  
##  CZB154  : 1   1st Qu.:21250   C2     :6               plus   : 9  
##  CZB199  : 1   Median :31750   C1     :5               unknown:12  
##  REL10979: 1   Mean   :27350   (C1,C2):3                           
##  REL10988: 1   3rd Qu.:33750   C3     :2                           
##  REL1166A: 1   Max.   :40000   (Other):4                           
##  (Other) :24                   NA's   :1                           
##         run      genome_size   
##           : 1   Min.   :4.590  
##  SRR097977: 1   1st Qu.:4.610  
##  SRR098026: 1   Median :4.625  
##  SRR098027: 1   Mean   :4.663  
##  SRR098028: 1   3rd Qu.:4.740  
##  SRR098029: 1   Max.   :4.800  
##  (Other)  :24

Sometimes categorical variables are coded as string (“male”, “female”), numeric (0, 1) or logical (TRUE, FALSE). When R reads the data by default it will treat all string variables as categorical variables. Note: To disable this behaviour use stringAsFactors=FALSE argument in read.csv() and other similar functions.

Use factor() function to convert any type of a vector to a factor:

gender <- c(0,1,1,1,1,0,0,0)    # define numeric variable
summary(gender )                # summary will calculate basic statistics for a numeric variable

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.5     0.5     1.0     1.0

gender  <- factor(gender , labels=c("male", "female"))  # convert vector to a categorical (factor) variable
summary(gender )                                        # summary produces appropriate output

##   male female 
##      4      4

In case of ecoli dataset some columns are not categorical variables but were read as such. We need to convert them back to “character” type:

str(ecoli)

## 'data.frame':    30 obs. of  7 variables:
##  $ sample     : Factor w/ 30 levels "CZB152","CZB154",..: 7 6 18 19 20 21 22 23 24 25 ...
##  $ generation : int  0 2000 5000 10000 15000 20000 20000 20000 25000 25000 ...
##  $ clade      : Factor w/ 7 levels "C1","(C1,C2)",..: NA 7 7 6 6 2 2 2 1 4 ...
##  $ strain     : Factor w/ 1 level "REL606": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cit        : Factor w/ 3 levels "minus","plus",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ run        : Factor w/ 30 levels "","SRR097977",..: 1 5 22 23 24 25 26 27 28 29 ...
##  $ genome_size: num  4.62 4.63 4.6 4.59 4.66 4.63 4.62 4.61 4.65 4.59 ...

ecoli$sample <- as.character(ecoli$sample)
ecoli$run <- as.character(ecoli$run)
str(ecoli)

## 'data.frame':    30 obs. of  7 variables:
##  $ sample     : chr  "REL606" "REL1166A" "ZDB409" "ZDB429" ...
##  $ generation : int  0 2000 5000 10000 15000 20000 20000 20000 25000 25000 ...
##  $ clade      : Factor w/ 7 levels "C1","(C1,C2)",..: NA 7 7 6 6 2 2 2 1 4 ...
##  $ strain     : Factor w/ 1 level "REL606": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cit        : Factor w/ 3 levels "minus","plus",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ run        : chr  "" "SRR098028" "SRR098281" "SRR098282" ...
##  $ genome_size: num  4.62 4.63 4.6 4.59 4.66 4.63 4.62 4.61 4.65 4.59 ...

#Notice a different kind of summary output for character, factor and numeric columns
summary(ecoli)

##     sample            generation        clade      strain        cit    
##  Length:30          Min.   :    0   Cit+   :9   REL606:30   minus  : 9  
##  Class :character   1st Qu.:21250   C2     :6               plus   : 9  
##  Mode  :character   Median :31750   C1     :5               unknown:12  
##                     Mean   :27350   (C1,C2):3                           
##                     3rd Qu.:33750   C3     :2                           
##                     Max.   :40000   (Other):4                           
##                                     NA's   :1                           
##      run             genome_size   
##  Length:30          Min.   :4.590  
##  Class :character   1st Qu.:4.610  
##  Mode  :character   Median :4.625  
##                     Mean   :4.663  
##                     3rd Qu.:4.740  
##                     Max.   :4.800  
##

dplyr package

dplyr - R package that provide easy tools for data manipulation

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     intersect, rename, setdiff, union

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Chose specific columns from a dataframe
select(ecoli, sample, clade, cit, genome_size)

##      sample   clade     cit genome_size
## 1    REL606    <NA> unknown        4.62
## 2  REL1166A unknown unknown        4.63
## 3    ZDB409 unknown unknown        4.60
## 4    ZDB429      UC unknown        4.59
## 5    ZDB446      UC unknown        4.66
## 6    ZDB458 (C1,C2) unknown        4.63
## 7   ZDB464* (C1,C2) unknown        4.62
## 8    ZDB467 (C1,C2) unknown        4.61
## 9    ZDB477      C1 unknown        4.65
## 10   ZDB483      C3 unknown        4.59
## 11    ZDB16      C1 unknown        4.61
## 12   ZDB357      C2 unknown        4.62
## 13  ZDB199*      C1   minus        4.62
## 14   ZDB200      C2   minus        4.63
## 15   ZDB564    Cit+    plus        4.74
## 16   ZDB30*      C3   minus        4.61
## 17   ZDB172    Cit+    plus        4.77
## 18   ZDB158      C2   minus        4.63
## 19   ZDB143    Cit+    plus        4.79
## 20   CZB199      C1   minus        4.59
## 21   CZB152    Cit+    plus        4.80
## 22   CZB154    Cit+    plus        4.76
## 23    ZDB83    Cit+   minus        4.60
## 24    ZDB87      C2    plus        4.75
## 25    ZDB96    Cit+    plus        4.74
## 26    ZDB99      C1   minus        4.61
## 27   ZDB107    Cit+    plus        4.79
## 28   ZDB111      C2   minus        4.62
## 29 REL10979    Cit+    plus        4.78
## 30 REL10988      C2   minus        4.62

# Use "pipeline synax" to perform several operations at once:
ecoli %>% select( sample, clade, cit, genome_size)

##      sample   clade     cit genome_size
## 1    REL606    <NA> unknown        4.62
## 2  REL1166A unknown unknown        4.63
## 3    ZDB409 unknown unknown        4.60
## 4    ZDB429      UC unknown        4.59
## 5    ZDB446      UC unknown        4.66
## 6    ZDB458 (C1,C2) unknown        4.63
## 7   ZDB464* (C1,C2) unknown        4.62
## 8    ZDB467 (C1,C2) unknown        4.61
## 9    ZDB477      C1 unknown        4.65
## 10   ZDB483      C3 unknown        4.59
## 11    ZDB16      C1 unknown        4.61
## 12   ZDB357      C2 unknown        4.62
## 13  ZDB199*      C1   minus        4.62
## 14   ZDB200      C2   minus        4.63
## 15   ZDB564    Cit+    plus        4.74
## 16   ZDB30*      C3   minus        4.61
## 17   ZDB172    Cit+    plus        4.77
## 18   ZDB158      C2   minus        4.63
## 19   ZDB143    Cit+    plus        4.79
## 20   CZB199      C1   minus        4.59
## 21   CZB152    Cit+    plus        4.80
## 22   CZB154    Cit+    plus        4.76
## 23    ZDB83    Cit+   minus        4.60
## 24    ZDB87      C2    plus        4.75
## 25    ZDB96    Cit+    plus        4.74
## 26    ZDB99      C1   minus        4.61
## 27   ZDB107    Cit+    plus        4.79
## 28   ZDB111      C2   minus        4.62
## 29 REL10979    Cit+    plus        4.78
## 30 REL10988      C2   minus        4.62

# Chose rows (filter the dataframe based on specific condition):
filter(ecoli, cit == "plus")

##     sample generation clade strain  cit       run genome_size
## 1   ZDB564      31500  Cit+ REL606 plus SRR098289        4.74
## 2   ZDB172      32000  Cit+ REL606 plus SRR098042        4.77
## 3   ZDB143      32500  Cit+ REL606 plus SRR098040        4.79
## 4   CZB152      33000  Cit+ REL606 plus SRR097977        4.80
## 5   CZB154      33000  Cit+ REL606 plus SRR098026        4.76
## 6    ZDB87      34000    C2 REL606 plus SRR098035        4.75
## 7    ZDB96      36000  Cit+ REL606 plus SRR098036        4.74
## 8   ZDB107      38000  Cit+ REL606 plus SRR098038        4.79
## 9 REL10979      40000  Cit+ REL606 plus SRR098029        4.78

#Pipes (filter and then select columns)
ecoli %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade)

##     sample generation clade
## 1   ZDB564      31500  Cit+
## 2   ZDB172      32000  Cit+
## 3   ZDB143      32500  Cit+
## 4   CZB152      33000  Cit+
## 5   CZB154      33000  Cit+
## 6    ZDB87      34000    C2
## 7    ZDB96      36000  Cit+
## 8   ZDB107      38000  Cit+
## 9 REL10979      40000  Cit+

#To save the resulting object into a dataframe""
ecoli_citplus <- ecoli %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade)

# View the result:
str(ecoli_citplus )

## 'data.frame':    9 obs. of  3 variables:
##  $ sample    : chr  "ZDB564" "ZDB172" "ZDB143" "CZB152" ...
##  $ generation: int  31500 32000 32500 33000 33000 34000 36000 38000 40000
##  $ clade     : Factor w/ 7 levels "C1","(C1,C2)",..: 5 5 5 5 5 3 5 5 5

# Split - apply - combine ( n() is a function within dplyr package to find count for each group )
ecoli %>%
  group_by(cit) %>%
  summarize( n() )

## # A tibble: 3 × 2
##       cit `n()`
##    <fctr> <int>
## 1   minus     9
## 2    plus     9
## 3 unknown    12

#To calculate mean for each subgroup
ecoli %>%
  group_by(cit, clade) %>%
  summarize( mean_size = mean(genome_size, na.rm = TRUE) )

## Source: local data frame [13 x 3]
## Groups: cit [?]
## 
##        cit   clade mean_size
##     <fctr>  <fctr>     <dbl>
## 1    minus      C1  4.606667
## 2    minus      C2  4.625000
## 3    minus      C3  4.610000
## 4    minus    Cit+  4.600000
## 5     plus      C2  4.750000
## 6     plus    Cit+  4.771250
## 7  unknown      C1  4.630000
## 8  unknown (C1,C2)  4.620000
## 9  unknown      C2  4.620000
## 10 unknown      C3  4.590000
## 11 unknown      UC  4.625000
## 12 unknown unknown  4.615000
## 13 unknown      NA  4.620000

R graphics

There are 3 main environments in R to create graphics:

R base utilities
lattice package
ggplot2 package

There is also powerful “grid” package and a couple in-development interactive packages

Base R Graphics

Generic(high level) plot functions:

-plot: generic x-y plots, scatterplots
-matplot
-barplot
-boxplot
-hist: histogram
-pie: pie charts
-dotchart: cleveland dot plots
-image, heatmap, contour, persp: image-like plots
-qqnorm, qqline, qqplot: distribution analysis plots
-pairs, coplot: display of multivariant data
-dotchart
-mosaicplot
-stripchart
-contour
-hclust: hierarchial clustering

Low level - functions that add to the existing plot created with a high-level plot functions:

-points
-lines
-abline
-segments
-text
-mtext
-title
-axis
-grid
-legend
-rug
-polygon
-box

# create points for sqrt(x) and log(x) functions
x <- 1:10
y1 <- log(x)
y2 <- sqrt(x)

# First attempt to draw the points:
plot(x,y1)
lines(x,y2)

There are many problems with this plot: Line did not fit into the plot, log() line is displayed using points, while sqrt() is displayed as a line, there is no title or annotations.

R provides many options to every plot function to control various parameters:

# Using R graphics parameters we can convert this plot into high quality one:
plot(x, y1,                                          # specify the coordinates
     type = "b",                                     # type of the plot
     ylim = c(min(c(y1,y2)), max(c(y1,y2))),         # set the axis limits
     main = "Logarithm and Square root functions",   # title
     ylab = "y",     #                               # y label
     col = "darkgreen",                              # line color
     lwd = 2,                                        # line tickness
     pch=15,                                         # symbol to display the point
     cex.lab=1.5,                                    # size of the font used for labels
     font.lab = 3,                                   # font type
     cex.main = 2)                                   # size of the font used for the title

#Additional line
lines(x,y2,                        # coordinates
      col = "steelblue",           # color
      lwd =2,                      # line width
      type="b",                    # line type (b - both, lines and points) 
      pch=19,                      # symbol used to display points
      cex.lab=1.5)                 # font size for labels

#Add grid to the plot
abline(h = seq(0,3,by=0.5), v = c(0:10), col='lightgray',lty="dotted")

#Add legend
legend("topleft",                             # position
       col=c("darkgreen","steelblue"),        # colors
       lty=1,                                 # draw line
       pch=c(15,19),                          # draw points
       cex=1.5,                               # text size
       legend=c("logarithm","square root"))   # text

# Add text to the graph
text(x = 6, y = log(6),  pos = 2, labels=expression(paste("y = ",ln(x))),   cex=2, offset=0.75)
text(x = 6, y = sqrt(6), pos = 2, labels=expression(paste("y = ",sqrt(x))), cex=2, offset=0.75)

For our ecoli dataset

plot(ecoli$genome_size, 
     pch=8, 
     main="Scatter plot of genome sizes", 
     cex=2,
     col="steelblue", 
     ylab="Genome Size", 
     xlab="Sample")

Barplot

# Basic Barplot
ranks <- table(salaries$rank)
barplot(ranks)

# Improved barplot
# Notice how the main plot returns the position of each bar which we will store in the variable b.
b<-barplot(ranks,
           col = "darkgreen",
           main = "Teaching Faculty",
           sub = "Source: University Administration",
           names.arg = names(ranks), 
           ylim = c(0, max(ranks) * 1.1),
           axes = FALSE)

# Values stored in the variable b are used as x coordinates to place the text on top of each bar
text(b,ranks,
     labels = ranks, 
     pos = 3)

Colors in R

There are many ways to specify a color in R:

specify an index within a colorscheme
by name
using rgb, hsv, hex values

#1. Using index within colorscheme (palettes)
barplot(c(1:10), col=c(1:10))

      #list colors in the current palette:
      palette()

## [1] "black"   "red"     "green3"  "blue"    "cyan"    "magenta" "yellow" 
## [8] "gray"

      pie(rep(1,length(palette())),
          col=palette())

Some available palettes:

rainbow()
heat.colors()
terrain.colors()
topo.colors()
cm.colors()
gray.colors()

num.col=15
barplot(c(1:num.col), col=rainbow(num.col))

barplot(c(1:num.col), col=heat.colors(num.col))

barplot(c(1:num.col), col=terrain.colors(num.col))

barplot(c(1:num.col), col=topo.colors(num.col))

barplot(c(1:num.col), col=cm.colors(num.col))

barplot(c(1:num.col), col=gray.colors(num.col))

A popular palettes could be found in package RColorBrewer:

#display.brewer.all()
barplot(c(1:8), col=brewer.pal(8,"Dark2") )

barplot(c(1:9), col=brewer.pal(9,"Set1") )

# 2. Specify colors by name
colors() # list all colornames

barplot(c(21,19,16,14,12,10,8,5,4,3,2,1), col=c("steelblue4",
                                "darkred",
                                "dodgerblue",
                                "forestgreen",
                                "gold",
                                "steelblue",
                                "firebrick",
                                "darkorange",
                                "darkgrey",
                                "tomato",
                                "olivedrab",
                                "dimgray"))

A lits of some of my favorite colors:

steelblue, steelblue4
firebrick
tomato
skyblue
olivedrab
forestgreen
gold
dodgerblue
dimgray
darkred
darkorange
darkgray
springgreen
violetred
slateblue
sienna
seagreen
sandybrown
salmon
saddlebrown
maroon
limegreen

barplot(c(21,19,17,14,12,8,5,4,3,2,1), col=c("springgreen",
                                          "skyblue",
                                 "violetred",
                                 "slateblue",
                                 "sienna",
                                 "seagreen",
                                 "sandybrown",
                                 "salmon",
                                 "saddlebrown",
                                 "maroon",
                                 "limegreen"))

You can find a color table in this pdf document: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf

# 3. Specify rgb values (4th argument is used to control transparency)
col1 <- rgb(0.1, 0.5, 0.1, alpha=0.8)
col2 <- rgb(0.9, 0.7, 0.0)
barplot(c(7,11), col=c(col1, col2))

#Conversion one color format to another
col2rgb(), rgb2hsv()

# Additional help related to colors
help(package=colorspace)

Boxplot

#Simple boxplot
boxplot(salaries$salary)

#let's display 2 boxplots: one for male and one for female names
boxplot( formula = salary ~ discipline,
         data = salaries,
         names = c("A","B"),
         col = c("gold","forestgreen"),
         horizontal = TRUE,
         main = "Salary range for Disciplines A and B")

legend("topright", 
       legend=c("A","B"), 
       pch=15,
       col = c("gold","forestgreen"))

# Side by side boxplot for ecoli dataset
boxplot(genome_size ~ cit, ecoli)   #simple

#Add colors, title, labels
boxplot(genome_size ~ cit, ecoli,  col=c("pink","purple", "darkgrey"),
        main="Average expression differences between celltypes", ylab="Expression")

Explore various symbols View help of points() function:

?points

#Draw a table with all symbols
plot(c(-1, 26), 0:1, type = "n", axes = FALSE, xlab="", ylab="")
text(0:25, 0.6, 0:25, cex=0.75)
points(0:25, rep(0.5, 26), pch = 0:25, bg = "grey")

graphics parameters:

pch: plotting symbol (default - open circle)
lty: line type (default - solid line)
lwd: line width
las: orientation of the axis labels on the plot
bg: background color
fg: foreground color
mar: margin size
oma: outer margin size

levels(salaries$rank)

## [1] "AssocProf" "AsstProf"  "Prof"

col3 <- c("steelblue4","firebrick","forestgreen")
cols <- col3 [ as.numeric(salaries$rank) ]
plot(x = salaries$yrs.service,
     y = salaries$salary, 
     pch = 19, 
     main = "Analysis of dependency of 2 variables",
     xlab = "Years of Service",
     ylab = "Salary", 
     col = cols,
     cex=1.5)

lm.fit <- lm(salary ~ yrs.service, data = salaries)
abline(lm.fit, lty = 4, lwd = 2, col="darkgray")
legend("topleft",levels(salaries$rank),fill=col3 )


eq <- substitute(italic(salary) == a + b %.% italic(yrs.service)*","~~italic(r)^2~"="~r2, 
                 list(a = format(coef(lm.fit)[1], digits = 2), 
                      b = format(coef(lm.fit)[2], digits = 2), 
                      r2 = format(summary(lm.fit)$r.squared, digits = 3)))
#R-squared: what proportion of the variation in the outcome can be explained by the covariates/predictors
# if R-squared is low, than the outcome is poorly predicted by the covariates/


text(15,170000,labels=as.expression(eq), pos=4, cex=.8 )

Histogram

#plot simple histogram
hist(salaries$salary)

# Add color and title:
hist(salaries$salary, 
     main="Histogram of Salaries of professors in Disciplines A and B",
     col="steelblue3",
     xlab="Salary, dollars")

# Same plot with probability along the y axes
hist(salaries$salary, 
     main="Histogram of Salaries of professors in Disciplines A and B",
     col="steelblue3",
     xlab="Salary, dollars", 
     prob=TRUE)

#Add rugplot
rug(salaries$salary)

#Add density line
lines(density(salaries$salary), col="darkblue", lwd=2)

Pairs

# Delays per airport
pairs( salaries[ ,c("yrs.since.phd", "yrs.service","salary") ] ) 

#An alternative way using formula
pairs( ~ yrs.since.phd + yrs.service + salary, 
       data = salaries)

Improve the plot appearance using custom function:

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs( ~ yrs.since.phd + yrs.service + salary, 
       data = salaries,
       lower.panel = panel.smooth, 
       upper.panel = panel.cor)

Multiple plots in one graph

#par.save <- par()  # save current graphics parameters
#print( par() )     # print current graphics parameters

par( mfrow = c(1,2) )  # 1 row and 2 columns

    # first graph
    hist(salaries$salary, 
     main="Histogram of Salaries of professors in Disciplines A and B",
     col="steelblue3",
     xlab="Salary, dollars", 
     prob=TRUE)

    #Add rugplot
    rug(salaries$salary)

    #Add density line
    lines(density(salaries$salary), col="darkblue", lwd=2)

    # second graph
    b<-barplot(table(salaries$rank),
           col=c("steelblue4","firebrick","forestgreen"),
           main = "Professor Ranks",
           names.arg = c("Associate Professor", "Assistant Professor", "Professor"),
           ylim = c(0, max(table(salaries$rank)) * 1.25), 
           axes=FALSE)
    text(b,table(salaries$rank),table(salaries$rank), pos=3)
    mtext("Disciplines A and B")

par ( mfrow = c(1,1) )

#basic plot with points
ggplot(salaries, aes( yrs.service, salary)) + geom_point()

ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point() +    # type of geom objects
  theme_bw() +      # theme
  ggtitle(" Salary analysis")   # plot title

# color the points according to the faculty type
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )

#Add a regression line to the plot
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  geom_smooth() + 
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )

## `geom_smooth()` using method = 'loess'

# Change the line to a simple lm line
ggplot(salaries, aes( yrs.service, salary)) + 
  geom_point( aes(color = factor(rank)  ), size=5) +    # type of geom objects
  ggtitle(" Salary analysis") +  # plot title
  geom_smooth( method='lm' , col = "black") + 
  theme_bw( ) +      # theme
  theme ( plot.title=element_text(size=14,face="bold" ) )

ggplot(ecoli) +
  geom_point(aes(x = sample, y= genome_size, 
                 color = generation, 
                 shape = cit), size = rel(3.0)) +
  theme(axis.text.x = element_text(angle=45, hjust=1))

#boxplot
ggplot(salaries, aes( x=discipline, y=salary ) ) + geom_boxplot()

#adding addional features to the boxplot
ggplot(salaries, aes( x=discipline, y=salary ) ) + 
  geom_boxplot() +
  geom_jitter(width = 0.2)

ggplot(salaries, aes( x=rank, y=salary) ) + 
  geom_boxplot( varwidth = TRUE ) + #width is proportional to the square roots of the number of observations
  geom_jitter(width = 0.2) + 
  ggtitle(" Salary analysis") + 
  coord_flip() + 
  theme ( plot.title=element_text(size=14,face="bold" ) )

ggplot(ecoli) +
  geom_boxplot(aes(x = cit, y = genome_size, fill = cit)) +
  ggtitle('Boxplot of genome size by citrate mutant type') +
  xlab('citrate mutant') +
  ylab('genome size') +
  theme(panel.grid.major = element_line(size = .5, color = "grey"),
        axis.text.x = element_text(angle=45, hjust=1),
        axis.title = element_text(size = rel(1.5)),
        axis.text = element_text(size = rel(1.25)))

graph <- ggplot(ecoli) +
  geom_boxplot(aes(x = cit, y = genome_size, fill = cit))

graph +ggtitle('Boxplot of genome size by citrate mutant type') +
  xlab('citrate mutant') +
  ylab('genome size') +
  theme(panel.grid.major = element_line(size = .5, color = "grey"),
        axis.text.x = element_text(angle=45, hjust=1),
        axis.title = element_text(size = rel(1.5)),
        axis.text = element_text(size = rel(1.25)))

#histogram
ggplot(salaries, aes(x=salary)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(salaries, aes(x=salary)) + geom_histogram(binwidth=10000)

#improving histogram
ggplot(salaries, aes(x=salary)) + 
  geom_histogram(binwidth=10000, 
                 color="black", fill="white")

ggplot(salaries, aes(x=salary)) + 
  geom_histogram(binwidth=10000, 
                 color="black", fill="white", aes(y=..density..)) + 
  geom_density(alpha=.3, fill="darkorange")

R intro (session 2)

Katia Oleinik (koleinik@bu.edu)

10/19/2017

Installing Packages

Reading input datasets;

dplyr package

R graphics

Base R Graphics