BINF_tut


Project maintained by sarbal Hosted on GitHub Pages — Theme by mattgraham

Week 9: Data vizualisation and exploratory analysis

Objectives

Some visual things you can do with R and ways to explore your data systematically. In the tutorial you will:

Downloads

Once more, download these files into your working directory:

To check your working directory:

getwd()

To set your working diretory:

setwd("X:/project")

Run this to install/load libraries

source("helper.R") 

Setting up

Start a new notebook file by selecting “File” -> “New File” -> “R Notebook” Save the file as “yourname_week7.Rmd”. Delete the instructions starting from “This is an [R…”. For the different code below, insert it as R chunks. An R chunk is code placed after a line that starts with ` { r } `and ends before a line with ` `.
As before, copy the code chunks into your R notebook as R chunks.

Flow control

i = 2 
if( i > 0 )
{
    print("Positive!")
} 
else if( i < 0 )
{
    print("Negative!")
}
else {
    print("Zero!")
}

Loops and apply functions

lapply(X, FUN)
sapply(X, FUN) 
x = sample(10, size = 100, replace=T)
y = rep( c("X","Y"), 50 )
tapply(x,y, sum)
tapply(x,y, list)

Visuals

scatterplot

plot(iris$Sepal.Length, iris$Petal.Length, pch=12, cex=3, lwd=4, lty=4, type="b", col=colors()[sample(600,5)][as.numeric(iris$Species)] )

scatterplot

scatterplot

scatterplot

boxplot(iris$Sepal.Width~ iris$Species, col=1:3 )

scatterplot

beanplot(iris$Sepal.Width~ iris$Species, col=list(1,2,3))

scatterplot

iris.list = lapply( unique(iris$Species), function(si) iris$Sepal.Width[iris$Species==si]) 
vioplot( iris.list[[1]], iris.list[[2]], iris.list[[3]], col="darkgreen")

scatterplot

hist(iris$Petal.Width, col="lightblue")

scatterplot

tapply(iris$Petal.Width < 0.75, iris$Species, sum)
hist(iris$Petal.Width, col="lightblue")
hist(iris$Petal.Width[iris$Species=="setosa"], col="red", add=T)
hist(iris$Petal.Width[iris$Species=="versicolor"], col="blue", add=T)
hist(iris$Petal.Width[iris$Species=="virginica"], col="purple", add=T)

scatterplot

h <- hist(iris$Petal.Width, col="lightblue")
h
hist(iris$Petal.Width[iris$Species=="setosa"],  breaks=h$breaks,col="red", add=T)
hist(iris$Petal.Width[iris$Species=="versicolor"], breaks=h$breaks, col="blue", add=T)
hist(iris$Petal.Width[iris$Species=="virginica"],  breaks=h$breaks,col="purple", add=T)

scatterplot

h <- hist(iris$Petal.Width, col=0, border=0)
hist(iris$Petal.Width[iris$Species=="setosa"],  breaks=h$breaks,col=makeTransparent("red"), add=T)
hist(iris$Petal.Width[iris$Species=="versicolor"], breaks=h$breaks, col=makeTransparent("blue"), add=T)
hist(iris$Petal.Width[iris$Species=="virginica"],  breaks=h$breaks,col=makeTransparent("purple"), add=T)

scatterplot

h <- hist(iris$Petal.Width, freq=F)
d_all <-density( iris$Petal.Width) 
lines(d_all, col="black")

scatterplot

points()
polygon()
segments()
abline()
rug()
text()
mtext()
legend()
...
## Ignore what these functions do for now, but copy them over 
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "lightgreen", ...)
}
## with size proportional to the correlations.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r, col= plasma(100)[round(r,2)*100])
}
pairs(iris, bg=1:3,lower.panel = panel.smooth, pch=19, upper.panel = panel.cor, diag.panel = panel.hist, cex.labels = 2, font.labels = 2)

scatterplot

iris2  = apply(iris[,1:4], 2, as.numeric)
heatmap.3(iris2, RowSideCol=cols7[as.numeric(iris$Species)] , col=viridis(100))

heatmap

iris.r  = t(apply(iris[,1:4], 1, rank))
heatmap.3(iris.r, RowSideCol=cols7[as.numeric(iris$Species)] , col=viridis(100))

heatmap


iris.r2  = apply(iris[,1:4], 2, rank)
heatmap.3(iris.r2, RowSideCol=cols7[as.numeric(iris$Species)] , col=viridis(100))

heatmap

samples.cor = cor( t(iris2) )
heatmap.3(samples.cor, col=plasma(100), ColSideCol=cols7[as.numeric(iris$Species)])

heatmap

“Tidyr” versions

We can do most all of this with ggplot2.There are less finicky things to worry about, and is generally more intuitive.

g <- ggplot(iris, aes(x = Sepal.Length, y = Petal.Length)) 
g

ggplot

g <- g + geom_point()
g

ggplot

g <- g + geom_point(aes(color = Species))
g

ggplot

g <- ggplot(iris, aes(x = Sepal.Length, y = Petal.Length, color = Species)) + geom_point()  +  geom_smooth(method = "lm", se = F) 
g

ggplot

g <- ggplot(data=iris, aes(x=Species, y=Sepal.Length))
g + geom_boxplot(aes(fill=Species)) + 
  ylab("Sepal Length") + ggtitle("Iris Boxplot")  

ggplot

g <- ggplot(data=iris, aes(x=Petal.Width))
g + geom_histogram(binwidth=0.2, color="black", aes(fill=Species)) +  xlab("Petal Width") +  ylab("Frequency") + ggtitle("Histogram of Petal Width") 

ggplot

g <- ggplot(data=iris, aes(x=Species, y=Sepal.Length))
g + geom_bar(stat = "summary", fun = "mean") + xlab("Species") +  ylab("Mean") + ggtitle("Bar plot of mean Sepal Length") 

ggplot More here

Colors and palettes

colors() 
palette()
blacks = c("black", 1, "#000000") 
reds = c("red", 2, "#FF0000") 
allreds = colors()[grep("red", colors())]
allredsRamp <- colorRampPalette(allreds)
allredsRamp(100)
grey2blue = colorpanel(100, "lightgrey", "blue", "darkblue")
rainbow(5)
heat.colors(10)
terrain.colors(100)
topo.colors(10)
cm.colors(5)
library(viridis)
n=10
magma(n)
plasma(n)
inferno(n)
viridis(n)
cividis(n)
turbo(n)
plot(1:n, col=magma(n), pch=19, cex=5)
plot(1:n, col=plasma(n), pch=19, cex=5)
n=100
plot(1:n, col=turbo(n), pch=19, cex=5)

plot plot plot

Test yourself!

  1. Download and load the file “R_dataviz.Rdata” into your environment.
  2. Plot three plots from the dataset “X” to answer the next set of questions. There are multiple columns in the dataset, so feel free to explore and play around first. a. What are the average weights per sex? Show as a histogram. b. What are the range of scute numbers per set? Show as a boxplot. c. What are the correlations between the biometrics? Show as a correlation plot.
  3. Now, using the dataset “Y”, plot a heatmap. Aim for clarity!
  4. And finally, look at dataset “Z”. Plot it the best way you think would show its key feature.
  5. “Knit” your R markdown file into an html page or a pdf.

Solutions

## Resources

Back to the homepage