hobrien.github.io

Tidyverse Tutorial

GeneID Sample1 Sample2
Gene1 473 526
Gene2 7203 6405
Gene3 59487 51467
GeneID Sample Count
Gene1 Sample1 473
Gene2 Sample1 7203
Gene3 Sample1 59487
Gene1 Sample2 526
Gene2 Sample2 6405
Gene3 Sample2 51467

The pipe function:

library(magrittr) #Ceci ne pas une pipe
set.seed(69)
rnorm(10) %>% mean

Ceci ne pas une pipe

tibble/readr

df <- read.delim("examples/SampleInfo.txt")
2 * as.numeric(df$ReadLength)
# [1] 6 6 6 6 6 6 6 8 4 6 6 4 2 4 4 2 2 6 4 2 2 4 4 2 4 4 2...

2 * as.numeric(as.character(df$ReadLength))
# [1] 152 152 152 152 152 152 152  NA 250 152 152 250 200

tibble <- read_tsv("examples/SampleInfo.txt")
2 * as.numeric(tibble$ReadLength)
# [1] 152 152 152 152 152 152 152  NA 250 152 152 250 200

tidyr

wide <-tribble(
  ~Gene, ~Sample1,  ~Sample2,
  "Gene1_APOE1", 473,  526,
  "Gene2_SETD1A", 7203,  6405,
  "Gene3_TCF4", 59487, 51467
)

long <- gather(tibble, Sample, Value, -Gene)
wide <- spread(long, Sample, Value)

separate(wide, Gene, c("GeneID", "Gene_name"))

dplyr

head(tibble[,c('Sample', 'Sex')])
select(SchoolData, Sample, Sex) %>% head
head(subset(tibble, PCW< 14))
filter(tibble, PCW<14) %>% head
head(tibble[order(tibble$RIN),])
arrange(tibble, RIN) %>% head
head(aggregate(PCW ~ Sex, data=tibble, FUN=function(x) av_score=mean(x)))
#I can't figure out how to rename the output
tibble %>% group_by(Sex) %>% summarise(av_age=mean(PCW)) %>% head
head(aggregate(RIN ~ Sex+PCW, data=tibble, FUN=function(x) c(mean=mean(x), var=var(x))))
tibble %>% group_by(Sex, PCW) %>% summarise(mean=mean(RIN), var=var(RIN)) %>% head
head(aggregate(RIN ~ Sex, data=tibble,  FUN=function(x) num_students=length(x)))
tibble %>% group_by(Sex) %>% summarise(num_samples=n()) %>% head
tibble$total_length <- 2 * as.numeric(tibble$ReadLength)
head(SchoolData)
tibble %>% mutate(total_length=2 * as.numeric(ReadLength)) %>% head
library(ggplot2)

tibble %>% 
    group_by(Sex) %>% 
    summarise(mean=mean(PCW), se=sd(PCW)/sqrt(n())) %>% 
    ggplot(aes(x=Sex, y=mean)) +
        geom_bar(stat="identity", fill="royalblue4", alpha=1/2) +
        geom_errorbar(aes(ymin=mean-se, ymax=mean+se), colour="royalblue4", alpha=1/2)
tibble %>% 
    group_by(Sex) %>% 
    summarise(mean=mean(PCW), se=sd(PCW)/sqrt(n())) %>% 
    ggplot(aes(x=Sex, y=mean)) +
        geom_jitter(aes(x=Sex, y=PCW), alpha=1/10, 
                    position = position_jitter(width = 0.2), 
                    colour="royalblue4", data=tibble) +
        geom_point(stat="identity", alpha=2/3, shape=5, size=2, colour="royalblue4") +
        geom_errorbar(aes(ymin=mean-se, ymax=mean+se), colour="royalblue4", alpha=2/3)

tibble %>% group_by(PCW) %>% filter(min_rank(desc(RIN)) == 1)
tibble %>% 
    group_by(Sex) %>% 
    mutate(mean = mean(PCW)) %>% 
    ungroup() %>% 
    mutate(deviation=PCW-mean)
counts<-read_tsv("Counts.txt")

inner_join(tibble, counts, by=c("Sample" = "SampleID")) # keeps only rows common to both datasets
left_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in left dataframe, adding NA when row is missing from right dataset
right_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in right, adding NA when row is missing from left dataset
full_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in both, adding NA when row is missing from either dataset

stringr

frag_size <- read_file("examples/peak_calling.txt") %>%
    str_extract("(?<=fragment size = )\\d+") %>% 
    as.numeric()

purrr

inner_join(tibble, counts, by=c("Sample" = "SampleID")) %>%
    gather(stat, value, 7:13) %>%
    group_by(stat) %>%
    nest() %>%
    mutate(cor=map(data, ~cor.test(RIN, .$value)), r=map_dbl(cor, 4), p=map_dbl(cor, 3)) %>%
    select(-data, -cor)

Useful resources

Alternatives to dplyr