Compact data (AKA wide) vs tidy data (AKA long data):
GeneID | Sample1 | Sample2 |
---|---|---|
Gene1 | 473 | 526 |
Gene2 | 7203 | 6405 |
Gene3 | 59487 | 51467 |
GeneID | Sample | Count |
---|---|---|
Gene1 | Sample1 | 473 |
Gene2 | Sample1 | 7203 |
Gene3 | Sample1 | 59487 |
Gene1 | Sample2 | 526 |
Gene2 | Sample2 | 6405 |
Gene3 | Sample2 | 51467 |
library(magrittr) #Ceci ne pas une pipe
set.seed(69)
rnorm(10) %>% mean
readr is the tidyverse version of read.delim
readr creates tibbles. It’s also fast and flexible
df <- read.delim("examples/SampleInfo.txt")
2 * as.numeric(df$ReadLength)
# [1] 6 6 6 6 6 6 6 8 4 6 6 4 2 4 4 2 2 6 4 2 2 4 4 2 4 4 2...
2 * as.numeric(as.character(df$ReadLength))
# [1] 152 152 152 152 152 152 152 NA 250 152 152 250 200
tibble <- read_tsv("examples/SampleInfo.txt")
2 * as.numeric(tibble$ReadLength)
# [1] 152 152 152 152 152 152 152 NA 250 152 152 250 200
wide <-tribble(
~Gene, ~Sample1, ~Sample2,
"Gene1_APOE1", 473, 526,
"Gene2_SETD1A", 7203, 6405,
"Gene3_TCF4", 59487, 51467
)
long <- gather(tibble, Sample, Value, -Gene)
wide <- spread(long, Sample, Value)
separate(wide, Gene, c("GeneID", "Gene_name"))
head(tibble[,c('Sample', 'Sex')])
select(SchoolData, Sample, Sex) %>% head
head(subset(tibble, PCW< 14))
filter(tibble, PCW<14) %>% head
head(tibble[order(tibble$RIN),])
arrange(tibble, RIN) %>% head
head(aggregate(PCW ~ Sex, data=tibble, FUN=function(x) av_score=mean(x)))
#I can't figure out how to rename the output
tibble %>% group_by(Sex) %>% summarise(av_age=mean(PCW)) %>% head
head(aggregate(RIN ~ Sex+PCW, data=tibble, FUN=function(x) c(mean=mean(x), var=var(x))))
tibble %>% group_by(Sex, PCW) %>% summarise(mean=mean(RIN), var=var(RIN)) %>% head
head(aggregate(RIN ~ Sex, data=tibble, FUN=function(x) num_students=length(x)))
tibble %>% group_by(Sex) %>% summarise(num_samples=n()) %>% head
tibble$total_length <- 2 * as.numeric(tibble$ReadLength)
head(SchoolData)
tibble %>% mutate(total_length=2 * as.numeric(ReadLength)) %>% head
library(ggplot2)
tibble %>%
group_by(Sex) %>%
summarise(mean=mean(PCW), se=sd(PCW)/sqrt(n())) %>%
ggplot(aes(x=Sex, y=mean)) +
geom_bar(stat="identity", fill="royalblue4", alpha=1/2) +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), colour="royalblue4", alpha=1/2)
tibble %>%
group_by(Sex) %>%
summarise(mean=mean(PCW), se=sd(PCW)/sqrt(n())) %>%
ggplot(aes(x=Sex, y=mean)) +
geom_jitter(aes(x=Sex, y=PCW), alpha=1/10,
position = position_jitter(width = 0.2),
colour="royalblue4", data=tibble) +
geom_point(stat="identity", alpha=2/3, shape=5, size=2, colour="royalblue4") +
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), colour="royalblue4", alpha=2/3)
tibble %>% group_by(PCW) %>% filter(min_rank(desc(RIN)) == 1)
tibble %>%
group_by(Sex) %>%
mutate(mean = mean(PCW)) %>%
ungroup() %>%
mutate(deviation=PCW-mean)
counts<-read_tsv("Counts.txt")
inner_join(tibble, counts, by=c("Sample" = "SampleID")) # keeps only rows common to both datasets
left_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in left dataframe, adding NA when row is missing from right dataset
right_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in right, adding NA when row is missing from left dataset
full_join(tibble, counts, by=c("Sample" = "SampleID")) #keeps all rows in both, adding NA when row is missing from either dataset
package for string manipulation
extract fragment size estimate from homer peak calling
frag_size <- read_file("examples/peak_calling.txt") %>%
str_extract("(?<=fragment size = )\\d+") %>%
as.numeric()
apply a function across subsets of a data frame (among other things)
inner_join(tibble, counts, by=c("Sample" = "SampleID")) %>%
gather(stat, value, 7:13) %>%
group_by(stat) %>%
nest() %>%
mutate(cor=map(data, ~cor.test(RIN, .$value)), r=map_dbl(cor, 4), p=map_dbl(cor, 3)) %>%
select(-data, -cor)