Fork me on GitHub

Differential Expression

It contains my assignment in Bioinformatics course talking about Differential Expression and this semesters’ code

My assignment in bioinformatics course

Codes using different packages producing differential expression files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
setwd('61samplesmatrix')
#rnanames = 'miRNA'
rnaname = 'miRNA'
commonpath = '04.counts/hcc_lulab.sequentialMap.homer.'
savepath = '05.diffexp/hcc_lulab.sequentialMap.homer.'
mx <- read.table(paste(commonpath,rnaname,".mx",sep=""),sep="\t",header=T)
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)
mx_filterGenes <- mx[filter_genes,]
####################################################################################
####################################################################################
library(DESeq2)
countData <- mx_filterGenes
colData <- design
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)
norm_df <- data.frame(Gene=rownames(norm_matrix), norm_matrix)
if(!file.exists(paste(savepath,rnaname,".DESeq2.rlog.mx",sep=""))){
write.table(norm_df, paste(savepath,rnaname,".DESeq2.rlog.mx",sep=""), row.names = FALSE,sep="\t")}
deg <- DESeq(dds)

res <- results(deg,contrast=c("Treatment","normal","Before"),tidy=TRUE)
merged_res <- merge(norm_df,res,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".NormalvsBefore.DESeq2.tsv",sep=""))){
write.table(merged_res,file=paste(savepath,rnaname,".NormalvsBefore.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

res1 <- results(deg,contrast=c("Treatment","normal","After"),tidy=TRUE)
merged_res1 <- merge(norm_df,res1,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".NormalvsAfter.DESeq2.tsv",sep=""))){
write.table(merged_res1,file=paste(savepath,rnaname,".NormalvsAfter.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

res2 <- results(deg,contrast=c("Treatment","After","Before"),tidy=TRUE)
merged_res2 <- merge(norm_df,res2,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".AftervsBefore.DESeq2.tsv",sep=""))){
write.table(merged_res2,file=paste(savepath,rnaname,".AftervsBefore.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

####################################################################################
####################################################################################
library(edgeR)
#Read Data in
countData <- mx_filterGenes[,-1]
rownames(countData) <- mx_filterGenes[,1]
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
colData <- design
y <- DGEList(countData, samples=colData, group=colData$Treatment)
y <- calcNormFactors(y)
design <-model.matrix(~Treatment, data=colData)
y <- estimateDisp(y, design)
et <- exactTest(y)
res <- topTags(et,Inf)
tidyResult <- data.frame(Gene=rownames(res$table), res$table)
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.edgeR.classic.tsv",sep=""))){
write.table(tidyResult,file=paste(savepath,rnaname,".NCvsHCC.edgeR.classic.tsv",sep=""),sep="\t",row.names=FALSE)}
fit <- glmFit(y,design)
lrt <- glmLRT(fit,contrast = c(1,-1))
FDR <- p.adjust(lrt$table$PValue, method="BH")
padj_lrt <- cbind(lrt$table,FDR)
fit_df <- lrt$fitted.values
if(!file.exists(paste(savepath,rnaname,".homer.edgeR.TMM.mx",sep=""))){
write.table(fit_df,file = paste(savepath,rnaname,".homer.edgeR.TMM.mx",sep=""),row.names = T, sep="\t", quote=F)}
merged_lrt <- merge(fit_df,padj_lrt,by="row.names")
colnames(merged_lrt)[1] <- "Genes"
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.edgeR.tsv",sep=""))){
write.table(merged_lrt,file =paste(savepath,rnaname,".NCvsHCC.edgeR.tsv",sep=""),row.names = F, sep="\t", quote=F)}
####################################################################################
####################################################################################

cpmMx <- read.table(paste(commonpath,rnaname,".homer.rpm.m",sep=""),sep="\t",header=T)
filter_cpm <- apply(
mx[,2:ncol(cpmMx)],
1,
function(x) length(x[x > 0]) >= 2
)
mx_filterCPM <- cpmMx[filter_cpm,]
myFun <- function(x){
x = as.numeric(x)
v1 = x[2:4]
v2 = x[5:10]
out <- wilcox.test(v1,v2,exact = F)
out <- out$p.value
}
p_value <- apply(mx_filterCPM,1,myFun)
p_value[is.nan(p_value)] <- 1
FDR <- p.adjust(p_value,method = "BH")
mx_filterCPM$avgNC <- apply(mx_filterCPM[,2:4],1,mean)
mx_filterCPM$avgHCC <- apply(mx_filterCPM[,5:10],1,mean)
mx_filterCPM$log2fc <- log2((mx_filterCPM$avgNC+0.25)/(mx_filterCPM$avgHCC+0.25))
results <- cbind(mx_filterCPM,p_value,FDR)
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.wilcox.tsv",sep=""))){
write.table(results,file = paste(savepath,rnaname,".NCvsHCC.wilcox.tsv",sep=""),row.names = F, sep="\t", quote=F)}

some plot from DESeq2

Here is the pdf version of the following codes and its output.

If you can’t see pdf, please try this more interactive way by click here

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
### DESeq2
{r warning=FALSE}
# filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#-----------------------------------------
# basic script for normalizing with DESeq2
library(DESeq2)
#Read Data in
countData <- mx_filterGenes
colData <- design

# generate Dataset
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)

# normlize using rlog method
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)
norm_df <- data.frame(Gene=rownames(norm_matrix), norm_matrix)

if(!file.exists("05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx")){
write.table(norm_df, "05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx", row.names = FALSE,sep="\t")}

deg <- DESeq(dds)
res <- results(deg,tidy=TRUE)
merged_res <- merge(norm_df,res,by.x="Gene",by.y="row")

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv")){
write.table(merged_res,file="05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv",sep="\t",row.names=FALSE)
}

the codes use DESeq2 and produces files:

05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx

05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv

edgeR

warning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 0.filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#--------------------------------
# basic script for running edgeR
library(edgeR)
#Read Data in
countData <- mx_filterGenes[,-1]
rownames(countData) <- mx_filterGenes[,1]
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
colData <- design

# generate DGE object
y <- DGEList(countData, samples=colData, group=colData$Treatment)
y <- calcNormFactors(y)

#Estimate Error Model
design <-model.matrix(~Treatment, data=colData)
y <- estimateDisp(y, design)

# classic methods: compute p-values, then output
et <- exactTest(y)
res <- topTags(et,Inf)
tidyResult <- data.frame(Gene=rownames(res$table), res$table)

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv")){
write.table(tidyResult,file="05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv",sep="\t",row.names=FALSE)}

# Generalized linear models
fit <- glmFit(y,design)
# likelihood ratio test
lrt <- glmLRT(fit,contrast = c(1,-1))
FDR <- p.adjust(lrt$table$PValue, method="BH")
padj_lrt <- cbind(lrt$table,FDR)
fit_df <- lrt$fitted.values
if(!file.exists("05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx")){
write.table(fit_df,file = "05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx",row.names = T, sep="\t", quote=F)}
merged_lrt <- merge(fit_df,padj_lrt,by="row.names")
colnames(merged_lrt)[1] <- "Genes"

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv")){
write.table(merged_lrt,file ="05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv",row.names = F, sep="\t", quote=F)}

the codes use edgeR and produces files:

05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv

05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx

05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv

Wilcox/Mann-Whitney-U Test

1. normalize the reads by library size (edgeR)

2. identify differential expressed gene using wilcoxon.test()

warning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
cpmMx <- read.table("04.counts/hcc_example.miRNA.homer.rpm.mx",sep="\t",header=T)
filter_cpm <- apply(
mx[,2:ncol(cpmMx)],
1,
function(x) length(x[x > 0]) >= 2
)
mx_filterCPM <- cpmMx[filter_cpm,]

myFun <- function(x){
x = as.numeric(x)
v1 = x[2:4]
v2 = x[5:10]
out <- wilcox.test(v1,v2,exact = F)
out <- out$p.value
}
p_value <- apply(mx_filterCPM,1,myFun)
p_value[is.nan(p_value)] <- 1
FDR <- p.adjust(p_value,method = "BH")
mx_filterCPM$avgNC <- apply(mx_filterCPM[,2:4],1,mean)
mx_filterCPM$avgHCC <- apply(mx_filterCPM[,5:10],1,mean)
mx_filterCPM$log2fc <- log2((mx_filterCPM$avgNC+0.25)/(mx_filterCPM$avgHCC+0.25))
results <- cbind(mx_filterCPM,p_value,FDR)
if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.wilcox__.tsv")){
write.table(results,file = "05.diffexp/hcc_example.miRNA.NCvsHCC.wilcox__.tsv",row.names = F, sep="\t", quote=F)}

the codes use wilconx test and produces files:

05.diffexp/hccexample.miRNA.NCvsHCC.wilcox.tsv

exploring results

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#-----------------------------------------
# basic script for normalizing with DESeq2
library(DESeq2)
#Read Data in
countData <- mx_filterGenes
colData <- design

# generate Dataset
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)

# normlize using rlog mathod
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)

# diff-exp analysis using DESeq2
deg <- DESeq(dds)
res <- results(deg,tidy=TRUE)

Heatmap for count matrix

1
2
3
4
library("pheatmap")
select <- order(rowMeans(counts(deg,normalized=TRUE)), decreasing=TRUE)[1:25]
df <- as.data.frame(colData(deg)[,c("Treatment","Sample")])
pheatmap(assay(norm)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=TRUE, annotation_col=df)
-----The ---- end ------- Thanks --- for --- Reading----