Fork me on GitHub

Differential Expression&作业

It contains my assignment in Bioinformatics course talking about Differential Expression and this semesters’ code

My assignment in bioinformatics course

Codes using different packages producing differential expression files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
setwd('61samplesmatrix')
#rnanames = 'miRNA'
rnaname = 'miRNA'
commonpath = '04.counts/hcc_lulab.sequentialMap.homer.'
savepath = '05.diffexp/hcc_lulab.sequentialMap.homer.'
mx <- read.table(paste(commonpath,rnaname,".mx",sep=""),sep="\t",header=T)
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)
mx_filterGenes <- mx[filter_genes,]
####################################################################################
####################################################################################
library(DESeq2)
countData <- mx_filterGenes
colData <- design
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)
norm_df <- data.frame(Gene=rownames(norm_matrix), norm_matrix)
if(!file.exists(paste(savepath,rnaname,".DESeq2.rlog.mx",sep=""))){
write.table(norm_df, paste(savepath,rnaname,".DESeq2.rlog.mx",sep=""), row.names = FALSE,sep="\t")}
deg <- DESeq(dds)

res <- results(deg,contrast=c("Treatment","normal","Before"),tidy=TRUE)
merged_res <- merge(norm_df,res,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".NormalvsBefore.DESeq2.tsv",sep=""))){
write.table(merged_res,file=paste(savepath,rnaname,".NormalvsBefore.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

res1 <- results(deg,contrast=c("Treatment","normal","After"),tidy=TRUE)
merged_res1 <- merge(norm_df,res1,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".NormalvsAfter.DESeq2.tsv",sep=""))){
write.table(merged_res1,file=paste(savepath,rnaname,".NormalvsAfter.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

res2 <- results(deg,contrast=c("Treatment","After","Before"),tidy=TRUE)
merged_res2 <- merge(norm_df,res2,by.x="Gene",by.y="row")
if(!file.exists(paste(savepath,rnaname,".AftervsBefore.DESeq2.tsv",sep=""))){
write.table(merged_res2,file=paste(savepath,rnaname,".AftervsBefore.DESeq2.tsv",sep=""),sep="\t",row.names=FALSE)
}

####################################################################################
####################################################################################
library(edgeR)
#Read Data in
countData <- mx_filterGenes[,-1]
rownames(countData) <- mx_filterGenes[,1]
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
colData <- design
y <- DGEList(countData, samples=colData, group=colData$Treatment)
y <- calcNormFactors(y)
design <-model.matrix(~Treatment, data=colData)
y <- estimateDisp(y, design)
et <- exactTest(y)
res <- topTags(et,Inf)
tidyResult <- data.frame(Gene=rownames(res$table), res$table)
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.edgeR.classic.tsv",sep=""))){
write.table(tidyResult,file=paste(savepath,rnaname,".NCvsHCC.edgeR.classic.tsv",sep=""),sep="\t",row.names=FALSE)}
fit <- glmFit(y,design)
lrt <- glmLRT(fit,contrast = c(1,-1))
FDR <- p.adjust(lrt$table$PValue, method="BH")
padj_lrt <- cbind(lrt$table,FDR)
fit_df <- lrt$fitted.values
if(!file.exists(paste(savepath,rnaname,".homer.edgeR.TMM.mx",sep=""))){
write.table(fit_df,file = paste(savepath,rnaname,".homer.edgeR.TMM.mx",sep=""),row.names = T, sep="\t", quote=F)}
merged_lrt <- merge(fit_df,padj_lrt,by="row.names")
colnames(merged_lrt)[1] <- "Genes"
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.edgeR.tsv",sep=""))){
write.table(merged_lrt,file =paste(savepath,rnaname,".NCvsHCC.edgeR.tsv",sep=""),row.names = F, sep="\t", quote=F)}
####################################################################################
####################################################################################

cpmMx <- read.table(paste(commonpath,rnaname,".homer.rpm.m",sep=""),sep="\t",header=T)
filter_cpm <- apply(
mx[,2:ncol(cpmMx)],
1,
function(x) length(x[x > 0]) >= 2
)
mx_filterCPM <- cpmMx[filter_cpm,]
myFun <- function(x){
x = as.numeric(x)
v1 = x[2:4]
v2 = x[5:10]
out <- wilcox.test(v1,v2,exact = F)
out <- out$p.value
}
p_value <- apply(mx_filterCPM,1,myFun)
p_value[is.nan(p_value)] <- 1
FDR <- p.adjust(p_value,method = "BH")
mx_filterCPM$avgNC <- apply(mx_filterCPM[,2:4],1,mean)
mx_filterCPM$avgHCC <- apply(mx_filterCPM[,5:10],1,mean)
mx_filterCPM$log2fc <- log2((mx_filterCPM$avgNC+0.25)/(mx_filterCPM$avgHCC+0.25))
results <- cbind(mx_filterCPM,p_value,FDR)
if(!file.exists(paste(savepath,rnaname,".NCvsHCC.wilcox.tsv",sep=""))){
write.table(results,file = paste(savepath,rnaname,".NCvsHCC.wilcox.tsv",sep=""),row.names = F, sep="\t", quote=F)}

some plot from DESeq2

Here is the pdf version of the following codes and its output.

If you can’t see pdf, please try this more interactive way by click here

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
### DESeq2
{r warning=FALSE}
# filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#-----------------------------------------
# basic script for normalizing with DESeq2
library(DESeq2)
#Read Data in
countData <- mx_filterGenes
colData <- design

# generate Dataset
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)

# normlize using rlog method
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)
norm_df <- data.frame(Gene=rownames(norm_matrix), norm_matrix)

if(!file.exists("05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx")){
write.table(norm_df, "05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx", row.names = FALSE,sep="\t")}

deg <- DESeq(dds)
res <- results(deg,tidy=TRUE)
merged_res <- merge(norm_df,res,by.x="Gene",by.y="row")

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv")){
write.table(merged_res,file="05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv",sep="\t",row.names=FALSE)
}

the codes use DESeq2 and produces files:

05.diffexp/hcc_example.miRNA.homer.DESeq2.rlog.mx

05.diffexp/hcc_example.miRNA.NCvsHCC.DESeq2.tsv

edgeR

warning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 0.filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#--------------------------------
# basic script for running edgeR
library(edgeR)
#Read Data in
countData <- mx_filterGenes[,-1]
rownames(countData) <- mx_filterGenes[,1]
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)
colData <- design

# generate DGE object
y <- DGEList(countData, samples=colData, group=colData$Treatment)
y <- calcNormFactors(y)

#Estimate Error Model
design <-model.matrix(~Treatment, data=colData)
y <- estimateDisp(y, design)

# classic methods: compute p-values, then output
et <- exactTest(y)
res <- topTags(et,Inf)
tidyResult <- data.frame(Gene=rownames(res$table), res$table)

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv")){
write.table(tidyResult,file="05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv",sep="\t",row.names=FALSE)}

# Generalized linear models
fit <- glmFit(y,design)
# likelihood ratio test
lrt <- glmLRT(fit,contrast = c(1,-1))
FDR <- p.adjust(lrt$table$PValue, method="BH")
padj_lrt <- cbind(lrt$table,FDR)
fit_df <- lrt$fitted.values
if(!file.exists("05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx")){
write.table(fit_df,file = "05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx",row.names = T, sep="\t", quote=F)}
merged_lrt <- merge(fit_df,padj_lrt,by="row.names")
colnames(merged_lrt)[1] <- "Genes"

if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv")){
write.table(merged_lrt,file ="05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv",row.names = F, sep="\t", quote=F)}

the codes use edgeR and produces files:

05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.classic.tsv

05.diffexp/hcc_example.miRNA.homer.edgeR.TMM.mx

05.diffexp/hcc_example.miRNA.NCvsHCC.edgeR.tsv

Wilcox/Mann-Whitney-U Test

1. normalize the reads by library size (edgeR)

2. identify differential expressed gene using wilcoxon.test()

warning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
cpmMx <- read.table("04.counts/hcc_example.miRNA.homer.rpm.mx",sep="\t",header=T)
filter_cpm <- apply(
mx[,2:ncol(cpmMx)],
1,
function(x) length(x[x > 0]) >= 2
)
mx_filterCPM <- cpmMx[filter_cpm,]

myFun <- function(x){
x = as.numeric(x)
v1 = x[2:4]
v2 = x[5:10]
out <- wilcox.test(v1,v2,exact = F)
out <- out$p.value
}
p_value <- apply(mx_filterCPM,1,myFun)
p_value[is.nan(p_value)] <- 1
FDR <- p.adjust(p_value,method = "BH")
mx_filterCPM$avgNC <- apply(mx_filterCPM[,2:4],1,mean)
mx_filterCPM$avgHCC <- apply(mx_filterCPM[,5:10],1,mean)
mx_filterCPM$log2fc <- log2((mx_filterCPM$avgNC+0.25)/(mx_filterCPM$avgHCC+0.25))
results <- cbind(mx_filterCPM,p_value,FDR)
if(!file.exists("05.diffexp/hcc_example.miRNA.NCvsHCC.wilcox__.tsv")){
write.table(results,file = "05.diffexp/hcc_example.miRNA.NCvsHCC.wilcox__.tsv",row.names = F, sep="\t", quote=F)}

the codes use wilconx test and produces files:

05.diffexp/hccexample.miRNA.NCvsHCC.wilcox.tsv

exploring results

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# filter expression matrix
mx <- read.table("04.counts/hcc_example.miRNA.homer.ct.mx",sep="\t",header=T)

# filter genes
filter_genes <- apply(
mx[,2:ncol(mx)],
1,
function(x) length(x[x > 2]) >= 2
)

mx_filterGenes <- mx[filter_genes,]

# load experimential design
design <- read.table("05.diffexp/design.tsv",sep="\t",header=T)

#-----------------------------------------
# basic script for normalizing with DESeq2
library(DESeq2)
#Read Data in
countData <- mx_filterGenes
colData <- design

# generate Dataset
dds <- DESeqDataSetFromMatrix(countData, colData, design=~Treatment, tidy=TRUE)

# normlize using rlog mathod
norm <- rlog(dds,blind=FALSE)
norm_matrix <- assay(norm)

# diff-exp analysis using DESeq2
deg <- DESeq(dds)
res <- results(deg,tidy=TRUE)

Heatmap for count matrix

1
2
3
4
library("pheatmap")
select <- order(rowMeans(counts(deg,normalized=TRUE)), decreasing=TRUE)[1:25]
df <- as.data.frame(colData(deg)[,c("Treatment","Sample")])
pheatmap(assay(norm)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=TRUE, annotation_col=df)
-----The ---- end ------- Thanks --- for --- Reading----