This protocol processes RNA-seq data using the R programming environment and specialized packages from Bioconductor to create genes lists. Novice users can copy and paste commands into the R console. To create gene expression data for Protocol step 6B, we downloaded gene expression data from the Ovarian Serous Cystadenocarcinoma project of The Cancer Genome Atlas (TCGA)1, http://cancergenome.nih.gov via the Genomic Data Commons (GDC) portal2 on 2017-06-14 using TCGABiolinks R package3. The data includes 544 samples available as RMA-normalized microarray data (Affymetrix HG-U133A), and 309 samples available as RNA-seq data, with reads mapped to a reference genome using MapSplice4 and read counts per transcript determined using the RSEM method5. RNA-seq data are labeled as ‘RNA-seq V2’, see details at: https://wiki.nci.nih.gov/display/TCGA/RNASeq+Version+2). The RNA-seqV2 data consists of raw counts similar to regular RNA-seq but RSEM (RNA-seq by Expectation Maximization) data can be used with the edgeR method.
This part of the supplementary protocol demonstrates filtering and scoring RNA-seq data using normalized RNA-seq count data with the edgeR R package. The protocol can be used to produce input data for pathway enrichment methods like g:Profiler, GSEA and others. This RNA-seq analysis protocol follows conceptually similar steps to microarray analysis shown above.
knitr::opts_knit$set(cache = TRUE)
tryCatch(expr = { library("edgeR")},
error = function(e) {
source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")},
finally = library("edgeR"))
working_dir <- file.path(getwd(),"data")
#The field in the class definition file that defines the classes of the data.
data_classes <- "SUBTYPE"
RNASeq <- read.table(
file.path(working_dir,"Supplementary_Table10_TCGA_RNASeq_rawcounts.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
classDefinitions_RNASeq <- read.table(
file.path(working_dir, "Supplementary_Table11_RNASeq_classdefinitions.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
cpms <- cpm(RNASeq)
keep <- rowSums(cpms > 1) >= 50
counts <- RNASeq[keep,]
# create data structure to hold counts and subtype information for each sample.
d <- DGEList(counts=counts, group=classDefinitions_RNASeq$SUBTYPE)
#Normalize the data
d <- calcNormFactors(d)
#create multidimensional scaling(MDS) plot. The command below will automatically
# generate the plot containing all samples where each subtype is a different color.
#Ideally there should be a good separation between the different classes.
mds_filename <- file.path(working_dir, "mdsplot_allsamples.png")
png(filename = mds_filename)
mds_output <- plotMDS(d, labels=NULL, pch = 1,
col= c("darkgreen","blue","red", "orange")[factor(classDefinitions_RNASeq$SUBTYPE)],
xlim = c(-2.5,4), ylim = c(-2.5,4))
legend("topright",
legend=levels(factor(classDefinitions_RNASeq$SUBTYPE)),
pch=c(1), col= c("darkgreen","blue","red", "orange"),title="Class",
bty = 'n', cex = 0.75)
dev.off()
#calculate dispersion
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)
#the below regular expression excludes gene names that are ? or that start with LOC
# any number of additional terms can be added to the regular expresion, for example
# to exclude any genes that start with "His" add |^His to the regular expression
exclude <- grep("\\?|^LOC", rownames(d), value=T)
d <- d[which(!rownames(d) %in% exclude),]
#calculate differential expression statistics with a simple design
de <- exactTest(d, pair=c("Immunoreactive","Mesenchymal"))
tt_exact_test <- topTags(de,n=nrow(d))
#alternately you can also use the glm model using contrasts.
#For a simple 2 class comparison this is not required but if you want to compare
# 1 class to the remaining 3 classes then this sort of model is useful.
classes <- factor(classDefinitions_RNASeq[,data_classes])
modelDesign <- model.matrix(~ 0 + classes)
contrast_mesenvsimmuno <- makeContrasts(
mesenvsimmuno ="classesMesenchymal-classesImmunoreactive",
levels=modelDesign)
fit_glm <- glmFit(d,modelDesign)
mesenvsimmuno <- glmLRT(fit_glm , contrast = contrast_mesenvsimmuno)
tt_mesenvsimmuno <- topTags(mesenvsimmuno,n=nrow(d))
Examples of different designs that can be used for this dataset. Instead of simple two class design you can also compare one class to the remaining three classes.
classes <- factor(classDefinitions_RNASeq[,data_classes])
modelDesign <- model.matrix(~ 0 + classes)
contrast_immuno <- makeContrasts(
immunovsrest ="classesImmunoreactive-(classesMesenchymal +
classesProliferative +classesDifferentiated)/3",
levels=modelDesign)
fit_glm <- glmFit(d,modelDesign)
immunovsrest <- glmLRT(fit_glm , contrast = contrast_immuno)
tt_immunovsrest <- topTags(immunovsrest,n=nrow(d))
contrast_mesen <- makeContrasts(
mesenvsrest = "classesMesenchymal-(classesImmunoreactive +
classesProliferative +classesDifferentiated)/3",
levels=modelDesign)
fit_glm <- glmFit(d,modelDesign)
mesenvsrest <- glmLRT(fit_glm , contrast = contrast_mesen)
tt_mesenvsrest <- topTags(mesenvsrest,n=nrow(d))
contrast_prolif <- makeContrasts(
prolifvsrest = "classesProliferative-(classesMesenchymal +
classesImmunoreactive +classesDifferentiated)/3",
levels=modelDesign)
fit_glm <- glmFit(d,modelDesign)
prolifvsrest <- glmLRT(fit_glm , contrast = contrast_prolif)
tt_prolifvsrest <- topTags(prolifvsrest,n=nrow(d))
contrast_diff <- makeContrasts(
diffvsrest = "classesDifferentiated-(classesMesenchymal +
classesImmunoreactive +classesProliferative)/3",
levels=modelDesign)
fit_glm <- glmFit(d,modelDesign)
diffvsrest <- glmLRT(fit_glm , contrast = contrast_diff)
tt_diffvsrest <- topTags(diffvsrest,n=nrow(d))
8a. Create the gene list for use in g:Profiler or another thresholded enrichment tool. The list may comprise all genes that have a significant FDR-corrected p-value (code shown below), all significant and FDR-corrected up-regulated genes and all down-regulated genes separately, or some other combination of thresholds. Also see analogous step in the microarray protocol.
tt <- tt_exact_test
#get the indices of scored dataset that have FDR < 0.05
select_genes = which(tt$table$FDR < 0.05)
#output how many genes there are in the set that have FDR < 0.05
length(select_genes)
#gene names from the TCGA set contain gene name and entrez gene ids separated by ‘|’
# for all subsequent enrichment analysis we need to have just one id. Separate the names
# into their two ids and keep the gene symbols
topgenes_qvalue005 <- unlist(lapply(rownames(tt$table)[select_genes],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
#output the top 5 entries in the list of top genes
head(topgenes_qvalue005)
#write results out to the file. This is an example of a set that can be used for
# Protocol 1
write.table(topgenes_qvalue005,
file.path(working_dir,"MesenchymalvsImmunoreactive_RNAseq_allsignificantgenes.txt"),
col.names=FALSE, sep="\t", row.names=FALSE, quote=FALSE)
8b. Create a two-column rank (.RNK) file of all gene IDs and corresponding scores to for GSEA pre-ranked analysis. To run GSEA in pre-ranked mode, you need a two column RNK file with gene/protein/probe name (column 1) and the associated score (column 2). The first column should contain the same type of gene IDs used in the pathway gene-set (GMT) file. GSEA will look for enrichment in the set of most differentially expressed genes at the top of the list as well as those at the bottom of the list. Genes at the top of the list are more highly expressed in class A of samples (e.g., Mesenchymal) while genes at the bottom are highly expressed in class B (e.g., Immunoreactive). A score can be computed by multiplying direction (sign) of fold change and logarithm of p-value for each gene.
#calculate ranks
ranks_RNAseq = sign(tt$table$logFC) * -log10(tt$table$PValue)
#gene names from the TCGA set contain gene name and entrez gene ids separated by ‘|’
# for all subsequent enrichment analysis we need to have just one id. Separate the names
# into their two ids.
genenames <- unlist(lapply( rownames(tt$table), function(data)
{unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(tt$table),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
#create ranks file
ranks_RNAseq <- cbind(genenames, ranks_RNAseq)
colnames(ranks_RNAseq) <- c("GeneName","rank")
#sort ranks in decreasing order
ranks_RNAseq <- ranks_RNAseq[order(as.numeric(ranks_RNAseq[,2]),decreasing = TRUE),]
write.table(ranks_RNAseq, file.path(working_dir,
"Supplementary_Table2_MesenvsImmuno_RNASeq_ranks.rnk"),
col.name = TRUE, sep="\t", row.names = FALSE, quote = FALSE)
Small section of the top and bottom of the resulting rank file:
head(ranks_RNAseq)
tail(ranks_RNAseq)
9a. (Optional) Create an expression file for the enrichment map and save it to a file in the working folder. The optional expression file is created from the original RNA-seq expression matrix used for tha anlysis (variable d above) with the addition of a column on the left edge of the matrix. The additional field often contains a gene description however any text value can be added.
#fix issue with biomart not working because of url redirection
options(RCurlOptions=list(followlocation=TRUE, postredir=2L))
normalized_expression_RNAseq <- cpm(d, normalized.lib.size=TRUE)
#From the rownames parse out the gene name and the geneids
genenames <- unlist(lapply( rownames(normalized_expression_RNAseq),
function(data) {unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(normalized_expression_RNAseq),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
EM_expressionFile_RNAseq <- data.frame(Name = genenames, normalized_expression_RNAseq)
rownames(EM_expressionFile_RNAseq) <- rownames(normalized_expression_RNAseq)
colnames(EM_expressionFile_RNAseq) <- substring(colnames(EM_expressionFile_RNAseq),1,12)
#Add descriptions instead of geneids
tryCatch(expr = { library("biomaRt")},
error = function(e) {
source("https://bioconductor.org/biocLite.R")
biocLite("biomaRt")},
finally = library("biomaRt"))
mart = useMart(biomart="ENSEMBL_MART_ENSEMBL")
mart = useDataset(mart, dataset="hsapiens_gene_ensembl" )
genes = getBM(attributes = c( 'hgnc_symbol', 'description'), filters='hgnc_symbol',
values=genenames, mart=mart);
genes$description = gsub("\\[Source.*", "", genes$description);
EM_expressionFile_RNAseq <- merge(genes,EM_expressionFile_RNAseq,
all.y=TRUE,by.x=1, by.y=1)
colnames(EM_expressionFile_RNAseq)[1] <- "Name"
colnames(EM_expressionFile_RNAseq)[2] <- "Description"
write.table(EM_expressionFile_RNAseq,
file.path(working_dir,
"Supplementary_Table6_TCGA_OV_RNAseq_expression.txt"),
col.name=TRUE,sep="\t", row.names=FALSE, quote=FALSE)
9b. (Optional) GSEA CLS file defining the phenotype (i.e. biological conditions) of each sample in the expression file, for example, see Supplementary_Table7_TCGA_OV_RNAseq_classes.cls. This file is only required for phenotype randomization in GSEA, however providing it to EnrichmentMap will label the columns of the expression file in the EnrichmentMap heat map viewer by phenotype.
#write out a GSEA classes file. (optional)
fileConn <- file(
file.path(working_dir,"Supplementary_Table7_TCGA_OV_RNAseq_classes.cls"))
writeLines(c(paste(length(classDefinitions_RNASeq[,data_classes]), "4 1"),
paste("# ", unique(classDefinitions_RNASeq[,data_classes])[1], " ",
unique(classDefinitions_RNASeq[,data_classes])[2], " ",
unique(classDefinitions_RNASeq[,data_classes])[3], " ",
unique(classDefinitions_RNASeq[,data_classes])[4])), fileConn)
write.table(t(classDefinitions_RNASeq[,data_classes]),
file.path(working_dir,"Supplementary_Table7_TCGA_OV_RNAseq_classes.cls"),
col.name=FALSE, sep="\t",
row.names=FALSE, quote=FALSE, append=TRUE)
close(fileConn)
9c. (Optional) Examine gene expression data using heat maps. Heat maps can easily show the separation between sample classes, labeled by colors in the heat map header. By limiting to the most significantly differentially expressed list of genes (FDR-corrected p<0.05) we can verify whether the scoring accurately separates class A from class B. Resulting heatmap is shown in Figure 2.
tryCatch(expr = { library("pheatmap")},
error = function(e) {
install.packages("pheatmap")},
finally = library("pheatmap"))
tryCatch(expr = { library("RColorBrewer")},
error = function(e) {
install.packages("RColorBrewer")},
finally = library("RColorBrewer"))
annotation_col <- data.frame(SUBTYPE=factor(classDefinitions_RNASeq[,data_classes]))
rownames(annotation_col) <- substr(classDefinitions_RNASeq[,2],1,12)
ann_colors = list(SUBTYPE = c(Immunoreactive="blue", Mesenchymal="red",
Proliferative = "orange",Differentiated="darkgreen"))
col.pal <- rev(brewer.pal(11, "RdBu"))
genes_to_select <- unlist(lapply( rownames(tt$table)[which(tt$table$FDR<0.05)],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
matrix_for_heatmap <- as.matrix(EM_expressionFile_RNAseq[which(EM_expressionFile_RNAseq[,1]
%in% genes_to_select ),3:dim(EM_expressionFile_RNAseq)[2] ])
class(matrix_for_heatmap) <- "numeric"
matrix_for_heatmap[matrix_for_heatmap == 0] <- 0.0000001
Example of other comparison that can be done with this dataset. Rank files for each are also created
#Immuno vs rest
tt <- tt_immunovsrest
select_genes = which(tt$table$FDR<0.05)
length(select_genes)
topgenes_qvalue005 <- unlist(lapply( rownames(tt$table)[select_genes],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
head(topgenes_qvalue005)
write.table(topgenes_qvalue005,
file.path(working_dir,"ImmunovsRest_RNAseq_allsignificantgenes.txt"),
col.names=FALSE, sep="\t", row.names=FALSE, quote=FALSE)
ranks_RNAseq = sign(tt$table$logFC) * -log10(tt$table$PValue)
genenames <- unlist(lapply( rownames(tt$table), function(data)
{unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(tt$table),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
ranks_RNAseq <- cbind(genenames, ranks_RNAseq)
colnames(ranks_RNAseq) <- c("GeneName","rank")
write.table(ranks_RNAseq, file.path(working_dir,"ImmunovsRest_RNASeq_ranks.rnk"),
col.name = TRUE, sep="\t", row.names = FALSE, quote = FALSE)
#Mesen vs rest data
tt <- tt_mesenvsrest
select_genes = which(tt$table$FDR<0.05)
length(select_genes)
topgenes_qvalue005 <- unlist(lapply( rownames(tt$table)[select_genes],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
head(topgenes_qvalue005)
write.table(topgenes_qvalue005,
file.path(working_dir,"MesenvsRest_RNAseq_allsignificantgenes.txt"),
col.names=FALSE, sep="\t", row.names=FALSE, quote=FALSE)
ranks_RNAseq = sign(tt$table$logFC) * -log10(tt$table$PValue)
genenames <- unlist(lapply( rownames(tt$table), function(data)
{unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(tt$table),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
ranks_RNAseq <- cbind(genenames, ranks_RNAseq)
colnames(ranks_RNAseq) <- c("GeneName","rank")
write.table(ranks_RNAseq, file.path(working_dir,"MesenvsRest_RNASeq_ranks.rnk"),
col.name = TRUE, sep="\t", row.names = FALSE, quote = FALSE)
#Differentiated vs rest data
tt <- tt_diffvsrest
select_genes = which(tt$table$FDR<0.05)
length(select_genes)
topgenes_qvalue005 <- unlist(lapply( rownames(tt$table)[select_genes],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
head(topgenes_qvalue005)
write.table(topgenes_qvalue005,
file.path(working_dir,"DiffvsRest_RNAseq_allsignificantgenes.txt"),
col.names=FALSE, sep="\t", row.names=FALSE, quote=FALSE)
ranks_RNAseq = sign(tt$table$logFC) * -log10(tt$table$PValue)
genenames <- unlist(lapply( rownames(tt$table), function(data)
{unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(tt$table),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
ranks_RNAseq <- cbind(genenames, ranks_RNAseq)
colnames(ranks_RNAseq) <- c("GeneName","rank")
write.table(ranks_RNAseq, file.path(working_dir,"DiffvsRest_RNASeq_ranks.rnk"),
col.name = TRUE, sep="\t", row.names = FALSE, quote = FALSE)
#Proliferative vs rest data
tt <- tt_prolifvsrest
select_genes = which(tt$table$FDR<0.05)
length(select_genes)
topgenes_qvalue005 <- unlist(lapply( rownames(tt$table)[select_genes],
function(data) {unlist(strsplit(data,"\\|"))[1]}))
head(topgenes_qvalue005)
write.table(topgenes_qvalue005,
file.path(working_dir,"ProlifvsRest_RNAseq_allsignificantgenes.txt"),
col.names=FALSE, sep="\t", row.names=FALSE, quote=FALSE)
ranks_RNAseq = sign(tt$table$logFC) * -log10(tt$table$PValue)
genenames <- unlist(lapply( rownames(tt$table), function(data)
{unlist(strsplit(data,"\\|"))[1]}))
geneids <- unlist(lapply( rownames(tt$table),
function(data) {unlist(strsplit(data,"\\|"))[2]}))
ranks_RNAseq <- cbind(genenames, ranks_RNAseq)
colnames(ranks_RNAseq) <- c("GeneName","rank")
write.table(ranks_RNAseq, file.path(working_dir,"ProlifvsRest_RNASeq_ranks.rnk"),
col.name = TRUE, sep="\t", row.names = FALSE, quote = FALSE)
1. International Cancer Genome, C. et al. International network of cancer genome projects. Nature 464, 993–8 (2010).
2. Grossman, R. L. et al. Toward a shared vision for cancer genomic data. N Engl J Med 375, 1109–12 (2016).
3. Colaprico, A. et al. TCGAbiolinks: An r/bioconductor package for integrative analysis of tcga data. Nucleic Acids Res 44, e71 (2016).
4. Wang, K. et al. MapSplice: Accurate mapping of rna-seq reads for splice junction discovery. Nucleic Acids Res 38, e178 (2010).
5. Li, B. & Dewey, C. N. RSEM: Accurate transcript quantification from rna-seq data with or without a reference genome. BMC Bioinformatics 12, 323 (2011).
6. Verhaak, R. G. et al. Prognostically relevant gene signatures of high-grade serous ovarian carcinoma. J Clin Invest 123, 517–25 (2013).
7. Robinson, M. D., McCarthy, D. J. & Smyth, G. K. EdgeR: A bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics 26, 139–40 (2010).