Bioinformatics analysis is often carried out in R. Pathway enrichment analysis can be also performed directly in R. This protocol will demonstrate the use of R packages ROAST1 and Camera2. Each method requires an expressionSet that minimally contains a matrix of expression values for a set of genes and conditions. The expression matrix generated in supplementary protocol part 1 or 2 is suitable for the analysis.
#working_dir <- “path/to/data”
working_dir <- "./data"
tryCatch(expr = { library("limma")},
error = function(e) { source("https://bioconductor.org/biocLite.R")
biocLite("limma")},
finally = library("limma"))
tryCatch(expr = { library("GSA")},
error = function(e) { source("https://bioconductor.org/biocLite.R")
biocLite("GSA")},
finally = library("GSA"))
tryCatch(expr = { library("RCurl")},
error = function(e) {
install.packages("RCurl")},
finally = library("RCurl"))
# This protocol can use RNA-seq expression data or microarray expression data.
#Specify which you would like to use
dataType_rnaseq <- TRUE
#The field in the class definition file that defines the classes of the data.
data_classes <- "SUBTYPE"
#string to name the analysis
analysis_name <- "Mesen_vs_Immuno"
#from Supplementary protocol 1
expression_file <- "Supplementary_Table6_TCGA_OV_RNAseq_expression.txt"
Only Human, Mouse and Rat gene set files are currently available on the Baderlab downloads site. If you are working with a species other than human (and it is either rat or mouse) change the gmt_url below to correct species. Check here to see all available species.
This can be done automatically through R or can be done manually through the website. To download it programmatically follow below:
gmt_url = "http://download.baderlab.org/EM_Genesets/current_release/Human/symbol/"
#list all the files on the server
filenames = getURL(gmt_url)
tc = textConnection(filenames)
contents = readLines(tc)
close(tc)
#get the gmt that has all the pathways and does not include terms
#inferred from electronic annotations(IEA)
#start with gmt file that has pathways only
rx = gregexpr("(?<=<a href=\")(.*.GOBP_AllPathways_no_GO_iea.*.)(.gmt)(?=\">)",
contents, perl = TRUE)
gmt_file = unlist(regmatches(contents, rx))
dest_gmt_file <- file.path(working_dir,paste("Supplementary_Table3_",gmt_file,sep="") )
download.file(
paste(gmt_url,gmt_file,sep=""),
destfile=dest_gmt_file
)
Load in the newly downloaded GMT file
#if you haven't automatically downloaded the gmt file set
#the path to the gmt file below.
gmt_file <- dest_gmt_file
capture.output(
genesets <- GSA.read.gmt(gmt_file),
file="./gsea_load_output.txt"
)
names(genesets$genesets) <- genesets$geneset.names
(optional) Recreate the DGEList from supplementary protocol rnaseq (Steps 1 -5 in Supplementary protocol 1 rnaseq) or if you are within the same sesison you can use the d variable.
if(dataType_rnaseq){
tryCatch(expr = { library("edgeR")},
error = function(e) { source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")},
finally = library("edgeR"))
RNAseq <- read.table(
file.path(working_dir,"Supplementary_Table10_TCGA_RNASeq_rawcounts.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
classDefinitions_RNAseq <-read.table(
file.path(working_dir,"Supplementary_Table11_RNASeq_classdefinitions.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
cpms <- cpm(RNAseq)
keep <- rowSums(cpms > 1) >= 50
counts <- RNAseq [keep,]
#round counts to create whole numbers
counts <- round(counts)
exclude <- rownames(counts)[union(grep("\\?",rownames(counts)), grep("^LOC",
rownames(counts)))]
counts <- counts[which(!rownames(counts) %in% exclude),]
d <- DGEList(counts=counts, group=classDefinitions_RNAseq[,data_classes])
d <- calcNormFactors(d)
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)
temp_names <- rownames(d)
rownames(d) <-
unlist(lapply(temp_names,function(x){ unlist(strsplit(x,"\\|"))[1]}))
data_for_gs_analysis <- d
classes <- data_for_gs_analysis$samples$group
}
Or alternately load data as a MinimalSet from microarray data (Steps 9 - 14 in Supplementary protocol 2 microarray)
if(!dataType_rnaseq){
tryCatch(expr = { library("Biobase")},
error = function(e) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biobase")},
finally = library("Biobase"))
#load in files
expressionMatrix <- as.matrix(read.table(
file.path(working_dir, "Supplementary_Table12_TCGA_Microarray_rmanormalized.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE))
classDefinitions <- read.table(
file.path(working_dir,"Supplementary_Table13_Microarray_classdefinitions.txt"),
header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
identical(colnames(expressionMatrix), classDefinitions$patient)
#create minimal set
minimalSet <- ExpressionSet(assayData=expressionMatrix)
classes <- factor(classDefinitions[,data_classes])
#create model
modelDesign <- model.matrix(~ 0 + classes)
#assign data set for the analysis
data_for_gs_analysis <- minimalSet
}
If you haven’t defined the data above using RNA-seq data or microarray data, Set data to be your DGEList of MinimalSet.
#Should be a minimalSet or DGEList
#data_for_gs_analysis <- data you want to use for the analysis.
#classes <- class definitions for each of the samples in the dataset
genesets_filtered <- ids2indices(genesets$genesets, rownames(data_for_gs_analysis),
remove.empty=TRUE)
geneset_sizes <- unlist(lapply(genesets_filtered, length))
geneset_indices <- which(geneset_sizes>=15 & geneset_sizes<200)
design <- model.matrix(~ 0 + classes)
contrast_mesenvsimmuno <- makeContrasts(
mesenvsimmuno ="classesImmunoreactive-classesMesenchymal",levels=design)
mroast_results <- mroast(data_for_gs_analysis, genesets_filtered[geneset_indices],
design,contrast=contrast_mesenvsimmuno, nrot=10000)
mroast_descr <- unlist(lapply(rownames(mroast_results),
function(x){unlist(strsplit(x,"\\%"))[1]}))
In order to create an enrichment map from the results of the ROAST analysis we need to create a file that can be uploaded into the EnrichmentMap and translated into a network. EnrichmentMap requires a file that contains the pathway name, description, p-value associated with the enrichment, corrected p-value associated with the enrichment, a phenotype and a list of genes associated with the given pathway. The list of genes is optional bu in the absence of a list of genes a gmt file must be provided. If any of the required columns are missig from your analysis you can supply fake values in order to build an enrichment map.
mroast_results_file <- "mroast_results_generic_em.txt"
Phenotype <- unlist(lapply(mroast_results[,"Direction"],function(x)
{if(x=="Up"){1}else{(-1)}}))
genes <- c()
for(i in 1:length(rownames(mroast_results))){
current_geneset <- unlist(genesets_filtered
[ which( names(genesets_filtered) %in% rownames(mroast_results)[i])])
current_genes <- c()
for(j in 1:length(current_geneset)){
if(j==length(current_geneset)){
current_genes <- paste(current_genes,
rownames(data_for_gs_analysis)[current_geneset[j]],
sep="")
} else {
current_genes <- paste(current_genes,
rownames(data_for_gs_analysis)[current_geneset[j]],
",", sep="")
}
}
genes <- rbind(genes, current_genes)
}
rownames(genes) <- rownames(mroast_results)
mroast_results_generic_em <- data.frame( rownames(mroast_results), mroast_descr,
PValue=mroast_results[,"PValue"], FDR=mroast_results[,"FDR"], Phenotype, genes)
write.table(mroast_results_generic_em, file.path(working_dir,mroast_results_file),
col.name=TRUE, sep="\t", row.names=FALSE, quote=FALSE)
camera_results_file <- "camera_results_generic_em.txt"
camera_results <- camera(data_for_gs_analysis,
genesets_filtered[geneset_indices], design, contrast=contrast_mesenvsimmuno)
camera_descr <- unlist(lapply(rownames(camera_results),
function(x){unlist(strsplit(x,"\\%"))[1]}))
camera_Phenotype <- unlist(lapply(camera_results[,"Direction"],
function(x){if(x=="Up"){1}else{(-1)}}))
camera_genes <- c()
for(i in 1:length(rownames(camera_results))){
current_geneset <- unlist(
genesets_filtered[ which( names( genesets_filtered ) %in%
rownames(camera_results)[i])])
current_genes <- c()
for(j in 1:length(current_geneset)){
if(j==length(current_geneset)){
current_genes <- paste( current_genes,
rownames(data_for_gs_analysis) [current_geneset[j]],
sep="")
} else {
current_genes <- paste( current_genes,
rownames(data_for_gs_analysis)[ current_geneset[j]], ",",
sep="")
}
}
camera_genes <- rbind(camera_genes, current_genes)
}
rownames(camera_genes) <- rownames(camera_results)
camera_results_generic_em <- data.frame(rownames(camera_results), camera_descr,
PValue = camera_results[,"PValue"], FDR=camera_results[,"FDR"], Phenotype, genes )
write.table(camera_results_generic_em, file.path(working_dir,camera_results_file),
col.name=TRUE, sep="\t", row.names=FALSE, quote=FALSE)
The results from Camera2 or ROAST1 can be input to Enrichment Map, following the protocol(start from step 6) in the main text.
Optional: Build Enrichment map from the above results
Instead of creating an enrichment map through the Cytoscape user interface it is also possible to create it directly from R using cyrest commands. Below is an example of how to create an enrichment map directly from R using the Camera2 and ROAST1 results that have been created in this protocol.
Make sure that you have launch Cytoscape and installed all required apps as listed in the main Protocol (Step 1 - 4)
Build a network with Camera2 results:
#use easy cyRest library to communicate with cytoscape.
tryCatch(expr = { library(RCy3)},
error = function(e) { install_github("cytoscape/RCy3")}, finally = library(RCy3))
#defined threshold for GSEA enrichments (need to be strings for cyrest call)
pvalue_threshold <- "0.05"
qvalue_threshold <- "0.001"
similarity_threshold <- "0.25"
similarity_metric = "JACCARD"
generic_gmt_file <- file.path(getwd(),gmt_file)
cur_model_name <- paste("camera",analysis_name,sep="_")
results_filename <- file.path(getwd(),working_dir,camera_results_file)
#######################################
#create EM - camera results
#######################################
current_network_name <- paste(cur_model_name,pvalue_threshold,qvalue_threshold,sep="_")
em_command = paste('enrichmentmap build analysisType="generic"',
'gmtFile=',generic_gmt_file,
'pvalue=',pvalue_threshold,
'qvalue=',qvalue_threshold,
'similaritycutoff=',similarity_threshold,
'coefficients=',similarity_metric,
'enrichmentsDataset1=',results_filename,
'expressionDataset1=',file.path(getwd(),working_dir,
expression_file),
sep=" ")
#enrichment map command will return the suid of newly created network.
response <- commandsGET(em_command)
current_network_suid <- 0
#enrichment map command will return the suid of newly created network unless it Failed.
#If it failed it will contain the word failed
if(grepl(pattern="Failed", response)){
paste(response)
} else {
current_network_suid <- response
}
response <- renameNetwork(current_network_name, as.numeric(current_network_suid))
When building a network if the commandsGet returns an error similar to this:
“RCy3::commandsGET, HTTP Error Code: 500
url=http://localhost:1234/v1/commands/enrichmentmap/build?analysisType=generic
&gmtFile=[path/to/file]/data/Supplementary_Table3_Human_GOBP_
AllPathways_no_GO_iea_March_01_2018_symbol.gmt
&pvalue=%200.05&qvalue=%200.0001&similaritycutoff=%200.25&coefficients=%20JACCARD
&enrichmentsDataset1=[path/to/file]/data/mroast_results_generic_em.txt
&expressionDataset1=[path/to/file]/data/Supplementary_Table6_TCGA_OV_RNAseq_expression.txt
Error in commandsGET(em_command) : "
Copy the url (for example, from the above error use:
“http://localhost:1234/v1/commands/enrichmentmap/build?analysisType=generic
&gmtFile=[path/to/file]/data/Supplementary_Table3_Human_GOBP_
AllPathways_no_GO_iea_March_01_2018_symbol.gmt
&pvalue=%200.05&qvalue=%200.0001&similaritycutoff=%200.25&coefficients=%20JACCARD
&enrichmentsDataset1=[path/to/file]/data/mroast_results_generic_em.txt
&expressionDataset1=[path/to/file]/data/Supplementary_Table6_TCGA_OV_RNAseq_expression.txt“)
and paste it into a web browser to get a more descriptive error message. Do not copy the above link to your web browser. The url is specific to the machine you have run the notebook on. copy the url from your error message.
Sometimes the above error will come back when everything is fine. If there are no results returned because nothing passes the thresholds you specified the above error with appear in R. In the web browser after following the above link will show the error “Failed: None of the gene sets have passed the filter. Try relaxing the gene set filter parameters.”
Build a network with the ROAST1 results
cur_model_name <- paste("roast",analysis_name,sep="_")
results_filename <- file.path(getwd(),working_dir,mroast_results_file)
#######################################
#create EM -roast results
#######################################
current_network_name <- paste(cur_model_name,pvalue_threshold,qvalue_threshold,sep="_")
em_command = paste('enrichmentmap build analysisType="generic"',
'gmtFile=',generic_gmt_file,
'pvalue=',pvalue_threshold,
'qvalue=',qvalue_threshold,
'similaritycutoff=',similarity_threshold,
'coefficients=',similarity_metric,
'enrichmentsDataset1=',results_filename,
'expressionDataset1=',file.path(getwd(),working_dir,
expression_file),
sep=" ")
#enrichment map command will return the suid of newly created network.
response <- commandsGET(em_command)
current_network_suid <- 0
#enrichment map command will return the suid of newly created network unless it Failed.
# If it failed it will contain the word failed
if(grepl(pattern="Failed", response)){
paste(response)
} else {
current_network_suid <- response
}
response <- renameNetwork(current_network_name, network = as.numeric(current_network_suid))
1. Wu, D. et al. ROAST: Rotation gene set tests for complex microarray experiments. Bioinformatics 26, 2176–82 (2010).
2. Wu, D. & Smyth, G. K. Camera: A competitive gene set test accounting for inter-gene correlation. Nucleic Acids Res 40, e133 (2012).