This protocol processes RNA-seq data using the R programming environment and specialized packages from Bioconductor to create genes lists. The scripts are available for download and novice users can copy and paste commands into R console. To create gene expression data for Protocol 1B, we downloaded gene expression data from the Ovarian Serous Cystadenocarcinoma project of The Cancer Genome Atlas (TCGA), http://cancergenome.nih.gov via the Genomic Data Commons (GDC) portal on 2017-06-14 using TCGABiolinks R package. The dataset includes 544 samples available as RMA-normalized microarray data (Affymetrix HG-U133A), and 309 samples available as RNA-seq data, with reads mapped to a reference genome using MapSplice58 and read counts per transcript determined using the RSEM method59. RNA-seq data are labeled in the dataset as ‘RNA-Seq V2’, see details at: https://wiki.nci.nih.gov/display/TCGA/RNA-Seq+Version+2). The RNA-SeqV2 dataset consists of raw counts similar to regular RNA-seq but RSEM data can be used with the edgeR method.
TCGABiolinks R package allows users to download raw or scored data directly from GDC portal. Both new datasets as well as legacy TCGA data are available for downlod
#for the latest version install from github
#devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
#source("https://bioconductor.org/biocLite.R")
#biocLite("TCGAbiolinks")
#make sure the version is at least or greater than 2.5.3
library("TCGAbiolinks")
library("SummarizedExperiment")
For our analysis we want to restrict the samples in our supplementary files to just a few subtypes but for a general analysis this is not required.
#load class definitions
classDefinitions_verhaak <- read.table( "data/Verhaak_supplementary_table1_OV_subtypes.txt", header = TRUE, sep = "\t", quote="\"", stringsAsFactors = FALSE)
#try and download the microarray expressions data.
query_microarray <- GDCquery(project = "TCGA-OV",
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "HT_HG-U133A",
access = "open",
legacy = TRUE)
GDCdownload(query_microarray )
OVMicroarray <- GDCprepare(query_microarray )
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
microarray <- assay(OVMicroarray)
#remove the duplicate genes and make gene names the matrix row names
microarray <- microarray[which(!duplicated(rownames(microarray))),]
#compute the 12 character barcode for each patients
microarrayPatients <- cbind(colnames(microarray), gsub('\\.','-',substring(colnames(microarray),1,12)))
#only include patients that were included in Verhaak dataset
microarray <- microarray[,which(microarrayPatients[,2] %in% classDefinitions_verhaak[which(!is.na(classDefinitions_verhaak$SUBTYPE)),"ID"])]
microarrayPatients <- merge(microarrayPatients,classDefinitions_verhaak[,c("ID","SUBTYPE")],by.x = 2, by.y =1)
colnames(microarrayPatients) <- c( "barcode","patient","SUBTYPE")
#only include patients that have microarray data for them
microarrayPatients <- microarrayPatients[which(microarrayPatients$patient %in% colnames(microarray)),]
microarrayPatients <- microarrayPatients[order(microarrayPatients$SUBTYPE),]
#convert the barcodes so that they will be compatible with colnames (R doesn't like "-" in column names)
microarrayPatients$patient <- gsub('-','\\.',microarrayPatients$patient)
colnames(microarray) <- gsub('-','\\.',colnames(microarray))
microarray <- microarray[,colnames(microarray)[order(match(colnames(microarray),microarrayPatients$patient))]]
write.table(microarray,"./data/Supplementary_Table10_TCGA_Microarray_rmanormalized.txt",col.name=TRUE,sep="\t",row.names=TRUE,quote=FALSE)
write.table(microarrayPatients,"./data/Supplementary_Table11_Microarray_classdefinitions.txt",col.name=TRUE,sep="\t",row.names=TRUE,quote=FALSE)
Sometimes the server times out. If you initially get an error try running this block over.
Also, if you have run the above code immediately before this section the tar file will have the same name but a different format. Try deleting all tar files in the directory and re-run.
query <- GDCquery(project = "TCGA-OV",
data.category = "Gene expression",
data.type = "Gene expression quantification",
experimental.strategy = "RNA-Seq",
platform = "Illumina HiSeq",
access = "open",
file.type = "results",
legacy = TRUE)
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg19
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-OV
## --------------------
## oo Filtering results
## --------------------
## ooo By platform
## ooo By access
## ooo By experimental.strategy
## ooo By data.type
## ooo By file.type
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
GDCdownload(query)
## Downloading data for project TCGA-OV
## Of the 309 files for download 309 already exist.
## All samples have been already downloaded
OVRnaseqSE <- GDCprepare(query)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
## Downloading genome information (try:0) Using: Homo sapiens genes (GRCh37.p13)
## Loading from disk
## Starting to add information to samples
## => Add clinical information to samples
## Add FFPE information. More information at:
## => https://cancergenome.nih.gov/cancersselected/biospeccriteria
## => http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html
## => Adding subtype information to samples
rnaseq <- assay(OVRnaseqSE)
#remove the duplicate genes and make gene names the matrix row names
rnaseq <- rnaseq[which(!duplicated(rownames(rnaseq))),]
#compute the 12 character barcode for each patients
rnaseqPatients <- cbind(colnames(rnaseq), gsub('\\.','-',substring(colnames(rnaseq),1,12)))
#only include patients that were included in Verhaak dataset
rnaseq <- rnaseq[,which(rnaseqPatients[,2] %in% classDefinitions_verhaak[which(!is.na(classDefinitions_verhaak$SUBTYPE)),"ID"])]
rnaseqPatients <- merge(rnaseqPatients,classDefinitions_verhaak[,c("ID","SUBTYPE")],by.x = 2, by.y =1)
colnames(rnaseqPatients) <- c( "barcode","patient","SUBTYPE")
#change the order of the classes so mesenchymal and immunoreactive are first
rnaseqPatients <- rbind(rnaseqPatients[which(rnaseqPatients$SUBTYPE == "Mesenchymal"),],
rnaseqPatients[which(rnaseqPatients$SUBTYPE == "Immunoreactive"),],
rnaseqPatients[which(rnaseqPatients$SUBTYPE == "Differentiated"),],
rnaseqPatients[which(rnaseqPatients$SUBTYPE == "Proliferative"),])
#rnaseqPatients <- rnaseqPatients[order(rnaseqPatients$SUBTYPE),]
#convert the barcodes so that they will be compatible with colnames (R doesn't like "-" in column names)
rnaseqPatients$patient <- gsub('-','\\.',rnaseqPatients$patient)
colnames(rnaseq) <- gsub('-','\\.',colnames(rnaseq))
rnaseq <- rnaseq[,colnames(rnaseq)[order(match(colnames(rnaseq),rnaseqPatients$patient))]]
#rnaseq <- rnaseq[,which(colnames(rnaseq) %in% rnaseqPatients[which(rnaseqPatients$SUBTYPE== "Immunoreactive" | rnaseqPatients$SUBTYPE == "Mesenchymal" ),"patient"])]
write.table(rnaseq,"./data/Supplementary_Table12_TCGA_RNASeq_rawcounts.txt",col.name=TRUE,sep="\t",row.names=TRUE,quote=FALSE)
write.table(rnaseqPatients,"./data/Supplementary_Table13_RNASeq_classdefinitions.txt",col.name=TRUE,sep="\t",row.names=TRUE,quote=FALSE)