Metadata

Sample metadata

date_types <- cols(lib_date = "c", seq_date = "c")

metadata         <- list()
metadata$patient <- 
  read_tsv_quiet(paths$patient_md) %>% 
  mutate(clinical_variant = fct_relevel(clinical_variant, "Sporadic"))

metadata$genome  <- read_tsv_quiet(paths$genome_md, col_types = date_types)
metadata$mrna    <- read_tsv_quiet(paths$mrna_md, col_types = date_types)
metadata$mirna   <- read_tsv_quiet(paths$mirna_md, col_types = date_types)

# Append site information to genome and mrna metadata
metadata <- map_at(metadata, c("genome", "mrna", "mirna"), 
                   append_info, "site", "clinical_variant")

# Set clinical_variant to centroblast/centrocyte for tonsil samples
metadata <- map_at(metadata, c("mrna", "mirna"), fix_clinical_variant)

We have incomplete HIV status for this cohort. For this reason, we only consider the sporadic and endemic variants. We will exclude any HIV-positive cases (that we know of).

# Remove HIV-positive cases
hiv_patients <- filter(metadata$patient, hiv_status == "Positive") %$% patient
metadata <- purrr::map(metadata, ~filter(.x, !patient %in% hiv_patients))

# Remove unpaired genome samples
metadata$genome <- 
  metadata$genome %>% 
  group_by(patient) %>% 
  filter(all(c("Tumor", "Normal") %in% tissue_status))

There happens to be a few patients with more than one RNA-seq dataset because both the FF and FFPE tumor tissue underwent library construction and sequencing. For now, we’ll simply be excluding the FFPE samples when an FF sample is available for the same patient.

metadata %<>% map_if(~ "ff_or_ffpe" %in% names(.x), remove_ffpe)

testthat::expect_true(all(table(metadata$mrna$sample) == 1))
testthat::expect_true(all(table(metadata$genome$patient) == 2))
metadata$patient <- 
  read_tsv_quiet(paths$patient_icgc_md) %>% {
  suppressWarnings(bind_rows(metadata$patient, .))} %>% 
  mutate(clinical_variant = as.factor(clinical_variant))

metadata$genome <- 
  read_tsv_quiet(paths$genome_icgc_md) %>% {
  suppressWarnings(bind_rows(metadata$genome, .))} %>% 
  mutate(clinical_variant = as.factor(clinical_variant))

get_icgc_donor <- 
  metadata$genome %>% 
  filter(grepl("^SP", biospecimen_id)) %$%
  setNames(patient_barcode, biospecimen_id)

Reference

HGNC gene symbols

To improve the interpretability of the results, I will use official HGNC symbols as gene names instead of Ensembl gene IDs.

ens86 <- useEnsembl("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
                    host = "oct2016.archive.ensembl.org")

symbols <-
  getBM(c("ensembl_gene_id", "hgnc_symbol"), mart = ens86) %>% 
  rename(gene_id = ensembl_gene_id, symbol = hgnc_symbol) %>% 
  distinct() %>% 
  mutate(symbol = ifelse(symbol == "", gene_id, symbol)) %>% 
  group_by(gene_id) %>% 
  slice(1)

Gene/transcript metadata

This gene-to-transcript mapping will allow tximport to summarize the transcript counts reported by Salmon at the gene level. We also use this data frame to convert gene IDs into gene symbols.

tx2gene_cols <- c("transcript", "gene_id", "gene")
tx2gene      <- 
  read_tsv_quiet(paths$tx2gene, col_names = tx2gene_cols) %>%
  mutate(gene_id = fix_gene_ids(gene_id)) %>% 
  left_join(symbols, by = "gene_id") %>% 
  mutate(symbol = ifelse(grepl("ebv_|_PAR_Y", transcript), gene, symbol)) %>% 
  select(transcript, symbol, gene_id) %>% 
  as.data.frame()

# Useful functions and variables
ebv_genes   <- filter(tx2gene, grepl("^ebv_", transcript)) %$% symbol
get_gene_id <- function(name) filter(tx2gene, symbol %in% name) %$% unique(gene_id)
get_gene    <- function(id) slice(tx2gene, match(id, gene_id)) %$% symbol

Gene lists

genes         <- list()
genes$bl      <- read_lines(paths$bl_genes)
genes$mbl     <- read_lines(paths$mbl_genes) %>% intersect(tx2gene$symbol)
genes$morgan  <- read_lines(paths$morgan_genes) %>% intersect(tx2gene$symbol)
genes$wright  <- read_lines(paths$wright_genes) %>% get_gene()
genes$malaria <- read_lines(paths$malaria_genes) %>% get_gene()
latency_genes <- read_tsv(paths$latency_genes)

Gene Annotations

chroms <- paste0("chr", c(1:22, "X"))

ens75 <- useEnsembl("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
                    host = "feb2014.archive.ensembl.org")

genes_gr <-
  getBM(mart = ens75,
        attributes = c("ensembl_gene_id", "hgnc_symbol", 
                       "chromosome_name", "start_position", "end_position"),
        filters = c("with_hgnc"),
        values = list(c(TRUE))) %>% 
  rename(gene_id = ensembl_gene_id, symbol = hgnc_symbol,
         chrom = chromosome_name, start = start_position, end = end_position) %>% 
  distinct() %>% 
  mutate(chrom = sub("^", "chr", chrom)) %>% 
  filter(chrom %in% chroms) %>% 
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

Results

Sex and EBV status

We inferred sex and EBV status from the sequencing data because the clinical annotations were incomplete.

status <- list()

status$sex <- 
  read_tsv_quiet(paths$sex) %>% 
  mutate(
    patient = ifelse(
      grepl("^SP", sample), get_icgc_donor[sample], get_patient_id(sample))) %>% 
  select(-sample, -chrY_chrX_read_ratio)

status$ebv_dna <- 
  read_tsv_quiet(paths$ebv_dna) %>%
  mutate(
    patient = ifelse(
      grepl("^SP", sample), get_icgc_donor[sample], get_patient_id(sample)),
    ebv_status = ifelse(
      grepl("^SP", sample), "NA", ebv_status),
    ebv_type = ifelse(
      grepl("^SP", sample), "NA", ebv_type)) %>% 
  select(-sample)

status$ebv_rna <- 
  read_tsv_quiet(paths$ebv_rna) %>%
  mutate(
    patient = ifelse(
      grepl("^SP", sample), get_icgc_donor[sample], get_patient_id(sample))) %>% 
  select(-sample, -ebv_status, -ebv_type)

# Track EBV read ratios
ebv_ratios <- 
  status$ebv_dna %>% 
  select(patient, ebv_status, dna_read_ratio = ebv_read_ratio) %>% 
  left_join(status$ebv_rna, by = "patient") %>% 
  rename(rna_read_ratio = ebv_read_ratio)

# Check if inferred sex status is consistent with clinical annotations
is_sex_consistent <- 
  metadata$patient %>% 
  inner_join(status$sex, by = "patient") %$% 
  ifelse(!is.na(annotated_sex), annotated_sex == sex, TRUE)
testthat::expect_true(all(is_sex_consistent))

# Fill in any blanks using the clinical annotations
status$sex <- 
  metadata$patient %>% 
  left_join(status$sex, by = "patient") %>% 
  mutate(sex = ifelse(is.na(sex), annotated_sex, sex)) %>% 
  select(patient, sex)

status$ebv_dna <- 
  metadata$patient %>% 
  left_join(status$ebv_dna, by = "patient") %>% 
  mutate(
    ebv_status = ifelse(grepl("^TS", patient), "Negative", ebv_status),
    ebv_type   = ifelse(grepl("^TS", patient), "None", ebv_type)) %>% 
  select(patient, ebv_status, ebv_type)

metadata <- map(metadata, ~left_join(.x, status$sex, by = "patient"))
metadata <- map(metadata, ~left_join(.x, status$ebv_dna, by = "patient"))
metadata$genome <- 
  read_tsv_quiet(paths$tc) %>%
  bind_rows(read_tsv_quiet(paths$tc_icgc)) %>% 
  select(biospecimen_id = Tumor_Sample_Barcode, tumour_content = Tumour_Content) %>% 
  left_join(metadata$genome, ., by = "biospecimen_id")

metadata$genome <- 
  read_tsv_quiet(paths$tn) %>%
  select(patient_barcode = bcr_patient_barcode, tumour_nuclei = tumor_nuclei_percent) %>% 
  left_join(metadata$genome, ., by = "patient_barcode") %>% 
  mutate(tumour_nuclei = ifelse(tissue_status == "Tumor", tumour_nuclei, NA_real_))

Strelka

maf <- read_maf(paths$mmaf)

# Update variant classification with simpler categories
maf@data$Variant_Classification <- get_categories(maf@data$Consequence)
maf@maf.silent$Variant_Classification <- get_categories(maf@maf.silent$Consequence)

# Restrict to cases with metadata
maf <- 
  metadata$genome %$% 
  gsub("-", ".", biospecimen_id) %>% 
  subsetMaf(maf, tsb = ., includeSyn = TRUE, mafObj = TRUE) %>% 
  process_maf(get_icgc_donor)

# Estimate tumour content
maf@data <- 
  estimate_tc(maf@data) %>% 
  as.data.table() %>% 
  setkey(patient) %>% 
  merge(setkey(maf@data, patient), .)

# Add FF/FFPE status and tumour_content to MAF
maf@data <- 
  metadata$genome %>% 
  filter(tissue_status == "Tumor") %>% 
  select(patient, ff_or_ffpe, ebv_type, ebv_status, clinical_variant, sex) %>% 
  as.data.table() %>% 
  setkey(patient) %>% 
  merge(maf@data, ., all = FALSE)
maf_all <- suppressWarnings(fread(paths$maf_all, showProgress = FALSE))
  
# Restrict to cases with metadata
maf_all <- maf_all[!grepl("^SP", Tumor_Sample_Barcode), 
                   patient := get_patient_id(Tumor_Sample_Barcode)]
maf_all <- maf_all[grepl("^SP", Tumor_Sample_Barcode), 
                   patient := get_icgc_donor[as.character(Tumor_Sample_Barcode)]]
maf_all <- maf_all[, biospecimen_id := Tumor_Sample_Barcode]
maf_all <- maf_all[, t_vaf := t_alt_count / (t_alt_count + t_ref_count)]
setkey(maf_all, patient)

maf_all <- 
  estimate_tc(maf_all) %>% 
  as.data.table() %>% 
  setkey(patient) %>% 
  merge(maf_all, .)

maf_all <- 
  metadata$genome %>% 
  filter(biospecimen_id %in% maf_all$Tumor_Sample_Barcode) %>% 
  select(patient, ff_or_ffpe) %>%
  distinct() %>% 
  as.data.table() %>% 
  setkey(patient) %>% 
  merge(maf_all, .)
mmaf <- read_maf(paths$mmaf)

# Update variant classification with simpler categories
mmaf@data$Variant_Classification       <- get_categories(mmaf@data$Consequence)
mmaf@maf.silent$Variant_Classification <- get_categories(mmaf@maf.silent$Consequence)

mmaf <- subsetMaf(
  mmaf, query = "Variant_Classification %in% c('Truncation', 'Missense', 'Splicing')", 
  includeSyn = TRUE, mafObj = TRUE)

setkey(mmaf@data, Tumor_Sample_Barcode)

mmaf@data <-
  metadata$genome %>% 
  select(Tumor_Sample_Barcode = biospecimen_id, clinical_variant, ebv_type) %>% 
  mutate(Tumor_Sample_Barcode = gsub("-", ".", Tumor_Sample_Barcode)) %>% 
  as.data.table() %>% 
  setkey(Tumor_Sample_Barcode) %>% 
  merge(mmaf@data, ., all.x = TRUE) %>% 
  replace_na(list(clinical_variant = "Sporadic", ebv_type = "None"))

Significantly mutated genes

smgs <- 
  read_tsv_quiet(paths$smgs, col_names = c("gene", "num_methods")) %>% 
  filter(num_methods > 1) %$%
  gene

Sequenza

ffpe_samples <- metadata$genome %$% biospecimen_id[ff_or_ffpe == "FFPE"]

sequenza <- 
  Sys.glob(file.path(paths$sequenza, "*", "*_segments.txt")) %>% 
  set_names(get_sample_id(.)) %>% 
  map_df(read_tsv_quiet, .id = "biospecimen_id") %>% 
  rename(start = start.pos, end = end.pos) %>% 
  filter(
    !biospecimen_id %in% ffpe_samples,
    chromosome %in% paste0("chr", seq(1, 22)),
    !is.na(CNt)) %>% 
  mutate(patient = get_patient_id(biospecimen_id))

Manta

manta_raw <- 
  Sys.glob(paths$manta) %>% 
  set_names(get_sample_id(.)) %>% 
  map(VariantAnnotation::readVcf) %>% 
  map(filter_pass) %>% 
  map(calc_tvaf) %>% 
  map_df(vcf_to_df, .id = "biospecimen_id") %>% 
  as_tibble() %>% 
  inner_join(select(metadata$genome, biospecimen_id, patient, ff_or_ffpe), 
             by = "biospecimen_id") %>% 
  inner_join(estimate_tc(maf@data), by = "patient")

Annotations

Now that the sex and EBV status are incorporated into the metadata data frames, we can create an annotations data frame. This is useful for certain tools like DEseq2 and pheatmap.

annotations <- 
  metadata$mrna %>% 
  mutate(cell_sorting = ifelse(is_cell_sorted(biospecimen_id), "Sorted", "Unsorted")) %>% 
  select(sample, clinical_variant, ebv_type, sex, ff_or_ffpe, 
         cell_sorting, site, tissue, lib_date) %>% 
  mutate_at(c("lib_date"), function(x) substr(x, 1, 7)) %>% 
  mutate_if(function(x) !is.factor(x), funs(fill_na)) %>% 
  mutate_each("as.factor", -sample) %>% 
  mutate(
    clinical_variant = fct_relevel(clinical_variant, "Sporadic", "Endemic"),
    ebv_type = fct_relevel(ebv_type, "Type 1", "Type 2")) %>% 
  as.data.frame() %>% 
  column_to_rownames("sample")

annotations_mirna <- 
  metadata$mirna %>% 
  mutate(cell_sorting = ifelse(is_cell_sorted(biospecimen_id), "Sorted", "Unsorted")) %>% 
  select(sample, clinical_variant, ebv_type, sex, ff_or_ffpe, 
         cell_sorting, site, tissue) %>% 
  mutate_if(function(x) !is.factor(x), funs(fill_na)) %>% 
  mutate_each("as.factor", -sample) %>% 
  mutate(
    clinical_variant = fct_relevel(clinical_variant, "Sporadic", "Endemic"),
    ebv_type = fct_relevel(ebv_type, "Type 1", "Type 2")) %>% 
  as.data.frame() %>% 
  column_to_rownames("sample")

gannotations <- 
  metadata$genome %>% 
  filter(tissue_status == "Tumor") %>% 
  mutate(cell_sorting = ifelse(is_cell_sorted(biospecimen_id), "Sorted", "Unsorted")) %>% 
  select(sample, clinical_variant, ebv_type, sex, 
         ff_or_ffpe, cell_sorting, site, tissue) %>% 
  mutate_if(function(x) !is.factor(x), funs(fill_na)) %>% 
  mutate_each("as.factor", -sample) %>% 
  mutate(
    clinical_variant = fct_relevel(clinical_variant, "Sporadic", "Endemic"),
    ebv_type = fct_relevel(ebv_type, "Type 1", "Type 2")) %>% 
  as.data.frame() %>% 
  column_to_rownames("sample")

Legend colours

Another thing we can do now that the metadata is loaded is define the legend colours.

colours <- 
  suppressWarnings(bind_rows(annotations, gannotations)) %>% 
  get_legend_colours()

display_colours(colours, 2)

matrix(rnorm(nrow(annotations)), nrow = 1) %>% 
  set_colnames(rownames(annotations)) %>% 
  pheatmap::pheatmap(annotation_col = rev(annotations), annotation_colors = colours, 
                     cluster_rows = FALSE, cluster_cols = FALSE)

Salmon

We measured expression using Salmon (quasi-alignment). The transcriptome we used it Gencode v25 supplemented with EBV transcripts.

# Load Salmon data
salmon     <- list()
salmon$txi <- load_salmon(paths$salmon, tx2gene, metadata$mrna$biospecimen_id)

# Load raw data as DESeq2 dataset
salmon$raw        <- list()
salmon$col_data   <- annotations[colnames(salmon$txi$counts),]
salmon$raw$dds    <- DESeqDataSetFromTximport(salmon$txi, salmon$col_data, ~ 1)
salmon$raw$dds    <- estimateSizeFactors(salmon$raw$dds)
salmon$raw$counts <- counts(salmon$raw$dds, normalized = TRUE)

miRNA Expression

mirna     <- list()
mirna$raw <- list()

outliers <- c("BL081")

mirna$raw$df <- 
  read_tsv_quiet(paths$mirna) %>% 
  select(-ends_with("-31")) %>% 
  set_names(names(.) %>% get_sample_id()) %>% 
  select(gene = Gene, one_of(metadata$mirna$biospecimen_id)) %>%
  set_names(get_sample_short_id(names(.))) %>% 
  select(-one_of(outliers))

mirna$raw$mat <- 
  mirna$raw$df %>% 
  as.data.frame() %>% 
  column_to_rownames("gene") %>% 
  as.matrix()

mirna$raw$dds  <- DESeqDataSetFromMatrix(
  mirna$raw$mat, annotations_mirna[colnames(mirna$raw$mat),], design = ~1)
mirna$raw$dds  <- estimateSizeFactors(mirna$raw$dds)
mirna$raw$cts  <- counts(mirna$raw$dds, normalized = TRUE)
mirna$raw$lcts <- log(1 + mirna$raw$cts)
mirna$norm <- list()

mirna$norm$df <- 
  mirna$raw$df %>% 
  mutate_at(vars(-gene), function(x) {x / sum(x) * 1000000})

mirna$norm$mat <- 
  mirna$norm$df %>% 
  as.data.frame() %>% 
  column_to_rownames("gene") %>% 
  as.matrix()