Packages

I load the packages I use the most while trying to avoid namespace conflicts. Any others are used with the :: syntax.

# Load packages in order of increasing importance
library(magrittr)
library(data.table)
library(matrixStats)
library(biomaRt)
library(NMF)
library(DESeq2)
library(maftools)
library(forcats)
library(tidyverse)

File Paths

Below are the paths to the files that will be used in this report.

paths <- list()

# Metadata
paths$patient_md      <- project("data/metadata/patient_metadata.tsv")
paths$patient_icgc_md <- project("data/metadata/patient_metadata.icgc.tsv")
paths$genome_md       <- project("data/metadata/genome_metadata.bams.tsv")
paths$genome_icgc_md  <- project("data/metadata/genome_metadata.icgc.tsv")
paths$mrna_md         <- project("data/metadata/mrna_metadata.bams.tsv")
paths$mirna_md        <- project("data/metadata/mirna_metadata.bams.tsv")
paths$tn              <- project("data/metadata/tumour_nuclei_percent.tsv")

# Reference
paths$bl_genes      <- project("etc/bl.genes.txt")
paths$mbl_genes     <- project("etc/mbl_signature.genes.txt")
paths$morgan_genes  <- project("etc/morgan.genes.txt")
paths$wright_genes  <- project("etc/wright.genes.txt")
paths$malaria_genes <- project("etc/malaria.genes.txt")
paths$latency_genes <- project("etc/latency.genes.txt")
paths$seq           <- project("ref/GRCh38/Sequence")
paths$seq_grch37    <- project("ref/GRCh37-lite/Sequence")
paths$annot         <- project("ref/GRCh38/Annotation")
paths$genome        <- file.path(paths$seq, "WholeGenomeFasta/genome.simple_headers.fa")
paths$trans         <- file.path(paths$seq, "GencodeTranscriptome/25")
paths$tx2gene       <- file.path(paths$trans, "gencode_v25_and_NC_007605.tx2gene.tsv")
paths$gaps          <- file.path(paths$annot, "ExcludedRegions/gaps.bed")
paths$genome_grch37 <- file.path(paths$seq_grch37, "WholeGenomeFasta/genome.simple_headers.fa")

# Results
paths$ebv_dna  <- project("results/status/ebv_status.dna.txt")
paths$ebv_rna  <- project("results/status/ebv_status.rna.txt")
paths$sex      <- project("results/status/sex_status.txt")
paths$strelka  <- project("results/strelka/3-tabulate")
paths$tc       <- file.path(paths$strelka, "tumour_content.tsv")
paths$tc_icgc  <- file.path(paths$strelka, "icgc.tumour_content.tsv")
paths$maf      <- file.path(paths$strelka, "strelka.noncanonical.qss_filt.vaf_filt.aug.maf")
paths$maf_all  <- file.path(paths$strelka, "all.strelka.canonical.qss_filt.aug.maf")
paths$mmaf     <- project("results/smgs/data/BL_META.grch37.maf")
paths$smgs     <- project("results/smgs/results/meta_analysis/BL_META.genes.txt")
paths$sequenza <- project("results/sequenza/2-sequenza/force_tc")
paths$gistic   <- project("results/gistic")
paths$manta    <- project("results/manta/1-manta/*/results/variants/somaticSV.vcf.gz")
paths$salmon   <- project("results/salmon")
paths$mirna    <- project("results/gsc_mirna/mirna_expression.tsv")

Global Variables

We define variables that are generally useful below.

# Set RNG seed for reproducibility
rng_seed <- 87510475  # Generated by runif(1, 0, 10^8)
set.seed(rng_seed)

# Use all available cores when multithreading
num_cores <- parallel::detectCores()
doMC::registerDoMC(cores = num_cores)
BiocParallel::register(BiocParallel::MulticoreParam(workers = num_cores))

# Effective genome size
genome_size <- 2934876451

# Number of most variably expressed genes
ntop <- 1000

# Other options
options(scipen=10)