barplot_maf_mutfreq_data <-
  maf@data %>%
  dplyr::count(patient, ff_or_ffpe, Variant_Type) %>%
  dplyr::ungroup() %>%
  tidyr::spread(Variant_Type, n) %>%
  dplyr::mutate(patient = forcats::fct_reorder(patient, -SNP)) %>%
  tidyr::gather(Variant_Type, n, -patient, -ff_or_ffpe) %>%
  dplyr::mutate(
    Variant_Type = dplyr::recode_factor(
      Variant_Type, SNP = "SNV", INS = "Insertion", DEL = "Deletion"),
    Variant_Class = dplyr::recode_factor(
      Variant_Type, SNV = "SNVs", Insertion = "Indels", Deletion = "Indels"),
    cohort = ifelse(grepl("^BL", patient), "BLGSP", "ICGC"))

barplot_maf_mutfreq_stats <-
  barplot_maf_mutfreq_data %>%
  dplyr::group_by(Variant_Class, patient) %>%
  dplyr::summarise(n = sum(n)) %>%
  dplyr::summarise(n_patients = dplyr::n_distinct(patient),
                   max = max(n),
                   mean = round(mean(n)))

barplot_maf_mutfreq_ff_or_ffpe <- 
  ggplot(barplot_maf_mutfreq_data) +
  geom_col(aes(x = patient, y = n, fill = Variant_Type)) +
  geom_hline(data = barplot_maf_mutfreq_stats, aes(yintercept = mean),
             colour = "red", linetype = 2) +
  facet_grid(Variant_Class ~ ff_or_ffpe, scales = "free", space = "free_x") +
  scale_x_discrete(labels = NULL) +
  scale_fill_brewer(palette = "Set2") +
  theme(legend.position = "top") +
  rotate_x_text() +
  labs(x = "Patient", y = "Frequency", fill = "Variant Type")

barplot_maf_mutfreq_ff_or_ffpe

barplot_maf_mutfreq_cohort <- 
  barplot_maf_mutfreq_ff_or_ffpe +
  facet_grid(Variant_Class ~ cohort, scales = "free", space = "free_x")

barplot_maf_mutfreq_cohort

maf_mutfreq_age <- 
  metadata$patient %>% 
  select(patient, age = age_at_initial_pathologic_diagnosis, ebv_type, clinical_variant) %>% 
  inner_join(barplot_maf_mutfreq_data, by = "patient") %>% 
  drop_na() %>% 
  filter(
    Variant_Class == "SNVs",
    patient != "BL142",
    age < 20,
    n < 50000) 

scatterplot_maf_mutfreq_age <- 
  maf_mutfreq_age %>% 
  ggplot(aes(age, n)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  labs(x = "Age", y  = "SNV frequency")

scatterplot_maf_mutfreq_age

maf_titv <- titv(maf, useSyn = TRUE, plot = FALSE)

plotTiTv(maf_titv)

maf_tnm <- trinucleotideMatrix(maf, paths$genome_grch37, useSyn = TRUE)
maf_sig <- extractSignatures(maf_tnm, nTry = 6, plotBestFitRes = FALSE)
   method   seed rng metric rank sparseness.basis sparseness.coef      rss      evar
1: brunet random   3     KL    2        0.3029940       0.3938290 18005848 0.9513736
2: brunet random   5     KL    3        0.3498756       0.3776821  5358215 0.9855297
3: brunet random   4     KL    4        0.3803052       0.3181127  3448177 0.9906879
4: brunet random   2     KL    5        0.3890917       0.2558792  2991051 0.9919224
5: brunet random   1     KL    6        0.4013064       0.2792355  2541249 0.9931371
   silhouette.coef silhouette.basis residuals niter   cpu cpu.all nrun cophenetic dispersion
1:       1.0000000        1.0000000  25270.52   510 0.208  15.686   10  0.9950794  0.9498402
2:       0.8275758        0.7747530  18418.03  1060 0.386  15.708   10  0.9819025  0.8571356
3:       0.7297531        0.7486916  13965.73   780 0.238  16.756   10  0.9761288  0.8512303
4:       0.5195405        0.6024173  12421.42   880 0.353  19.530   10  0.9420168  0.7367003
5:       0.3865825        0.5544854  11294.23  1970 0.823  18.969   10  0.9326400  0.7234075
   silhouette.consensus
1:            0.9790992
2:            0.9059049
3:            0.9029230
4:            0.7321543
5:            0.6582883
plotSignatures(maf_sig)

corrplot::corrplot(maf_sig$coSineSimMat, is.corr = FALSE, tl.cex = 0.6, 
                   tl.col = 'black', cl.cex = 0.6)

maf_sig$contributions %>% 
  set_colnames(get_patient_id(colnames(.))) %>% 
  plot_heatmap(colours, gannotations, cluster_rows = FALSE)

mut_counts <- maf@data[Variant_Type == "SNP", 
                       .(N = .N, ebv_status), 
                       by = "Tumor_Sample_Barcode"] %>% 
  distinct()

maf_signatures_mut_burden <- 
  maf_sig$contributions %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column("Tumor_Sample_Barcode") %>% 
  as.data.table() %>% 
  merge(mut_counts, by = "Tumor_Sample_Barcode") %>% 
  gather(signature, value, starts_with("Sig"))

scatterplot_maf_signatures_mut_burden <- 
  maf_signatures_mut_burden %>% 
  filter(N < 50000) %>%
  ggplot(aes(N, value)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_grid(~ signature) +
  labs(x = "SNV frequency", y = "Signature strength")

scatterplot_maf_signatures_mut_burden

boxplot_maf_signatures_ebv <- 
  maf_signatures_mut_burden %>% 
  filter(N < 50000) %>%
  ggplot(aes(ebv_status, value, fill = ebv_status)) +
  geom_boxplot(notch = TRUE) +
  facet_grid(~ signature) +
  scale_fill_manual(values = colours$ebv_status, breaks = NULL) +
  labs(x = "EBV status", y = "Signature strength")

boxplot_maf_signatures_ebv

boxplot_maf_vaf <- 
  maf@data %>% 
  mutate(
    patient = fct_reorder(patient, t_vaf, function(x) -median(x)),
    is_bl_gene = Hugo_Symbol %in% genes$bl & is_nonsynonymous(Consequence)) %>% 
  ggplot(aes(patient, t_vaf)) + 
  geom_boxplot(fill = "grey", outlier.colour = NA) +
  geom_hline(yintercept = c(0.25)) +
  scale_x_discrete(labels = NULL) +
  scale_y_continuous(breaks = seq(0, 1, 0.1), limits = c(0, 1)) +
  facet_grid(~ff_or_ffpe, scales = "free_x", space = "free_x") +
  rotate_x_text() + 
  labs(x = "Patient", y = "Tumor variant allele fraction")

boxplot_maf_vaf

nonsyn_query <- "Variant_Classification %in% c('Missense', 'Truncation', 'Splicing', 'Multi_Hit')"
maf_ebl <- subsetMaf(maf, query = paste("clinical_variant == 'Endemic' &", nonsyn_query), 
                     includeSyn = TRUE, mafObj = TRUE, genes = smgs)
maf_sbl <- subsetMaf(maf, query = paste("clinical_variant == 'Sporadic' &", nonsyn_query), 
                     includeSyn = TRUE, mafObj = TRUE, genes = smgs)

maf_ebl_vs_sbl <- mafCompare(maf_ebl, maf_sbl, 'Endemic BL', 'Sporadic BL', minMut = 4)

sig_genes_ebv_vs_sbl <- 
  maf_ebl_vs_sbl$results %>% 
  filter(Hugo_Symbol %in% smgs) %$% 
  Hugo_Symbol[pval < 0.05][order(or[pval < 0.05])]

if (length(sig_genes_ebv_vs_sbl) > 0) {
  cooncoplot_maf_ebl_vs_sbl <- coOncoplot(
    maf_sbl, maf_ebl, sig_genes_ebv_vs_sbl, 
    m1Name = 'Sporadic BL', m2Name = 'Endemic BL',
    removeNonMutated = FALSE, colors = colours$categs)
  
  # print(cooncoplot_maf_ebl_vs_sbl)
}

maf_ebvneg <- subsetMaf(maf, query = paste("ebv_status == 'Negative' &", nonsyn_query), 
                         includeSyn = TRUE, mafObj = TRUE, genes = smgs)
maf_ebvpos <- subsetMaf(maf, query = paste("ebv_status == 'Positive' &", nonsyn_query), 
                        includeSyn = TRUE, mafObj = TRUE, genes = smgs)

maf_ebvneg_vs_ebvpos <- mafCompare(maf_ebvneg, maf_ebvpos, 
                                   'EBV-negative', 'EBV-positive', minMut = 4)

sig_genes_ebvneg_vs_ebvpos <- 
  maf_ebvneg_vs_ebvpos$results %>% 
  filter(Hugo_Symbol %in% smgs) %$% 
  Hugo_Symbol[pval < 0.05][order(or[pval < 0.05])]

if (length(sig_genes_ebvneg_vs_ebvpos) > 0) {
  cooncoplot_maf_ebvneg_vs_ebvpos <- coOncoplot(
    maf_ebvneg, maf_ebvpos, sig_genes_ebvneg_vs_ebvpos, 
    m1Name = 'EBV-negative', m2Name = 'EBV-positive',
    removeNonMutated = FALSE, colors = colours$categs)
  
  # print(cooncoplot_maf_ebvneg_vs_ebvpos)
}

maf_ebvneg <- subsetMaf(maf, query = paste("ebv_status %in% c('Negative', 'NA') &", nonsyn_query), 
                         includeSyn = TRUE, mafObj = TRUE, genes = smgs)
maf_ebvpos <- subsetMaf(maf, query = paste("ebv_status == 'Positive' &", nonsyn_query), 
                        includeSyn = TRUE, mafObj = TRUE, genes = smgs)

maf_ebvneg_vs_ebvpos <- mafCompare(maf_ebvneg, maf_ebvpos, 
                                   'EBV-negative', 'EBV-positive', minMut = 4)

sig_genes_ebvneg_vs_ebvpos <- 
  maf_ebvneg_vs_ebvpos$results %>% 
  filter(Hugo_Symbol %in% smgs) %$% 
  Hugo_Symbol[pval < 0.05][order(or[pval < 0.05])]

if (length(sig_genes_ebvneg_vs_ebvpos) > 0) {
  cooncoplot_maf_ebvneg_vs_ebvpos <- coOncoplot(
    maf_ebvneg, maf_ebvpos, sig_genes_ebvneg_vs_ebvpos, 
    m1Name = 'EBV-negative', m2Name = 'EBV-positive',
    removeNonMutated = FALSE, colors = colours$categs)
  
  # print(cooncoplot_maf_ebvneg_vs_ebvpos)
}

maf_ebv1 <- subsetMaf(maf, query = paste("ebv_type == 'Type 1' &", nonsyn_query), 
                         includeSyn = TRUE, mafObj = TRUE, genes = smgs)
maf_ebv2 <- subsetMaf(maf, query = paste("ebv_type == 'Type 2' &", nonsyn_query), 
                        includeSyn = TRUE, mafObj = TRUE, genes = smgs)


maf_ebv1_vs_ebv2 <- mafCompare(maf_ebv1, maf_ebv2, 
                               'EBV Type 1', 'EBV Type 2', minMut = 4)

sig_genes_ebv1_vs_ebv2 <- 
  maf_ebv1_vs_ebv2$results %>% 
  filter(Hugo_Symbol %in% smgs) %$% 
  Hugo_Symbol[pval < 0.05][order(or[pval < 0.05])]

if (length(sig_genes_ebv1_vs_ebv2) > 0) {
  cooncoplot_maf_ebv1_vs_ebv2 <- coOncoplot(
    maf_ebv1, maf_ebv2, c(sig_genes_ebv1_vs_ebv2, "TP53"), 
    m1Name = 'EBV Type 1', m2Name = 'EBV Type 2',
    removeNonMutated = FALSE, colors = colours$categs)

  # print(cooncoplot_maf_ebv1_vs_ebv2)
}
boxplot_maf_compare_all_mutfreq_cv <- 
  maf@data %>% 
  count(clinical_variant, Tumor_Sample_Barcode) %>% 
  filter(n < 50000) %>% 
  ggplot(aes(clinical_variant, n, fill = clinical_variant)) + 
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(values = colours$clinical_variant, breaks = NULL) +
  labs(x = "Clinical variant", y = "Mutation frequency")

boxplot_maf_compare_all_mutfreq_ebv <- 
  maf@data %>% 
  count(ebv_status, Tumor_Sample_Barcode) %>% 
  filter(n < 50000) %>% 
  ggplot(aes(ebv_status, n, fill = ebv_status)) + 
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(values = colours$ebv_status, breaks = NULL) +
  # scale_x_discrete(limits = c("Positive", "Negative")) +
  labs(x = "EBV type", y = "Mutation frequency")

gridExtra::grid.arrange(
  boxplot_maf_compare_all_mutfreq_cv, boxplot_maf_compare_all_mutfreq_ebv, ncol = 2)

boxplot_maf_compare_nonsyn_smg_mutfreq_cv <- 
  maf@data %>% 
  filter(is_nonsynonymous(Consequence), Hugo_Symbol %in% smgs) %>% 
  count(clinical_variant, Tumor_Sample_Barcode) %>% 
  ggplot(aes(clinical_variant, n, fill = clinical_variant)) + 
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(values = colours$clinical_variant, breaks = NULL) +
  labs(x = "Clinical variant", y = "Mutation frequency")

boxplot_maf_compare_nonsyn_smg_mutfreq_ebv <- 
  maf@data %>% 
  filter(is_nonsynonymous(Consequence), Hugo_Symbol %in% smgs) %>% 
  count(ebv_status, Tumor_Sample_Barcode) %>% 
  ggplot(aes(ebv_status, n, fill = ebv_status)) + 
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(values = colours$ebv_status, breaks = NULL) +
  labs(x = "EBV status", y = "Mutation frequency")

gridExtra::grid.arrange(
  boxplot_maf_compare_nonsyn_smg_mutfreq_cv, boxplot_maf_compare_nonsyn_smg_mutfreq_ebv, ncol = 2)

ig_loci <- GRanges(
  c("chr2", "chr8", "chr14", "chr22"),
  IRanges(c(88999518, 128747246, 106020156, 22942470),
          c(89599757, 128753246, 107349540, 23342167)))

chroms    <- c(paste0("chr", 1:22), "chrX")
seqlens   <- seqlengths(BSgenome.Hsapiens.UCSC.hg19::Hsapiens)[chroms]
gen       <- data.frame(chroms, seqlens) %>% set_colnames(c("V1", "V2"))

gaps_gr <-
  read_tsv_quiet(paths$gaps) %>%
  rename(seqnames = `#chrom`, start = chromStart, end = chromEnd) %>%
  makeGRangesFromDataFrame()

bins <-
  tileGenome(seqlens, tilewidth = 5000, cut.last.tile.in.chrom = TRUE) %>%
  subsetByOverlaps(ig_loci, invert = TRUE)

maf_grl <-
  maf@data %>% 
  .[, .(biospecimen_id, Chromosome, Start_Position, End_Position, 
        HGVSp_Short, Consequence)] %>% 
  .[, Chromosome := sub("^", "chr", Chromosome)] %>% 
  .[Chromosome %in% chroms] %>% 
  as.data.frame() %>%
  filter_nonsyn(inverse = TRUE) %>%
  makeGRangesListFromDataFrame(
    split.field = "biospecimen_id",
    seqnames.field = "Chromosome",
    start.field = "Start_Position",
    end.field = "End_Position")

mut_counts <-
  map(as.list(maf_grl), ~countOverlaps(bins, .x)) %>%
  map(pmin, 1) %>%
  invoke(cbind, .) %>%
  rowSums()

mut_counts_fit <- fitdistrplus::fitdist(
  mut_counts, distr = "binom", start = list(prob = 0.01), 
  fix.arg = list(size = length(maf_grl)))

qval_thres <- 1e-9

mut_counts_df <-
  bins %>%
  as.data.frame() %>%
  inset("count", value = mut_counts) %>%
  select(seqnames, start, end, count) %>%
  mutate(
    pval = map_dbl(count, ~ binom.test(.x, n = length(maf_grl), 
                                       p = mut_counts_fit$estimate, 
                                       alternative = "greater")$p.value),
    qval = p.adjust(pval, "BH"),
    nlog_qval = -log10(qval),
    signif = qval < qval_thres)

closest_genes_idx <- 
  mut_counts_df %>% 
  filter(signif) %>% 
  makeGRangesFromDataFrame() %>%
  GenomicRanges::nearest(genes_gr) 

symbol_annotations <- 
  genes_gr[closest_genes_idx] %>% 
  as.data.frame() %>% 
  select(symbol) %>% 
  bind_cols(filter(mut_counts_df, signif), .) %>% 
  group_by(symbol) %>% 
  top_n(1, -qval) %>% 
  select(seqnames, start, end, symbol)
manhattanplot_maf_noncoding_mut_density <- 
  mut_counts_df %>%
  left_join(symbol_annotations, by = c("seqnames", "start", "end")) %>% 
  mutate(
    seqnames = sub("chr", "", seqnames),
    seqnames = factor(seqnames, levels = c(1:22, "X")),
    colour = ifelse(as.integer(seqnames) %% 2, "even", "odd")) %>% 
  filter(nlog_qval > 0) %>%
  ggplot(aes(start, nlog_qval, label = symbol, colour = colour)) +
  geom_point() +
  ggrepel::geom_text_repel(nudge_y = 3, fontface = "bold", size = 8) +
  geom_hline(yintercept = -log10(qval_thres), linetype = 2, colour = "grey40") +
  scale_x_continuous(labels = NULL) +
  scale_colour_manual(values = c(even = "#80b1d3", odd = "#b3de69"), breaks = NULL) +
  facet_grid(~seqnames, scales = "free_x", space = "free_x") +
  theme(
    panel.spacing = unit(0, "lines"),
    axis.ticks.x = element_blank()) + 
  labs(x = "Chromosome", y = "Negative log Q-value")

gt <- ggplot_gtable(ggplot_build(manhattanplot_maf_noncoding_mut_density))
gt$layout$clip[grep("panel", gt$layout$name)] <- "off"
grid::grid.draw(gt)

waterfallplot_maf <- 
  subsetMaf(maf, query = "Variant_Classification %in% c('Truncation', 'Missense', 'Splicing')", 
          mafObj = TRUE) %>% 
  oncoplot(genes = smgs, colors = colours$categs, removeNonMutated = FALSE)

# print(waterfallplot_maf)
gprotein_genes <- c("GNA13", "P2RY8", "RHOA", "GNAI2", "ARHGEF40")

# maf@data %>%
#   filter(Hugo_Symbol %in% gprotein_genes) %>%
#   select(Hugo_Symbol, Tumor_Sample_Barcode) %>%
#   bind_rows(tibble(Hugo_Symbol = "DROP", Tumor_Sample_Barcode = unique(maf@data$Tumor_Sample_Barcode))) %>%
#   mutate(status = 0) %>%
#   distinct() %>%
#   spread(Hugo_Symbol, status, fill = 1, drop = FALSE) %>%
#   select(-Tumor_Sample_Barcode, -DROP) %>%
#   map(as.factor) %>%
#   table() %>%
#   c() %>%
#   cometExactTest::comet_exact_test()

oncostrip_maf_gproteins <- oncostrip(maf, genes = gprotein_genes, 
                                     colors = colours$categs, removeNonMutated = FALSE)

print(oncostrip_maf_gproteins)

swisnf_genes <- c("ARID1A", "SMARCA4")

oncostrip_maf_swisnf <- oncostrip(
  maf, genes = swisnf_genes, colors = colours$categs, removeNonMutated = FALSE)

print(oncostrip_maf_swisnf)