library(ithi.utils)
load_base_libs()
library(survival)
library(rms)
library(edgeR)
library(limma)
library(gage)
library(pathview)
library(DT)
library(ape)
library(phytools)
library(phylobase)

library(ithi.meta)
library(ithi.xcr)
library(ithi.ihc)
library(ithi.seq)
library(ithi.clones)
library(ithi.expr)
library(ithi.external)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db

mmctm_sample_result_dir <- snakemake@input$mmctm_sample_dir
mmctm_sample_ad_result_dir <- snakemake@input$mmctm_sample_ad_dir
mmctm_patient_ad_result_dir <- snakemake@input$mmctm_patient_ad_dir
mmctm_ov_combined_result_dir <- snakemake@input$mmctm_ov_combined_dir
mmctm_final_patient_dir <- snakemake@input$mmctm_final_patient_dir

mmctm_sample_sigplot <- snakemake@input$mmctm_sample_sigplot
mmctm_sample_ad_sigplot <- snakemake@input$mmctm_sample_ad_sigplot
mmctm_patient_ad_sigplot <- snakemake@input$mmctm_patient_ad_sigplot
mmctm_ov_combined_sigplot <- snakemake@input$mmctm_ov_combined_sigplot
mmctm_final_patient_sigplot <- snakemake@input$mmctm_final_patient_sigplot

master_variant_file <- snakemake@input$master_variant_file
master_breakpoint_file <- snakemake@input$master_breakpoint_file

ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
tiltypes <- snakemake@params$mutsig_tiltypes

tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

molecular_subtype_file <- snakemake@input$molecular_subtypes

icgc_expr_mat_file <- snakemake@input$icgc_expr_mat
icgc_specimen_file <- snakemake@input$icgc_specimen
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_raw_file <- snakemake@input$icgc_expr_melted
icgc_donor_file <- snakemake@input$icgc_clinical

tcga_expr_mat_file <- snakemake@input$tcga_expr_mat
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotation
tcga_donor_file <- snakemake@input$tcga_clinical

proportion_subclonality_file <- snakemake@input$subclonality

wang_icgc_fbi_status_file <- snakemake@input$wang_fbi_status

snv_cluster_files <- snakemake@input$snv_cluster_files
snv_cluster_patients <- snakemake@params$snv_cluster_patients

clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file

ith_icgc_batch_corrected_expression_file <- snakemake@input$ith_icgc_bc

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))
read_mutsig_output <- function(dir, sample_table, external = FALSE) {
    prop_table_path <- Sys.glob(file.path(dir, "*_props.tsv"))
    sigs_path <- Sys.glob(file.path(dir, "../plots/*_multipanel.pdf"))
    
    prop_table <- fread(prop_table_path)
    
    sample_cols <- colnames(prop_table)[colnames(prop_table) != "signature"]
    ith_cols <- str_detect(sample_cols, "patient")
    
    patients <- str_extract(sample_cols[ith_cols], "(?<=patient_?)[0-9]+")
    site_ids <- str_replace_all(sample_cols[ith_cols], "patient_?[0-9]+_", "")
    
    FROM <- c("rectum_site_1", "rectum_site_2", "rectum_site_3", "rectum_site_4", 
        "pelvis_site_1", "pelvis_site_2", "cul_de_sac_site_1")
    TO <- c("rectum_site_1", "rectum_site_1", "rectum_site_2", "rectum_site_2", 
        "pelvis_site_1", "pelvis_site_1", "cul-de-sac_site_1")
    
    site_ids_fixed <- mapvalues(site_ids, from = FROM, to = TO)
    df <- data.frame(patient_id = patients, site_id = site_ids_fixed, old_id = sample_cols[ith_cols])
    if (length(df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"), 
        ]$site_id) > 0) {
        df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"), ]$site_id <- "left_ovary_site_3"
    }
    
    sample_subset <- unique(subset(sample_table, select = c("condensed_id", 
        "patient_id", "site_id")))
    
    df <- plyr::join(df, sample_subset)
    if (nrow(df) > 0) {
        df$is_ancestral <- 0
        df$is_ancestral[df$site_id == "ancestral"] <- 1
    }
    
    # colnames(prop_table)[ith_cols] <- make.unique(df$condensed_id)
    
    prop_final <- melt(prop_table, id.vars = c("signature"), measure.vars = sample_cols, 
        variable.name = "old_id", value.name = "proportion")
    if (external) {
        prop_final_merged <- plyr::join(df, prop_final, by = c("old_id"), type = "full")
    } else {
        prop_final_merged <- plyr::join(df, prop_final, by = c("old_id"), type = "inner")
    }
    
    prop_final_merged <- prop_final_merged %>% mutate(new_id = ifelse(site_id %in% 
        c("ancestral", "residual"), yes = paste(patient_id, site_id, sep = "_"), 
        no = condensed_id))
    na_idx <- is.na(prop_final_merged$new_id)
    prop_final_merged$new_id[na_idx] <- as.character(prop_final_merged$old_id[na_idx])
    
    result <- list(prop = prop_final_merged, sigplot = sigs_path)
    return(result)
}

sum_total_variant_counts <- function(variant_sample) {
    variant_sample %>% group_by(condensed_id, patient_id) %>% summarise(snv_count = sum(snv_count), 
        bkpt_count = sum(bkpt_count))
}

compute_mutsig_counts <- function(df) {
    df <- df %>% mutate(is_sv = as.numeric(str_detect(signature, "^SV")), sigcount = ifelse(as.character(site_id) == 
        "ancestral", yes = ifelse(is_sv, yes = ancestral_bkpt * proportion, 
        no = ancestral_snv * proportion), no = ifelse(is_sv, yes = bkpt_count * 
        proportion, no = snv_count * proportion)))
    return(df)
}

create_mutsig_matrix <- function(df, col = "proportion") {
    mat <- acast(df, new_id ~ signature, fun.aggregate = mean, value.var = col)
    return(mat)
}
variant_res <- summarize_variant_counts(master_variant_file, master_breakpoint_file, 
    db_path)

Read 5.9% of 682516 rows
Read 682516 rows and 9 (of 9) columns from 0.026 GB file in 00:00:03
variant_sample <- variant_res$sample
variant_patient <- variant_res$patient

variant_sample_sum <- sum_total_variant_counts(variant_sample)

ihc_table <- fread(ihc_table_path)
ihc_table_subset <- subset(ihc_table, select = c("condensed_id", tiltypes))
ihc_table_slide <- fread(ihc_table_slide_path)

Sample level

Signatures

Results

props_sample <- read_mutsig_output(mmctm_sample_result_dir, samples)
props_sample_prop <- subset(props_sample$prop, patient_id != "11")

props_sample_mat <- create_mutsig_matrix(props_sample_prop, col = "proportion")

props_sample_mat_scaled <- clip_values(scale(props_sample_mat), 2, -2)
sample_heat <- pheatmap(props_sample_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation", 
    clustering_method = "ward.D2")

props_sample_merged <- Reduce(function(x, y) plyr::join(x, y), list(props_sample_prop, 
    variant_sample_sum, variant_patient, ihc_table_subset))

props_sample_merged <- compute_mutsig_counts(props_sample_merged)
COL <- "E_CD8_density"
measure <- "sigcount"

pvals <- setNames(ddply(props_sample_merged, .(signature), function(x) {
    df <- as.data.frame(x)
    df <- df[!is.na(df[, COL]), ]
    corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), 
        df))
    
    pval <- unname(corres$coefficients[, 5][2])
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_sample_merged, aes_string(x = COL, y = measure)) + geom_point(aes(colour = patient_id)) + 
    scale_colour_manual(values = pal_patient) + facet_wrap(~signature, scales = "free") + 
    geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, 
        vjust = 1.5, size = 3, parse = TRUE) + theme_bw()

# x <- Reduce(function(x,y) merge(x,y, by=c('condensed_id')),
# list(rownames_to_column(as.data.frame(t(prop_matrix)), var =
# 'condensed_id'), ihc_table_subset)) x$patient_id <- map_id(x$condensed_id,
# from = 'condensed_id', to='patient_id', db_path) x <- subset(x, patient_id
# != '11') cols <- colnames(x)[!colnames(x) %in% c('condensed_id',
# 'patient_id')] rmat <- matrix(nrow=length(cols),ncol=length(cols)) pmat <-
# matrix(nrow=length(cols),ncol=length(cols)) for (i in 1:length(cols)) {
# for (j in i:length(cols)) { if (i != j) { col1 <- cols[i] col2 <- cols[j]
# formula <- paste0('`', col2, '`', '~', '`', col1, '`', '+ (1|patient_id)',
# sep=' ') res <- summary(lmer(formula, x)) pval <- tryCatch({
# unname(res$coefficients[,5][2]) }, error = function(e) { NA }) r <-
# unname(res$coefficients[,1][2]) rmat[i,j] <- rmat[j,i] <- r pmat[i,j] <-
# pmat[j,i] <- pval } else { rmat[i,j] <- 1 pmat[i,j] <- NA } } }
# resmat <- log10(pmat)*-sign(rmat) resmat[resmat == Inf] <- NA
# rownames(resmat) <- colnames(resmat) <- cols pheatmap(resmat,
# display_numbers = signif(pmat, 3), cluster_rows = FALSE, cluster_cols =
# FALSE, fontsize = 5)
# xmat <- x %>% select(-one_of('condensed_id', 'patient_id')) cormat <-
# corr.test(xmat, method='spearman', adjust='fdr') pheatmap(cormat$r,
# display_numbers = signif(cormat$p, 3), fontsize = 5)

Ancestral-descendant level (samples)

Signatures

Results

props_ad <- read_mutsig_output(mmctm_sample_ad_result_dir, samples)
props_ad_prop <- subset(props_ad$prop, patient_id != "11")

props_ad_mat <- create_mutsig_matrix(props_ad_prop, col = "proportion")

props_ad_mat_scaled <- clip_values(scale(props_ad_mat), 2, -2)
patient_heat <- pheatmap(props_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")

props_ad_merged <- Reduce(function(x, y) plyr::join(x, y), list(props_ad_prop, 
    subset(variant_sample, is_ancestral == 0), variant_patient, ihc_table_subset))

props_ad_merged <- compute_mutsig_counts(props_ad_merged)
COL <- "E_CD8_density"
measure <- "sigcount"

props_ad_merged_descendant <- subset(props_ad_merged, site_id != "ancestral")

pvals <- setNames(ddply(props_ad_merged_descendant, .(signature), function(x) {
    df <- as.data.frame(x)
    df <- df[!is.na(df[, COL]), ]
    corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), 
        df))
    
    pval <- unname(corres$coefficients[, 5][2])
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged_descendant, aes_string(x = COL, y = measure)) + geom_point(aes(colour = patient_id)) + 
    scale_colour_manual(values = pal_patient) + facet_wrap(~signature, scales = "free") + 
    geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, 
        vjust = 1.5, size = 3, parse = TRUE) + theme_bw()

What we can also see from these plots is that not resolving SNV-7 (the noise cluster) is going to be a problem. The signature proportion values from that cluster are fairly similar to those from SNV-6, the APOBEC signature.

Ancestral-descendant level (patients)

Signatures

Results

props_patient_ad <- read_mutsig_output(mmctm_patient_ad_result_dir, samples)
props_patient_ad_prop <- subset(props_patient_ad$prop, patient_id != "11")

props_patient_ad_mat <- create_mutsig_matrix(props_patient_ad_prop, col = "proportion")

props_patient_ad_mat_scaled <- clip_values(scale(props_patient_ad_mat), 2, -2)
patient_ad_heat <- pheatmap(props_patient_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")

props_patient_ad_merged <- Reduce(function(x, y) plyr::join(x, y), list(props_patient_ad_prop, 
    subset(variant_sample, is_ancestral == 0), variant_patient))

props_patient_ad_merged <- compute_mutsig_counts(props_patient_ad_merged)

Correlation matrices

ihc_mat <- as.data.frame(ihc_table_subset %>% dplyr::select(-condensed_id))
rownames(ihc_mat) <- ihc_table_subset$condensed_id

# TOTAL_TIL_COLS <- c('T_CD8_density', 'T_CD4_density', 'T_CD20_density',
# 'T_Plasma_density') ihc_mat <- subset(ihc_mat, select=TOTAL_TIL_COLS)
compute_ihc_mutsig_cor <- function(ihc_mat, mutsig_mat, patient_summary = FALSE, 
    ancestral = FALSE) {
    if (patient_summary) {
        ihc_mat <- rownames_to_column(as.data.frame(ihc_mat), "id")
        mutsig_mat <- rownames_to_column(as.data.frame(mutsig_mat), "id")
        
        if (ancestral) {
            mutsig_mat <- subset(mutsig_mat, str_detect(id, "ancestral"))
        } else {
            mutsig_mat <- subset(mutsig_mat, !str_detect(id, "ancestral"))
        }
        
        ihc_mat$patient_id <- str_extract(ihc_mat$id, "^[0-9]+")
        mutsig_mat$patient_id <- str_extract(mutsig_mat$id, "^[0-9]+")
        
        ihc_mat <- ihc_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% 
            summarise_each(funs(mean(., na.rm = TRUE))) %>% column_to_rownames(var = "patient_id")
        mutsig_mat <- mutsig_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% 
            summarise_each(funs(mean(., na.rm = TRUE))) %>% column_to_rownames(var = "patient_id")
        
        ihc_mat <- as.matrix(ihc_mat)
        mutsig_mat <- as.matrix(mutsig_mat)
    }
    
    intersect_samples <- intersect(rownames(mutsig_mat), rownames(ihc_mat))
    mutsig <- mutsig_mat[intersect_samples, , drop = FALSE]
    ihc <- ihc_mat[intersect_samples, , drop = FALSE]
    
    pmat <- matrix(nrow = ncol(mutsig), ncol = ncol(ihc))
    rmat <- matrix(nrow = ncol(mutsig), ncol = ncol(ihc))
    rownames(pmat) <- rownames(rmat) <- colnames(mutsig)
    colnames(pmat) <- colnames(rmat) <- colnames(ihc)
    
    for (i in 1:ncol(mutsig)) {
        for (j in 1:ncol(ihc)) {
            corres <- cor.test(mutsig[, i], ihc[, j], method = "spearman")
            pmat[i, j] <- corres$p.value
            rmat[i, j] <- corres$estimate
        }
    }
    
    return(list(p = pmat, r = rmat))
}

For adjusted p-values just run p.adjust.mat on the p-value labels.

Sample-level

ihc_sample_cor <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat)

pheatmap(ihc_sample_cor$r, display_numbers = signif(ihc_sample_cor$p, 3))

ihc_sample_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat, 
    patient_summary = TRUE)

pheatmap(ihc_sample_cor_summary$r, display_numbers = signif(ihc_sample_cor_summary$p, 
    3))

Ancestral-descendant level

ihc_ad_cor <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat)

pheatmap(ihc_ad_cor$r, display_numbers = signif(ihc_ad_cor$p, 3))

ihc_ad_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE)

pheatmap(ihc_ad_cor_summary$r, display_numbers = signif(ihc_ad_cor_summary$p, 
    3))

ihc_ad_cor_summary_ancestral <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, 
    patient_summary = TRUE, ancestral = TRUE)

pheatmap(ihc_ad_cor_summary_ancestral$r, display_numbers = signif(ihc_ad_cor_summary_ancestral$p, 
    3))

Note: p-values shown are uncorrected. Patient-summarized correlations are insignificant after FDR correction.

Cluster-level analysis

Finding correlations at the level of individual signatures can be difficult. Even moreso because some published signatures are combinations of these signatures – e.g. SV-3 and SV-6 are both foldback signatures in the ancestral-descendant analysis.

Hence, we can look at the level of clusters from our heatmaps.

make_cluster_frame <- function(clusters) {
    clusts <- rownames_to_column(as.data.frame(clusters), "new_id")
    colnames(clusts)[2] <- "cluster"
    clusts$cluster <- factor(clusts$cluster)
    return(clusts)
}

NCLUST <- 2
selected_cols <- c("new_id", "condensed_id", "patient_id", "old_id", "cluster", 
    tiltypes)

Sample-level

clusters <- cutree(sample_heat$tree_row, NCLUST)
sample_clusts <- make_cluster_frame(clusters)

props_sample_merged_clusts <- join(props_sample_merged, sample_clusts)
sample_df <- unique(subset(props_sample_merged_clusts, select = selected_cols))

sample_df_melted <- melt(sample_df, id.vars = colnames(sample_df)[!colnames(sample_df) %in% 
    tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
pvals <- setNames(ddply(sample_df_melted, .(tiltype), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(density ~ cluster, df)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(sample_df_melted, aes(x = cluster, y = density)) + geom_boxplot() + theme_bw() + 
    theme_Publication() + geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2, 
    height = 0)) + scale_color_manual(values = pal_patient) + facet_wrap(~tiltype, 
    scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

diversity_column <- "observedDiversity_mean"

diversity_files <- list(tcr = tcr_diversity_file, bcr = bcr_diversity_file)

diversity <- lapply(names(diversity_files), function(segment) {
    f <- diversity_files[[segment]]
    xcr_diversity <- read_xcr_diversity_file(f, db_path)
    
    df <- subset(xcr_diversity, select = c("condensed_id", "patient_id", diversity_column))
    colnames(df) <- mapvalues(colnames(df), from = diversity_column, to = segment)
    return(df)
})
names(diversity) <- names(diversity_files)

xcr_diversity <- Reduce(plyr::join, diversity)

XCR_VARS <- c("tcr", "bcr")
molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))

proportion_subclonality <- fread(proportion_subclonality_file)
proportion_subclonality_subset <- subset(proportion_subclonality, select = c("condensed_id", 
    "proportion_subclonal"))

exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)

celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
pathway_matrix <- create_pathway_matrix(exprs, labels, db_path)

celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = "condensed_id")
pathway_df <- rownames_to_column(as.data.frame(t(pathway_matrix)), var = "condensed_id")

NANOSTRING_VARS <- c(rownames(celltype_matrix), rownames(pathway_matrix))
colnames(sample_clusts) <- mapvalues(colnames(sample_clusts), "new_id", "condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, 
    tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, 
    pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
    x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "full"), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))

combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
# ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), '7_BrnM')

sample_order <- sample_heat$tree_row$labels[sample_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-1", "SV-2", "SNV-1")
sig_annotations <- rownames_to_column(data.frame(props_sample_mat[, SIGS]), 
    var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots = "condensed_id") %>% 
    summarise(snv_count = log(sum(snv_count)), bkpt_count = log(sum(bkpt_count))) %>% 
    subset(select = c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, 
    sig_annotations, variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")

combined_mat_scaled <- scale(combined_mat)

# hcs <- lapply(unique(combined_df$cluster), function(clust) { ids <-
# subset(combined_df, cluster == clust)$condensed_id dists <-
# dist(combined_mat_scaled[ids,], method = 'euclidean') #dists <-
# proxy::dist(combined_mat_scaled[ids,], method = function(x,y)
# pairwise_dist(x,y,method='canberra')) hclust(dists, method = 'ward.D') })
# min_height <- max(sapply(hcs, function(x) max(unique(cophenetic(x)))))*1.1
# hc <- Reduce(function(x,y) merge(as.dendrogram(x),as.dendrogram(y),height
# = min_height), hcs) hc <- as.dendrogram(ref_dendrogram) hc <-
# as.dendrogram(sample_heat2$tree_row) ha <-
# HeatmapAnnotation(row_annotations[rownames(combined_mat_scaled),],
# which='row') Heatmap(clip_values(combined_mat_scaled, 2, -2), cluster_rows
# = hc2, cluster_columns = TRUE, split = 2, clustering_method_columns =
# 'ward.D2') + ha

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order, ], cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)

What we can see is:

  • H-HRD cluster is relatively homogeneous.
  • Looks like there are two clusters within the H-FBI group – one characterized by high levels of immune activity (cytotoxicity, etc.) and one characterized by low levels.

This is a pretty significant finding.

Additionally, an interesting thing is that the patients/samples with the highest proportions of foldbacks – patients 2, 3, and 9 – are actually the ones with high immune response in the foldback group! Suggestive that perhaps foldback inversions can create neoepitopes. Of course very preliminary and low sample size though.

If you’re wondering why there’s no dendrogram on the vertical axis it’s because the plotting functions I’m currently using don’t allow for self-specified dendrograms. Trying to make one that lets me do so but it’s taking a bit of acrobatics and I’ve wasted a lot of time already …

To see ICGC validation, go to that section. I’ll add a link later …

TCR/BCR diversity

combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", 
    to = "patient_id", db_path)
combined_df_xcr <- melt(combined_df, id.vars = c("condensed_id", "patient_id", 
    "cluster"), measure.vars = XCR_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_xcr, .(type), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(value ~ cluster, df)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_xcr, aes(x = cluster, y = value)) + geom_boxplot() + theme_bw() + 
    theme_Publication() + geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2, 
    height = 0)) + scale_color_manual(values = pal_patient) + facet_wrap(~type, 
    scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Celltypes and pathways

combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", 
    to = "patient_id", db_path)
combined_df_nanostring <- melt(combined_df, id.vars = c("condensed_id", "patient_id", 
    "cluster"), measure.vars = NANOSTRING_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_nanostring, .(type), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(value ~ cluster, df)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_nanostring, aes(x = cluster, y = value)) + geom_boxplot() + 
    theme_bw() + theme_Publication() + geom_jitter(aes(colour = patient_id), 
    position = position_jitter(width = 0.2, height = 0)) + scale_color_manual(values = pal_patient) + 
    facet_wrap(~type, scales = "free") + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Ancestral-descendant level

clusters <- cutree(patient_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

props_ad_merged_clusts <- join(props_ad_merged, patient_clusts)
patient_df <- unique(subset(props_ad_merged_clusts, select = selected_cols))

patient_df_melted <- melt(patient_df, id.vars = colnames(patient_df)[!colnames(patient_df) %in% 
    tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
pvals <- setNames(ddply(patient_df_melted, .(tiltype), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(density ~ cluster, df)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(patient_df_melted, aes(x = cluster, y = density)) + geom_boxplot() + 
    theme_bw() + theme_Publication() + geom_jitter(aes(colour = patient_id), 
    position = position_jitter(width = 0.2, height = 0)) + scale_color_manual(values = pal_patient) + 
    facet_wrap(~tiltype, scales = "free") + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Hence, the foldback group (cluster 2) has lower CD8+ and CD4+ densities than the HRD group, as expected.

A caveat is that p-values are computed with Wilcoxon, irrespective of patient number. We can’t really opt for a nonparametric nested ranks test because very few patients (4) are actually present in both clusters.

Ancestral analysis

Ancestral variants may have different properties from descendant variants.

Sample-specific

Here we’ll just naively allow for subclonal variants to be counted multiple times.

props_ad_merged$is_ancestral <- as.factor(props_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_ad_merged, .(signature), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(proportion ~ is_ancestral, df)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged, aes(x = is_ancestral, y = proportion)) + geom_boxplot() + 
    geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2, 
        height = 0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + 
    facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

P-values are uncorrected.

Unsurprisingly, ancestral samples have significantly more of SNV-4, the age signature. Insignificant, but they also have less SV-1, which is one of the BRCA’s (small deletions).

Union

Here we’ll actually take the union of subclonal variants for each patient so we don’t overcount.

props_patient_ad_merged$is_ancestral <- as.factor(props_patient_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_patient_ad_merged, .(signature), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(proportion ~ is_ancestral, df, paired = TRUE)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_patient_ad_merged, aes(x = is_ancestral, y = proportion)) + geom_boxplot() + 
    geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2, 
        height = 0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + 
    facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Aside from age and HRD which were described in McPherson et al., there’s additionally the SV-3 signature – a translocation signature. Implying that translocations are early (ancestral) events in HGSC – perhaps this is interesting? I have to do a literature search.

anc_desc_patient <- dcast(props_patient_ad_merged, formula = patient_id + signature ~ 
    is_ancestral, value.var = "proportion")

pvals <- setNames(ddply(anc_desc_patient, .(signature), function(x) {
    df <- as.data.frame(x)
    corres <- with(df, cor.test(`0`, `1`, method = "spearman"))
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(anc_desc_patient, aes(x = `1`, y = `0`)) + geom_point() + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + 
    facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

ICGC validation

Signatures

FBI subclusters

specimen_data <- fread(icgc_specimen_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)
icgc_expr_df <- icgc_expr_mat %>% as.data.frame %>% column_to_rownames("icgc_donor_id") %>% 
    t %>% as.data.frame %>% rownames_to_column("Name")
icgc_celltype_matrix <- create_pathway_matrix(icgc_expr_df, labels, db_path, 
    convert_ids = FALSE)

icgc_immune <- rownames_to_column(as.data.frame(t(icgc_celltype_matrix)), var = "icgc_donor_id")

# cytotoxic_markers <- c('PRF1', 'GZMA', 'HLA-A', 'HLA-B', 'HLA-C', 'GZMK',
# 'GZMM', 'GZMH', 'GNLY', 'GZMB') icgc_immune <-
# as.data.frame(subset(icgc_expr_mat, select=c('icgc_donor_id',
# cytotoxic_markers))) for (i in 2:ncol(icgc_immune)) { icgc_immune[,i] <-
# log10(icgc_immune[,i]) }
props_ov_combined <- read_mutsig_output(mmctm_ov_combined_result_dir, samples, 
    external = TRUE)
props_ov_combined_prop <- props_ov_combined$prop

props_ov_combined_mat <- create_mutsig_matrix(props_ov_combined_prop, col = "proportion")

# props_ov_combined_mat_scaled <- as.data.frame(apply(props_ov_combined_mat,
# 2, function(x) { #x <- logit(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)
# }))

props_ov_combined_mat_scaled <- scale(props_ov_combined_mat)

# props_ov_combined_mat_scaled <- clip_values(props_ov_combined_mat_scaled,
# 3, -3) clip_values(scale(props_ov_combined_mat), 2, -2)

ov_combined_heat <- pheatmap(props_ov_combined_mat_scaled, fontsize_row = 6, 
    clustering_distance_rows = "correlation", clustering_method = "ward.D2")

NCLUST <- 3

clusters <- cutree(ov_combined_heat$tree_row, NCLUST)
ov_combined_clusts <- make_cluster_frame(clusters)

colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "icgc_donor_id", "new_id")

data_matrices <- list(ov_combined_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
    x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
# combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select = -c(new_id, cluster))
rownames(combined_mat) <- combined_df$new_id
sample_order <- ov_combined_heat$tree_row$labels[ov_combined_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select = c(new_id, cluster))

SIGS <- c("SV-8", "SNV-1", "SV-4", "SV-7", "SV-5", "SNV-7", "SV-1")
sig_annotations <- rownames_to_column(data.frame(props_ov_combined_mat[, SIGS], 
    check.names = FALSE), var = "new_id")

subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype", 
    "nmf_subtype"))
colnames(subtype_annotations)[1] <- "new_id"

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, 
    subtype_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "new_id")

select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == 
    "no treatment")$icgc_donor_id
sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

# order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

# combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x - 
    median(x, na.rm = TRUE))/mad(x, na.rm = TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order, ]
other_mat <- props_ov_combined_mat_scaled[sample_order, ]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", 
    clustering_method = "ward.D2")

# rowmean <- rowMeans(combined_mat_scaled) a <- merge(row_annotations,
# as.data.frame(rowmean), by='row.names') ggplot(subset(a, cluster != 1),
# aes(x=rowmean > median(rowmean), y = `SNV-7`)) + geom_boxplot() +
# theme_bw()
clusters <- setNames(cluster_annotations, c("icgc_donor_id", "cluster"))

Differential expression

icgc_expr_melted <- fread(icgc_expr_raw_file)

Read 0.0% of 3709596 rows
Read 11.6% of 3709596 rows
Read 23.5% of 3709596 rows
Read 35.3% of 3709596 rows
Read 47.4% of 3709596 rows
Read 59.3% of 3709596 rows
Read 71.2% of 3709596 rows
Read 83.0% of 3709596 rows
Read 95.2% of 3709596 rows
Read 3709596 rows and 15 (of 15) columns from 0.484 GB file in 00:00:16
icgc_expr_casted_matrix <- expression_df_to_matrix(icgc_expr_melted, summarize_over = "icgc_donor_id", 
    measure_var = "raw_read_count")

icgc_counts <- icgc_expr_casted_matrix %>% column_to_rownames(var = "icgc_donor_id")
icgc_counts_filtered <- icgc_counts[clusters$icgc_donor_id, ]
icgc_counts_filtered <- icgc_counts_filtered %>% t %>% as.data.frame

dge <- DGEList(counts = icgc_counts_filtered)
dge <- calcNormFactors(dge)

library_sizes <- colSums(icgc_counts_filtered)
library_sizes
  DO46325   DO46327   DO46328   DO46329   DO46330   DO46331   DO46332 
 78792862 106156028 146714663 100782140  66648443  94187980 128035792 
  DO46333   DO46334   DO46336   DO46338   DO46340   DO46342   DO46344 
 70763442  87776088 106659467  67264480  72586319  91076278  97575765 
  DO46346   DO46350   DO46352   DO46354   DO46356   DO46358   DO46360 
 78219082  65895620  95169618  92438598  46265120  90175214  96687938 
  DO46362   DO46364   DO46366   DO46368   DO46370   DO46372   DO46374 
 95856331 101399457  90792124  97051216 100193800  92056106 116039147 
  DO46376   DO46378   DO46380   DO46382   DO46384   DO46386   DO46388 
 81664713 159721278  90029262  80192737  80784914  88436986  95577226 
  DO46390   DO46392   DO46394   DO46396   DO46398   DO46400   DO46402 
 88612126  91723742  91209587  87119354  79391516  93620832  94446982 
  DO46404   DO46408   DO46412   DO46448   DO46493   DO46551   DO46560 
 97746414  91333342  95814730  88596500 123302138  84121097 109112881 
  DO46561   DO46566   DO46568   DO46571   DO46576   DO46581   DO46586 
 99873236  79993818  95909367  75957744  91988944  98993304  95877151 
  DO46588   DO46591   DO46597   DO46602   DO46606   DO46611 
 78452166 113011642  94472903  96109428  98892638  61422278 
max(library_sizes)/min(library_sizes)
[1] 3.452304
design <- model.matrix(~0 + factor(clusters$cluster))
colnames(design) <- paste0("clust", 1:NCLUST)
contrast.matrix <- makeContrasts(clust1 - clust3, clust2 - (clust1 + clust3)/2, 
    clust2 - clust1, levels = design)

limma-trend

We straddle close to threshold for doing this but let’s do it anyways.

fit_trend <- rnaseq_DE_initial_fit(dge, design, type = "limma_trend")
fit_trend_contrasts <- contrasts.fit(fit_trend, contrast.matrix)
fit_trend_contrasts <- eBayes(fit_trend_contrasts)

hrd_fbi_genes <- topTable(fit_trend_contrasts, coef = "clust2 - (clust1 + clust3)/2", 
    adjust.method = "BH", number = Inf, sort = "p")
fbi_immune_genes <- topTable(fit_trend_contrasts, coef = "clust1 - clust3", 
    number = Inf, sort = "p")
hrd_fbi_highimmune_genes <- topTable(fit_trend_contrasts, coef = "clust2 - clust1", 
    number = Inf, sort = "p")
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "go")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "go")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "go")
HRD vs. FBI samples (clust2 vs. clust1+clust3)
datatable(hrd_fbi_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))
High immune FBI vs. low immune FBI (clust1 vs. clust3)
datatable(fbi_immune_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))
HRD vs. high immune FBI (clust2 vs. clust1)
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))

voom

We have greater than 3-fold variability in library size, so we’re better off using the voom method for DE analysis.

fit_voom <- rnaseq_DE_initial_fit(dge, design, type = "voom")
fit_voom_contrasts <- contrasts.fit(fit_voom, contrast.matrix)
fit_voom_contrasts <- eBayes(fit_voom_contrasts)

hrd_fbi_genes <- topTable(fit_voom_contrasts, coef = "clust2 - (clust1 + clust3)/2", 
    adjust.method = "BH", number = Inf, sort = "p")
fbi_immune_genes <- topTable(fit_voom_contrasts, coef = "clust1 - clust3", number = Inf, 
    sort = "p")
hrd_fbi_highimmune_genes <- topTable(fit_voom_contrasts, coef = "clust2 - clust1", 
    number = Inf, sort = "p")
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "kegg")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "kegg")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "kegg")
# pv.out.list <- sapply(path.ids2, function(pid) pathview( gene.data =
# exp.fc, pathway.id = pid, species = 'hsa', out.suffix=out.suffix))

Note: BRCA1, BRCA2, and RPA are also downregulated in the HRD group – the entire HR-associated gene cluster isn’t significantly downregulated though.

One of the key reasons why the entire HR pathway isn’t significantly downregulated is because PolQ is part of it and PolQ is upregulated. Of note, PolQ promotes MMEJ, providing further evidence for MMEJ activity in FBI tumours.

HRD vs. FBI samples (clust2 vs. clust1+clust3)
datatable(hrd_fbi_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))
High immune FBI vs. low immune FBI (clust1 vs. clust3)
datatable(fbi_immune_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))
HRD vs. high immune FBI (clust2 vs. clust1)
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options = list(pageLength = 5, 
    scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf", 
        "print")))

Some DNA repair genes

v <- voom(icgc_counts_filtered, design, normalize = "quantile", plot = FALSE)
icgc_expr_df_wide <- v$E %>% t %>% as.data.frame %>% rownames_to_column("icgc_donor_id")

icgc_expr_df_melted <- reshape2::melt(icgc_expr_df_wide, id.vars = c("icgc_donor_id"), 
    measure.vars = colnames(icgc_expr_df_wide)[2:ncol(icgc_expr_df_wide)], variable.name = "Name", 
    value.name = "expression")

icgc_expr_df_melted_labeled <- plyr::join(icgc_expr_df_melted, clusters)

REPAIR_GENES <- c("BRCA1", "BRCA2", "POLQ", "FEN1", "XRCC1", "PARP1", "LIG3")

icgc_expr_df_melted_labeled_filtered <- subset(icgc_expr_df_melted_labeled, 
    Name %in% REPAIR_GENES)

ggplot(subset(icgc_expr_df_melted_labeled_filtered, !is.na(cluster)), aes(x = cluster, 
    y = expression)) + geom_boxplot() + geom_jitter(position = position_jitter(width = 0.2, 
    height = 0)) + theme_bw() + theme_Publication() + facet_wrap(~Name, scales = "free")

Survival

icgc_clinical <- read_donor_specimen_survival(icgc_donor_file, icgc_specimen_file)

icgc_clinical_labeled <- plyr::join(clusters, icgc_clinical)

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, 
    donor_vital_status == "deceased"))

Survival by FBI status …

simple_survival_analysis(SurvObj ~ cluster != 2, data = icgc_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

                    N Observed Expected (O-E)^2/E (O-E)^2/V
cluster != 2=FALSE 27       25     29.3     0.641      1.34
cluster != 2=TRUE  38       34     29.7     0.634      1.34

 Chisq= 1.3  on 1 degrees of freedom, p= 0.246 
simple_survival_analysis(SurvObj ~ cluster, data = icgc_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

           N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=1 21       18     15.8     0.293     0.413
cluster=2 27       25     29.3     0.641     1.344
cluster=3 17       16     13.8     0.344     0.473

 Chisq= 1.3  on 2 degrees of freedom, p= 0.509 

TCGA validation

While we can’t directly do mutation signature analysis with MMCTM on exome data (I suppose this is theoretically possible but we’d probably be restricted in the types of events we can call, like long SVs), we can look at the FBI-HLAMP finding from Yikan’s paper and see if that lines up with immune signatures.

tcga_expr_mat <- read_tcga_exprs(tcga_expr_mat_file)

Read 0.0% of 570 rows
Read 570 rows and 12497 (of 12497) columns from 0.121 GB file in 00:00:05
tcga_ov_annotation <- fread(tcga_ov_annotation_file)
# tcga_fbi_hlamp_proportions <- fread(tcga_fbi_hlamp_proportions_file)
tcga_expr_df <- tcga_expr_mat %>% as.data.frame %>% column_to_rownames("tcga_sample_id") %>% 
    t %>% as.data.frame %>% rownames_to_column("Name")
tcga_celltype_matrix <- create_pathway_matrix(tcga_expr_df, labels, db_path, 
    convert_ids = FALSE)

tcga_immune <- rownames_to_column(as.data.frame(t(tcga_celltype_matrix)), var = "tcga_sample_id")
mat <- tcga_immune %>% column_to_rownames("tcga_sample_id") %>% scale  #%>% clip_values(2, -2)
row_annotations <- tcga_ov_annotation %>% as.data.frame %>% column_to_rownames("tcga_sample_id")

tcgaheat <- pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE, 
    cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)

# tcga_clusters <-
# rownames_to_column(data.frame(cluster=factor(cutree(tcgaheat$tree_row,
# 2))), var = 'tcga_sample_id')

col <- "Cytotoxicity"
tcga_clusters <- rownames_to_column(data.frame(cluster = mat[, col] > median(mat[, 
    col])), "tcga_sample_id") %>% mutate(cluster = as.factor(as.numeric(cluster)))

# row_annotations <- plyr::join(tcga_ov_annotation, tcga_clusters) %>%
# as.data.frame %>% column_to_rownames('tcga_sample_id') pheatmap(mat,
# annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols =
# TRUE, clustering_method = 'ward.D', show_rownames = FALSE)
df_merged <- plyr::join(tcga_immune, tcga_ov_annotation)

expr_measure_vars <- colnames(tcga_immune)[2:ncol(tcga_immune)]

df_merged_melted <- melt(df_merged, id.vars = c("tcga_sample_id", "Subgroup", 
    "MolecularSubtype", "BRCA.status"), measure.vars = expr_measure_vars, variable.name = "Measure", 
    value.name = "Expression")
pvals <- setNames(ddply(subset(df_merged_melted, !is.na(Subgroup)), .(Measure), 
    function(x) {
        df <- as.data.frame(x)
        testres <- kruskal.test(Expression ~ factor(Subgroup), data = df)
        
        pval <- testres$p.value
        eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
        return(as.character(as.expression(eq)))
    }), c("Measure", "p.value"))

ggplot(subset(df_merged_melted, !is.na(Subgroup)), aes(x = Subgroup, y = Expression)) + 
    geom_boxplot() + theme_bw() + facet_wrap(~Measure, scales = "free", ncol = 3) + 
    theme_Publication() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

P-values are uncorrected. So after correction, we aren’t going to get anything significant.

In conclusion, the immune subgroups constitute a new subgrouping independent of FBI-HLAMP.

I’ve also done correlation testing using the logR values that Yikan’s provided me (rather than the discrete groupings of no AMP, FBI-AMP low, and FBI-AMP high), but the correlations are very poor (rho values of ~ -0.05 or so, p-values of >=0.3).

Survival

tcga_clinical <- read_tcga_clinical(tcga_donor_file, type = "synapse2", filter = TRUE, 
    unique = FALSE)

tcga_clinical_labeled <- Reduce(function(x, y) plyr::join(x, y, type = "full"), 
    list(tcga_clusters, tcga_clinical, tcga_ov_annotation))

tcga_clinical_labeled$SurvObj <- with(tcga_clinical_labeled, Surv(ifelse(vital_status == 
    "Dead", death_days_to, last_contact_days_to), vital_status == "Dead"))

# tcga_clinical_labeled <- subset(tcga_clinical_labeled,
# additional_drug_therapy == 'NO' & additional_immuno_therapy == 'NO' &
# additional_pharmaceutical_therapy == 'NO' & targeted_molecular_therapy ==
# 'NO' & radiation_therapy == 'NO') tcga_clinical_labeled <-
# subset(tcga_clinical_labeled, str_detect(tumor_stage, '^III'))

FBI-AMP colocalization

Let’s first stratify by Yikan’s groupings:

simple_survival_analysis(SurvObj ~ Subgroup, data = tcga_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

n=430, 147 observations deleted due to missingness.

                        N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 173       86     67.7      4.94      7.43
Subgroup=FBI-AMP Low  182       80     92.1      1.60      2.87
Subgroup=No AMP        75       43     49.2      0.77      1.03

 Chisq= 7.4  on 2 degrees of freedom, p= 0.0243 

Note: The logrank p-values in the paper are incorrect, the one’s I’ve computed are the right ones. In the paper, the death days from the dead patients were not correctly combined into the computation, so p-values were computed without the dead patients.

Immune activity

We’ll now stratify by cluster, a variable that indicates whether or not we’re above median in terms of Cytotoxicity. In other words, 1 means we’re high immune, and 0 means we’re low immune.

simple_survival_analysis(SurvObj ~ cluster, data = tcga_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

n=562, 15 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 280      162      148      1.38      2.79
cluster=1 282      132      146      1.39      2.79

 Chisq= 2.8  on 1 degrees of freedom, p= 0.095 

Low immune tumours trend towards poor survival but this is not significant at all.

Now let’s try by treating immune activity as a continuous variable, controlling for other covariates:

tcga_immune_continuous <- tcga_celltype_matrix %>% t %>% as.data.frame %>% rownames_to_column("tcga_sample_id")

tcga_clinical_labeled_continuous <- plyr::join(tcga_clinical_labeled, tcga_immune_continuous)

coxph(SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis + tumor_grade + 
    str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy + 
    additional_immuno_therapy, tcga_clinical_labeled_continuous)
Call:
coxph(formula = SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis + 
    tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + 
    additional_drug_therapy + additional_immuno_therapy, data = tcga_clinical_labeled_continuous)

                                              coef exp(coef)  se(coef)
Cytotoxicity                             -1.68e-01  8.45e-01  8.25e-02
age_at_initial_pathologic_diagnosis       1.95e-02  1.02e+00  5.50e-03
tumor_gradeG1                            -4.72e-01  6.24e-01  1.27e+00
tumor_gradeG2                            -2.99e-01  7.42e-01  1.06e+00
tumor_gradeG3                            -1.19e-02  9.88e-01  1.06e+00
tumor_gradeG4                             3.76e-01  1.46e+00  1.45e+00
tumor_gradeGB                             1.14e-02  1.01e+00  1.45e+00
tumor_gradeGX                             3.20e-01  1.38e+00  1.16e+00
str_extract(tumor_stage, "[IV]+")II       1.12e-01  1.12e+00  7.00e-01
str_extract(tumor_stage, "[IV]+")III      9.32e-01  2.54e+00  5.91e-01
str_extract(tumor_stage, "[IV]+")IV       1.19e+00  3.29e+00  6.07e-01
additional_chemo_therapy[Not Available]  -9.30e-03  9.91e-01  3.53e+03
additional_chemo_therapyNO               -1.14e+00  3.19e-01  3.53e+03
additional_chemo_therapyYES              -9.73e-01  3.78e-01  3.53e+03
additional_drug_therapy[Not Available]   -1.49e+01  3.44e-07  2.42e+03
additional_drug_therapy[Pending]          5.16e-01  1.68e+00  3.53e+03
additional_drug_therapyNO                 4.78e-01  1.61e+00  3.53e+03
additional_drug_therapyYES               -1.28e-02  9.87e-01  3.53e+03
additional_immuno_therapy[Not Available]  1.44e+01  1.87e+06  2.58e+03
additional_immuno_therapy[Pending]       -1.40e+01  8.52e-07  4.19e+03
additional_immuno_therapyNO              -1.95e-01  8.23e-01  3.30e-01
additional_immuno_therapyYES                    NA        NA  0.00e+00
                                             z       p
Cytotoxicity                             -2.04 0.04158
age_at_initial_pathologic_diagnosis       3.55 0.00038
tumor_gradeG1                            -0.37 0.71118
tumor_gradeG2                            -0.28 0.77908
tumor_gradeG3                            -0.01 0.99097
tumor_gradeG4                             0.26 0.79603
tumor_gradeGB                             0.01 0.99372
tumor_gradeGX                             0.28 0.78225
str_extract(tumor_stage, "[IV]+")II       0.16 0.87332
str_extract(tumor_stage, "[IV]+")III      1.58 0.11449
str_extract(tumor_stage, "[IV]+")IV       1.96 0.04961
additional_chemo_therapy[Not Available]   0.00 1.00000
additional_chemo_therapyNO                0.00 0.99974
additional_chemo_therapyYES               0.00 0.99978
additional_drug_therapy[Not Available]   -0.01 0.99509
additional_drug_therapy[Pending]          0.00 0.99988
additional_drug_therapyNO                 0.00 0.99989
additional_drug_therapyYES                0.00 1.00000
additional_immuno_therapy[Not Available]  0.01 0.99553
additional_immuno_therapy[Pending]        0.00 0.99734
additional_immuno_therapyNO              -0.59 0.55541
additional_immuno_therapyYES                NA      NA

Likelihood ratio test=46.8  on 21 df, p=0.00101
n= 559, number of events= 292 
   (18 observations deleted due to missingness)

Indeed, immune activity is significantly associated with prolonged survival (negative coefficient = fewer death events).

Stratification by foldback-AMP status

fbi_high <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP High")
fbi_low <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP Low")
fbi_no <- subset(tcga_clinical_labeled, Subgroup == "No AMP")
fbi_nothigh <- subset(tcga_clinical_labeled, Subgroup %in% c("FBI-AMP Low", 
    "No AMP"))
simple_survival_analysis(SurvObj ~ cluster, data = fbi_high)

Call:
survival::survdiff(formula = formula, data = data)

n=172, 2 observations deleted due to missingness.

           N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 90       45     48.9     0.310     0.739
cluster=1 82       40     36.1     0.419     0.739

 Chisq= 0.7  on 1 degrees of freedom, p= 0.39 
simple_survival_analysis(SurvObj ~ cluster, data = fbi_low)

Call:
survival::survdiff(formula = formula, data = data)

n=181, 5 observations deleted due to missingness.

           N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 91       42     43.5    0.0514     0.116
cluster=1 90       37     35.5    0.0630     0.116

 Chisq= 0.1  on 1 degrees of freedom, p= 0.733 
simple_survival_analysis(SurvObj ~ cluster, data = fbi_no)

Call:
survival::survdiff(formula = formula, data = data)

           N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 34       24     18.2      1.84      3.23
cluster=1 41       19     24.8      1.35      3.23

 Chisq= 3.2  on 1 degrees of freedom, p= 0.0724 

What’s curious about this is that it seems only the no AMP group gets any benefit from having a high immune response. So there’s no benefit within the foldback group of high/low immune response.

simple_survival_analysis(SurvObj ~ cluster, data = fbi_nothigh)

Call:
survival::survdiff(formula = formula, data = data)

n=256, 5 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 125       66     61.9     0.275     0.559
cluster=1 131       56     60.1     0.283     0.559

 Chisq= 0.6  on 1 degrees of freedom, p= 0.455 

Stratification by immune activity

immune_low <- subset(tcga_clinical_labeled, cluster == 0)
immune_high <- subset(tcga_clinical_labeled, cluster == 1)
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_high)

Call:
survival::survdiff(formula = formula, data = data)

n=213, 72 observations deleted due to missingness.

                       N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 82       40     28.1     5.005     7.303
Subgroup=FBI-AMP Low  90       37     40.5     0.299     0.531
Subgroup=No AMP       41       19     27.4     2.569     3.935

 Chisq= 8.3  on 2 degrees of freedom, p= 0.0155 
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_low)

Call:
survival::survdiff(formula = formula, data = data)

n=215, 70 observations deleted due to missingness.

                       N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 90       45     38.6     1.060     1.651
Subgroup=FBI-AMP Low  91       42     50.8     1.535     2.843
Subgroup=No AMP       34       24     21.6     0.275     0.345

 Chisq= 2.9  on 2 degrees of freedom, p= 0.236 

Likewise, the only benefit of being non-foldback is derived from the immune-high group – there’s no difference between being foldback or not if you’re in the low immune group.

Multivariate models

simple_survival_analysis(SurvObj ~ Subgroup + cluster, data = tcga_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

n=428, 149 observations deleted due to missingness.

                                  N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High, cluster=0 90       45     38.4     1.118     1.380
Subgroup=FBI-AMP High, cluster=1 82       40     28.5     4.624     5.436
Subgroup=FBI-AMP Low, cluster=0  91       42     50.1     1.307     1.730
Subgroup=FBI-AMP Low, cluster=1  90       37     41.1     0.415     0.523
Subgroup=No AMP, cluster=0       34       24     21.0     0.444     0.496
Subgroup=No AMP, cluster=1       41       19     27.9     2.819     3.331

 Chisq= 10.9  on 5 degrees of freedom, p= 0.0525 

We’ll next combined FBI-AMP low and No AMP into a single category called FBI-NotHigh.

tcga_clinical_labeled$newgroup <- ifelse(tcga_clinical_labeled$Subgroup == "FBI-AMP High", 
    "FBI-High", "FBI-NotHigh")

simple_survival_analysis(SurvObj ~ newgroup + cluster, data = tcga_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

n=428, 149 observations deleted due to missingness.

                                  N Observed Expected (O-E)^2/E (O-E)^2/V
newgroup=FBI-High, cluster=0     90       45     38.4     1.118     1.380
newgroup=FBI-High, cluster=1     82       40     28.5     4.624     5.436
newgroup=FBI-NotHigh, cluster=0 125       66     71.0     0.358     0.549
newgroup=FBI-NotHigh, cluster=1 131       56     69.0     2.448     3.686

 Chisq= 8.7  on 3 degrees of freedom, p= 0.0337 

Let’s also do Cox proportional hazards models:

mod1 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis + 
    tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + 
    additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
anova(mod1)
mod2 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis, 
    tcga_clinical_labeled_continuous)
anova(mod2)
mod3 <- coxph(SurvObj ~ Cytotoxicity + (Subgroup == "FBI-AMP High") + age_at_initial_pathologic_diagnosis, 
    subset(tcga_clinical_labeled_continuous, !is.na(Subgroup)))
anova(mod3)

So the effects of immune activity and FBI/HRD status are collinear.

Final run

Instead of overwriting the previous, this is its own section since the underlying implementation has changed substantially.

variant_table <- read_variant_file(master_variant_file, db_path)
breakpoint_table <- read_variant_file(master_breakpoint_file, db_path)

sig_results <- read_signature_files(mmctm_final_patient_dir, variant_table, 
    breakpoint_table, db_path)

sig_results_snv <- subset(sig_results$snv, is_present == 1)
sig_results_sv <- subset(sig_results$sv, is_present == 1)
sig_results_nonith <- sig_results$nonith %>% as.data.frame %>% column_to_rownames("signature") %>% 
    t %>% as.data.frame %>% rownames_to_column("condensed_id")

signature_labels <- str_extract(c(colnames(sig_results_snv), colnames(sig_results_sv)), 
    "SN?V-[0-9]+")
signature_labels <- signature_labels[!is.na(signature_labels)]
summarize_signature_proportions <- function(x, by = c("condensed_id", "patient_id"), 
    signature_labels, report_count = FALSE) {
    props <- subset(x, select = c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% 
        group_by_(.dots = by) %>% summarise_each(funs(sum))
    
    if (report_count) {
        counts <- subset(x, select = c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% 
            group_by_(.dots = by) %>% summarise(n = n())
        props <- plyr::join(props %>% as.data.frame, counts %>% as.data.frame)
    }
    sums <- rowSums(subset(props, select = colnames(props[colnames(props) %in% 
        signature_labels])))
    sigs <- intersect(signature_labels, colnames(props))
    for (sig in sigs) {
        props[, sig] <- props[, sig]/sums
    }
    return(props)
}

Signatures

Sample signature proportions

These proportions won’t be exactly equal to the topic proportions that the model outputs (they are variational estimates), but we’re actually doing pretty darn well.

TODO: Make a QC plot. From a glance it seems that proportion error is usually within 1-5% relative error.

sample_props_snv <- summarize_signature_proportions(sig_results_snv, by = c("condensed_id", 
    "patient_id"), signature_labels)
sample_props_sv <- summarize_signature_proportions(sig_results_sv, by = c("condensed_id", 
    "patient_id"), signature_labels)

patient_props_snv <- sig_results_snv %>% subset(select = c("patient_id", "chrom", 
    "coord", "ref", "alt", intersect(signature_labels, colnames(sig_results_snv)))) %>% 
    unique %>% summarize_signature_proportions(by = c("patient_id"), signature_labels)
patient_props_sv <- sig_results_sv %>% subset(select = c("patient_id", "prediction_id", 
    intersect(signature_labels, colnames(sig_results_sv)))) %>% unique %>% summarize_signature_proportions(by = c("patient_id"), 
    signature_labels)
sample_props <- plyr::join(sample_props_snv %>% as.data.frame, sample_props_sv %>% 
    as.data.frame)
sample_props <- rbind.fill(sample_props, sig_results_nonith)

sample_props_mat <- sample_props %>% column_to_rownames("condensed_id") %>% 
    subset(select = c(colnames(sample_props)[colnames(sample_props) %in% signature_labels]))

sample_props_mat_scaled <- scale(sample_props_mat)
# [,c('SNV-2', 'SNV-5', 'SV-3', 'SV-6', 'SV-8', 'SV-1', 'SV-7', 'SV-5',
# 'SNV-3')]
sample_props_heat <- pheatmap(sample_props_mat_scaled, fontsize_row = 5, clustering_method = "ward.D2")

patient_props <- plyr::join(patient_props_snv %>% as.data.frame, patient_props_sv %>% 
    as.data.frame)
patient_props <- rbind.fill(patient_props, sig_results_nonith %>% plyr::rename(c(condensed_id = "patient_id")))

patient_props_mat <- patient_props %>% column_to_rownames("patient_id") %>% 
    subset(select = c(colnames(patient_props)[colnames(patient_props) %in% signature_labels]))

patient_props_mat_scaled <- scale(patient_props_mat)
# [,c('SNV-2', 'SNV-5', 'SV-3', 'SV-6', 'SV-8', 'SV-1', 'SV-7', 'SV-5',
# 'SNV-3')]
patient_props_heat <- pheatmap(patient_props_mat_scaled, fontsize_row = 5, clustering_method = "ward.D2", 
    clustering_distance_rows = "correlation")

ITH cohort

sample_clusts <- as.data.frame(cutree(sample_props_heat$tree_row, 4)) %>% setNames(c("cluster")) %>% 
    rownames_to_column("condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, 
    tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, 
    pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
    x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "full"), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))
combined_df <- subset(combined_df, condensed_id %in% subset(samples, project_code == 
    "ITH")$condensed_id)

combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
# ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), '7_BrnM')

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[, SIGS], check.names = FALSE), 
    var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots = "condensed_id") %>% 
    summarise(snv_count = log(sum(snv_count)), bkpt_count = log(sum(bkpt_count))) %>% 
    subset(select = c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, 
    sig_annotations, variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_ith <- row_annotations

combined_mat_scaled <- scale(combined_mat)

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order, ], cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)

combined_mat_scaled_ith <- combined_mat_scaled

ICGC validation

icgc_fbi_status <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status) <- mapvalues(colnames(icgc_fbi_status), from = c("Case_ID"), 
    to = c("condensed_id"))
colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "new_id", "condensed_id")

data_matrices <- list(sample_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
    x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
# combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[, SIGS], check.names = FALSE), 
    var = "condensed_id")

subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype", 
    "nmf_subtype"))
colnames(subtype_annotations)[1] <- "condensed_id"

icgc_fbi_annotations <- subset(icgc_fbi_status, select = c("condensed_id", "Patch et al. Class", 
    "Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, 
    subtype_annotations, icgc_fbi_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_noith <- row_annotations

# select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == 
    "no treatment")$icgc_donor_id
# sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

# order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

# combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x - 
    median(x, na.rm = TRUE))/mad(x, na.rm = TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order, ]
other_mat <- sample_props_mat_scaled[sample_order, ]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", 
    clustering_method = "ward.D2")

combined_mat_scaled_noith <- combined_mat_scaled
icgc_clinical_labeled <- Reduce(function(x, y) plyr::join(x, y, type = "full"), 
    list(setNames(cluster_annotations, c("icgc_donor_id", "cluster")), icgc_clinical, 
        setNames(subset(icgc_fbi_annotations, select = c("condensed_id", "BRCA.status", 
            "Subgroup")), c("icgc_donor_id", "BRCA.status", "Wang_subgroup"))))

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, 
    donor_vital_status == "deceased"))

Survival by FBI status …

simple_survival_analysis(SurvObj ~ cluster != 1, data = subset(icgc_clinical_labeled, 
    cluster != 4))

Call:
survival::survdiff(formula = formula, data = data)

                    N Observed Expected (O-E)^2/E (O-E)^2/V
cluster != 1=FALSE 28       26     28.9     0.298     0.627
cluster != 1=TRUE  36       32     29.1     0.297     0.627

 Chisq= 0.6  on 1 degrees of freedom, p= 0.428 
simple_survival_analysis(SurvObj ~ cluster, data = subset(icgc_clinical_labeled, 
    cluster != 4))

Call:
survival::survdiff(formula = formula, data = data)

           N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=1 28       26    28.94     0.298     0.627
cluster=2 25       23    21.32     0.133     0.223
cluster=3 11        9     7.75     0.203     0.249

 Chisq= 0.7  on 2 degrees of freedom, p= 0.716 
simple_survival_analysis(SurvObj ~ Wang_subgroup, data = icgc_clinical_labeled)

Call:
survival::survdiff(formula = formula, data = data)

n=66, 19 observations deleted due to missingness.

                        N Observed Expected (O-E)^2/E (O-E)^2/V
Wang_subgroup=High FBI 36       35     28.6      1.45      2.89
Wang_subgroup=Low FBI  30       25     31.4      1.32      2.89

 Chisq= 2.9  on 1 degrees of freedom, p= 0.0889 

Combined

Sample-level
combined_mat_scaled_all <- rbind.fill(combined_mat_scaled_ith %>% as.data.frame %>% 
    rownames_to_column("condensed_id"), combined_mat_scaled_noith %>% as.data.frame %>% 
    rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

row_annotations_all <- rbind.fill(row_annotations_ith %>% rownames_to_column("condensed_id"), 
    row_annotations_noith %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat_scaled_all))

# combined_mat_scaled <- as.data.frame(apply(combined_mat_all, 2,
# function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))
combined_mat_scaled <- combined_mat_scaled_all[sample_order, ]

nanostring_vars <- rownames(icgc_celltype_matrix)
combined_mat_scaled_subset <- subset(combined_mat_scaled, select = nanostring_vars)
combined_mat_scaled_subset <- combined_mat_scaled_subset[!apply(combined_mat_scaled_subset, 
    1, function(x) all(is.na(x))), ]
pheatmap(clip_values(combined_mat_scaled_subset, 2, -2), cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations_all, fontsize = 6, 
    clustering_distance_cols = "correlation", clustering_method = "ward.D2")

Doesn’t really cluster consistently between ICGC and our samples.

TODO: Batch correct ICGC and our data together (I think I already did this somewhere) – and normalize the final matrix in one step rather than normalizing separately for each subcohort and combining those together. Otherwise we could always be subject to the scenario where the ITH cohort may be skewed in immune response (either all relatively low or high) compared to the ICGC cohort.

TODO: Use absolute counts of mutations for each signature, and cluster based on those. It may be that samples with too low/high mutation counts are not clustering properly.

Patient-level
# ith_icgc_batch_corrected_expression_file <-
# '~/shahlab/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv'

ith_icgc_expression <- fread(ith_icgc_batch_corrected_expression_file)

ith_icgc_celltype_expr <- create_celltype_matrix(ith_icgc_expression, labels, 
    db_path, convert_ids = FALSE)
ith_icgc_pathway_expr <- create_pathway_matrix(ith_icgc_expression, labels, 
    db_path, convert_ids = FALSE)

icgc_specimen_data <- fread(icgc_specimen_file)
summarize_expression_by_patient <- function(expr) {
    primary_specimen_dat <- subset(icgc_specimen_data, str_detect(specimen_type, 
        "Primary"))
    
    notx <- subset(primary_specimen_dat, str_detect(specimen_type, "Primary") & 
        specimen_donor_treatment_type == "no treatment")$icgc_donor_id
    primary_specimen_dat <- subset(primary_specimen_dat, icgc_donor_id %in% 
        notx)
    
    x <- setNames(melt(as.matrix(expr)), c("Name", "sample", "expr"))
    x$patient_id <- map_id(as.character(x$sample), from = "condensed_id", to = "patient_id", 
        db_path)
    idx <- is.na(x$patient_id)
    donor_labels <- df_as_map(primary_specimen_dat, as.character(x$sample[idx]), 
        from = "icgc_specimen_id", to = "icgc_donor_id")
    x$patient_id[idx] <- donor_labels
    x <- subset(x, !is.na(patient_id))
    x_sum <- x %>% group_by(Name, patient_id) %>% summarise(expr = mean(expr, 
        na.rm = TRUE))
    
    res <- dcast(x_sum, formula = Name ~ patient_id, value.var = "expr")
    return(res)
}
NCLUST <- 3
clusters <- cutree(patient_props_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

cluster_annotations <- subset(plyr::rename(patient_clusts, c(new_id = "patient_id")), 
    select = c(patient_id, cluster))

ith_icgc_celltype_expr_patient <- summarize_expression_by_patient(ith_icgc_celltype_expr)
ith_icgc_pathway_expr_patient <- summarize_expression_by_patient(ith_icgc_pathway_expr)

expr_dat <- ith_icgc_pathway_expr_patient
combined_patient_mat <- expr_dat %>% as.data.frame %>% column_to_rownames(var = "Name") %>% 
    t %>% as.data.frame %>% rownames_to_column("patient_id")

data_matrices <- list(cluster_annotations, combined_patient_mat)
data_matrices <- lapply(data_matrices, function(x) {
    x
})

combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
combined_mat <- subset(combined_df, select = -c(cluster))
rownames(combined_mat) <- NULL
combined_mat <- combined_mat %>% column_to_rownames("patient_id")

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(patient_props_mat[, SIGS], 
    check.names = FALSE), var = "patient_id")

subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype", 
    "nmf_subtype"))
colnames(subtype_annotations)[1] <- "patient_id"

icgc_fbi_status_patient <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status_patient) <- mapvalues(colnames(icgc_fbi_status_patient), 
    from = c("Case_ID"), to = c("patient_id"))

icgc_fbi_annotations <- subset(icgc_fbi_status_patient, select = c("patient_id", 
    "Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status", 
    "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, 
    subtype_annotations, icgc_fbi_annotations))

row_annotations <- row_annotations %>% column_to_rownames("patient_id")

combined_patient_mat_scaled <- combined_mat %>% scale

patient_order <- patient_props_heat$tree_row$labels[patient_props_heat$tree_row$order]
patient_order <- intersect(patient_order, rownames(combined_patient_mat_scaled))
pheatmap(clip_values(combined_patient_mat_scaled[patient_order, ], 2, -2), cluster_rows = FALSE, 
    cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", 
    clustering_method = "ward.D2")

Ancestral-descendant signature proportions

ancdesc_props_snv <- unique(subset(sig_results_snv, select = c("patient_id", 
    "is_ancestral", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% 
        colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by = c("patient_id", 
    "is_ancestral"), signature_labels)
ancdesc_props_sv <- unique(subset(sig_results_sv, select = c("patient_id", "prediction_id", 
    "is_ancestral", signature_labels[signature_labels %in% colnames(sig_results_sv)]))) %>% 
    summarize_signature_proportions(by = c("patient_id", "is_ancestral"), signature_labels)

ancdesc_props <- plyr::join(ancdesc_props_snv %>% as.data.frame, ancdesc_props_sv %>% 
    as.data.frame)

ancdesc_props_mat <- subset(ancdesc_props, select = colnames(ancdesc_props)[colnames(ancdesc_props) %in% 
    signature_labels])
ancdesc_props_scaled <- scale(ancdesc_props_mat)
rownames(ancdesc_props_scaled) <- with(ancdesc_props, paste(patient_id, is_ancestral, 
    sep = "_"))

pheatmap(ancdesc_props_scaled, clustering_distance_rows = "euclidean", clustering_method = "ward.D2")

ancdesc_props_melted <- melt(ancdesc_props, id.vars = c("patient_id", "is_ancestral"), 
    measure.vars = intersect(colnames(ancdesc_props), signature_labels), variable.name = "signature", 
    value.name = "proportion")

pvals <- setNames(ddply(ancdesc_props_melted, .(signature), function(x) {
    df <- as.data.frame(x)
    corres <- wilcox.test(proportion ~ is_ancestral, df, paired = TRUE)
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(ancdesc_props_melted, aes(x = factor(is_ancestral), y = proportion)) + 
    geom_boxplot() + facet_wrap(~signature, scales = "free") + geom_text(data = pvals, 
    aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, 
    parse = TRUE) + theme_bw() + theme_Publication()

Origin node signature proportions

Note: We cannot get node-specific rearrangement signatures.

Note: These node labels DO NOT correspond to node labels within the clone phylogeny; they correspond to nodes within the Dollo model.

node_props_snv <- unique(subset(sig_results_snv, select = c("patient_id", "origin_node", 
    "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>% 
    summarize_signature_proportions(by = c("patient_id", "origin_node"), signature_labels, 
        report_count = TRUE)
node_props_mat <- subset(node_props_snv, select = colnames(node_props_snv)[colnames(node_props_snv) %in% 
    signature_labels])
node_props_scaled <- scale(node_props_mat)
rownames(node_props_scaled) <- with(node_props_snv, paste(patient_id, origin_node, 
    sep = "_"))

row_annotations <- subset(node_props_snv, select = c("patient_id", "n"))
row_annotations$n <- log2(row_annotations$n)
row_annotations$patient_id <- factor_id(row_annotations$patient_id, type = "patient_id", 
    db_path)
rownames(row_annotations) <- rownames(node_props_scaled)

clustering_colours <- list(patient_id = pal_patient)

pheatmap(clip_values(node_props_scaled, 2, -2), clustering_distance_rows = "euclidean", 
    clustering_method = "ward.D2", fontsize_row = 5, annotation_row = row_annotations, 
    annotation_colors = clustering_colours)

Looks like there are similar selection pressures acting at each part of the sample phylogeny – i.e. mutation signatures are ‘relatively’ consistent within patients throughout time.

Clonal phylogeny branch-specific signature proportions

NOTE: THE LABELS ON THESE TREES MAY NOT BE THE SAME AS THOSE IN THE MAPSCAPES (ugh).

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, 
    clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
snv_cluster <- rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
    f <- snv_cluster_files[[i]]
    patient_id <- snv_cluster_patients[[i]]
    snv_cluster <- read_snv_cluster(f, clone_branch_length_file, db_path)
    return(snv_cluster)
}))

snv_cluster <- plyr::join(snv_cluster, branch_lengths)
snv_cluster_filtered <- subset(snv_cluster, !is.na(label))

sig_results_snv_cluster <- plyr::join(sig_results_snv, snv_cluster_filtered)
branch_props_snv <- unique(subset(sig_results_snv_cluster, select = c("patient_id", 
    "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% 
        colnames(sig_results_snv_cluster)]))) %>% summarize_signature_proportions(by = c("patient_id", 
    "label"), signature_labels, report_count = TRUE)
branch_props_snv_melted <- melt(branch_props_snv, id.vars = c("patient_id", 
    "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv)), 
    variable.name = "signature", value.name = "proportion")
ggplot(branch_props_snv_melted, aes(x = label, y = proportion)) + geom_bar(aes(fill = signature), 
    stat = "identity") + facet_wrap(~patient_id, ncol = 1, scales = "free_x") + 
    theme_bw() + theme_Publication() + geom_text(data = unique(subset(branch_props_snv_melted, 
    select = c("patient_id", "label", "n"))), aes(x = label, y = 1, label = n), 
    vjust = -0.2, stat = "identity") + ylim(c(0, 1.2))

What if we use MAP assignments? (This would allow us to apply chi-square tests.)

TODO: Figure out how to apply tests between k > 2 groups of proportions… in other words, a test of homogeneity.

id_vars <- colnames(sig_results_snv_cluster)[!colnames(sig_results_snv_cluster) %in% 
    signature_labels]

sig_results_snv_cluster_melted <- melt(sig_results_snv_cluster, id.vars = id_vars, 
    measure.vars = intersect(signature_labels, colnames(sig_results_snv_cluster)), 
    variable.name = "signature", value.name = "proportion")
maxes <- sig_results_snv_cluster_melted %>% group_by_(.dots = id_vars) %>% summarise(maxprop = max(proportion))
sig_results_snv_cluster_melted <- plyr::join(sig_results_snv_cluster_melted, 
    maxes)
sig_results_snv_cluster_melted$proportion_map <- ifelse(sig_results_snv_cluster_melted$proportion == 
    sig_results_snv_cluster_melted$maxprop, 1, 0)

sig_results_snv_cluster_casted <- dcast(sig_results_snv_cluster_melted, formula = paste0(paste(id_vars, 
    collapse = "+"), "~ signature"), value.var = "proportion_map")

branch_props_snv_map <- unique(subset(sig_results_snv_cluster_casted, select = c("patient_id", 
    "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% 
        colnames(sig_results_snv_cluster_casted)]))) %>% summarize_signature_proportions(by = c("patient_id", 
    "label"), signature_labels, report_count = TRUE)

branch_props_snv_map_melted <- melt(branch_props_snv_map, id.vars = c("patient_id", 
    "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv_map)), 
    variable.name = "signature", value.name = "proportion")
ggplot(branch_props_snv_map_melted, aes(x = label, y = proportion)) + geom_bar(aes(fill = signature), 
    stat = "identity") + facet_wrap(~patient_id, ncol = 1, scales = "free_x") + 
    theme_bw() + theme_Publication() + geom_text(data = unique(subset(branch_props_snv_map_melted, 
    select = c("patient_id", "label", "n"))), aes(x = label, y = 1, label = n), 
    vjust = -0.2, stat = "identity") + ylim(c(0, 1.2))

Adjusted clone trees

trees_age <- trees
age_signature <- "SNV-5"

branch_props_dat <- branch_props_snv_map_melted

tree_objects <- lapply(seq_along(trees_age), function(i) {
    tree <- trees_age[[i]]
    tree_old <- tree
    patient <- names(trees_age)[i]
    branch_props_dat_sub <- subset(branch_props_dat, patient_id == as.numeric(patient) & 
        signature == age_signature)
    branch_props_dat_sub$length <- with(branch_props_dat_sub, proportion * n)
    
    edge_lengths <- tree@edge.length
    idx <- which(edge_lengths == 0)
    to_labels <- tree@label[str_extract(names(edge_lengths), "[0-9]+$")]
    lengths <- df_as_map(branch_props_dat_sub, unname(to_labels), from = "label", 
        to = "length")
    if (length(idx) > 0) {
        lengths[idx] <- 0
    }
    tree@edge.length <- lengths
    names(tree@edge.length) <- names(edge_lengths)
    
    return(list(all = tree_old, age = tree))
})
names(tree_objects) <- names(trees_age)
tmp <- unlist(tree_objects)
# ignore <- lapply(seq_along(tmp), function(i) { print(plot(tmp[[i]],
# show.node.label = TRUE, main = names(tmp)[i])) })
find_ancestors <- function(tree) {
    x <- phylobase:::.phylo4ToDataFrame(tree)
    root <- subset(x, node.type == "root")$label
    root_number <- names(which(tree@label == root))
    direct_descendants <- subset(x, ancestor == as.numeric(root_number))$label
    if (length(direct_descendants) == 1) {
        return(c(root, direct_descendants))
    } else {
        return(root)
    }
}
patients <- unique(branch_props_dat$patient_id)
root_data <- rbind.fill(lapply(patients, function(pat) {
    tree <- trees[[as.character(pat)]]
    ancestors <- find_ancestors(tree)
    rbind.fill(lapply(ancestors, function(x) {
        data.frame(label = x, patient_id = pat)
    }))
}))
root_data$node_type <- "root"
branch_data_annotated <- plyr::join(branch_props_dat, root_data)
branch_data_annotated$node_type[is.na(branch_data_annotated$node_type)] <- "descendant"

root_proportions <- branch_data_annotated %>% subset(node_type == "root") %>% 
    group_by(patient_id, signature) %>% summarise(proportion = weighted.mean(proportion, 
    w = n)) %>% plyr::rename(c(proportion = "root_proportion"))

branch_data_annotated <- plyr::join(branch_data_annotated, root_proportions)
branch_data_annotated$proportion_diff <- with(branch_data_annotated, proportion - 
    root_proportion)
ggplot(branch_data_annotated %>% subset(node_type == "descendant" & n > 40), 
    aes(x = proportion_diff, fill = signature)) + geom_histogram(binwidth = 0.02, 
    alpha = 0.4, position = "identity") + theme_bw() + theme_Publication()

---
title: "Mutation signatures"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "mutation_signatures.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_final_patient_sigplot.png', '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_sigplot.png', '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_ov_combined_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_ad_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', 'Rmd/mutation_signatures.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', '/shahlab/alzhang/data/ICGC/specimen.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_patient_ad_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "mmctm_patient_ad_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "mmctm_sample_ad_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_final_patient_sigplot.png', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_sigplot.png', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_ov_combined_sigplot.png', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_ad_sigplot.png', "mmctm_ov_combined_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "subclonality" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "snv_cluster_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv'), "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "molecular_subtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "notebook" = 'Rmd/mutation_signatures.Rmd', "ihc_table_slide" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "mmctm_sample_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "icgc_expr_mat" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_patient_ad_sigplot.png', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "tcga_ov_annotation" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tcga_expr_mat" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/mutation_signatures.nb.html'),
    params = list(c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), 'ithi-analysis-mutation-signature-notebook', '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', c('1', '2', '3', '4', '7', '9', '10', '11', '12', '13', '14', '15', '16', '17'), "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "name" = 'ithi-analysis-mutation-signature-notebook', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "snv_cluster_patients" = c('1', '2', '3', '4', '7', '9', '10', '11', '12', '13', '14', '15', '16', '17')),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/mutation_signatures.log'),
    resources = list(),
    config = list("notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "nclusts" = 3, "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "mvclust_nclust" = 3, "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "default_sampler" = 'HMC', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "prevalence_threshold" = 0.01, "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "variability_type" = 'stabilize', "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "phenotype_threshold" = 0.85, "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "xcr_distance_method" = 'horn', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "PNG_DENSITY" = 300, "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "clonal_samplers" = c('HMC', 'NUTS'), "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "PNG_QUALITY" = 300, "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "classifier_type" = 'knn', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "figure_gallery_notebook" = 'Rmd/figures.Rmd', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "spatial_result_dirs" = list("epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc', "stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc'), "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "index_notebook" = 'Rmd/index.Rmd', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd'),
    rule = 'mutation_signature_notebook'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()
library(survival)
library(rms)
library(edgeR)
library(limma)
library(gage)
library(pathview)
library(DT)
library(ape)
library(phytools)
library(phylobase)

library(ithi.meta)
library(ithi.xcr)
library(ithi.ihc)
library(ithi.seq)
library(ithi.clones)
library(ithi.expr)
library(ithi.external)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

mmctm_sample_result_dir <- snakemake@input$mmctm_sample_dir
mmctm_sample_ad_result_dir <- snakemake@input$mmctm_sample_ad_dir
mmctm_patient_ad_result_dir <- snakemake@input$mmctm_patient_ad_dir
mmctm_ov_combined_result_dir <- snakemake@input$mmctm_ov_combined_dir
mmctm_final_patient_dir <- snakemake@input$mmctm_final_patient_dir

mmctm_sample_sigplot <- snakemake@input$mmctm_sample_sigplot
mmctm_sample_ad_sigplot <- snakemake@input$mmctm_sample_ad_sigplot
mmctm_patient_ad_sigplot <- snakemake@input$mmctm_patient_ad_sigplot
mmctm_ov_combined_sigplot <- snakemake@input$mmctm_ov_combined_sigplot
mmctm_final_patient_sigplot <- snakemake@input$mmctm_final_patient_sigplot

master_variant_file <- snakemake@input$master_variant_file
master_breakpoint_file <- snakemake@input$master_breakpoint_file

ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
tiltypes <- snakemake@params$mutsig_tiltypes

tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

molecular_subtype_file <- snakemake@input$molecular_subtypes

icgc_expr_mat_file <- snakemake@input$icgc_expr_mat
icgc_specimen_file <- snakemake@input$icgc_specimen
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_raw_file <- snakemake@input$icgc_expr_melted
icgc_donor_file <- snakemake@input$icgc_clinical

tcga_expr_mat_file <- snakemake@input$tcga_expr_mat
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotation
tcga_donor_file <- snakemake@input$tcga_clinical

proportion_subclonality_file <- snakemake@input$subclonality

wang_icgc_fbi_status_file <- snakemake@input$wang_fbi_status

snv_cluster_files <- snakemake@input$snv_cluster_files
snv_cluster_patients <- snakemake@params$snv_cluster_patients

clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file

ith_icgc_batch_corrected_expression_file <- snakemake@input$ith_icgc_bc
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```

```{r}
read_mutsig_output <- function(dir, sample_table, external=FALSE) {
  prop_table_path <- Sys.glob(file.path(dir, "*_props.tsv"))
  sigs_path <- Sys.glob(file.path(dir, "../plots/*_multipanel.pdf"))
  
  prop_table <- fread(prop_table_path)
  
  sample_cols <- colnames(prop_table)[colnames(prop_table) != "signature"]
  ith_cols <- str_detect(sample_cols, "patient")
  
  patients <- str_extract(sample_cols[ith_cols], "(?<=patient_?)[0-9]+")
  site_ids <- str_replace_all(sample_cols[ith_cols], "patient_?[0-9]+_", "")
  
  FROM <- c("rectum_site_1", "rectum_site_2", "rectum_site_3", "rectum_site_4",
            "pelvis_site_1", "pelvis_site_2", "cul_de_sac_site_1")
  TO <- c("rectum_site_1", "rectum_site_1", "rectum_site_2", "rectum_site_2",
          "pelvis_site_1", "pelvis_site_1", "cul-de-sac_site_1")
  
  site_ids_fixed <- mapvalues(site_ids, from = FROM, to=TO)
  df <- data.frame(patient_id=patients, site_id=site_ids_fixed, old_id=sample_cols[ith_cols])
  if (length(df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"),]$site_id) > 0) {
    df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"),]$site_id <- "left_ovary_site_3"
  }
  
  sample_subset <- unique(subset(sample_table, select=c("condensed_id", "patient_id", "site_id")))
  
  df <- plyr::join(df, sample_subset)
  if (nrow(df) > 0) {
    df$is_ancestral <- 0
    df$is_ancestral[df$site_id == "ancestral"] <- 1
  }
  
  #colnames(prop_table)[ith_cols] <- make.unique(df$condensed_id)
  
  prop_final <- melt(prop_table, id.vars = c('signature'), measure.vars = sample_cols,
                     variable.name = "old_id", value.name = "proportion")
  if (external) {
    prop_final_merged <- plyr::join(df, prop_final, by=c("old_id"), type='full')
  } else {
    prop_final_merged <- plyr::join(df, prop_final, by=c("old_id"), type='inner')
  }
  
  prop_final_merged <- prop_final_merged %>% mutate(new_id = ifelse(site_id %in% c("ancestral", "residual"),
                                               yes = paste(patient_id, site_id, sep="_"),
                                               no = condensed_id))
  na_idx <- is.na(prop_final_merged$new_id)
  prop_final_merged$new_id[na_idx] <- as.character(prop_final_merged$old_id[na_idx])
  
  result <- list(prop=prop_final_merged, sigplot=sigs_path)
  return(result)
}

sum_total_variant_counts <- function(variant_sample) {
  variant_sample %>% group_by(condensed_id, patient_id) %>% 
    summarise(snv_count=sum(snv_count), bkpt_count=sum(bkpt_count))
}

compute_mutsig_counts <- function(df) {
  df <- df %>% mutate(is_sv = as.numeric(str_detect(signature, "^SV")),
                sigcount = ifelse(as.character(site_id) == "ancestral",
                  yes = ifelse(is_sv, yes = ancestral_bkpt*proportion, no = ancestral_snv*proportion),
                  no = ifelse(is_sv, yes=bkpt_count * proportion, no = snv_count * proportion)))
  return(df)
}

create_mutsig_matrix <- function(df, col = "proportion") {
  mat <- acast(df, new_id ~ signature, fun.aggregate = mean, value.var = col)
  return(mat)
}
```

```{r}
variant_res <- summarize_variant_counts(master_variant_file, master_breakpoint_file, db_path)
variant_sample <- variant_res$sample
variant_patient <- variant_res$patient

variant_sample_sum <- sum_total_variant_counts(variant_sample)

ihc_table <- fread(ihc_table_path)
ihc_table_subset <- subset(ihc_table, select=c("condensed_id", tiltypes))
ihc_table_slide <- fread(ihc_table_slide_path)
```

## Sample level

### Signatures

![](`r mmctm_sample_sigplot`)


### Results

```{r}
props_sample <- read_mutsig_output(mmctm_sample_result_dir, samples)
props_sample_prop <- subset(props_sample$prop, patient_id != "11")

props_sample_mat <- create_mutsig_matrix(props_sample_prop, col = "proportion")

props_sample_mat_scaled <- clip_values(scale(props_sample_mat), 2, -2)
sample_heat <- pheatmap(props_sample_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation", clustering_method = "ward.D2")
```

```{r}
props_sample_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_sample_prop, variant_sample_sum, variant_patient, ihc_table_subset))

props_sample_merged <- compute_mutsig_counts(props_sample_merged)
```

```{r}
COL <- "E_CD8_density"
measure <- "sigcount"

pvals <- setNames(ddply(props_sample_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  df <- df[!is.na(df[,COL]),]
  corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), df))
  
  pval <- unname(corres$coefficients[,5][2])
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_sample_merged, aes_string(x=COL, y=measure)) + geom_point(aes(colour=patient_id)) + 
  scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales="free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
  theme_bw()
```

```{r}
# x <- Reduce(function(x,y) merge(x,y, by=c("condensed_id")), list(rownames_to_column(as.data.frame(t(prop_matrix)), var = "condensed_id"), 
#            ihc_table_subset))
# x$patient_id <- map_id(x$condensed_id, from = "condensed_id", to="patient_id", db_path)
# x <- subset(x, patient_id != "11")
# 
# cols <- colnames(x)[!colnames(x) %in% c("condensed_id", "patient_id")]
# rmat <- matrix(nrow=length(cols),ncol=length(cols))
# pmat <- matrix(nrow=length(cols),ncol=length(cols))
# 
# for (i in 1:length(cols)) {
#   for (j in i:length(cols)) {
#     if (i != j) {
#       col1 <- cols[i]
#       col2 <- cols[j]
#       formula <- paste0("`", col2, "`", "~", "`", col1, "`", "+ (1|patient_id)", sep=" ")
#       res <- summary(lmer(formula, x))
#       pval <- tryCatch({
#         unname(res$coefficients[,5][2])
#       }, error = function(e) {
#         NA
#       })
#       r <- unname(res$coefficients[,1][2])
#       rmat[i,j] <- rmat[j,i] <- r
#       pmat[i,j] <- pmat[j,i] <- pval
#     } else {
#       rmat[i,j] <- 1
#       pmat[i,j] <- NA
#     }
#   }
# }

```

```{r}
# resmat <- log10(pmat)*-sign(rmat)
# resmat[resmat == Inf] <- NA
# rownames(resmat) <- colnames(resmat) <- cols
# 
# pheatmap(resmat, display_numbers = signif(pmat, 3), cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 5)
```

```{r}
# xmat <- x %>% select(-one_of("condensed_id", "patient_id"))
# 
# cormat <- corr.test(xmat, method="spearman", adjust="fdr")
# pheatmap(cormat$r, display_numbers = signif(cormat$p, 3), fontsize = 5)
```


## Ancestral-descendant level (samples)

### Signatures

![](`r mmctm_sample_ad_sigplot`)

### Results

```{r}
props_ad <- read_mutsig_output(mmctm_sample_ad_result_dir, samples)
props_ad_prop <- subset(props_ad$prop, patient_id != "11")

props_ad_mat <- create_mutsig_matrix(props_ad_prop, col = "proportion")

props_ad_mat_scaled <- clip_values(scale(props_ad_mat), 2, -2)
patient_heat <- pheatmap(props_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")
```

```{r}
props_ad_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_ad_prop, subset(variant_sample, is_ancestral == 0), variant_patient, ihc_table_subset))

props_ad_merged <- compute_mutsig_counts(props_ad_merged)
```

```{r}
COL <- "E_CD8_density"
measure <- "sigcount"

props_ad_merged_descendant <- subset(props_ad_merged, site_id != "ancestral")

pvals <- setNames(ddply(props_ad_merged_descendant, .(signature), function(x) {
  df <- as.data.frame(x)
  df <- df[!is.na(df[,COL]),]
  corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), df))
  
  pval <- unname(corres$coefficients[,5][2])
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged_descendant, aes_string(x=COL, y=measure)) + geom_point(aes(colour=patient_id)) + 
  scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales="free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
  theme_bw()
```

What we can also see from these plots is that not resolving SNV-7 (the noise cluster) is going to be a problem. The signature proportion values from that cluster are fairly similar to those from SNV-6, the APOBEC signature. 


## Ancestral-descendant level (patients)

### Signatures

![](`r mmctm_patient_ad_sigplot`)

### Results

```{r}
props_patient_ad <- read_mutsig_output(mmctm_patient_ad_result_dir, samples)
props_patient_ad_prop <- subset(props_patient_ad$prop, patient_id != "11")

props_patient_ad_mat <- create_mutsig_matrix(props_patient_ad_prop, col = "proportion")

props_patient_ad_mat_scaled <- clip_values(scale(props_patient_ad_mat), 2, -2)
patient_ad_heat <- pheatmap(props_patient_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")
```

```{r}
props_patient_ad_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_patient_ad_prop, subset(variant_sample, is_ancestral == 0), variant_patient))

props_patient_ad_merged <- compute_mutsig_counts(props_patient_ad_merged)
```


## Correlation matrices

```{r}
ihc_mat <- as.data.frame(ihc_table_subset %>% dplyr::select(-condensed_id))
rownames(ihc_mat) <- ihc_table_subset$condensed_id

#TOTAL_TIL_COLS <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
#ihc_mat <- subset(ihc_mat, select=TOTAL_TIL_COLS)
```

```{r}
compute_ihc_mutsig_cor <- function(ihc_mat, mutsig_mat, patient_summary = FALSE, ancestral = FALSE) {
  if (patient_summary) {
    ihc_mat <- rownames_to_column(as.data.frame(ihc_mat), "id")
    mutsig_mat <- rownames_to_column(as.data.frame(mutsig_mat), "id")
    
    if (ancestral) {
      mutsig_mat <- subset(mutsig_mat, str_detect(id, "ancestral"))
    } else {
      mutsig_mat <- subset(mutsig_mat, !str_detect(id, "ancestral"))
    }
    
    ihc_mat$patient_id <- str_extract(ihc_mat$id, "^[0-9]+")
    mutsig_mat$patient_id <- str_extract(mutsig_mat$id, "^[0-9]+")
    
    ihc_mat <- ihc_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% summarise_each(funs(mean(., na.rm=TRUE))) %>% column_to_rownames(var = "patient_id")
    mutsig_mat <- mutsig_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% summarise_each(funs(mean(., na.rm=TRUE))) %>% column_to_rownames(var = "patient_id")
    
    ihc_mat <- as.matrix(ihc_mat)
    mutsig_mat <- as.matrix(mutsig_mat)
  }
  
  intersect_samples <- intersect(rownames(mutsig_mat), rownames(ihc_mat))
  mutsig <- mutsig_mat[intersect_samples,,drop=FALSE]
  ihc <- ihc_mat[intersect_samples,,drop=FALSE]
  
  pmat <- matrix(nrow=ncol(mutsig), ncol=ncol(ihc))
  rmat <- matrix(nrow=ncol(mutsig), ncol=ncol(ihc))
  rownames(pmat) <- rownames(rmat) <- colnames(mutsig)
  colnames(pmat) <- colnames(rmat) <- colnames(ihc)
  
  for (i in 1:ncol(mutsig)) {
    for (j in 1:ncol(ihc)) {
      corres <- cor.test(mutsig[,i], ihc[,j], method="spearman")
      pmat[i,j] <- corres$p.value
      rmat[i,j] <- corres$estimate
    }
  }
  
  return(list(p=pmat, r=rmat))
}
```

For adjusted p-values just run `p.adjust.mat` on the p-value labels. 

### Sample-level

```{r}
ihc_sample_cor <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat)

pheatmap(ihc_sample_cor$r, display_numbers = signif(ihc_sample_cor$p, 3))

ihc_sample_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat, patient_summary = TRUE)

pheatmap(ihc_sample_cor_summary$r, display_numbers = signif(ihc_sample_cor_summary$p, 3))
```

### Ancestral-descendant level

```{r}
ihc_ad_cor <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat)

pheatmap(ihc_ad_cor$r, display_numbers = signif(ihc_ad_cor$p, 3))

ihc_ad_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE)

pheatmap(ihc_ad_cor_summary$r, display_numbers = signif(ihc_ad_cor_summary$p, 3))

ihc_ad_cor_summary_ancestral <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE, ancestral = TRUE)

pheatmap(ihc_ad_cor_summary_ancestral$r, display_numbers = signif(ihc_ad_cor_summary_ancestral$p, 3))
```

Note: p-values shown are uncorrected. Patient-summarized correlations are insignificant after FDR correction. 

## Cluster-level analysis

Finding correlations at the level of individual signatures can be difficult. Even moreso because some published signatures are combinations of these signatures -- e.g. SV-3 and SV-6 are both foldback signatures in the ancestral-descendant analysis. 

Hence, we can look at the level of clusters from our heatmaps. 

```{r}
make_cluster_frame <- function(clusters) {
  clusts <- rownames_to_column(as.data.frame(clusters), "new_id")
  colnames(clusts)[2] <- "cluster"
  clusts$cluster <- factor(clusts$cluster)
  return(clusts)
}

NCLUST <- 2
selected_cols <- c("new_id", "condensed_id", "patient_id", "old_id", "cluster", tiltypes)
```

### Sample-level

```{r}
clusters <- cutree(sample_heat$tree_row, NCLUST)
sample_clusts <- make_cluster_frame(clusters)

props_sample_merged_clusts <- join(props_sample_merged, sample_clusts)
sample_df <- unique(subset(props_sample_merged_clusts, select=selected_cols))

sample_df_melted <- melt(sample_df, id.vars = colnames(sample_df)[!colnames(sample_df) %in% tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
```

```{r}
pvals <- setNames(ddply(sample_df_melted, .(tiltype), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(density ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(sample_df_melted, aes(x=cluster, y=density)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ tiltype, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```


```{r}
diversity_column <- "observedDiversity_mean"

diversity_files <- list(tcr=tcr_diversity_file, bcr=bcr_diversity_file)

diversity <- lapply(names(diversity_files), function(segment) {
  f <- diversity_files[[segment]]
  xcr_diversity <- read_xcr_diversity_file(f, db_path)
  
  df <- subset(xcr_diversity, select=c("condensed_id", "patient_id", diversity_column))
  colnames(df) <- mapvalues(colnames(df), from=diversity_column, to=segment)
  return(df)
})
names(diversity) <- names(diversity_files)

xcr_diversity <- Reduce(plyr::join, diversity)

XCR_VARS <- c("tcr", "bcr")
```


```{r}
molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))

proportion_subclonality <- fread(proportion_subclonality_file)
proportion_subclonality_subset <- subset(proportion_subclonality, select=c("condensed_id", "proportion_subclonal"))

exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)

celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
pathway_matrix <- create_pathway_matrix(exprs, labels, db_path)

celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = "condensed_id")
pathway_df <- rownames_to_column(as.data.frame(t(pathway_matrix)), var = "condensed_id")

NANOSTRING_VARS <- c(rownames(celltype_matrix), rownames(pathway_matrix))
```



```{r}
colnames(sample_clusts) <- mapvalues(colnames(sample_clusts), "new_id", "condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='full'), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
#ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), "7_BrnM")

sample_order <- sample_heat$tree_row$labels[sample_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-1", "SV-2", "SNV-1")
sig_annotations <- rownames_to_column(data.frame(props_sample_mat[,SIGS]), var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots="condensed_id") %>% 
  summarise(snv_count=log(sum(snv_count)), bkpt_count=log(sum(bkpt_count))) %>%
  subset(select=c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, sig_annotations,
                                           variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")

combined_mat_scaled <- scale(combined_mat)

# hcs <- lapply(unique(combined_df$cluster), function(clust) {
#   ids <- subset(combined_df, cluster == clust)$condensed_id
#   dists <- dist(combined_mat_scaled[ids,], method = "euclidean")
#   #dists <- proxy::dist(combined_mat_scaled[ids,], method = function(x,y) pairwise_dist(x,y,method="canberra"))
#   hclust(dists, method = "ward.D")
# })
# min_height <- max(sapply(hcs, function(x) max(unique(cophenetic(x)))))*1.1
# 
# hc <- Reduce(function(x,y) merge(as.dendrogram(x),as.dendrogram(y),height = min_height), hcs)
# hc <- as.dendrogram(ref_dendrogram)
# hc <- as.dendrogram(sample_heat2$tree_row)
# 
# ha <- HeatmapAnnotation(row_annotations[rownames(combined_mat_scaled),], which="row")
# 
# Heatmap(clip_values(combined_mat_scaled, 2, -2), cluster_rows = hc2, cluster_columns = TRUE, split = 2, clustering_method_columns = "ward.D2") + ha

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order,], cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)
```

What we can see is:

* H-HRD cluster is relatively homogeneous.
* Looks like there are two clusters within the H-FBI group -- one characterized by high levels of immune activity (cytotoxicity, etc.) and one characterized by low levels. 

This is a pretty significant finding. 

Additionally, an interesting thing is that the patients/samples with the highest proportions of foldbacks -- patients 2, 3, and 9 -- are actually the ones with high immune response in the foldback group! Suggestive that perhaps foldback inversions can create neoepitopes. Of course very preliminary and low sample size though. 

If you're wondering why there's no dendrogram on the vertical axis it's because the plotting functions I'm currently using don't allow for self-specified dendrograms. Trying to make one that lets me do so but it's taking a bit of acrobatics and I've wasted a lot of time already ... 

To see ICGC validation, go to that section. I'll add a link later ...

#### TCR/BCR diversity

```{r}
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", to="patient_id", db_path)
combined_df_xcr <- melt(combined_df, id.vars = c("condensed_id", "patient_id", "cluster"), measure.vars = XCR_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_xcr, .(type), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(value ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_xcr, aes(x=cluster, y=value)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

#### Celltypes and pathways

```{r, fig.width=10, fig.height=10}
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", to="patient_id", db_path)
combined_df_nanostring <- melt(combined_df, id.vars = c("condensed_id", "patient_id", "cluster"), measure.vars = NANOSTRING_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_nanostring, .(type), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(value ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_nanostring, aes(x=cluster, y=value)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

### Ancestral-descendant level

```{r}
clusters <- cutree(patient_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

props_ad_merged_clusts <- join(props_ad_merged, patient_clusts)
patient_df <- unique(subset(props_ad_merged_clusts, select=selected_cols))

patient_df_melted <- melt(patient_df, id.vars = colnames(patient_df)[!colnames(patient_df) %in% tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
```

```{r}
pvals <- setNames(ddply(patient_df_melted, .(tiltype), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(density ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(patient_df_melted, aes(x=cluster, y=density)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ tiltype, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```


Hence, the foldback group (cluster 2) has lower CD8+ and CD4+ densities than the HRD group, as expected. 

A caveat is that p-values are computed with Wilcoxon, irrespective of patient number. We can't really opt for a nonparametric nested ranks test because very few patients (4) are actually present in both clusters. 

## Ancestral analysis

Ancestral variants may have different properties from descendant variants. 

### Sample-specific

Here we'll just naively allow for subclonal variants to be counted multiple times. 

```{r}
props_ad_merged$is_ancestral <- as.factor(props_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_ad_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged, aes(x=is_ancestral, y=proportion)) + geom_boxplot() + geom_jitter(aes(colour=patient_id), position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

P-values are uncorrected. 

Unsurprisingly, ancestral samples have significantly more of SNV-4, the age signature. Insignificant, but they also have less SV-1, which is one of the BRCA's (small deletions). 

### Union

Here we'll actually take the union of subclonal variants for each patient so we don't overcount. 

```{r}
props_patient_ad_merged$is_ancestral <- as.factor(props_patient_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_patient_ad_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df, paired=TRUE)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_patient_ad_merged, aes(x=is_ancestral, y=proportion)) + geom_boxplot() + geom_jitter(aes(colour=patient_id), position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

Aside from age and HRD which were described in McPherson et al., there's additionally the SV-3 signature -- a translocation signature. Implying that translocations are early (ancestral) events in HGSC -- perhaps this is interesting? I have to do a literature search. 


```{r}
anc_desc_patient <- dcast(props_patient_ad_merged, formula = patient_id + signature ~ is_ancestral, value.var = "proportion")

pvals <- setNames(ddply(anc_desc_patient, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- with(df, cor.test(`0`, `1`, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(anc_desc_patient, aes(x=`1`, y=`0`)) + geom_point() + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

## ICGC validation

### Signatures

![](`r mmctm_ov_combined_sigplot`)

### FBI subclusters

```{r}
specimen_data <- fread(icgc_specimen_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)
icgc_expr_df <- icgc_expr_mat %>% as.data.frame %>% column_to_rownames("icgc_donor_id") %>% t %>% as.data.frame %>% rownames_to_column("Name")
icgc_celltype_matrix <- create_pathway_matrix(icgc_expr_df, labels, db_path, convert_ids = FALSE)

icgc_immune <- rownames_to_column(as.data.frame(t(icgc_celltype_matrix)), var = "icgc_donor_id")

# cytotoxic_markers <- c("PRF1", "GZMA", "HLA-A", "HLA-B", "HLA-C", "GZMK", "GZMM", "GZMH", "GNLY", "GZMB")
# icgc_immune <- as.data.frame(subset(icgc_expr_mat, select=c("icgc_donor_id", cytotoxic_markers)))
# for (i in 2:ncol(icgc_immune)) {
#  icgc_immune[,i] <- log10(icgc_immune[,i])
# }
```

```{r, fig.height=12}
props_ov_combined <- read_mutsig_output(mmctm_ov_combined_result_dir, samples, external = TRUE)
props_ov_combined_prop <- props_ov_combined$prop

props_ov_combined_mat <- create_mutsig_matrix(props_ov_combined_prop, col = "proportion")

# props_ov_combined_mat_scaled <- as.data.frame(apply(props_ov_combined_mat, 2, function(x) {
#   #x <- logit(x)
#   (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)
# }))

props_ov_combined_mat_scaled <- scale(props_ov_combined_mat)

#props_ov_combined_mat_scaled <- clip_values(props_ov_combined_mat_scaled, 3, -3)
#clip_values(scale(props_ov_combined_mat), 2, -2)

ov_combined_heat <- pheatmap(props_ov_combined_mat_scaled, fontsize_row = 6, clustering_distance_rows = "correlation", clustering_method = "ward.D2")
```

```{r}
NCLUST <- 3

clusters <- cutree(ov_combined_heat$tree_row, NCLUST)
ov_combined_clusts <- make_cluster_frame(clusters)

colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "icgc_donor_id", "new_id")

data_matrices <- list(ov_combined_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
#combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select=-c(new_id, cluster))
rownames(combined_mat) <- combined_df$new_id
```

```{r, fig.width=8, fig.height=8}
sample_order <- ov_combined_heat$tree_row$labels[ov_combined_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(new_id, cluster))

SIGS <- c("SV-8", "SNV-1", "SV-4", "SV-7", "SV-5", "SNV-7", "SV-1")
sig_annotations <- rownames_to_column(data.frame(props_ov_combined_mat[,SIGS], check.names = FALSE), var = "new_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "new_id"

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "new_id")

select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

#order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

#combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order,]
other_mat <- props_ov_combined_mat_scaled[sample_order,]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")

#rowmean <- rowMeans(combined_mat_scaled)
#a <- merge(row_annotations, as.data.frame(rowmean), by="row.names")
#ggplot(subset(a, cluster != 1), aes(x=rowmean > median(rowmean), y = `SNV-7`)) + geom_boxplot() + theme_bw()
```

```{r}
clusters <- setNames(cluster_annotations, c("icgc_donor_id", "cluster"))
```

### Differential expression

```{r}
icgc_expr_melted <- fread(icgc_expr_raw_file)
icgc_expr_casted_matrix <- expression_df_to_matrix(icgc_expr_melted, summarize_over = "icgc_donor_id", measure_var = "raw_read_count")

icgc_counts <- icgc_expr_casted_matrix %>% column_to_rownames(var = "icgc_donor_id") 
icgc_counts_filtered <- icgc_counts[clusters$icgc_donor_id,]
icgc_counts_filtered <- icgc_counts_filtered %>% t %>% as.data.frame

dge <- DGEList(counts = icgc_counts_filtered)
dge <- calcNormFactors(dge)

library_sizes <- colSums(icgc_counts_filtered)
library_sizes
max(library_sizes)/min(library_sizes)

design <- model.matrix(~0 + factor(clusters$cluster))
colnames(design) <- paste0("clust", 1:NCLUST)
contrast.matrix <- makeContrasts(clust1-clust3, clust2-(clust1+clust3)/2,clust2-clust1, levels=design)
```

#### limma-trend

We straddle close to threshold for doing this but let's do it anyways. 

```{r}
fit_trend <- rnaseq_DE_initial_fit(dge, design, type = "limma_trend")
```

```{r}
fit_trend_contrasts <- contrasts.fit(fit_trend, contrast.matrix)
fit_trend_contrasts <- eBayes(fit_trend_contrasts)

hrd_fbi_genes <- topTable(fit_trend_contrasts, coef = "clust2 - (clust1 + clust3)/2", adjust.method = "BH", number = Inf, sort="p")
fbi_immune_genes <- topTable(fit_trend_contrasts, coef = "clust1 - clust3", number = Inf, sort="p")
hrd_fbi_highimmune_genes <- topTable(fit_trend_contrasts, coef = "clust2 - clust1", number = Inf, sort="p")
```

```{r}
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "go")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "go")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "go")
```

##### HRD vs. FBI samples (clust2 vs. clust1+clust3)

```{r}
datatable(hrd_fbi_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```



##### High immune FBI vs. low immune FBI (clust1 vs. clust3)

```{r}
datatable(fbi_immune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


##### HRD vs. high immune FBI (clust2 vs. clust1)

```{r}
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```

#### voom

We have greater than 3-fold variability in library size, so we're better off using the voom method for DE analysis. 

```{r}
fit_voom <- rnaseq_DE_initial_fit(dge, design, type = "voom")
```

```{r}
fit_voom_contrasts <- contrasts.fit(fit_voom, contrast.matrix)
fit_voom_contrasts <- eBayes(fit_voom_contrasts)

hrd_fbi_genes <- topTable(fit_voom_contrasts, coef = "clust2 - (clust1 + clust3)/2", adjust.method = "BH", number =Inf, sort="p")
fbi_immune_genes <- topTable(fit_voom_contrasts, coef = "clust1 - clust3", number = Inf, sort="p")
hrd_fbi_highimmune_genes <- topTable(fit_voom_contrasts, coef = "clust2 - clust1", number = Inf, sort="p")
```


```{r, message=TRUE, warning=TRUE}
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "kegg")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "kegg")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "kegg")
# pv.out.list <- sapply(path.ids2, function(pid) pathview(
#                       gene.data =  exp.fc, pathway.id = pid,
#                       species = "hsa", out.suffix=out.suffix))
```

**Note:** BRCA1, BRCA2, and RPA are also downregulated in the HRD group -- the entire HR-associated gene cluster isn't significantly downregulated though. 

One of the key reasons why the entire HR pathway isn't significantly downregulated is because PolQ is part of it and PolQ is upregulated. Of note, PolQ promotes MMEJ, providing further evidence for MMEJ activity in FBI tumours. 

##### HRD vs. FBI samples (clust2 vs. clust1+clust3)

```{r}
datatable(hrd_fbi_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```



##### High immune FBI vs. low immune FBI (clust1 vs. clust3)

```{r}
datatable(fbi_immune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


##### HRD vs. high immune FBI (clust2 vs. clust1)

```{r}
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


#### Some DNA repair genes

```{r}
v <- voom(icgc_counts_filtered, design, normalize="quantile", plot=FALSE)
```

```{r}
icgc_expr_df_wide <- v$E %>% t %>% as.data.frame %>% rownames_to_column("icgc_donor_id")

icgc_expr_df_melted <- reshape2::melt(icgc_expr_df_wide, id.vars = c("icgc_donor_id"), measure.vars = colnames(icgc_expr_df_wide)[2:ncol(icgc_expr_df_wide)], variable.name = "Name", value.name = "expression")

icgc_expr_df_melted_labeled <- plyr::join(icgc_expr_df_melted, clusters)

REPAIR_GENES <- c("BRCA1", "BRCA2", "POLQ", "FEN1", "XRCC1", "PARP1", "LIG3")

icgc_expr_df_melted_labeled_filtered <- subset(icgc_expr_df_melted_labeled, Name %in% REPAIR_GENES)

ggplot(subset(icgc_expr_df_melted_labeled_filtered, !is.na(cluster)), aes(x=cluster, y=expression)) + geom_boxplot() + geom_jitter(position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + facet_wrap(~ Name, scales = "free")
```

### Survival

```{r}
icgc_clinical <- read_donor_specimen_survival(icgc_donor_file, icgc_specimen_file)

icgc_clinical_labeled <- plyr::join(clusters, icgc_clinical)

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, donor_vital_status == "deceased"))
```

Survival by FBI status ...

```{r}
simple_survival_analysis(SurvObj ~ cluster != 2, data = icgc_clinical_labeled)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = icgc_clinical_labeled)
```

## TCGA validation

While we can't directly do mutation signature analysis with MMCTM on exome data (I suppose this is theoretically possible but we'd probably be restricted in the types of events we can call, like long SVs), we can look at the FBI-HLAMP finding from Yikan's paper and see if that lines up with immune signatures. 

```{r}
tcga_expr_mat <- read_tcga_exprs(tcga_expr_mat_file)
tcga_ov_annotation <- fread(tcga_ov_annotation_file)
#tcga_fbi_hlamp_proportions <- fread(tcga_fbi_hlamp_proportions_file)
```

```{r}
tcga_expr_df <- tcga_expr_mat %>% as.data.frame %>% column_to_rownames("tcga_sample_id") %>% t %>% as.data.frame %>% rownames_to_column("Name")
tcga_celltype_matrix <- create_pathway_matrix(tcga_expr_df, labels, db_path, convert_ids = FALSE)

tcga_immune <- rownames_to_column(as.data.frame(t(tcga_celltype_matrix)), var = "tcga_sample_id")
```

```{r, fig.width=8, fig.height=16}
mat <- tcga_immune %>% column_to_rownames("tcga_sample_id") %>% scale #%>% clip_values(2, -2)
row_annotations <- tcga_ov_annotation %>% as.data.frame %>% column_to_rownames("tcga_sample_id")

tcgaheat <- pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)

#tcga_clusters <- rownames_to_column(data.frame(cluster=factor(cutree(tcgaheat$tree_row, 2))), var = "tcga_sample_id")

col <- "Cytotoxicity"
tcga_clusters <- rownames_to_column(data.frame(cluster=mat[,col] > median(mat[,col])), "tcga_sample_id") %>% mutate(cluster = as.factor(as.numeric(cluster)))

#row_annotations <- plyr::join(tcga_ov_annotation, tcga_clusters) %>% as.data.frame %>% column_to_rownames("tcga_sample_id")
#pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)
```


```{r}
df_merged <- plyr::join(tcga_immune, tcga_ov_annotation)

expr_measure_vars <- colnames(tcga_immune)[2:ncol(tcga_immune)]

df_merged_melted <- melt(df_merged, id.vars = c("tcga_sample_id", "Subgroup", "MolecularSubtype", "BRCA.status"), measure.vars = expr_measure_vars, variable.name = "Measure", value.name = "Expression")
```

```{r, fig.height = 18}
pvals <- setNames(ddply(subset(df_merged_melted, !is.na(Subgroup)), .(Measure), function(x) {
  df <- as.data.frame(x)
  testres <- kruskal.test(Expression ~ factor(Subgroup), data = df)
  
  pval <- testres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("Measure", "p.value"))

ggplot(subset(df_merged_melted, !is.na(Subgroup)), aes(x=Subgroup, y=Expression)) + geom_boxplot() + theme_bw() + facet_wrap(~ Measure, scales = "free", ncol=3) + theme_Publication() + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```

P-values are uncorrected. So after correction, we aren't going to get anything significant. 

In conclusion, the immune subgroups constitute a new subgrouping independent of FBI-HLAMP. 

I've also done correlation testing using the logR values that Yikan's provided me (rather than the discrete groupings of no AMP, FBI-AMP low, and FBI-AMP high), but the correlations are very poor (rho values of ~ -0.05 or so, p-values of >=0.3). 

### Survival


```{r}
tcga_clinical <- read_tcga_clinical(tcga_donor_file, type = "synapse2", filter = TRUE, unique = FALSE)

tcga_clinical_labeled <- Reduce(function(x,y) plyr::join(x,y, type = 'full'), list(tcga_clusters, tcga_clinical, tcga_ov_annotation))

tcga_clinical_labeled$SurvObj <- with(tcga_clinical_labeled, Surv(ifelse(vital_status == "Dead", death_days_to, last_contact_days_to), vital_status == "Dead"))

#tcga_clinical_labeled <- subset(tcga_clinical_labeled, additional_drug_therapy == "NO" & additional_immuno_therapy == "NO" & additional_pharmaceutical_therapy == "NO" &  targeted_molecular_therapy == "NO" & radiation_therapy == "NO")
#tcga_clinical_labeled <- subset(tcga_clinical_labeled, str_detect(tumor_stage, "^III"))
```

#### FBI-AMP colocalization

Let's first stratify by Yikan's groupings:

```{r}
simple_survival_analysis(SurvObj ~ Subgroup, data = tcga_clinical_labeled)
```

Note: The logrank p-values in the paper are incorrect, the one's I've computed are the right ones. In the paper, the death days from the dead patients were not correctly combined into the computation, so p-values were computed without the dead patients. 

#### Immune activity

We'll now stratify by `cluster`, a variable that indicates whether or not we're above median in terms of `r col`. In other words, 1 means we're high immune, and 0 means we're low immune. 

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = tcga_clinical_labeled)
```

Low immune tumours trend towards poor survival but this is not significant at all. 

Now let's try by treating immune activity as a continuous variable, controlling for other covariates:

```{r}
tcga_immune_continuous <-  tcga_celltype_matrix %>% t %>% as.data.frame %>% rownames_to_column("tcga_sample_id")

tcga_clinical_labeled_continuous <- plyr::join(tcga_clinical_labeled, tcga_immune_continuous)

coxph(SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis + tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
```

Indeed, immune activity is significantly associated with prolonged survival (negative coefficient = fewer death events). 

#### Stratification by foldback-AMP status

```{r}
fbi_high <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP High")
fbi_low <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP Low")
fbi_no <- subset(tcga_clinical_labeled, Subgroup == "No AMP")
fbi_nothigh <- subset(tcga_clinical_labeled, Subgroup %in% c("FBI-AMP Low", "No AMP"))
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_high)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_low)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_no)
```

What's curious about this is that it seems only the no AMP group gets any benefit from having a high immune response. So there's no benefit within the foldback group of high/low immune response. 
```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_nothigh)
```

#### Stratification by immune activity

```{r}
immune_low <- subset(tcga_clinical_labeled, cluster == 0)
immune_high <- subset(tcga_clinical_labeled, cluster == 1)
```

```{r}
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_high)

simple_survival_analysis(SurvObj ~ Subgroup, data = immune_low)
```

Likewise, the only benefit of being non-foldback is derived from the immune-high group -- there's no difference between being foldback or not if you're in the low immune group. 

#### Multivariate models

```{r}
simple_survival_analysis(SurvObj ~ Subgroup + cluster, data = tcga_clinical_labeled)
```

We'll next combined FBI-AMP low and No AMP into a single category called FBI-NotHigh. 

```{r}
tcga_clinical_labeled$newgroup <- ifelse(tcga_clinical_labeled$Subgroup == "FBI-AMP High", "FBI-High", "FBI-NotHigh")

simple_survival_analysis(SurvObj ~ newgroup + cluster, data = tcga_clinical_labeled)
```


Let's also do Cox proportional hazards models:

```{r}
mod1 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis + tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
anova(mod1)
```

```{r}
mod2 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis, tcga_clinical_labeled_continuous)
anova(mod2)
```

```{r}
mod3 <- coxph(SurvObj ~ Cytotoxicity + (Subgroup == "FBI-AMP High") + age_at_initial_pathologic_diagnosis, subset(tcga_clinical_labeled_continuous, !is.na(Subgroup)))
anova(mod3)
```

So the effects of immune activity and FBI/HRD status are collinear. 

## Final run

Instead of overwriting the previous, this is its own section since the underlying implementation has changed substantially. 

```{r}
variant_table <- read_variant_file(master_variant_file, db_path)
breakpoint_table <- read_variant_file(master_breakpoint_file, db_path)

sig_results <- read_signature_files(mmctm_final_patient_dir, variant_table, breakpoint_table, db_path)

sig_results_snv <- subset(sig_results$snv, is_present == 1)
sig_results_sv <- subset(sig_results$sv, is_present == 1)
sig_results_nonith <- sig_results$nonith %>% as.data.frame %>% column_to_rownames("signature") %>% t %>% as.data.frame %>% rownames_to_column("condensed_id")

signature_labels <- str_extract(c(colnames(sig_results_snv), colnames(sig_results_sv)), "SN?V-[0-9]+")
signature_labels <- signature_labels[!is.na(signature_labels)]
```

```{r}
summarize_signature_proportions <- function(x, by=c("condensed_id", "patient_id"), signature_labels, report_count = FALSE) {
  props <- subset(x, select=c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% group_by_(.dots=by) %>% summarise_each(funs(sum))
  
  if (report_count) {
    counts <- subset(x, select=c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% group_by_(.dots=by) %>% summarise(n=n())
    props <- plyr::join(props %>% as.data.frame, counts %>% as.data.frame)
  }
  sums <- rowSums(subset(props, select=colnames(props[colnames(props) %in% signature_labels])))
  sigs <- intersect(signature_labels, colnames(props))
  for (sig in sigs) {
    props[,sig] <- props[,sig]/sums
  }
  return(props)
}
```

### Signatures

![](`r mmctm_final_patient_sigplot`)

### Sample signature proportions

These proportions won't be exactly equal to the topic proportions that the model outputs (they are variational estimates), but we're actually doing pretty darn well. 

TODO: Make a QC plot. From a glance it seems that proportion error is usually within 1-5% relative error. 

```{r}
sample_props_snv <- summarize_signature_proportions(sig_results_snv, by=c("condensed_id", "patient_id"), signature_labels)
sample_props_sv <- summarize_signature_proportions(sig_results_sv, by=c("condensed_id", "patient_id"), signature_labels)

patient_props_snv <- sig_results_snv %>% subset(select=c("patient_id", "chrom", "coord", "ref", "alt", intersect(signature_labels, colnames(sig_results_snv)))) %>% unique %>% summarize_signature_proportions(by=c("patient_id"), signature_labels)
patient_props_sv <- sig_results_sv %>% subset(select=c("patient_id", "prediction_id", intersect(signature_labels, colnames(sig_results_sv)))) %>% unique %>% summarize_signature_proportions(by=c("patient_id"), signature_labels)
```

```{r}
sample_props <- plyr::join(sample_props_snv %>% as.data.frame, sample_props_sv %>% as.data.frame)
sample_props <- rbind.fill(sample_props, sig_results_nonith)

sample_props_mat <- sample_props %>% column_to_rownames("condensed_id") %>% subset(select=c(colnames(sample_props)[colnames(sample_props) %in% signature_labels]))

sample_props_mat_scaled <- scale(sample_props_mat)
#[,c("SNV-2", "SNV-5", "SV-3", "SV-6", "SV-8", "SV-1", "SV-7", "SV-5", "SNV-3")]
sample_props_heat <- pheatmap(sample_props_mat_scaled, fontsize_row = 5,
                              clustering_method = "ward.D2")
```

```{r}
patient_props <- plyr::join(patient_props_snv %>% as.data.frame, patient_props_sv %>% as.data.frame)
patient_props <- rbind.fill(patient_props, sig_results_nonith %>% plyr::rename(c("condensed_id"="patient_id")))

patient_props_mat <- patient_props %>% column_to_rownames("patient_id") %>% subset(select=c(colnames(patient_props)[colnames(patient_props) %in% signature_labels]))

patient_props_mat_scaled <- scale(patient_props_mat)
#[,c("SNV-2", "SNV-5", "SV-3", "SV-6", "SV-8", "SV-1", "SV-7", "SV-5", "SNV-3")]
patient_props_heat <- pheatmap(patient_props_mat_scaled, fontsize_row = 5,
                              clustering_method = "ward.D2", clustering_distance_rows = "correlation")
```

#### ITH cohort

```{r}
sample_clusts <- as.data.frame(cutree(sample_props_heat$tree_row, 4)) %>% setNames(c("cluster")) %>% rownames_to_column("condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='full'), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))
combined_df <- subset(combined_df, condensed_id %in% subset(samples, project_code == "ITH")$condensed_id)

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
#ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), "7_BrnM")

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[,SIGS], check.names = FALSE), var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots="condensed_id") %>% 
  summarise(snv_count=log(sum(snv_count)), bkpt_count=log(sum(bkpt_count))) %>%
  subset(select=c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, sig_annotations,
                                           variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_ith <- row_annotations

combined_mat_scaled <- scale(combined_mat)

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order,], cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)

combined_mat_scaled_ith <- combined_mat_scaled
```

#### ICGC validation

```{r}
icgc_fbi_status <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status) <- mapvalues(colnames(icgc_fbi_status), from = c("Case_ID"), to = c("condensed_id"))
```

```{r}
colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "new_id", "condensed_id")

data_matrices <- list(sample_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
#combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[,SIGS], check.names = FALSE), var = "condensed_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "condensed_id"

icgc_fbi_annotations <- subset(icgc_fbi_status, select=c("condensed_id", "Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations, icgc_fbi_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_noith <- row_annotations

#select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
#sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

#order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

#combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order,]
other_mat <- sample_props_mat_scaled[sample_order,]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")

combined_mat_scaled_noith <- combined_mat_scaled
```


```{r}
icgc_clinical_labeled <- Reduce(function(x,y) plyr::join(x,y,type='full'), list(setNames(cluster_annotations, c("icgc_donor_id", "cluster")), icgc_clinical, setNames(subset(icgc_fbi_annotations, select=c("condensed_id", "BRCA.status", "Subgroup")), c("icgc_donor_id", "BRCA.status", "Wang_subgroup"))))

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, donor_vital_status == "deceased"))
```

Survival by FBI status ...

```{r}
simple_survival_analysis(SurvObj ~ cluster != 1, data = subset(icgc_clinical_labeled, cluster != 4))
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = subset(icgc_clinical_labeled, cluster != 4))
```

```{r}
simple_survival_analysis(SurvObj ~ Wang_subgroup, data = icgc_clinical_labeled)
```

#### Combined

##### Sample-level

```{r}
combined_mat_scaled_all <- rbind.fill(combined_mat_scaled_ith %>% as.data.frame %>% rownames_to_column("condensed_id"), combined_mat_scaled_noith %>% as.data.frame %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

row_annotations_all <- rbind.fill(row_annotations_ith %>% rownames_to_column("condensed_id"), row_annotations_noith %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat_scaled_all))

#combined_mat_scaled <- as.data.frame(apply(combined_mat_all, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))
combined_mat_scaled <- combined_mat_scaled_all[sample_order,]

nanostring_vars <- rownames(icgc_celltype_matrix)
combined_mat_scaled_subset <- subset(combined_mat_scaled, select=nanostring_vars)
combined_mat_scaled_subset <- combined_mat_scaled_subset[!apply(combined_mat_scaled_subset, 1, function(x) all(is.na(x))),]
```

```{r, fig.width=7, fig.height=12}
pheatmap(clip_values(combined_mat_scaled_subset, 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations_all, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")
```

Doesn't really cluster consistently between ICGC and our samples. 

TODO: Batch correct ICGC and our data together (I think I already did this somewhere) -- and normalize the final matrix in one step rather than normalizing separately for each subcohort and combining those together. Otherwise we could always be subject to the scenario where the ITH cohort may be skewed in immune response (either all relatively low or high) compared to the ICGC cohort. 

TODO: Use absolute counts of mutations for each signature, and cluster based on those. It may be that samples with too low/high mutation counts are not clustering properly. 

##### Patient-level

```{r}
#ith_icgc_batch_corrected_expression_file <- "~/shahlab/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv"

ith_icgc_expression <- fread(ith_icgc_batch_corrected_expression_file)

ith_icgc_celltype_expr <- create_celltype_matrix(ith_icgc_expression, labels, db_path, convert_ids = FALSE)
ith_icgc_pathway_expr <- create_pathway_matrix(ith_icgc_expression, labels, db_path, convert_ids = FALSE)

icgc_specimen_data <- fread(icgc_specimen_file)
```

```{r}
summarize_expression_by_patient <- function(expr) {
  primary_specimen_dat <- subset(icgc_specimen_data, str_detect(specimen_type, "Primary"))
  
  notx <- subset(primary_specimen_dat, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
  primary_specimen_dat <- subset(primary_specimen_dat, icgc_donor_id %in% notx)
  
  x <- setNames(melt(as.matrix(expr)), c("Name", "sample", "expr"))
  x$patient_id <- map_id(as.character(x$sample), from = "condensed_id", to="patient_id", db_path)
  idx <- is.na(x$patient_id)
  donor_labels <- df_as_map(primary_specimen_dat, as.character(x$sample[idx]), from = "icgc_specimen_id",to = "icgc_donor_id")
  x$patient_id[idx] <- donor_labels
  x <- subset(x, !is.na(patient_id))
  x_sum <- x %>% group_by(Name, patient_id) %>% summarise(expr=mean(expr, na.rm=TRUE))
  
  res <- dcast(x_sum, formula = Name ~ patient_id, value.var = "expr")
  return(res)
}
```

```{r}
NCLUST <- 3
clusters <- cutree(patient_props_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

cluster_annotations <- subset(plyr::rename(patient_clusts, c('new_id'='patient_id')), select=c(patient_id, cluster))

ith_icgc_celltype_expr_patient <- summarize_expression_by_patient(ith_icgc_celltype_expr)
ith_icgc_pathway_expr_patient <- summarize_expression_by_patient(ith_icgc_pathway_expr)

expr_dat <- ith_icgc_pathway_expr_patient
combined_patient_mat <- expr_dat %>% as.data.frame %>% column_to_rownames(var = "Name") %>% 
  t %>% as.data.frame %>% rownames_to_column("patient_id")

data_matrices <- list(cluster_annotations, combined_patient_mat)
data_matrices <- lapply(data_matrices, function(x) {
  x
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
combined_mat <- subset(combined_df, select=-c(cluster))
rownames(combined_mat) <- NULL
combined_mat <- combined_mat %>% column_to_rownames("patient_id")

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(patient_props_mat[,SIGS], check.names = FALSE), var = "patient_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "patient_id"

icgc_fbi_status_patient <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status_patient) <- mapvalues(colnames(icgc_fbi_status_patient), from = c("Case_ID"), to = c("patient_id"))

icgc_fbi_annotations <- subset(icgc_fbi_status_patient, select=c("patient_id", "Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations, icgc_fbi_annotations))

row_annotations <- row_annotations %>% column_to_rownames("patient_id")

combined_patient_mat_scaled <- combined_mat %>% scale

patient_order <- patient_props_heat$tree_row$labels[patient_props_heat$tree_row$order]
patient_order <- intersect(patient_order, rownames(combined_patient_mat_scaled))
```

```{r, fig.width=7, fig.height=10}
pheatmap(clip_values(combined_patient_mat_scaled[patient_order,], 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")
```



### Ancestral-descendant signature proportions

```{r}
ancdesc_props_snv <- unique(subset(sig_results_snv, select=c("patient_id", "is_ancestral", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by=c("patient_id", "is_ancestral"), signature_labels)
ancdesc_props_sv <- unique(subset(sig_results_sv, select=c("patient_id", "prediction_id", "is_ancestral", signature_labels[signature_labels %in% colnames(sig_results_sv)]))) %>% summarize_signature_proportions(by=c("patient_id", "is_ancestral"), signature_labels)

ancdesc_props <- plyr::join(ancdesc_props_snv %>% as.data.frame, ancdesc_props_sv %>% as.data.frame)

ancdesc_props_mat <- subset(ancdesc_props, select=colnames(ancdesc_props)[colnames(ancdesc_props) %in% signature_labels])
```

```{r}
ancdesc_props_scaled <- scale(ancdesc_props_mat)
rownames(ancdesc_props_scaled) <- with(ancdesc_props, paste(patient_id, is_ancestral, sep="_"))

pheatmap(ancdesc_props_scaled, clustering_distance_rows = "euclidean", clustering_method = "ward.D2")
```

```{r}
ancdesc_props_melted <- melt(ancdesc_props, id.vars = c("patient_id", "is_ancestral"), measure.vars = intersect(colnames(ancdesc_props), signature_labels), variable.name = "signature", value.name = "proportion")

pvals <- setNames(ddply(ancdesc_props_melted, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df, paired=TRUE)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(ancdesc_props_melted, aes(x=factor(is_ancestral), y=proportion)) + geom_boxplot() + facet_wrap(~ signature, scales="free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
theme_bw() + theme_Publication()
```

### Origin node signature proportions

**Note**: We cannot get node-specific rearrangement signatures. 

**Note**: These node labels DO NOT correspond to node labels within the clone phylogeny; they correspond to nodes within the Dollo model. 

```{r}
node_props_snv <- unique(subset(sig_results_snv, select=c("patient_id", "origin_node", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by=c("patient_id", "origin_node"), signature_labels, report_count = TRUE)
```
```{r}
node_props_mat <- subset(node_props_snv, select=colnames(node_props_snv)[colnames(node_props_snv) %in% signature_labels])
```

```{r, fig.width=7, fig.height=10}
node_props_scaled <- scale(node_props_mat)
rownames(node_props_scaled) <- with(node_props_snv, paste(patient_id, origin_node, sep="_"))

row_annotations <- subset(node_props_snv, select=c("patient_id", "n"))
row_annotations$n <- log2(row_annotations$n)
row_annotations$patient_id <- factor_id(row_annotations$patient_id, type = "patient_id", db_path)
rownames(row_annotations) <- rownames(node_props_scaled)

clustering_colours <- list(patient_id = pal_patient)

pheatmap(clip_values(node_props_scaled, 2, -2), clustering_distance_rows = "euclidean", clustering_method = "ward.D2", fontsize_row = 5, annotation_row = row_annotations, annotation_colors = clustering_colours)
```

Looks like there are similar selection pressures acting at each part of the sample phylogeny -- i.e. mutation signatures are 'relatively' consistent within patients throughout time. 

### Clonal phylogeny branch-specific signature proportions

NOTE: THE LABELS ON THESE TREES MAY NOT BE THE SAME AS THOSE IN THE MAPSCAPES (ugh). 

```{r}
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
```

```{r}
snv_cluster <- rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
  f <- snv_cluster_files[[i]]
  patient_id <- snv_cluster_patients[[i]]
  snv_cluster <- read_snv_cluster(f, clone_branch_length_file, db_path)
  return(snv_cluster)
}))

snv_cluster <- plyr::join(snv_cluster, branch_lengths)
```

```{r}
snv_cluster_filtered <- subset(snv_cluster, !is.na(label))

sig_results_snv_cluster <- plyr::join(sig_results_snv, snv_cluster_filtered)
```

```{r}
branch_props_snv <- unique(subset(sig_results_snv_cluster, select=c("patient_id", "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv_cluster)]))) %>% summarize_signature_proportions(by=c("patient_id", "label"), signature_labels, report_count = TRUE)
```

```{r}
branch_props_snv_melted <- melt(branch_props_snv, id.vars = c("patient_id", "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv)), variable.name = "signature", value.name = "proportion")
```

```{r, fig.width=7, fig.height=25}
ggplot(branch_props_snv_melted, aes(x=label, y=proportion)) + geom_bar(aes(fill=signature), stat = "identity") + facet_wrap(~ patient_id, ncol=1, scales = "free_x") + theme_bw() + theme_Publication() + geom_text(data=unique(subset(branch_props_snv_melted, select=c("patient_id", "label", "n"))), aes(x=label, y=1, label=n),vjust=-.2,stat="identity") + ylim(c(0, 1.2))
```

What if we use MAP assignments? (This would allow us to apply chi-square tests.) 

TODO: Figure out how to apply tests between k > 2 groups of proportions... in other words, a test of homogeneity. 

```{r}
id_vars <- colnames(sig_results_snv_cluster)[!colnames(sig_results_snv_cluster) %in% signature_labels]

sig_results_snv_cluster_melted <- melt(sig_results_snv_cluster, id.vars = id_vars, measure.vars = intersect(signature_labels, colnames(sig_results_snv_cluster)), variable.name = "signature", value.name = "proportion")
maxes <- sig_results_snv_cluster_melted %>% group_by_(.dots = id_vars) %>% summarise(maxprop=max(proportion))
sig_results_snv_cluster_melted <- plyr::join(sig_results_snv_cluster_melted,maxes)
sig_results_snv_cluster_melted$proportion_map <- ifelse(sig_results_snv_cluster_melted$proportion == sig_results_snv_cluster_melted$maxprop, 1, 0)

sig_results_snv_cluster_casted <- dcast(sig_results_snv_cluster_melted, formula = paste0(paste(id_vars, collapse="+"), "~ signature"), value.var = "proportion_map")

branch_props_snv_map <- unique(subset(sig_results_snv_cluster_casted, select=c("patient_id", "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv_cluster_casted)]))) %>% summarize_signature_proportions(by=c("patient_id", "label"), signature_labels, report_count = TRUE)

branch_props_snv_map_melted <- melt(branch_props_snv_map, id.vars = c("patient_id", "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv_map)), variable.name = "signature", value.name = "proportion")
```


```{r, fig.width=7, fig.height=25}
ggplot(branch_props_snv_map_melted, aes(x=label, y=proportion)) + geom_bar(aes(fill=signature), stat = "identity") + facet_wrap(~ patient_id, ncol=1, scales = "free_x") + theme_bw() + theme_Publication() + geom_text(data=unique(subset(branch_props_snv_map_melted, select=c("patient_id", "label", "n"))), aes(x=label, y=1, label=n),vjust=-.2,stat="identity") + ylim(c(0, 1.2))
```

### Adjusted clone trees

```{r}
trees_age <- trees
age_signature <- "SNV-5"

branch_props_dat <- branch_props_snv_map_melted

tree_objects <- lapply(seq_along(trees_age), function(i) {
  tree <- trees_age[[i]]
  tree_old <- tree
  patient <- names(trees_age)[i]
  branch_props_dat_sub <- subset(branch_props_dat, patient_id == as.numeric(patient) & signature == age_signature)
  branch_props_dat_sub$length <- with(branch_props_dat_sub, proportion*n)
  
  edge_lengths <- tree@edge.length
  idx <- which(edge_lengths == 0)
  to_labels <- tree@label[str_extract(names(edge_lengths), "[0-9]+$")]
  lengths <- df_as_map(branch_props_dat_sub, unname(to_labels), from = "label", to="length")
  if (length(idx) > 0) {
    lengths[idx] <- 0
  }
  tree@edge.length <- lengths
  names(tree@edge.length) <- names(edge_lengths)
  
  return(list(all=tree_old, age=tree))
})
names(tree_objects) <- names(trees_age)
```

```{r, fig.width=20, fig.height=20}
tmp <- unlist(tree_objects)
#ignore <- lapply(seq_along(tmp), function(i) {
#  print(plot(tmp[[i]], show.node.label = TRUE, main = names(tmp)[i]))
#})
```

```{r}
find_ancestors <- function(tree) {
  x <- phylobase:::.phylo4ToDataFrame(tree)
  root <- subset(x, node.type == "root")$label
  root_number <- names(which(tree@label == root))
  direct_descendants <- subset(x, ancestor == as.numeric(root_number))$label
  if (length(direct_descendants) == 1) {
    return(c(root, direct_descendants))
  } else {
    return(root)
  }
}
```

```{r}
patients <- unique(branch_props_dat$patient_id)
root_data <- rbind.fill(lapply(patients, function(pat) {
  tree <- trees[[as.character(pat)]]
  ancestors <- find_ancestors(tree)
  rbind.fill(lapply(ancestors, function(x) {
    data.frame(label=x, patient_id=pat)
  }))
}))
root_data$node_type <- "root"
```

```{r}
branch_data_annotated <- plyr::join(branch_props_dat, root_data)
branch_data_annotated$node_type[is.na(branch_data_annotated$node_type)] <- "descendant"

root_proportions <- branch_data_annotated %>% subset(node_type == "root") %>% group_by(patient_id, signature) %>% summarise(proportion=weighted.mean(proportion, w = n)) %>% plyr::rename(c('proportion'='root_proportion'))

branch_data_annotated <- plyr::join(branch_data_annotated, root_proportions)
branch_data_annotated$proportion_diff <- with(branch_data_annotated, proportion - root_proportion)
```

```{r}
ggplot(branch_data_annotated %>% subset(node_type == "descendant" & n > 40), aes(x=proportion_diff, fill=signature)) + geom_histogram(binwidth=0.02, alpha=0.4, position='identity') + theme_bw() + theme_Publication()
```