library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.ihc)
library(ithi.clones)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db

ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
tils_for_cluster <- unlist(snakemake@params$tils_for_cluster)
nclusts <- as.numeric(snakemake@params$nclusts)

tils_for_variability <- unlist(snakemake@params$tils_for_variability)
variability_type <- snakemake@params$variability_type

# clone_tree_dir <- snakemake@input$clone_tree_dir
clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clonal_measures_file <- snakemake@input$ith_stats
prevalence_threshold <- as.numeric(snakemake@params$prevalence_threshold)

proportion_subclonality_file <- snakemake@input$subclonality

Metadata

id_type <- "condensed_id"

db <- src_sqlite(db_path, create = FALSE)
duplicates <- collect(tbl(db, "duplicates"))

TIL-based clusters

We’ll use the following TIL subsets for clustering E_CD8_density, E_CD4_density, E_CD20_density, E_Plasma_density, S_CD8_density, S_CD4_density, S_CD20_density, S_Plasma_density.

ihc_table <- fread(ihc_table_path)
ihc_data <- subset(ihc_table, select = c(tils_for_cluster))
valid_rows <- apply(ihc_data, 1, function(x) all(!is.na(x)))

patient_ids <- as.data.frame(subset(ihc_table, select = c(patient_id)))
ihc_data <- as.data.frame(ihc_data[valid_rows, ])

patient_ids <- patient_ids[valid_rows, , drop = FALSE]
patient_ids$patient_id <- factor(patient_ids$patient_id)
colnames(patient_ids) <- "Patient"

rownames(ihc_data) <- rownames(patient_ids) <- as.data.frame(ihc_table)[, id_type][valid_rows]
dat_processed <- t(clip_values(scale(ihc_data)))
rownames(dat_processed) <- mapvalues(rownames(dat_processed), from = c("E_CD8_density", 
    "E_CD4_density", "E_CD20_density", "E_Plasma_density", "S_CD8_density", 
    "S_CD4_density", "S_CD20_density", "S_Plasma_density"), to = c("E CD8+", 
    "E CD4+", "E CD20+", "E Plasma", "S CD8+", "S CD4+", "S CD20+", "S Plasma"))

p <- pheatmap(dat_processed, clustering_method = "ward.D", annotation_col = patient_ids, 
    annotation_colors = list(Patient = pal_patient), cluster_rows = FALSE, show_colnames = FALSE, 
    legend = TRUE, cellwidth = 10, cellheight = 10, annotation_legend = FALSE)

clusts <- data.frame(cutree(p$tree_col, nclusts))
setDT(clusts, keep.rownames = TRUE)
clusts <- setNames(clusts, c(id_type, "cluster"))

clusts$cluster <- factor(clusts$cluster)
ihc_table_annotated <- merge(ihc_table, clusts, by = c(id_type))

Clonal diversity

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, 
    clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
prevalences <- rbind.fill(lapply(tree_branch_data, function(x) x$prevalence_dat))
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))

ccfs <- compute_ccf(prevalences, trees, id_type = id_type)
ccfs_labeled <- merge(ccfs, branch_lengths, by = c("label", "node", "patient_id"))

clonal_measures <- read_ith_stats(clonal_measures_file, db_path, duplicates = FALSE)
prevalence_indices <- prevalences %>% group_by(condensed_id, patient_id) %>% 
    summarise(shannon = vegan::diversity(clonal_prevalence, index = "shannon"), 
        simpson = vegan::diversity(clonal_prevalence, index = "simpson"))

prevalences_filtered <- subset(prevalences, clonal_prevalence >= prevalence_threshold)
# vafs_filtered <- subset(vafs, prevalence >= prevalence_threshold) ## Not
# exactly right but close enough
divergence <- compute_clone_divergence(prevalences_filtered, trees, id_type)
# divergence_weighted <- compute_clone_divergence(prevalences_filtered,
# vafs_filtered, clone_trees, branch_lengths, id_type, weighted=TRUE)
# colnames(divergence_weighted) <- mapvalues(colnames(divergence_weighted),
# from = 'clone_divergence', to = 'clone_divergence_weighted')
diversity <- Reduce(function(x, y) merge(x, y, by = c(id_type, "patient_id"), 
    all = TRUE), list(prevalence_indices, clonal_measures))

ITH-TIL correlations

Old measures

We’ll use proportion_subclonality as the surrogate.

proportion_subclonal <- fread(proportion_subclonality_file)

res <- plyr::join(clusts, proportion_subclonal)

if (nclusts == 2) {
    corres <- wilcox.test(proportion_subclonal ~ cluster, data = res)
} else {
    corres <- kruskal.test(proportion_subclonal ~ cluster, data = res)
}

pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))

ggplot(res, aes(x = cluster, y = proportion_subclonal)) + geom_boxplot(aes(fill = cluster)) + 
    theme_bw() + theme_Publication() + scale_fill_Publication() + annotate("text", 
    x = Inf, y = Inf, parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))

By a Kruskal-Wallis test.

If we instead choose to compare cluster 3 (high epithelial TILs) against everything else, then:

wilcox.test(proportion_subclonal ~ cluster == 3, data = res)

    Wilcoxon rank sum test

data:  proportion_subclonal by cluster == 3
W = 424, p-value = 0.005338
alternative hypothesis: true location shift is not equal to 0

New measures (in progress)

res <- merge(diversity, ihc_table_annotated, by = c(id_type, "patient_id"))

ith_measures <- c("shannon", "entropy", "simpson", "divergence", "postprocessed_divergence", 
    "proportion_subclonal", "combined_ith_normalized", "combined_ith_raw")
res_melted <- reshape2::melt(res, id.vars = c(id_type, "patient_id", "cluster"), 
    measure.vars = ith_measures, variable.name = "ith_type", value.name = "ith")
res_melted$ith_type <- factor(res_melted$ith_type)
stats <- setNames(ddply(res_melted, .(ith_type), function(x) {
    if (nclusts == 2) {
        res <- wilcox.test(ith ~ cluster, data = x)
    } else {
        res <- kruskal.test(ith ~ cluster, data = x)
    }
    pvalue <- res$p.value
    eq <- substitute(italic(P) == p, list(p = format(pvalue, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("ith_type", "p.value"))
ggplot(res_melted, aes(x = cluster, y = ith)) + geom_boxplot(aes(fill = cluster)) + 
    theme_bw() + theme_Publication() + scale_fill_Publication() + facet_wrap(~ith_type, 
    scales = "free") + geom_text(data = stats, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Check with Andrew as to how he computed clone divergence – my computations based on what I believe the definition is are different. Perhaps it’s a 1-minus issue?

ITH-TIL variability

ihc_table_slide <- fread(ihc_table_slide_path)

ihc_slide_data <- subset(ihc_table_slide, select = c("condensed_id", "patient_id", 
    "slide", tils_for_variability))
ihc_melted <- melt(ihc_slide_data, id.vars = c("condensed_id", "patient_id", 
    "slide"), measure.vars = tils_for_variability, variable.name = "tiltype", 
    value.name = "density")
ihc_melted$tiltype <- as.character(ihc_melted$tiltype)

intersect_samples <- intersect(unique(ihc_melted$condensed_id), unique(ccfs_labeled$condensed_id))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, 
    til_variance_option = variability_type, tils_for_variability)

clonedists <- clone_distances(ccfs_labeled, normalize = FALSE, id_type = id_type)
clonedists$patient_id <- as.character(clonedists$patient_id)

variabilities <- inner_join(til_var, subset(clonedists, select = -c(dist_clones_weighted)), 
    by = c("patient_id"))
variabilities_melted <- melt(variabilities, id.vars = c("patient_id", "mean_weighted_dist"), 
    measure.vars = tils_for_variability, variable.name = "tiltype", value.name = "tilvar")
variabilities_melted$patient_id <- factor(variabilities_melted$patient_id)
stats <- setNames(ddply(variabilities_melted, .(tiltype), function(x) {
    res <- with(x, cor.test(mean_weighted_dist, tilvar, method = "spearman"))
    pvalue <- res$p.value
    eq <- substitute(italic(P) == p, list(p = format(pvalue, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))
p <- ggplot(variabilities_melted, aes(x = mean_weighted_dist, y = tilvar)) + 
    geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication()
p <- p + facet_wrap(~tiltype, scales = "free", nrow = 1)
p <- p + theme(axis.text.x = element_text(size = 6, family = "Helvetica"), legend.text = element_text(size = 6, 
    family = "Helvetica"), axis.text.y = element_text(size = 6, family = "Helvetica"), 
    axis.title = element_text(size = 8, family = "Helvetica"), strip.text = element_text(size = 7, 
        family = "Helvetica"), strip.background = element_rect(fill = "white"))
p <- p + ylab("Variability") + xlab("Mean clonal distance")
p <- p + geom_text(data = stats, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, 
    vjust = 1.5, size = 3, parse = TRUE)
p <- p + scale_colour_manual(values = pal_patient) + guides(colour = guide_legend(title = "Patient"))

plot(p)

---
title: "ITH-TIL analysis"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "ith_til_densities.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', 'Rmd/ith_til_densities.Rmd', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "ihc_table_slide" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', "subclonality" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "notebook" = 'Rmd/ith_til_densities.Rmd'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/ith_til_densities.nb.html'),
    params = list('/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), 3, c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), 'ithi-analysis-ith-til-analysis', 0.01, 'stabilize', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "nclusts" = 3, "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "name" = 'ithi-analysis-ith-til-analysis', "prevalence_threshold" = 0.01, "variability_type" = 'stabilize'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/ith_til_analysis.log'),
    resources = list(),
    config = list("mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "prevalence_threshold" = 0.01, "clonal_samplers" = c('HMC', 'NUTS'), "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "classifier_type" = 'knn', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "variability_type" = 'stabilize', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "PNG_DENSITY" = 300, "PNG_QUALITY" = 300, "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "index_notebook" = 'Rmd/index.Rmd', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "nclusts" = 3, "spatial_result_dirs" = list("epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc', "stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc'), "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "mvclust_nclust" = 3, "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "xcr_distance_method" = 'horn', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "figure_gallery_notebook" = 'Rmd/figures.Rmd', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "default_sampler" = 'HMC', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "phenotype_threshold" = 0.85, "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2'),
    rule = 'ith_til_analysis'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.ihc)
library(ithi.clones)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
tils_for_cluster <- unlist(snakemake@params$tils_for_cluster)
nclusts <- as.numeric(snakemake@params$nclusts)

tils_for_variability <- unlist(snakemake@params$tils_for_variability)
variability_type <- snakemake@params$variability_type

#clone_tree_dir <- snakemake@input$clone_tree_dir
clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clonal_measures_file <- snakemake@input$ith_stats
prevalence_threshold <- as.numeric(snakemake@params$prevalence_threshold)

proportion_subclonality_file <- snakemake@input$subclonality
```

## Metadata

```{r}
id_type <- "condensed_id"

db <- src_sqlite(db_path, create=FALSE)
duplicates <- collect(tbl(db, "duplicates"))
```

## TIL-based clusters

We'll use the following TIL subsets for clustering `r tils_for_cluster`.

```{r}
ihc_table <- fread(ihc_table_path)
```

```{r}
ihc_data <- subset(ihc_table, select=c(tils_for_cluster))
valid_rows <- apply(ihc_data, 1, function(x) all(!is.na(x)))

patient_ids <- as.data.frame(subset(ihc_table, select=c(patient_id)))
ihc_data <- as.data.frame(ihc_data[valid_rows,])

patient_ids <- patient_ids[valid_rows,,drop=FALSE]
patient_ids$patient_id <- factor(patient_ids$patient_id)
colnames(patient_ids) <- "Patient"

rownames(ihc_data) <- rownames(patient_ids) <- as.data.frame(ihc_table)[,id_type][valid_rows]
```

```{r, fig.width=18, fig.height=4}
dat_processed <- t(clip_values(scale(ihc_data)))
rownames(dat_processed) <- mapvalues(rownames(dat_processed), 
                                     from = c("E_CD8_density", "E_CD4_density", "E_CD20_density", "E_Plasma_density", "S_CD8_density", "S_CD4_density", "S_CD20_density", "S_Plasma_density"),
          to=c("E CD8+", "E CD4+", "E CD20+", "E Plasma", "S CD8+", "S CD4+", "S CD20+", "S Plasma"))

p <- pheatmap(dat_processed, clustering_method = "ward.D", annotation_col = patient_ids, 
              annotation_colors = list(Patient=pal_patient), cluster_rows = FALSE,
              show_colnames = FALSE, legend = TRUE, cellwidth=10, cellheight=10,
              annotation_legend = FALSE)
```



```{r}
clusts <- data.frame(cutree(p$tree_col, nclusts))
setDT(clusts, keep.rownames = TRUE)
clusts <- setNames(clusts, c(id_type, "cluster"))

clusts$cluster <- factor(clusts$cluster)
ihc_table_annotated <- merge(ihc_table, clusts, by=c(id_type))
```

## Clonal diversity

```{r}
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
prevalences <- rbind.fill(lapply(tree_branch_data, function(x) x$prevalence_dat))
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))

ccfs <- compute_ccf(prevalences, trees, id_type = id_type)
ccfs_labeled <- merge(ccfs, branch_lengths, by=c("label", "node", "patient_id"))

clonal_measures <- read_ith_stats(clonal_measures_file, db_path, duplicates = FALSE)
```

```{r}
prevalence_indices <- prevalences %>% group_by(condensed_id, patient_id) %>% summarise(shannon=vegan::diversity(clonal_prevalence, index="shannon"), simpson=vegan::diversity(clonal_prevalence, index="simpson"))

prevalences_filtered <- subset(prevalences, clonal_prevalence >= prevalence_threshold)
#vafs_filtered <- subset(vafs, prevalence >= prevalence_threshold) ## Not exactly right but close enough
divergence <- compute_clone_divergence(prevalences_filtered, trees, id_type)
#divergence_weighted <- compute_clone_divergence(prevalences_filtered, vafs_filtered, clone_trees, branch_lengths, id_type, weighted=TRUE)
#colnames(divergence_weighted) <- mapvalues(colnames(divergence_weighted), from = "clone_divergence", 
#                                           to = "clone_divergence_weighted")
diversity <- Reduce(function(x,y) merge(x,y,by=c(id_type, "patient_id"), all=TRUE),
                    list(prevalence_indices, clonal_measures))
```

## ITH-TIL correlations

### Old measures 

We'll use proportion_subclonality as the surrogate. 

```{r}
proportion_subclonal <- fread(proportion_subclonality_file)

res <- plyr::join(clusts, proportion_subclonal)

if (nclusts == 2) {
  corres <- wilcox.test(proportion_subclonal ~ cluster, data=res)
} else {
  corres <- kruskal.test(proportion_subclonal ~ cluster, data=res)
}

pval <- corres$p.value
eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))

ggplot(res, aes(x=cluster, y=proportion_subclonal)) + geom_boxplot(aes(fill=cluster)) + theme_bw() + theme_Publication() + scale_fill_Publication() + annotate('text', x=Inf, y=Inf, parse=TRUE, hjust=1, vjust=1, label=as.character(as.expression(eq)))
```

By a Kruskal-Wallis test.

If we instead choose to compare cluster 3 (high epithelial TILs) against everything else, then:

```{r}
wilcox.test(proportion_subclonal ~ cluster == 3, data=res)
```

### New measures (in progress)

```{r}
res <- merge(diversity, ihc_table_annotated, by=c(id_type, "patient_id"))

ith_measures <- c("shannon", "entropy", "simpson", "divergence", "postprocessed_divergence", "proportion_subclonal",
                  "combined_ith_normalized", "combined_ith_raw")
res_melted <- reshape2::melt(res, id.vars = c(id_type, "patient_id", "cluster"),
                   measure.vars = ith_measures, variable.name = "ith_type", value.name = "ith")
res_melted$ith_type <- factor(res_melted$ith_type)
```

```{r}
stats <- setNames(ddply(res_melted, .(ith_type), function(x) {
  if (nclusts == 2) {
    res <- wilcox.test(ith ~ cluster, data=x)
  } else {
    res <- kruskal.test(ith ~ cluster, data=x)
  }
  pvalue <- res$p.value
  eq <- substitute(italic(P)==p, list(p=format(pvalue, digits=3)))
  return(as.character(as.expression(eq)))
}), c("ith_type", "p.value"))
```

```{r}
ggplot(res_melted, aes(x=cluster, y=ith)) + geom_boxplot(aes(fill=cluster)) + theme_bw() + theme_Publication() + scale_fill_Publication() + facet_wrap(~ ith_type, scales="free") + geom_text(data=stats, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5, size=3, parse=TRUE)
```

Check with Andrew as to how he computed clone divergence -- my computations based on what I believe the definition is are different. Perhaps it's a 1-minus issue? 


## ITH-TIL variability

```{r}
ihc_table_slide <- fread(ihc_table_slide_path)

ihc_slide_data <- subset(ihc_table_slide, select=c("condensed_id", "patient_id", "slide", tils_for_variability))
ihc_melted <- melt(ihc_slide_data, id.vars = c("condensed_id", "patient_id", "slide"),
                   measure.vars = tils_for_variability, variable.name = "tiltype",
                   value.name = "density")
ihc_melted$tiltype <- as.character(ihc_melted$tiltype)

intersect_samples <- intersect(unique(ihc_melted$condensed_id), unique(ccfs_labeled$condensed_id))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, til_variance_option = variability_type, tils_for_variability)

clonedists <- clone_distances(ccfs_labeled, normalize = FALSE, id_type = id_type)
clonedists$patient_id <- as.character(clonedists$patient_id)

variabilities <- inner_join(til_var, subset(clonedists, select=-c(dist_clones_weighted)), by=c("patient_id"))
variabilities_melted <- melt(variabilities, id.vars = c("patient_id", "mean_weighted_dist"),
                             measure.vars = tils_for_variability, variable.name = "tiltype",
                             value.name = "tilvar")
variabilities_melted$patient_id <- factor(variabilities_melted$patient_id)
```

```{r}
stats <- setNames(ddply(variabilities_melted, .(tiltype), function(x) {
  res <- with(x, cor.test(mean_weighted_dist, tilvar, method="spearman"))
  pvalue <- res$p.value
  eq <- substitute(italic(P)==p, list(p=format(pvalue, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))
```

```{r, fig.width=12, fig.height=3}
p <- ggplot(variabilities_melted, aes(x=mean_weighted_dist, y=tilvar)) + geom_point(aes(colour=patient_id)) + 
  theme_bw() + theme_Publication()
p <- p + facet_wrap(~ tiltype, scales = "free", nrow=1)
p <- p + theme(axis.text.x = element_text(size=6, family="Helvetica"),
               legend.text=element_text(size=6,family="Helvetica"),
               axis.text.y=element_text(size=6,family="Helvetica"),
               axis.title=element_text(size=8,family="Helvetica"),
               strip.text=element_text(size=7,family="Helvetica"),
               strip.background = element_rect(fill="white"))
p <- p + ylab("Variability") + xlab("Mean clonal distance")
p <- p + geom_text(data=stats, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5, 
                   size=3, parse=TRUE)
p <- p + scale_colour_manual(values = pal_patient) + 
  guides(colour=guide_legend(title="Patient"))

plot(p)
```



