library(ithi.utils)
load_base_libs()
library(nestedRanksTest)

library(ithi.meta)
library(ithi.xcr)
library(ithi.ihc)
library(ithi.expr)
library(ithi.clones)
library(nanotools)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db

molecular_subtype_file <- snakemake@input$molecular_subtypes

ihc_table_path <- snakemake@input$ihc_table
tiltypes <- unlist(snakemake@params$tiltypes)

tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

xcr_table_path <- snakemake@input$xcr_table

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))

Molecular subtype classification

molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))

Counts

table(molsubtypes$subtype)

C1 C2 C4 C5 
24 36 15 41 

IHC

ihc_table <- fread(ihc_table_path)
ihc_table$patient_id <- factor(ihc_table$patient_id)

ihcm <- melt(ihc_table, id.vars = c("condensed_id", "patient_id"), measure.vars = tiltypes, 
    variable.name = "tiltype", value.name = "density")

ihcm_subtype <- merge(molsubtypes, ihcm, by = c("condensed_id"))
stats <- setNames(ddply(ihcm_subtype, .(tiltype), function(x) {
    res <- kruskal.test(density ~ factor(subtype), data = x)
    pvalue <- res$p.value
    eq <- substitute(italic(P) == p, list(p = format(pvalue, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))

ggplot(ihcm_subtype, aes(x = subtype, y = density)) + geom_boxplot() + geom_jitter(position = position_jitter(width = 0.2, 
    height = 0), aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    scale_color_manual(values = pal_patient) + facet_wrap(~tiltype, scales = "free", 
    nrow = 2) + geom_text(data = stats, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

TCR/BCR diversity

diversity_files <- list(tcr = tcr_diversity_file, bcr = bcr_diversity_file)

diversity <- lapply(diversity_files, function(f) {
    dat <- read_xcr_diversity_file(f, db_path)
    merge(dat, molsubtypes, by = c("condensed_id"))
})

xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path)

Read 49.2% of 304822 rows
Read 85.3% of 304822 rows
Read 88.6% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:05
xcr_table$condensed_id <- as.character(xcr_table$condensed_id)
patients <- unique(xcr_table$patient_id)

tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"
id_type <- "condensed_id"

# not actually beta diversity but still fun
betas <- rbind.fill(lapply(patients, function(patient) {
    tcr_clonotypes <- subset(xcr_table, type == tcr_segment_type & patient_id == 
        patient)
    bcr_clonotypes <- subset(xcr_table, type == bcr_segment_type & patient_id == 
        patient)
    tcr_cross_table <- cross_tabulate(tcr_clonotypes, id_type = id_type)
    bcr_cross_table <- cross_tabulate(bcr_clonotypes, id_type = id_type)
    
    cross_tables <- list(tcr = tcr_cross_table, bcr = bcr_cross_table)
    
    betavals <- rbind.fill(lapply(seq_along(cross_tables), function(i) {
        cross_table <- cross_tables[[i]]
        df <- cbind(type = names(cross_tables)[i], compute_beta_diversity(cross_table, 
            mode = "union", measure = "count"))
        return(df)
    }))
    return(cbind(patient_id = patient, betavals))
}))
betas_list <- split(betas, f = betas$type)

compute_diversity_index <- function(xcr_table, type = "rareness", rare_threshold = 1e-04) {
    if (type == "rareness") {
        PREVALENCE_TYPE <- "freq"
        
        res <- xcr_table %>% group_by_(.dots = c("condensed_id", "patient_id", 
            "type")) %>% summarise_(rareness = lazyeval::interp(~sum(var[var < 
            rare_threshold]), var = as.name(PREVALENCE_TYPE)))
        res <- split(res, f = res$type)
    } else if (type == "berger_parker") {
        res <- xcr_table %>% group_by_(.dots = c("condensed_id", "patient_id", 
            "type")) %>% summarise(berger_parker = 1/max(freq))
        res <- split(res, f = res$type)
    }
    return(res)
}

cdf_clonotype_freqs <- function(xcr_table, segment = "TRB") {
    ggplot(xcr_table, aes(x = freq)) + stat_ecdf(geom = "step") + scale_x_log10() + 
        facet_wrap(~condensed_id, ncol = 5)
}

plot_diversity_subtype <- function(diversity, col) {
    ignore <- lapply(diversity, function(df) {
        p <- ggplot(df, aes_string(x = "subtype", y = col)) + geom_boxplot() + 
            geom_jitter(position = position_jitter(width = 0.2, height = 0), 
                aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
            scale_color_manual(values = pal_patient) + xlab("Subtype") + ylab("Diversity")
        print(p)
    })
}

Richness

plot_diversity_subtype(diversity, "observedDiversity_mean")

Shannon entropy

plot_diversity_subtype(diversity, "shannonWienerIndex_mean")

Divergence

plot_diversity_subtype(diversity, "inverseSimpsonIndex_mean")

D50 index

plot_diversity_subtype(diversity, "d50Index_mean")

Rareness

rareness <- compute_diversity_index(xcr_table, type = "rareness", rare_threshold = 1e-04)
rareness <- lapply(rareness, function(x) {
    x$patient_id <- as.character(x$patient_id)
    merge(x, molsubtypes, by = c("condensed_id"))
})
plot_diversity_subtype(rareness, "rareness")

Berger-Parker

dominance <- compute_diversity_index(xcr_table, type = "berger_parker")
dominance <- lapply(dominance, function(x) {
    x$patient_id <- as.character(x$patient_id)
    merge(x, molsubtypes, by = c("condensed_id"))
})
plot_diversity_subtype(dominance, "berger_parker")

Sample-specific clonotype proportion

betas_list <- lapply(betas_list, function(x) {
    x$patient_id <- as.character(x$patient_id)
    merge(x, molsubtypes, by = c("condensed_id"))
})
plot_diversity_subtype(betas_list, "prop_unique")

CDFs

cdf_clonotype_freqs(xcr_table, "TRB")

cdf_clonotype_freqs(xcr_table, "IGH")

Nanostring

exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)

celltype_matrix <- rownames_to_column(as.data.frame(t(create_celltype_matrix(exprs, 
    labels, db_path))), var = "condensed_id")
celltype_vars <- colnames(celltype_matrix)[!colnames(celltype_matrix) == "condensed_id"]

celltype_matrix_melted <- melt(celltype_matrix, id.vars = c("condensed_id"), 
    measure.vars = celltype_vars, variable.name = "type", value.name = "expr")

celltype_subtype <- merge(celltype_matrix_melted, molsubtypes, by = c("condensed_id"))
celltype_subtype$patient_id <- factor(map_id(celltype_subtype$condensed_id, 
    from = "condensed_id", to = "patient_id", db_path))
stats <- setNames(ddply(celltype_subtype, .(type), function(x) {
    res <- kruskal.test(expr ~ factor(subtype), data = x)
    pvalue <- res$p.value
    eq <- substitute(italic(P) == p, list(p = format(pvalue, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("type", "p.value"))

ggplot(celltype_subtype, aes(x = subtype, y = expr)) + geom_boxplot() + geom_jitter(position = position_jitter(width = 0.2, 
    height = 0), aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    scale_color_manual(values = pal_patient) + facet_wrap(~type, scales = "free", 
    nrow = 4) + geom_text(data = stats, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

pathway_matrix <- rownames_to_column(as.data.frame(t(create_pathway_matrix(exprs, 
    labels, db_path))), var = "condensed_id")
pathway_vars <- colnames(pathway_matrix)[!colnames(pathway_matrix) == "condensed_id"]

pathway_matrix_melted <- melt(pathway_matrix, id.vars = c("condensed_id"), measure.vars = pathway_vars, 
    variable.name = "type", value.name = "expr")

pathway_subtype <- merge(pathway_matrix_melted, molsubtypes, by = c("condensed_id"))
pathway_subtype$patient_id <- factor(map_id(pathway_subtype$condensed_id, from = "condensed_id", 
    to = "patient_id", db_path))
stats <- setNames(ddply(pathway_subtype, .(type), function(x) {
    res <- kruskal.test(expr ~ factor(subtype), data = x)
    pvalue <- res$p.value
    eq <- substitute(italic(P) == p, list(p = format(pvalue, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("type", "p.value"))

ggplot(pathway_subtype, aes(x = subtype, y = expr)) + geom_boxplot() + geom_jitter(position = position_jitter(width = 0.2, 
    height = 0), aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    scale_color_manual(values = pal_patient) + facet_wrap(~type, scales = "free", 
    nrow = 4) + geom_text(data = stats, aes(x = Inf, y = Inf, label = p.value), 
    hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Relation to clonal composition

Here we’ll relate intrapatient differences in molecular subtype to differences in clonal composition. Questions: * Are tumours with different molecular subtypes more clonally different?

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, 
    clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
prevalences <- rbind.fill(lapply(tree_branch_data, function(x) x$prevalence_dat))
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))

ccfs <- compute_ccf(prevalences, trees, id_type = "condensed_id")
ccfs_labeled <- merge(ccfs, branch_lengths, by = c("label", "node", "patient_id"))

## Need non-normalized distances
clone_dists <- clone_distances(ccfs_labeled, normalize = TRUE, id_type = "condensed_id")

clone_dists$patient_id <- factor(clone_dists$patient_id)

clone_dists_melted <- rbind.fill(lapply(1:nrow(clone_dists), function(i) {
    patient <- clone_dists$patient_id[i]
    distmat <- clone_dists$dist_clones_weighted[[i]]
    
    distdf <- setNames(melt(as.matrix(distmat)), c("id1", "id2", "clone_dist"))
    distdf$id1 <- factor(distdf$id1)
    distdf$id2 <- factor(distdf$id2)
    distdf <- subset(distdf, as.numeric(id1) > as.numeric(id2))
    return(distdf)
}))
df <- merge(clone_dists_melted, setNames(molsubtypes, c("id1", "subtype1")))
df <- merge(df, setNames(molsubtypes, c("id2", "subtype2")))
df$equal_subtypes <- df$subtype1 == df$subtype2
df$patient_id <- map_id(df$id1, from = "condensed_id", to = "patient_id", db_path)
ggplot(df, aes(x = equal_subtypes, y = clone_dist)) + geom_boxplot(outlier.size = -1) + 
    geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2, 
        height = 0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient)

Making this comparison across patients is unfair, as clone distance values (even when normalized) may be very different in scale from patient to patient. It’s obvious there’s no pattern ACROSS patients, but how about WITHIN patients?

If we assume a Gaussian response variable (which is definitely not true):

summary(lmer(clone_dist ~ equal_subtypes + (1 | patient_id), df))
Linear mixed model fit by REML t-tests use Satterthwaite approximations
  to degrees of freedom [lmerMod]
Formula: clone_dist ~ equal_subtypes + (1 | patient_id)
   Data: df

REML criterion at convergence: -244.6

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.4180 -0.4943 -0.0424  0.5238  2.3332 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.02447  0.1564  
 Residual               0.01821  0.1349  
Number of obs: 252, groups:  patient_id, 14

Fixed effects:
                     Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)          0.250493   0.044280  14.090000   5.657 5.77e-05 ***
equal_subtypesTRUE  -0.005325   0.020301 242.570000  -0.262    0.793    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr)
eql_sbtTRUE -0.221

There’s a trend toward lower clone distance between pairs of tumours with equal subtype, but this is not significant.

What if we use a nonparametric test?

tmp <- df %>% group_by(patient_id) %>% summarise(both_levels = !Reduce("&", 
    equal_subtypes) & Reduce("|", equal_subtypes))

df_filtered <- subset(df, patient_id %in% subset(tmp, both_levels)$patient_id)

res <- nestedRanksTest(clone_dist ~ !equal_subtypes | patient_id, data = df_filtered, 
    n.iter = 5000)
res

    Nested Ranks Test

data:  clone_dist by !equal_subtypes grouped by patient_id
Z = 0.00081235, p-value = 0.4866
alternative hypothesis: Z lies above bootstrapped null values
null values:
      0%       1%       5%      10%      25%      50%      75%      90% 
-0.36149 -0.25509 -0.17872 -0.14054 -0.07392 -0.00244  0.07311  0.14054 
     95%      99%     100% 
 0.17551  0.24696  0.34768 

bootstrap iterations: 5000 
group weights:
         10          11          12          13          14          15 
0.158407799 0.017059301 0.017059301 0.019496344 0.012997563 0.006498781 
         17           2           3           4           7           9 
0.019496344 0.004061738 0.604386677 0.089358245 0.043866775 0.007311129 
plot(res)

We do see a trend! We can only do this for patients with samples in both clusters. So, samples with different molecular subtypes are also more likely to have different clonal compositions. But this does not appear to be significant when I run it with the new subtyping (combined using ICGC for subtyping …).

---
title: "Molecular subtypes"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "molecular_subtypes.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', 'Rmd/molecular_subtypes.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "notebook" = 'Rmd/molecular_subtypes.Rmd', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "molecular_subtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/molecular_subtypes.nb.html'),
    params = list('/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), 'ithi-analysis-molecular-subtype-analysis', "tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'ithi-analysis-molecular-subtype-analysis'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/molecular_subtype_analysis.log'),
    resources = list(),
    config = list("xcr_distance_method" = 'horn', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "xcrmapscape_tcr_patient_order" = c(15, 1, 3, 4, 2, 17, 7, 14, 9, 10, 12, 13, 11, 16), "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "figure_gallery_notebook" = 'Rmd/figures.Rmd', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "variability_type" = 'stabilize', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "nscatters" = 6, "spatial_result_dirs" = list("stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc', "epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc'), "classifier_type" = 'knn', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "clonal_samplers" = c('HMC', 'NUTS'), "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "xcrmapscape_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/1.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/2.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/3.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/4.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/7.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/9.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/10.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/11.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/12.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/13.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/14.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/15.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/16.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/17.svg'), "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "PNG_QUALITY" = 300, "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "nclusts" = 3, "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "default_sampler" = 'HMC', "clonal_figure_template" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/templates/clonal_figure.svg', "prevalence_threshold" = 0.01, "mapscape_notebook" = 'Rmd/mapscape.Rmd', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "PNG_DENSITY" = 300, "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "mvclust_nclust" = 3, "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "index_notebook" = 'Rmd/index.Rmd', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "smooth_type" = 'loess', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "phenotype_threshold" = 0.85),
    rule = 'molecular_subtype_analysis'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()
library(nestedRanksTest)

library(ithi.meta)
library(ithi.xcr)
library(ithi.ihc)
library(ithi.expr)
library(ithi.clones)
library(nanotools)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

molecular_subtype_file <- snakemake@input$molecular_subtypes

ihc_table_path <- snakemake@input$ihc_table
tiltypes <- unlist(snakemake@params$tiltypes)

tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

xcr_table_path <- snakemake@input$xcr_table

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```


## Molecular subtype classification

```{r}
molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))
```

### Counts

```{r}
table(molsubtypes$subtype)
```

## IHC

```{r}
ihc_table <- fread(ihc_table_path)
ihc_table$patient_id <- factor(ihc_table$patient_id)

ihcm <- melt(ihc_table, id.vars = c("condensed_id", "patient_id"), measure.vars = tiltypes,
             variable.name = "tiltype", value.name = "density")

ihcm_subtype <- merge(molsubtypes, ihcm, by=c("condensed_id"))
```

```{r, fig.width=8, fig.height=4}
stats <- setNames(ddply(ihcm_subtype, .(tiltype), function(x) {
  res <- kruskal.test(density ~ factor(subtype), data=x)
  pvalue <- res$p.value
  eq <- substitute(italic(P)==p, list(p=format(pvalue, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))

ggplot(ihcm_subtype, aes(x=subtype, y=density)) + geom_boxplot() + geom_jitter(position = position_jitter(width=0.2, height=0), aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + facet_wrap(~ tiltype, scales="free", nrow=2) + geom_text(data=stats, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5, size=3, parse=TRUE)
```


## TCR/BCR diversity

```{r}
diversity_files <- list(tcr=tcr_diversity_file,
                        bcr=bcr_diversity_file)

diversity <- lapply(diversity_files, function(f) {
  dat <- read_xcr_diversity_file(f, db_path)
  merge(dat, molsubtypes, by=c("condensed_id"))
})

xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path)
xcr_table$condensed_id <- as.character(xcr_table$condensed_id)
patients <- unique(xcr_table$patient_id)

tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"
id_type <- "condensed_id"

# not actually beta diversity but still fun
betas <- rbind.fill(lapply(patients, function(patient) {
  tcr_clonotypes <- subset(xcr_table, type == tcr_segment_type & patient_id == patient)
  bcr_clonotypes <- subset(xcr_table, type == bcr_segment_type & patient_id == patient)
  tcr_cross_table <- cross_tabulate(tcr_clonotypes, id_type = id_type)
  bcr_cross_table <- cross_tabulate(bcr_clonotypes, id_type = id_type)
  
  cross_tables <- list(tcr=tcr_cross_table, bcr=bcr_cross_table)
  
  betavals <- rbind.fill(lapply(seq_along(cross_tables), function(i) {
    cross_table <- cross_tables[[i]]
    df <- cbind(type=names(cross_tables)[i], compute_beta_diversity(cross_table, mode = "union", measure = "count"))
    return(df)
  }))
  return(cbind(patient_id=patient, betavals))
}))
betas_list <- split(betas, f = betas$type)

compute_diversity_index <- function(xcr_table, type = "rareness", rare_threshold = 1e-4) {
  if (type == "rareness") {
    PREVALENCE_TYPE <- "freq"
    
    res <- xcr_table %>% group_by_(.dots=c("condensed_id", "patient_id", "type")) %>%
      summarise_(rareness = lazyeval::interp(~sum(var[var < rare_threshold]), var = as.name(PREVALENCE_TYPE)))
    res <- split(res, f=res$type)
  } else if (type == "berger_parker") {
    res <- xcr_table %>% group_by_(.dots=c("condensed_id", "patient_id", "type")) %>%
      summarise(berger_parker = 1/max(freq))
    res <- split(res, f=res$type)
  }
  return(res)
}

cdf_clonotype_freqs <- function(xcr_table, segment = "TRB") {
  ggplot(xcr_table, aes(x=freq)) + stat_ecdf(geom="step") + scale_x_log10() + facet_wrap(~ condensed_id, ncol = 5)
}

plot_diversity_subtype <- function(diversity, col) {
  ignore <- lapply(diversity, function(df) {
    p <- ggplot(df, aes_string(x="subtype", y=col)) + geom_boxplot() + geom_jitter(position=position_jitter(width=0.2, height=0),aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + xlab("Subtype") + ylab("Diversity")
    print(p)
  })
}
```

### Richness

```{r}
plot_diversity_subtype(diversity, "observedDiversity_mean")
```

### Shannon entropy

```{r}
plot_diversity_subtype(diversity, "shannonWienerIndex_mean")
```

### Divergence

```{r}
plot_diversity_subtype(diversity, "inverseSimpsonIndex_mean")
```

### D50 index

```{r}
plot_diversity_subtype(diversity, "d50Index_mean")
```

### Rareness

```{r}
rareness <- compute_diversity_index(xcr_table, type = "rareness", rare_threshold = 1e-4)
rareness <- lapply(rareness, function(x) {
  x$patient_id <- as.character(x$patient_id)
  merge(x, molsubtypes, by=c("condensed_id"))
})
plot_diversity_subtype(rareness, "rareness")
```

### Berger-Parker

```{r}
dominance <- compute_diversity_index(xcr_table, type = "berger_parker")
dominance <- lapply(dominance, function(x) {
  x$patient_id <- as.character(x$patient_id)
  merge(x, molsubtypes, by=c("condensed_id"))
})
plot_diversity_subtype(dominance, "berger_parker")
```

### Sample-specific clonotype proportion

```{r}
betas_list <- lapply(betas_list, function(x) {
  x$patient_id <- as.character(x$patient_id)
  merge(x, molsubtypes, by=c("condensed_id"))
})
plot_diversity_subtype(betas_list, "prop_unique")
```

### CDFs

```{r, fig.width=20, fig.height=100}
cdf_clonotype_freqs(xcr_table, "TRB")
```

```{r, fig.width=20, fig.height=100}
cdf_clonotype_freqs(xcr_table, "IGH")
```

## Nanostring

```{r}
exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)

celltype_matrix <- rownames_to_column(as.data.frame(t(create_celltype_matrix(exprs, labels, db_path))), var = "condensed_id")
celltype_vars <- colnames(celltype_matrix)[!colnames(celltype_matrix) == "condensed_id"]

celltype_matrix_melted <- melt(celltype_matrix, id.vars = c("condensed_id"), measure.vars = celltype_vars,
    variable.name = "type", value.name = "expr")

celltype_subtype <- merge(celltype_matrix_melted, molsubtypes, by=c("condensed_id"))
celltype_subtype$patient_id <- factor(map_id(celltype_subtype$condensed_id, from="condensed_id",
                                      to="patient_id", db_path))
```

```{r, fig.width=8, fig.height=8}
stats <- setNames(ddply(celltype_subtype, .(type), function(x) {
  res <- kruskal.test(expr ~ factor(subtype), data=x)
  pvalue <- res$p.value
  eq <- substitute(italic(P)==p, list(p=format(pvalue, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))

ggplot(celltype_subtype, aes(x=subtype, y=expr)) + geom_boxplot() + geom_jitter(position = position_jitter(width=0.2, height=0), aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales="free", nrow=4) + geom_text(data=stats, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5, size=3, parse=TRUE)
```


```{r}
pathway_matrix <- rownames_to_column(as.data.frame(t(create_pathway_matrix(exprs, labels, db_path))), var = "condensed_id")
pathway_vars <- colnames(pathway_matrix)[!colnames(pathway_matrix) == "condensed_id"]

pathway_matrix_melted <- melt(pathway_matrix, id.vars = c("condensed_id"), measure.vars = pathway_vars, variable.name = "type", value.name = "expr")

pathway_subtype <- merge(pathway_matrix_melted, molsubtypes, by=c("condensed_id"))
pathway_subtype$patient_id <- factor(map_id(pathway_subtype$condensed_id, from="condensed_id",
                                      to="patient_id", db_path))
```

```{r, fig.width=8, fig.height=8}
stats <- setNames(ddply(pathway_subtype, .(type), function(x) {
  res <- kruskal.test(expr ~ factor(subtype), data=x)
  pvalue <- res$p.value
  eq <- substitute(italic(P)==p, list(p=format(pvalue, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))

ggplot(pathway_subtype, aes(x=subtype, y=expr)) + geom_boxplot() + geom_jitter(position = position_jitter(width=0.2, height=0), aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales="free", nrow=4) + geom_text(data=stats, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5, size=3, parse=TRUE)
```

## Relation to clonal composition

Here we'll relate intrapatient differences in molecular subtype to differences in clonal composition. 
Questions:
* Are tumours with different molecular subtypes more clonally different?

```{r}
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
prevalences <- rbind.fill(lapply(tree_branch_data, function(x) x$prevalence_dat))
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))

ccfs <- compute_ccf(prevalences, trees, id_type = "condensed_id")
ccfs_labeled <- merge(ccfs, branch_lengths, by=c("label", "node", "patient_id"))

## Need non-normalized distances
clone_dists <- clone_distances(ccfs_labeled, normalize = TRUE, id_type = "condensed_id")

clone_dists$patient_id <- factor(clone_dists$patient_id)

clone_dists_melted <- rbind.fill(lapply(1:nrow(clone_dists), function(i) {
  patient <- clone_dists$patient_id[i]
  distmat <- clone_dists$dist_clones_weighted[[i]]
  
  distdf <- setNames(melt(as.matrix(distmat)), c("id1", "id2", "clone_dist"))
  distdf$id1 <- factor(distdf$id1)
  distdf$id2 <- factor(distdf$id2)
  distdf <- subset(distdf, as.numeric(id1) > as.numeric(id2))
  return(distdf)
}))
```

```{r}
df <- merge(clone_dists_melted, setNames(molsubtypes, c("id1", "subtype1")))
df <- merge(df, setNames(molsubtypes, c("id2", "subtype2")))
df$equal_subtypes <- df$subtype1 == df$subtype2
df$patient_id <- map_id(df$id1, from = "condensed_id", to = "patient_id", db_path)
```

```{r}
ggplot(df, aes(x=equal_subtypes, y=clone_dist)) + geom_boxplot(outlier.size = -1) + geom_jitter(aes(colour=patient_id), position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient)
```

Making this comparison across patients is unfair, as clone distance values (even when normalized) may be very different in scale from patient to patient. It's obvious there's no pattern ACROSS patients, but how about WITHIN patients? 

If we assume a Gaussian response variable (which is definitely not true):

```{r}
summary(lmer(clone_dist ~ equal_subtypes + (1|patient_id), df))
```

There's a trend toward lower clone distance between pairs of tumours with equal subtype, but this is not significant. 

What if we use a nonparametric test? 

```{r}
tmp <- df %>% group_by(patient_id) %>% summarise(both_levels = !Reduce('&', equal_subtypes) & Reduce('|', equal_subtypes))

df_filtered <- subset(df, patient_id %in% subset(tmp, both_levels)$patient_id)

res <- nestedRanksTest(clone_dist ~ !equal_subtypes | patient_id, data = df_filtered, n.iter = 5000)
res

plot(res)
```

We do see a trend! We can only do this for patients with samples in both clusters. So, samples with different molecular subtypes are also more likely to have different clonal compositions. But this does not appear to be significant when I run it with the new subtyping (combined using ICGC for subtyping ...). 


