library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.xcr)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db

xcr_table_path <- snakemake@input$xcr_table
prevalence_option <- snakemake@params$prevalence_option
distance_method <- snakemake@params$xcr_distance_method
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))
duplicates <- collect(tbl(db, "duplicates"))
xcrseq <- collect(tbl(db, "xcrseq"))

TCR/BCR

xcr_table <- read_clonotypes(xcr_table_path, duplicates = TRUE, db_path)

Read 19.7% of 304822 rows
Read 52.5% of 304822 rows
Read 88.6% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:05
# xcr_table <- subset(xcr_table, count >= 200)

technical_duplicates <- unique(subset(xcrseq, voa_unique != voa)$voa)

duplicate_sites <- subset(duplicates, duplicate == 1)
duplicate_sites <- merge(samples, duplicate_sites, by = "voa")
duplicate_categories <- subset(duplicate_sites, select = c(patient_id, site_id))

duplicates_mapped <- merge(duplicate_categories, samples, by = c("patient_id", 
    "site_id"))
duplicates_mapped <- merge(duplicates_mapped, xcrseq, by = c("voa"))
mapped_duplicate_categories <- duplicates_mapped %>% group_by(patient_id, site_id) %>% 
    summarise(num_voa = length(unique(voa)))
mapped_categories <- subset(mapped_duplicate_categories, num_voa > 1, select = c("patient_id", 
    "site_id"))

biological_duplicates <- unique(merge(duplicates_mapped, mapped_categories, 
    by = c("patient_id", "site_id"))$voa)
tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"

id_type <- "voa_unique"

Technical duplicates

xcr_table_tdups <- subset(xcr_table, voa %in% technical_duplicates)
tcr_tdups <- subset(xcr_table_tdups, type == tcr_segment_type)
tcr_cross_table <- cross_tabulate(tcr_tdups, id_type = id_type)

bcr_tdups <- subset(xcr_table_tdups, type == bcr_segment_type)
bcr_cross_table <- cross_tabulate(bcr_tdups, id_type = id_type)

cross_tdups <- list(tcr = tcr_cross_table, bcr = bcr_cross_table)
distance_matrices <- lapply(cross_tdups, function(cross_table) {
    distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
    mat <- as.matrix(distmat)
    return(mat)
})
sims_raw <- lapply(distance_matrices, function(mat) {
    sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type, 
        db_path = db_path, raw = TRUE, group = "voa", table = "xcrseq")
    sims <- setNames(subset(sims, select = -c(tissue1, tissue2, patient2)), 
        c("sample1", "sample2", "dist", "group"))
    sims$patient_id <- map_id(sims$group, from = "voa", to = "patient_id", db_path = db_path, 
        table = "samples")
    sims$condensed_id <- map_id(sims$group, from = "voa", to = "condensed_id", 
        db_path = db_path, table = "samples")
    project1 <- map_id(sims$sample1, from = "voa_unique", to = "project_id", 
        db_path = db_path, table = "xcrseq")
    project2 <- map_id(sims$sample2, from = "voa_unique", to = "project_id", 
        db_path = db_path, table = "xcrseq")
    sims$comparison_type <- c("different", "identical")[as.numeric(project1 == 
        project2) + 1]
    sims_summary <- sims %>% group_by(condensed_id, patient_id, comparison_type) %>% 
        summarise(mean_sim = 1 - mean(dist))
    return(sims_summary)
})
ignore <- lapply(list("tcr", "bcr"), function(segment) {
    dat <- sims_raw[[segment]]
    dat$patient_id <- factor(dat$patient_id)
    dat$comparison_type <- factor(dat$comparison_type)
    
    p <- ggplot(dat, aes(x = patient_id, y = mean_sim)) + geom_boxplot(aes(fill = comparison_type)) + 
        geom_jitter(position = position_jitter(width = 0.2, height = 0)) + scale_fill_manual(values = get_colour_palette(dat$comparison_type)) + 
        theme_bw() + theme_Publication() + xlab("Patient") + ylab("Pairwise repertoire similarity")
    print(p)
})

We might have good reason to suspect that duplicates conducted at different times (i.e. the original TCR/BCR-seq experiment years ago vs. the one now) might not give the same results. The RIN values for the old samples that we re-ran were quite low (3-5, compared to the usual >7). Indeed, the BCR data shows that duplicates from the same run are more similar to each other than those from different runs.

However, one case is alarming – patient 21 in the TCR data. I made an ID mapping error and missed most samples from patient 21 in the first round of processing. However, the refactoring and database construction stimulated by the newest run allowed me to recover these samples. Something fishy definitely seems to be going on here. Subsetting to clonotypes with >= 50, 200 reads did not help, indicating that the issue isn’t likely to be caused by low-level contamination. It may still be caused by high-level contamination – this is worth investigating as patient 21 has a very low total clonotype count.

Next steps: Check RIN values from the original run for patient 21’s samples. Answer: They are on the low side for IX3248 (median = 5.2), but there are several patients around the 5.4-5.7 range. So I’m tempted to say that isn’t the cause of the issue.

TODO: Also evaluate variation in diversity values.

Biological duplicates

Another class of duplicates we tested were parts of samples coming from the same tumour sample. That’s a bit confusing – what it means is that each tumour sample was split into 2 fresh frozen and 1 FFPE aliquot, and for some cases we subjected both fresh frozen aliquots to TCR/BCR-seq. I loosely refer to these cases as ‘biological duplicates’.

id_type <- "voa"

xcr_table_bdups <- subset(xcr_table, voa_unique %in% biological_duplicates)
tcr_bdups <- subset(xcr_table_bdups, type == tcr_segment_type)
tcr_cross_table <- cross_tabulate(tcr_bdups, id_type = id_type)

bcr_bdups <- subset(xcr_table_bdups, type == bcr_segment_type)
bcr_cross_table <- cross_tabulate(bcr_bdups, id_type = id_type)

cross_bdups <- list(tcr = tcr_cross_table, bcr = bcr_cross_table)
distance_matrices <- lapply(cross_bdups, function(cross_table) {
    distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
    mat <- as.matrix(distmat)
    return(mat)
})
sims_raw <- lapply(distance_matrices, function(mat) {
    sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type, 
        db_path = db_path, raw = TRUE, group = "condensed_id", table = "samples")
    sims <- setNames(subset(sims, select = -c(tissue1, tissue2, patient2)), 
        c("sample1", "sample2", "dist", "condensed_id"))
    sims$patient_id <- map_id(sims$condensed_id, from = "condensed_id", to = "patient_id", 
        db_path = db_path, table = "samples")
    project1 <- map_id(sims$sample1, from = "voa_unique", to = "project_id", 
        db_path = db_path, table = "xcrseq")
    project2 <- map_id(sims$sample2, from = "voa_unique", to = "project_id", 
        db_path = db_path, table = "xcrseq")
    sims$comparison_type <- c("different", "identical")[as.numeric(project1 == 
        project2) + 1]
    sims_summary <- sims %>% group_by(condensed_id, patient_id, comparison_type) %>% 
        summarise(mean_sim = 1 - mean(dist))
    return(sims_summary)
})
ignore <- lapply(list("tcr", "bcr"), function(segment) {
    dat <- sims_raw[[segment]]
    dat$patient_id <- factor(dat$patient_id)
    dat$comparison_type <- factor(dat$comparison_type)
    
    p <- ggplot(dat, aes(x = patient_id, y = mean_sim)) + geom_boxplot(aes(fill = comparison_type)) + 
        geom_jitter(position = position_jitter(width = 0.2, height = 0)) + scale_fill_manual(values = get_colour_palette(dat$comparison_type)) + 
        theme_bw() + theme_Publication() + xlab("Patient") + ylab("Pairwise repertoire similarity")
    print(p)
})

A little surprising is how low the repertoire similarity is between biological duplicates – from aliquots that are taken from the same tumour sample! Consistent with our findings that high T-cell abundance = high levels of intrapatient similarity, patient 15 stands out as a beacon of consistency.

---
title: "XCR replicate QC"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "replicates.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('Rmd/replicates.Rmd', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "notebook" = 'Rmd/replicates.Rmd', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/xcr_qc.nb.html'),
    params = list('ithi-analysis-xcr-replicate-qc', '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'clones', 'horn', "name" = 'ithi-analysis-xcr-replicate-qc', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "prevalence_option" = 'clones', "xcr_distance_method" = 'horn'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/xcr_qc.log'),
    resources = list(),
    config = list("molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "xcr_distance_method" = 'horn', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "spatial_result_dirs" = list("stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc', "epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc'), "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "phenotype_threshold" = 0.85, "xcrmapscape_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/1.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/2.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/3.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/4.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/7.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/9.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/10.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/11.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/12.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/13.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/14.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/15.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/16.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/17.svg'), "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "clonal_samplers" = c('HMC', 'NUTS'), "figure_gallery_notebook" = 'Rmd/figures.Rmd', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "default_sampler" = 'HMC', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "clonal_figure_template" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/templates/clonal_figure.svg', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "mvclust_nclust" = 3, "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "index_notebook" = 'Rmd/index.Rmd', "variability_type" = 'stabilize', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "xcrmapscape_tcr_patient_order" = c(15, 1, 3, 4, 2, 17, 7, 14, 9, 10, 12, 13, 11, 16), "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "PNG_DENSITY" = 300, "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "nscatters" = 6, "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "smooth_type" = 'loess', "classifier_type" = 'knn', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "nclusts" = 3, "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "PNG_QUALITY" = 300, "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "prevalence_threshold" = 0.01, "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf'),
    rule = 'xcr_replicate_qc'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.xcr)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

xcr_table_path <- snakemake@input$xcr_table
prevalence_option <- snakemake@params$prevalence_option
distance_method <- snakemake@params$xcr_distance_method
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
duplicates <- collect(tbl(db, "duplicates"))
xcrseq <- collect(tbl(db, "xcrseq"))
```

## TCR/BCR

```{r}
xcr_table <- read_clonotypes(xcr_table_path, duplicates=TRUE, db_path)
#xcr_table <- subset(xcr_table, count >= 200)

technical_duplicates <- unique(subset(xcrseq, voa_unique != voa)$voa)

duplicate_sites <- subset(duplicates, duplicate == 1)
duplicate_sites <- merge(samples, duplicate_sites, by="voa")
duplicate_categories <- subset(duplicate_sites, select=c(patient_id, site_id))

duplicates_mapped <- merge(duplicate_categories, samples, by=c("patient_id", "site_id"))
duplicates_mapped <- merge(duplicates_mapped, xcrseq, by=c("voa"))
mapped_duplicate_categories <- duplicates_mapped %>% group_by(patient_id, site_id) %>%
  summarise(num_voa=length(unique(voa)))
mapped_categories <- subset(mapped_duplicate_categories, num_voa > 1, select=c("patient_id", "site_id"))

biological_duplicates <- unique(merge(duplicates_mapped, mapped_categories, by=c("patient_id", "site_id"))$voa)
```

````{r}
tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"

id_type <- "voa_unique"
```

### Technical duplicates

```{r}
xcr_table_tdups <- subset(xcr_table, voa %in% technical_duplicates)
```

```{r}
tcr_tdups <- subset(xcr_table_tdups, type == tcr_segment_type)
tcr_cross_table <- cross_tabulate(tcr_tdups, id_type = id_type)

bcr_tdups <- subset(xcr_table_tdups, type == bcr_segment_type)
bcr_cross_table <- cross_tabulate(bcr_tdups, id_type = id_type)

cross_tdups <- list(tcr=tcr_cross_table, bcr=bcr_cross_table)
```

```{r}
distance_matrices <- lapply(cross_tdups, function(cross_table) {
  distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
  mat <- as.matrix(distmat)
  return(mat)
})
```


```{r}
sims_raw <- lapply(distance_matrices, function(mat) {
  sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type,
                                  db_path = db_path, raw = TRUE, group = "voa", table = "xcrseq")
  sims <- setNames(subset(sims, select=-c(tissue1, tissue2, patient2)), c("sample1", "sample2", 
                                                                "dist", "group"))
  sims$patient_id <- map_id(sims$group, from = "voa", to="patient_id", db_path = db_path, table = "samples")
  sims$condensed_id <- map_id(sims$group, from="voa", to="condensed_id", db_path=db_path, table="samples")
  project1 <- map_id(sims$sample1, from="voa_unique", to="project_id", db_path=db_path,
                          table="xcrseq")
  project2 <- map_id(sims$sample2, from="voa_unique", to="project_id", db_path=db_path,
                          table="xcrseq")
  sims$comparison_type <- c("different", "identical")[as.numeric(project1 == project2)+1]
  sims_summary <- sims %>% group_by(condensed_id, patient_id, comparison_type) %>% summarise(mean_sim=1-mean(dist))
  return(sims_summary)
})
```

```{r}
ignore <- lapply(list("tcr", "bcr"), function(segment) {
  dat <- sims_raw[[segment]]
  dat$patient_id <- factor(dat$patient_id)
  dat$comparison_type <- factor(dat$comparison_type)
  
  p <- ggplot(dat, aes(x=patient_id, y=mean_sim)) + geom_boxplot(aes(fill=comparison_type)) + geom_jitter(position=position_jitter(width=0.2, height=0)) + scale_fill_manual(values = get_colour_palette(dat$comparison_type)) + theme_bw() + theme_Publication() + xlab("Patient") + ylab("Pairwise repertoire similarity")
  print(p)
})
```


We might have good reason to suspect that duplicates conducted at different times (i.e. the original TCR/BCR-seq experiment years ago vs. the one now) might not give the same results. The RIN values for the old samples that we re-ran were quite low (3-5, compared to the usual >7). Indeed, the BCR data shows that duplicates from the same run are more similar to each other than those from different runs. 

However, one case is alarming -- patient 21 in the TCR data. I made an ID mapping error and missed most samples from patient 21 in the first round of processing. However, the refactoring and database construction stimulated by the newest run allowed me to recover these samples. Something fishy definitely seems to be going on here. Subsetting to clonotypes with >= 50, 200 reads did not help, indicating that the issue isn't likely to be caused by low-level contamination. It may still be caused by high-level contamination -- this is *worth investigating* as patient 21 has a very low total clonotype count. 

Next steps: Check RIN values from the original run for patient 21's samples. 
Answer: They are on the low side for IX3248 (median = 5.2), but there are several patients around the 5.4-5.7 range. So I'm tempted to say that isn't the cause of the issue. 


TODO: Also evaluate variation in diversity values. 

## Biological duplicates

Another class of duplicates we tested were parts of samples coming from the same tumour sample. That's a bit confusing -- what it means is that each tumour sample was split into 2 fresh frozen and 1 FFPE aliquot, and for some cases we subjected both fresh frozen aliquots to TCR/BCR-seq. I loosely refer to these cases as 'biological duplicates'. 

```{r}
id_type <- "voa"

xcr_table_bdups <- subset(xcr_table, voa_unique %in% biological_duplicates)
```

```{r}
tcr_bdups <- subset(xcr_table_bdups, type == tcr_segment_type)
tcr_cross_table <- cross_tabulate(tcr_bdups, id_type = id_type)

bcr_bdups <- subset(xcr_table_bdups, type == bcr_segment_type)
bcr_cross_table <- cross_tabulate(bcr_bdups, id_type = id_type)

cross_bdups <- list(tcr=tcr_cross_table, bcr=bcr_cross_table)
```

```{r}
distance_matrices <- lapply(cross_bdups, function(cross_table) {
  distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
  mat <- as.matrix(distmat)
  return(mat)
})
```


```{r}
sims_raw <- lapply(distance_matrices, function(mat) {
  sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type,
                                  db_path = db_path, raw = TRUE, group = "condensed_id", table = "samples")
  sims <- setNames(subset(sims, select=-c(tissue1, tissue2, patient2)), c("sample1", "sample2", 
                                                                "dist", "condensed_id"))
  sims$patient_id <- map_id(sims$condensed_id, from = "condensed_id", to="patient_id", db_path = db_path, table = "samples")
  project1 <- map_id(sims$sample1, from="voa_unique", to="project_id", db_path=db_path,
                          table="xcrseq")
  project2 <- map_id(sims$sample2, from="voa_unique", to="project_id", db_path=db_path,
                          table="xcrseq")
  sims$comparison_type <- c("different", "identical")[as.numeric(project1 == project2)+1]
  sims_summary <- sims %>% group_by(condensed_id, patient_id, comparison_type) %>% summarise(mean_sim=1-mean(dist))
  return(sims_summary)
})
```

```{r}
ignore <- lapply(list("tcr", "bcr"), function(segment) {
  dat <- sims_raw[[segment]]
  dat$patient_id <- factor(dat$patient_id)
  dat$comparison_type <- factor(dat$comparison_type)
  
  p <- ggplot(dat, aes(x=patient_id, y=mean_sim)) + geom_boxplot(aes(fill=comparison_type)) + geom_jitter(position=position_jitter(width=0.2, height=0)) + scale_fill_manual(values = get_colour_palette(dat$comparison_type)) + theme_bw() + theme_Publication() + xlab("Patient") + ylab("Pairwise repertoire similarity")
  print(p)
})
```

A little surprising is how low the repertoire similarity is between biological duplicates -- from aliquots that are taken from the same tumour sample! Consistent with our findings that high T-cell abundance = high levels of intrapatient similarity, patient 15 stands out as a beacon of consistency. 