library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.clones)
library(ithi.seq)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db

ith_stats_file <- snakemake@input$ith_stats
# proportion_subclonality_file <- snakemake@input$subclonality
tumour_purity_file <- snakemake@input$tumour_purity

old_result_file <- "/shahlab/alzhang/projects/ITH_Immune/data/ith/old_ith_comparison.tsv"

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))

Description

New clonal predictions are still under a lot of work so we have to validate their quality.

Old clonal predictions

Here’s what things looked like under the old set of predictions:

old_comparison <- fread(old_result_file)
old_comparison$patient_id <- factor(old_comparison$patient_id)

ith_vars <- c("divergence", "entropy", "composite_ith")
other_vars <- colnames(old_comparison)[!colnames(old_comparison) %in% ith_vars]
old_comparison_melted <- melt(old_comparison, id.vars = other_vars, measure.vars = ith_vars, 
    variable.name = "ithtype", value.name = "ith")
pvals <- setNames(ddply(old_comparison_melted, .(ithtype), function(x) {
    df <- x
    corres <- with(df, cor.test(proportion_subclonal, ith, method = "spearman"))
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(old_comparison_melted, aes(x = proportion_subclonal, y = ith)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + facet_wrap(~ithtype, scales = "free") + 
    scale_color_manual(values = pal_patient) + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

This is what we expect to see: a reasonably good correlation between naive measures of ITH and more “rigorous” measures.

New clonal predictions

ith_stats <- read_ith_stats(ith_stats_file, db_path, duplicates = FALSE)
# subclonality <- fread(proportion_subclonality_file)

# new_comparison <- merge(ith_stats, subclonality, by=c('condensed_id',
# 'patient_id'))
new_comparison <- ith_stats

ith_vars <- c("divergence", "entropy", "postprocessed_divergence", "combined_ith_raw", 
    "combined_ith_normalized")
other_vars <- colnames(new_comparison)[!colnames(new_comparison) %in% ith_vars]
new_comparison_melted <- melt(new_comparison, id.vars = other_vars, measure.vars = ith_vars, 
    variable.name = "ithtype", value.name = "ith")
pvals <- setNames(ddply(new_comparison_melted, .(ithtype), function(x) {
    df <- x
    corres <- with(df, cor.test(proportion_subclonal, ith, method = "spearman"))
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(new_comparison_melted, aes(x = proportion_subclonal, y = ith)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + facet_wrap(~ithtype, scales = "free") + 
    scale_color_manual(values = pal_patient) + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

As of now, these are pretty terrible, suggesting that the clonal predictions probably need to be worked out. Or perhaps we can just modify our measures.

Tumour purity vs. ITH

A potential issue for inferring clonal diversity values is tumour purity. Low tumour purities reduce the effective read count contributed by non-malignant cells and hence may reduce the power of clonal inference.

tumour_purity <- read_tumour_purity(tumour_purity_file, db_path)

Old estimates

ith_vars <- c("divergence", "entropy", "composite_ith", "proportion_subclonal")
other_vars <- colnames(old_comparison)[!colnames(old_comparison) %in% ith_vars]
old_comparison_melted2 <- melt(old_comparison, id.vars = other_vars, measure.vars = ith_vars, 
    variable.name = "ithtype", value.name = "ith")

old_comparison_melted2 <- plyr::join(old_comparison_melted2, tumour_purity)
pvals <- setNames(ddply(old_comparison_melted2, .(ithtype), function(x) {
    df <- x
    corres <- with(df, cor.test(tumour_content, ith, method = "spearman"))
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(old_comparison_melted2, aes(x = tumour_content, y = ith)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + facet_wrap(~ithtype, scales = "free") + 
    scale_color_manual(values = pal_patient) + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

Oddly enough, tumour purity has a slight negative correlation with ITH. So, more pure samples have less ITH. That’s the opposite of what we’d expect naively.

New estimates

ith_vars <- c("divergence", "entropy", "postprocessed_divergence", "proportion_subclonal", 
    "combined_ith_raw", "combined_ith_normalized")
other_vars <- colnames(new_comparison)[!colnames(new_comparison) %in% ith_vars]
new_comparison_melted2 <- melt(new_comparison, id.vars = other_vars, measure.vars = ith_vars, 
    variable.name = "ithtype", value.name = "ith")

new_comparison_melted2 <- plyr::join(new_comparison_melted2, tumour_purity)
pvals <- setNames(ddply(new_comparison_melted2, .(ithtype), function(x) {
    df <- x
    corres <- with(df, cor.test(tumour_content, ith, method = "spearman"))
    
    pval <- corres$p.value
    eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(new_comparison_melted2, aes(x = tumour_content, y = ith)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + facet_wrap(~ithtype, scales = "free") + 
    scale_color_manual(values = pal_patient) + geom_text(data = pvals, aes(x = Inf, 
    y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)

---
title: "Clonal statistics"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "ith_statistics.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('Rmd/ith_statistics.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "notebook" = 'Rmd/ith_statistics.Rmd', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "tumour_purity" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/ith_statistics.nb.html'),
    params = list('ithi-analysis-ith-statistics', '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'ithi-analysis-ith-statistics', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/ith_statistics.log'),
    resources = list(),
    config = list("tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "PNG_DENSITY" = 300, "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "figure_gallery_notebook" = 'Rmd/figures.Rmd', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "nclusts" = 3, "index_notebook" = 'Rmd/index.Rmd', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "xcr_distance_method" = 'horn', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "phenotype_threshold" = 0.85, "default_sampler" = 'HMC', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "clonal_samplers" = c('HMC', 'NUTS'), "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "variability_type" = 'stabilize', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "mvclust_nclust" = 3, "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "classifier_type" = 'knn', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "PNG_QUALITY" = 300, "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "spatial_result_dirs" = list("epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc', "stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc'), "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "prevalence_threshold" = 0.01, "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2'),
    rule = 'ith_statistics'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()

library(ithi.meta)
library(ithi.clones)
library(ithi.seq)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

ith_stats_file <- snakemake@input$ith_stats
#proportion_subclonality_file <- snakemake@input$subclonality
tumour_purity_file <- snakemake@input$tumour_purity

old_result_file <- "/shahlab/alzhang/projects/ITH_Immune/data/ith/old_ith_comparison.tsv"
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```

## Description

New clonal predictions are still under a lot of work so we have to validate their quality. 

### Old clonal predictions

Here's what things looked like under the old set of predictions:

```{r}
old_comparison <- fread(old_result_file)
old_comparison$patient_id <- factor(old_comparison$patient_id)

ith_vars <- c("divergence", "entropy", "composite_ith")
other_vars <- colnames(old_comparison)[!colnames(old_comparison) %in% ith_vars]
old_comparison_melted <- melt(old_comparison, id.vars = other_vars, measure.vars = ith_vars,
                              variable.name = "ithtype", value.name = "ith")
```

```{r}
pvals <- setNames(ddply(old_comparison_melted, .(ithtype), function(x) {
  df <- x
  corres <- with(df, cor.test(proportion_subclonal, ith, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(old_comparison_melted, aes(x=proportion_subclonal, y=ith)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + facet_wrap(~ ithtype, scales = "free") + scale_color_manual(values = pal_patient) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```

This is what we expect to see: a reasonably good correlation between naive measures of ITH and more "rigorous" measures. 


### New clonal predictions

```{r}
ith_stats <- read_ith_stats(ith_stats_file, db_path, duplicates=FALSE)
#subclonality <- fread(proportion_subclonality_file)

#new_comparison <- merge(ith_stats, subclonality, by=c("condensed_id", "patient_id"))
new_comparison <- ith_stats

ith_vars <- c("divergence", "entropy", "postprocessed_divergence", "combined_ith_raw", "combined_ith_normalized")
other_vars <- colnames(new_comparison)[!colnames(new_comparison) %in% ith_vars]
new_comparison_melted <- melt(new_comparison, id.vars = other_vars, measure.vars = ith_vars,
                              variable.name = "ithtype", value.name = "ith")
```



```{r}
pvals <- setNames(ddply(new_comparison_melted, .(ithtype), function(x) {
  df <- x
  corres <- with(df, cor.test(proportion_subclonal, ith, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(new_comparison_melted, aes(x=proportion_subclonal, y=ith)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + facet_wrap(~ ithtype, scales = "free") + scale_color_manual(values = pal_patient) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```

As of now, these are pretty terrible, suggesting that the clonal predictions probably need to be worked out. Or perhaps we can just modify our measures. 

## Tumour purity vs. ITH

A potential issue for inferring clonal diversity values is tumour purity. Low tumour purities reduce the effective read count contributed by non-malignant cells and hence may reduce the power of clonal inference. 

```{r}
tumour_purity <- read_tumour_purity(tumour_purity_file, db_path)
```

### Old estimates

```{r}
ith_vars <- c("divergence", "entropy", "composite_ith", "proportion_subclonal")
other_vars <- colnames(old_comparison)[!colnames(old_comparison) %in% ith_vars]
old_comparison_melted2 <- melt(old_comparison, id.vars = other_vars, measure.vars = ith_vars,
                              variable.name = "ithtype", value.name = "ith")

old_comparison_melted2 <- plyr::join(old_comparison_melted2, tumour_purity)
```

```{r}
pvals <- setNames(ddply(old_comparison_melted2, .(ithtype), function(x) {
  df <- x
  corres <- with(df, cor.test(tumour_content, ith, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(old_comparison_melted2, aes(x=tumour_content, y=ith)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + facet_wrap(~ ithtype, scales = "free") + scale_color_manual(values = pal_patient) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```


Oddly enough, tumour purity has a slight negative correlation with ITH. So, more pure samples have less ITH. That's the opposite of what we'd expect naively. 

### New estimates

```{r}
ith_vars <- c("divergence", "entropy", "postprocessed_divergence", "proportion_subclonal", "combined_ith_raw", "combined_ith_normalized")
other_vars <- colnames(new_comparison)[!colnames(new_comparison) %in% ith_vars]
new_comparison_melted2 <- melt(new_comparison, id.vars = other_vars, measure.vars = ith_vars,
                              variable.name = "ithtype", value.name = "ith")

new_comparison_melted2 <- plyr::join(new_comparison_melted2, tumour_purity)
```

```{r}
pvals <- setNames(ddply(new_comparison_melted2, .(ithtype), function(x) {
  df <- x
  corres <- with(df, cor.test(tumour_content, ith, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("ithtype", "p.value"))

ggplot(new_comparison_melted2, aes(x=tumour_content, y=ith)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + facet_wrap(~ ithtype, scales = "free") + scale_color_manual(values = pal_patient) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```