library(ithi.utils)
load_base_libs()
library(GenomicRanges)

library(nanotools)
library(ithi.meta)
library(ithi.ihc)
library(ithi.xcr)
library(ithi.expr)

Colour palettes

pal_patient <- select_palette("patient")

Parameters

db_path <- snakemake@params$db
ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide

xcr_table_path <- snakemake@input$xcr_table
prevalence_option <- snakemake@params$prevalence_option
distance_method <- snakemake@params$xcr_distance_method
filter_tissue <- snakemake@params$xcr_filter_tissue == "filter"
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

normalize_option <- snakemake@params$normalize %in% c("true", "TRUE", "True")
til_variance_option <- snakemake@params$til_variance_option

Metadata

db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))

IHC analysis

ihc_table <- fread(ihc_table_path)
ihc_table_slide <- fread(ihc_table_slide_path)
ihc_samples <- ihc_table$condensed_id

if (any(table(ihc_samples) != 1)) {
    stop("Duplicate condensed_ids from the same analysis type (IHC).")
}

Total TIL densities

tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")

plot_ihc_barplot(ihc_table, tiltypes, labs = c("CD8+ T cell", "CD4+ T cell", 
    "CD20+ B cell", "Plasma cell"), Y_MAX = 1500, db_path = db_path)

TIL anatomic differences

tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
ihc_data <- subset(ihc_table, select = c("condensed_id", "patient_id", tiltypes))
ihc_data$tissue <- extract_tissue(ihc_data$condensed_id, type = "general", shorten = TRUE)

ihc_data_melted <- melt(ihc_data, id.vars = c("condensed_id", "patient_id", 
    "tissue"), measure.vars = tiltypes, variable.name = "Type", value.name = "Density")

RENAME_MAP <- list(T_CD8_density = "CD8+", T_CD4_density = "CD4+", T_CD20_density = "CD20+", 
    T_Plasma_density = "Plasma")

ihc_data_melted$Type <- mapvalues(ihc_data_melted$Type, from = names(RENAME_MAP), 
    to = unname(unlist(RENAME_MAP)))

## Calculate p-values

# We have multiple levels and multiple random effect levels. Residuals are
# ~approx normal.  Let's use a GLM. I don't know of a nonparametric
# alternative for >=3 Tx levels.
pvals <- setNames(ddply(ihc_data_melted, .(Type), function(x) {
    df <- subset(as.data.frame(x), !is.na(Density))
    model <- lme(Density ~ tissue, random = ~1 | patient_id, data = df, method = "REML")
    result <- anova.lme(model, type = "sequential", adjustSigma = FALSE)
    
    tissue_pval <- result$`p-value`[2]
    eq <- substitute(italic(P) == p, list(p = format(tissue_pval, digits = 3)))
    return(as.character(as.expression(eq)))
}), c("Type", "p.value"))
p <- ggplot(ihc_data_melted, aes(x = Type, y = Density)) + geom_boxplot(aes(fill = tissue)) + 
    theme_bw() + theme_Publication() + scale_fill_Publication() + facet_wrap(~Type, 
    scales = "free", ncol = 2) + theme(axis.text.x = element_text(size = 0), 
    legend.text = element_text(size = 6, family = "Helvetica"), axis.text.y = element_text(size = 6, 
        family = "Helvetica"), axis.title = element_text(size = 8, family = "Helvetica")) + 
    geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, 
        vjust = 1.5, size = 3, parse = TRUE) + scale_y_continuous(trans = log10_trans(), 
    breaks = c(1, 10, 100, 1000))
p

TIL correlations

ihcmat <- subset(ihc_table, select = c("E_CD8_density", "E_CD4_density", "E_CD20_density", 
    "E_Plasma_density", "S_CD8_density", "S_CD4_density", "S_CD20_density", 
    "S_Plasma_density"))
colnames(ihcmat) <- c("E CD8+", "E CD4+", "E CD20+", "E Plasma", "S CD8+", "S CD4+", 
    "S CD20+", "S Plasma")

corres <- corr.test(ihcmat, adjust = "fdr")
corres.r <- corres$r
corres.p <- corres$p
hc <- hclust(dist(corres.r), method = "ward.D")

diag(corres.r) <- NA
diag(corres.p) <- NA

corres.labels <- format(signif(corres.p, 2), 2)

cols <- colorRampPalette(c("#95cbee", "#ffd73e", "#ce472e"))(100)
par(family = "helvetica")
pheatmap(corres.r[hc$order, hc$order], display_numbers = corres.labels, color = cols, 
    number_color = "white", cluster_rows = FALSE, cluster_cols = FALSE)

TCR/BCR analysis

xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path, verbose = 1)

Read 29.5% of 304822 rows
Read 68.9% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:05
xcr_samples <- unique(xcr_table$condensed_id)
tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"

id_type <- "condensed_id"

Number of unique VOAs: 95, number of unique condensed_ids: 95.

Clonotype repertoires

Circos

TCR
tcr_clonotypes <- subset(xcr_table, type == tcr_segment_type)

tcr_cross_table <- cross_tabulate(tcr_clonotypes, id_type = id_type)

if (prevalence_option != "freq") {
    summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones = length(cdr3nt))
    dists <- compute_immune_distance_matrix(tcr_cross_table, method = "num_overlaps")
} else {
    summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads = sum(count))
    dists <- compute_immune_distance_matrix(tcr_cross_table, method = "num_reads")
}

overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))

overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)

df <- data.frame(sample1 = overlapmat$sample1, sample2 = overlapmat$sample2, 
    shared = overlapmat$divshared)
circos_plot(df, "shared", id_type = id_type, db_path = db_path)

BCR
bcr_clonotypes <- subset(xcr_table, type == bcr_segment_type)

bcr_cross_table <- cross_tabulate(bcr_clonotypes, id_type = id_type)

if (prevalence_option != "freq") {
    summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones = length(cdr3nt))
    dists <- compute_immune_distance_matrix(bcr_cross_table, method = "num_overlaps")
} else {
    summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads = sum(count))
    dists <- compute_immune_distance_matrix(bcr_cross_table, method = "num_reads")
}

overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))

overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)

df <- data.frame(sample1 = overlapmat$sample1, sample2 = overlapmat$sample2, 
    shared = overlapmat$divshared)
circos_plot(df, "shared", id_type = id_type, db_path = db_path)

Clonotype repertoire similarity

cross_tables <- list(tcr = tcr_cross_table, bcr = bcr_cross_table)

The distance metric being used is horn.

distance_matrices <- lapply(cross_tables, function(cross_table) {
    distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
    mat <- as.matrix(distmat)
    return(mat)
})
sims_raw <- lapply(distance_matrices, function(mat) {
    sims <- average_intrapatient_similarity(mat, filter_tissue = filter_tissue, 
        id_type = id_type, db_path = db_path, raw = TRUE)
    sims <- subset(sims, select = -c(patient2))
    colnames(sims) <- mapvalues(colnames(sims), from = c("patient1"), to = c("patient"))
    return(sims)
})

sims_avg <- lapply(distance_matrices, function(mat) {
    average_intrapatient_similarity(mat, filter_tissue = filter_tissue, id_type = id_type, 
        db_path = db_path, raw = FALSE)
})
ignore <- lapply(list("tcr", "bcr"), function(segment) {
    dat <- sims_raw[[segment]]
    p <- ggplot(dat, aes(x = factor(patient), y = 1 - dist)) + geom_boxplot(aes(fill = factor(patient))) + 
        theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) + 
        xlab("Patient") + ylab("Pairwise repertoire similarity") + guides(fill = guide_legend(title = "Patient"))
    print(p)
})

Correlation with TIL densities

mean_til_densities <- subset(ihc_table, select = -c(voa, site_id, condensed_id)) %>% 
    group_by(patient_id) %>% summarise_all(function(x) mean(x, na.rm = TRUE))

sims_avg_tcr_til <- merge(sims_avg$tcr, mean_til_densities, by = "patient_id")
sims_avg_bcr_til <- merge(sims_avg$bcr, mean_til_densities, by = "patient_id")

sims_avg_tcr_til$patient_id <- factor(sims_avg_tcr_til$patient_id)
sims_avg_bcr_til$patient_id <- factor(sims_avg_bcr_til$patient_id)

sims_avg_tcr_til <- subset(sims_avg_tcr_til, ncomparisons >= 6)
sims_avg_bcr_til <- subset(sims_avg_bcr_til, ncomparisons >= 6)
tiltypenames <- colnames(ihc_table)[str_detect(colnames(ihc_table), "^(T).*_density$")]

sims_avg_melted <- lapply(list(tcr = sims_avg_tcr_til, bcr = sims_avg_bcr_til), 
    function(merged_sims) {
        other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% tiltypenames]
        melt(merged_sims, id.vars = other_cols, measure.vars = tiltypenames, 
            variable.name = "Type", value.name = "value")
    })

pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
    setNames(ddply(merged_sims_melt, .(Type), function(x) {
        df <- subset(as.data.frame(x), !is.na(value))
        pval <- with(df, cor.test(median_sim, value, method = "spearman")$p.value)
        # eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
        # return(as.character(as.expression(eq)))
        return(pval)
    }), c("Type", "p.value"))
})

pvals <- lapply(pvals, function(x) {
    x$p.value <- p.adjust(x$p.value, method = "fdr")
    x$p.value <- unlist(sapply(x$p.value, function(pval) as.character(as.expression(substitute(italic(P) == 
        p, list(p = format(pval, digits = 3)))))))
    return(x)
})

print(pvals$tcr)
                 Type                p.value
1       T_CD8_density italic(P) == "0.00704"
2       T_CD4_density   italic(P) == "0.142"
3      T_CD20_density   italic(P) == "0.275"
4    T_Plasma_density   italic(P) == "0.449"
5 T_Nonplasma_density   italic(P) == "0.142"
print(pvals$bcr)
                 Type              p.value
1       T_CD8_density italic(P) == "0.383"
2       T_CD4_density italic(P) == "0.383"
3      T_CD20_density italic(P) == "0.383"
4    T_Plasma_density italic(P) == "0.383"
5 T_Nonplasma_density italic(P) == "0.383"
ggplot(sims_avg_tcr_til, aes(x = median_sim, y = T_CD8_density)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "T_CD8_density")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()

ggplot(sims_avg_tcr_til, aes(x = median_sim, y = T_CD4_density)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "T_CD4_density")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()

ggplot(sims_avg_bcr_til, aes(x = median_sim, y = T_CD20_density)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$bcr, Type == "T_CD20_density")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()

ggplot(sims_avg_bcr_til, aes(x = median_sim, y = T_Plasma_density)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$bcr, Type == "T_Plasma_density")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()

Perhaps a better way of doing this is to use generalized linear models?

summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density, 
    data = sims_avg_tcr_til))

Call:
glm(formula = median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + 
    T_Plasma_density, data = sims_avg_tcr_til)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.17954  -0.12617  -0.09175   0.06449   0.41226  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)  
(Intercept)       0.1766786  0.1120449   1.577   0.1408  
T_CD8_density     0.0019203  0.0008616   2.229   0.0457 *
T_CD4_density    -0.0013891  0.0025360  -0.548   0.5939  
T_CD20_density    0.0044570  0.0077225   0.577   0.5745  
T_Plasma_density -0.0026832  0.0032726  -0.820   0.4283  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.04466831)

    Null deviance: 0.89980  on 16  degrees of freedom
Residual deviance: 0.53602  on 12  degrees of freedom
AIC: 1.4783

Number of Fisher Scoring iterations: 2
summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density, 
    data = sims_avg_bcr_til))

Call:
glm(formula = median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + 
    T_Plasma_density, data = sims_avg_bcr_til)

Deviance Residuals: 
      Min         1Q     Median         3Q        Max  
-0.169546  -0.084801   0.002978   0.051342   0.262292  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)   
(Intercept)       0.1645175  0.0662052   2.485  0.02870 * 
T_CD8_density     0.0009342  0.0005091   1.835  0.09140 . 
T_CD4_density    -0.0001514  0.0014985  -0.101  0.92119   
T_CD20_density    0.0074466  0.0045631   1.632  0.12864   
T_Plasma_density -0.0060288  0.0019337  -3.118  0.00889 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.01559549)

    Null deviance: 0.37101  on 16  degrees of freedom
Residual deviance: 0.18715  on 12  degrees of freedom
AIC: -16.41

Number of Fisher Scoring iterations: 2

Summary stats

Diversity

richness_column <- "observedDiversity_mean"
shannon_column <- "shannonWienerIndex_mean"
inverse_simpson_column <- "inverseSimpsonIndex_mean"
diversity_files <- list(tcr = tcr_diversity_file, bcr = bcr_diversity_file)

diversity <- lapply(diversity_files, function(f) {
    read_xcr_diversity_file(f, db_path)
})

plot_diversity <- function(col) {
    ignore <- lapply(diversity, function(df) {
        p <- ggplot(df, aes_string(x = "patient_id", y = col)) + geom_boxplot(aes(fill = patient_id)) + 
            theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) + 
            xlab("Patient") + ylab("Diversity")
        print(p)
    })
}
Richness
plot_diversity(richness_column)

Shannon entropy
plot_diversity(shannon_column)

Inverse Simpson
plot_diversity(inverse_simpson_column)

Nanostring

exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)
exprmat <- as.data.frame(getExprs(exprs))
annomat <- getAnnos(exprs)
rownames(exprmat) <- annomat$Name

nanostring_samples <- map_id(colnames(exprmat), from = "voa", to = "condensed_id", 
    db_path = db_path)
if (any(table(nanostring_samples) != 1)) {
    stop("Duplicate condensed_ids from the same analysis type (Nanostring).")
}

Cell-type expression signatures

patient_strings <- as.character(map_id(colnames(exprmat), from = "voa", to = "patient_id", 
    db_path = db_path))

## There are no duplicate condensed_ids when mapping from FFPE voa's, so we
## can convert here for readability
colnames(exprmat) <- nanostring_samples

annotations <- merge(annomat, labels, by.x = "Name", by.y = "Gene")
patients <- data.frame(Patient = patient_strings)
rownames(patients) <- colnames(exprmat)

cell_types <- subset(annotations, cell_type != "")$cell_type
cell_type_genes <- subset(annotations, cell_type != "")$Name[order(cell_types)]

annorow <- as.data.frame(subset(annotations, select = c(cell_type)))
rownames(annorow) <- annotations$Name
colnames(annorow)[1] <- "Cell Type"

annocol <- patients

annocolors <- list(`Cell Type` = get_colour_palette(unique(cell_types)), Patient = select_palette("patient"))

mat <- clip_values(t(scale(t(exprmat[cell_type_genes, ]))), hi = 2, lo = -2)

gaps <- which(rev(!duplicated(rev(cell_types[order(cell_types)]))))
pheatmap(mat, annotation_row = annorow, annotation_col = annocol, annotation_colors = annocolors, 
    cluster_rows = FALSE, show_colnames = FALSE, clustering_method = "ward.D", 
    gaps_row = gaps)

Cell-type metagenes

celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
celltype_matrix_scaled <- clip_values(t(scale(t(celltype_matrix))), hi = 2, 
    lo = -2)

pheatmap(celltype_matrix_scaled, annotation_col = annocol, annotation_colors = annocolors, 
    cluster_rows = TRUE, show_colnames = FALSE, clustering_method = "ward.D")

Correlation with intrapatient XCR variability

celltype_df <- data.frame(celltype_matrix %>% t, check.names = FALSE) %>% rownames_to_column(var = "condensed_id")
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id", 
    to = "patient_id", db_path)

mean_celltype_expression <- subset(celltype_df, select = -c(condensed_id)) %>% 
    group_by(patient_id) %>% summarise_all(function(x) mean(x, na.rm = TRUE))

sims_avg_tcr_expr <- merge(sims_avg$tcr, mean_celltype_expression, by = "patient_id")
sims_avg_bcr_expr <- merge(sims_avg$bcr, mean_celltype_expression, by = "patient_id")

sims_avg_tcr_expr$patient_id <- factor(sims_avg_tcr_expr$patient_id)
sims_avg_bcr_expr$patient_id <- factor(sims_avg_bcr_expr$patient_id)
exprnames <- colnames(celltype_df)[!colnames(celltype_df) %in% c("condensed_id", 
    "patient_id")]

sims_avg_melted <- lapply(list(tcr = sims_avg_tcr_expr, bcr = sims_avg_bcr_expr), 
    function(merged_sims) {
        other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% exprnames]
        melt(merged_sims, id.vars = other_cols, measure.vars = exprnames, variable.name = "Type", 
            value.name = "value")
    })

pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
    setNames(ddply(merged_sims_melt, .(Type), function(x) {
        df <- subset(as.data.frame(x), !is.na(value))
        pval <- with(df, cor.test(mean_sim, value, method = "spearman")$p.value)
        eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
        return(as.character(as.expression(eq)))
    }), c("Type", "p.value"))
})

print(pvals$tcr)
                 Type                 p.value
1                 aDC   italic(P) == "0.0138"
2              B-cell    italic(P) == "0.177"
3          CD8 T-cell  italic(P) == "0.00941"
4      Cytotoxic cell italic(P) == "0.000607"
5                  DC    italic(P) == "0.992"
6         Eosinophils   italic(P) == "0.0111"
7                 iDC    italic(P) == "0.199"
8         Macrophages    italic(P) == "0.211"
9           Mast cell   italic(P) == "0.0598"
10        Neutrophils    italic(P) == "0.368"
11 NK CD56bright cell    italic(P) == "0.042"
12    NK CD56dim cell  italic(P) == "0.00021"
13            NK cell    italic(P) == "0.219"
14                pDC    italic(P) == "0.807"
15      T helper cell    italic(P) == "0.677"
16             T-cell italic(P) == "0.000655"
17                Tcm  italic(P) == "0.00029"
18                Tem    italic(P) == "0.538"
19                TFH  italic(P) == "0.00151"
20                Tgd    italic(P) == "0.617"
21           Th1 cell   italic(P) == "0.0132"
22          Th17 cell    italic(P) == "0.347"
23           Th2 cell    italic(P) == "0.967"
24               Treg    italic(P) == "0.368"
print(pvals$bcr)
                 Type               p.value
1                 aDC  italic(P) == "0.202"
2              B-cell italic(P) == "0.0442"
3          CD8 T-cell  italic(P) == "0.237"
4      Cytotoxic cell  italic(P) == "0.199"
5                  DC   italic(P) == "0.46"
6         Eosinophils  italic(P) == "0.695"
7                 iDC  italic(P) == "0.183"
8         Macrophages  italic(P) == "0.649"
9           Mast cell italic(P) == "0.0712"
10        Neutrophils italic(P) == "0.0197"
11 NK CD56bright cell  italic(P) == "0.916"
12    NK CD56dim cell  italic(P) == "0.129"
13            NK cell  italic(P) == "0.846"
14                pDC  italic(P) == "0.382"
15      T helper cell  italic(P) == "0.972"
16             T-cell  italic(P) == "0.237"
17                Tcm  italic(P) == "0.129"
18                Tem  italic(P) == "0.767"
19                TFH  italic(P) == "0.239"
20                Tgd  italic(P) == "0.337"
21           Th1 cell  italic(P) == "0.429"
22          Th17 cell italic(P) == "0.0524"
23           Th2 cell italic(P) == "0.0757"
24               Treg  italic(P) == "0.382"
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `CD8 T-cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "CD8 T-cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `T helper cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "T helper cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `T-cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "T-cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `Th1 cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "Th1 cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `Th2 cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$tcr, Type == "Th2 cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

ggplot(sims_avg_bcr_expr, aes(x = mean_sim, y = `B-cell`)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + 
    annotate(geom = "text", label = subset(pvals$bcr, Type == "B-cell")$p.value, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)

Intrapatient variability (overall)

Requires that condensed_ids are never duplicated in any analysis type. I’ve inserted checks into each of the sections above to ensure this.

Nanostring

We’ll work with cell-type metagenes as working with all expression values will be biased by the number of genes associated with each cell type.

TIL densities

tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")

ihc_slide_data <- subset(ihc_table_slide, select = c("condensed_id", "patient_id", 
    "slide", tiltypes))
ihc_melted <- melt(ihc_slide_data, id.vars = c("condensed_id", "patient_id", 
    "slide"), measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
ihc_melted$tiltype <- as.character(ihc_melted$tiltype)

XCR

Correlations

T_CELL_TYPES <- c("T_CD8_density", "T_CD4_density")
B_CELL_TYPES <- c("T_CD20_density", "T_Plasma_density")
NANOSTRING_T_TYPES <- c("CD8 T-cell", "Cytotoxic cell", "T-cell", "T helper cell", 
    "Th1 cell", "Th2 cell", "Th17 cell", "Treg", "Tcm", "Tem")
NANOSTRING_B_TYPES <- c("B-cell", "TFH")
XCR_T_TYPES <- c("tcr")
XCR_B_TYPES <- c("bcr")

cols <- colorRampPalette(c("#95cbee", "#ffd73e", "#ce472e"))(100)

corplot <- function(var1, var2, sub1, sub2) {
    cor_df <- Reduce(function(x, y) merge(x, y, by = "patient_id"), list(var1, 
        var2))
    cor_matrix <- subset(cor_df, select = -c(patient_id))
    
    corres <- corr.test(cor_matrix, method = "pearson", adjust = "none")
    # hc <- hclust(dist(corres$r), method='ward.D')
    diag(corres$r) <- NA
    diag(corres$p) <- NA
    
    corres.r <- corres$r[sub1, sub2]
    corres.p <- corres$p[sub1, sub2]
    
    corres.p.labels <- format(signif(corres.p, 2), 2)
    
    pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white", 
        cluster_rows = TRUE, cluster_cols = TRUE)
}

TIL densities vs. XCR

intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples, 
    xcr_samples))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, 
    til_variance_option = "stabilize", tiltypes)
xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE, 
    id_type = "condensed_id", db_path)

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)

corplot(til_var, xcr_var, ihc_entries, xcr_entries)

TIL densities vs. Nanostring

intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples, 
    nanostring_samples))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, 
    til_variance_option = "stabilize", tiltypes)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

corplot(til_var, expr_var, ihc_entries, expr_entries)

XCR vs. Nanostring

intersect_samples <- compute_sample_intersection(samples_list = list(xcr_samples, 
    nanostring_samples))

xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE, 
    id_type = "condensed_id", db_path)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)

xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

corplot(xcr_var, expr_var, xcr_entries, expr_entries)

Intrapatient variability (pairwise)

We can also look at consistency in intrapatient similarity on a pairwise basis.

scalar_dist_method <- "manhattan"

ihc_stabilized <- stabilize_ihc_variances(ihc_slide_data, ihc_table, tiltypes)
ihc_stabilized$condensed_id <- factor_id(ihc_stabilized$condensed_id, "condensed_id", 
    db_path)
ihc_sims <- pairwise_similarity_frame(ihc_stabilized, id_col = id_type, group = "patient_id", 
    method = scalar_dist_method)

## always filter_tissue = FALSE
xcr_sims_raw_all <- lapply(names(distance_matrices), function(segment) {
    mat <- distance_matrices[[segment]]
    sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type, 
        db_path = db_path, raw = TRUE)
    sims <- subset(sims, select = -c(patient2))
    colnames(sims) <- mapvalues(colnames(sims), from = c("patient1", "dist"), 
        to = c("patient_id", segment))
    return(sims)
})
names(xcr_sims_raw_all) <- names(distance_matrices)

celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = id_type)
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id", 
    to = "patient_id", db_path)
celltype_sims <- pairwise_similarity_frame(celltype_df, id_col = id_type, group = "patient_id", 
    method = scalar_dist_method)

TIL densities vs XCR

ihc_xcr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(ihc_sims)))

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)

cor_matrix <- subset(ihc_xcr_sims, select = c(ihc_entries, xcr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[ihc_entries, xcr_entries]
corres.p <- corres$p[ihc_entries, xcr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white", 
    cluster_rows = TRUE, cluster_cols = TRUE)

xcr_sims_melted <- Reduce(plyr::join, xcr_sims_raw_all)

xcr_sims_melted <- melt(xcr_sims_melted, id.vars = colnames(xcr_sims_melted)[!colnames(xcr_sims_melted) %in% 
    xcr_entries], measure.vars = xcr_entries, variable.name = "segment", value.name = "xcrsim")

ihc_sims_melted <- melt(ihc_sims, id.vars = colnames(ihc_sims)[!colnames(ihc_sims) %in% 
    ihc_entries], measure.vars = ihc_entries, variable.name = "tiltype", value.name = "ihcsim")

ihc_xcr_melted <- merge(ihc_sims_melted, xcr_sims_melted, by = c("sample1", 
    "sample2", "patient_id"))

ggplot(ihc_xcr_melted, aes(x = ihcsim, y = xcrsim)) + geom_point(aes(colour = factor(patient_id))) + 
    facet_wrap(tiltype ~ segment, scales = "free") + theme_bw() + theme_Publication() + 
    scale_colour_manual(values = pal_patient)

summary(lmer(xcrsim ~ ihcsim + (1 | patient_id), data = subset(ihc_xcr_melted, 
    tiltype == "T_CD8_density" & segment == "tcr" & !is.na(ihcsim) & !is.na(xcrsim))))
Linear mixed model fit by REML t-tests use Satterthwaite approximations
  to degrees of freedom [lmerMod]
Formula: xcrsim ~ ihcsim + (1 | patient_id)
   Data: subset(ihc_xcr_melted, tiltype == "T_CD8_density" & segment ==  
    "tcr" & !is.na(ihcsim) & !is.na(xcrsim))

REML criterion at convergence: -87.7

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.4652 -0.4055  0.0996  0.4997  1.5398 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.05215  0.2284  
 Residual               0.01781  0.1334  
Number of obs: 121, groups:  patient_id, 17

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)    
(Intercept) 6.911e-01  5.951e-02 1.853e+01  11.613 6.07e-10 ***
ihcsim      7.561e-03  4.369e-02 1.116e+02   0.173    0.863    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
       (Intr)
ihcsim -0.271

Critically, TCR similarity does not correlate with CD8+ or CD4+ T-cell density similarity.

TIL densities vs. Nanostring

ihc_expr_sims <- Reduce(plyr::join, list(ihc_sims, celltype_sims))

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

cor_matrix <- subset(ihc_expr_sims, select = c(ihc_entries, expr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[ihc_entries, expr_entries]
corres.p <- corres$p[ihc_entries, expr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white", 
    cluster_rows = TRUE, cluster_cols = TRUE)

XCR vs. Nanostring

xcr_expr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(celltype_sims)))

xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

cor_matrix <- subset(xcr_expr_sims, select = c(xcr_entries, expr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[xcr_entries, expr_entries]
corres.p <- corres$p[xcr_entries, expr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white", 
    cluster_rows = TRUE, cluster_cols = TRUE)

These results aren’t really consistent with the patient-level results. It’s important to note, however, that these correlations are calculated naive to the patient labels. i.e. there’s no controlling for patient with a random intercept and slope, etc.

Things become more difficult to interpret if we incorporate random intercepts and slopes. There could be high correlations within patients but low correlations between patients, and vice versa.

Think about how we can show this in a statistically robust way.

IHC pairwise significance testing

pairwise_results_filtered <- ihc_pairwise_significance(ihc_table_slide, tiltypes, 
    db_path, slide_count_filter = 10)
Q_VALUE_THRESHOLD <- 0.01

pairwise_results_filtered$significant <- pairwise_results_filtered$q.value < 
    Q_VALUE_THRESHOLD

pairwise_tiltype_summary <- pairwise_results_filtered %>% group_by(patient_id, 
    tiltype) %>% summarise(any_significant = any(significant))

pairwise_tiltype_count <- pairwise_tiltype_summary %>% group_by(tiltype) %>% 
    summarise(npatients = sum(any_significant, na.rm = TRUE))

pairwise_tiltype_count
x <- pairwise_tiltype_summary %>% group_by(tiltype) %>% do(sigvec = .$any_significant)
x_pairs <- as_tibble(merge(x, x, by = c()))
x_pairs_p <- x_pairs %>% group_by(tiltype.x, tiltype.y) %>% summarise(p.value = pchisq(GenomicRanges::phicoef(table(unlist(sigvec.x), 
    unlist(sigvec.y)))^2 * sum(table(unlist(sigvec.x), unlist(sigvec.y))), df = 1, 
    lower.tail = FALSE))
x_pairs_p$q.value <- p.adjust(x_pairs_p$p.value, method = "fdr")

x_pairs_p
---
title: "Immune variability"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "immune_variability.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', 'Rmd/immune_variability.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "notebook" = 'Rmd/immune_variability.Rmd', "ihc_table_slide" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/immune_variability.nb.html'),
    params = list('clones', 'horn', 'stabilize', 'ithi-analysis-immune-variability', 'nofilter', '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'TRUE', "prevalence_option" = 'clones', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "til_variance_option" = 'stabilize', "name" = 'ithi-analysis-immune-variability', "xcr_filter_tissue" = 'nofilter', "xcr_distance_method" = 'horn', "normalize_option" = 'TRUE'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/immune_variability.log'),
    resources = list(),
    config = list("v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "spatial_result_dirs" = list("epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc', "stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc'), "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "mvclust_nclust" = 3, "smooth_type" = 'loess', "clonal_figure_template" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/templates/clonal_figure.svg', "classifier_type" = 'knn', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "nclusts" = 3, "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "xcrmapscape_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/1.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/2.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/3.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/4.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/7.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/9.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/10.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/11.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/12.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/13.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/14.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/15.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/16.svg', '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/xcrmapscape/17.svg'), "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "index_notebook" = 'Rmd/index.Rmd', "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "PNG_QUALITY" = 300, "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "figure_gallery_notebook" = 'Rmd/figures.Rmd', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "clonal_samplers" = c('HMC', 'NUTS'), "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "xcr_distance_method" = 'horn', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "nscatters" = 6, "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "xcrmapscape_tcr_patient_order" = c(15, 1, 3, 4, 2, 17, 7, 14, 9, 10, 12, 13, 11, 16), "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "default_sampler" = 'HMC', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "PNG_DENSITY" = 300, "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "prevalence_threshold" = 0.01, "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "variability_type" = 'stabilize', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "phenotype_threshold" = 0.85, "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd'),
    rule = 'immune_variability'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()
library(GenomicRanges)

library(nanotools)
library(ithi.meta)
library(ithi.ihc)
library(ithi.xcr)
library(ithi.expr)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db
ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide

xcr_table_path <- snakemake@input$xcr_table
prevalence_option <- snakemake@params$prevalence_option
distance_method <- snakemake@params$xcr_distance_method
filter_tissue <- snakemake@params$xcr_filter_tissue == "filter"
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

normalize_option <- snakemake@params$normalize %in% c("true", "TRUE", "True")
til_variance_option <- snakemake@params$til_variance_option
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```

## IHC analysis

```{r}
ihc_table <- fread(ihc_table_path)
ihc_table_slide <- fread(ihc_table_slide_path)
```

```{r}
ihc_samples <- ihc_table$condensed_id

if (any(table(ihc_samples) != 1)) {
  stop("Duplicate condensed_ids from the same analysis type (IHC).")
}
```

### Total TIL densities

```{r, fig.width=12, fig.height=3.6}
tiltypes <- c("T_CD8_density", "T_CD4_density",
              "T_CD20_density", "T_Plasma_density")

plot_ihc_barplot(ihc_table, tiltypes, labs=c("CD8+ T cell", "CD4+ T cell", "CD20+ B cell", "Plasma cell"), Y_MAX=1500, db_path = db_path)
```

### TIL anatomic differences

```{r}
tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
ihc_data <- subset(ihc_table, select=c("condensed_id", "patient_id", tiltypes))
ihc_data$tissue <- extract_tissue(ihc_data$condensed_id, type = "general", shorten=TRUE)

ihc_data_melted <- melt(ihc_data, id.vars = c("condensed_id", "patient_id", "tissue"),
     measure.vars = tiltypes, variable.name = "Type", value.name = "Density")

RENAME_MAP <- list(
  "T_CD8_density"="CD8+",
  "T_CD4_density"="CD4+",
  "T_CD20_density"="CD20+",
  "T_Plasma_density"="Plasma"
)

ihc_data_melted$Type <- mapvalues(ihc_data_melted$Type, from = names(RENAME_MAP),
                                  to=unname(unlist(RENAME_MAP)))

## Calculate p-values

# We have multiple levels and multiple random effect levels. Residuals are ~approx normal. 
# Let's use a GLM. I don't know of a nonparametric alternative for >=3 Tx levels. 
pvals <- setNames(ddply(ihc_data_melted, .(Type), function(x) {
  df <- subset(as.data.frame(x), !is.na(Density))
  model <- lme(Density ~ tissue, random=~1|patient_id, data=df, method="REML")
  result <- anova.lme(model, type="sequential", adjustSigma=FALSE)
  
  tissue_pval <- result$`p-value`[2]
  eq <- substitute(italic(P)==p, list(p=format(tissue_pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("Type", "p.value"))
```

```{r, fig.width=10, fig.height=10}
p <- ggplot(ihc_data_melted, aes(x=Type, y=Density)) + geom_boxplot(aes(fill=tissue)) + theme_bw() +
  theme_Publication() + scale_fill_Publication()  + facet_wrap(~ Type, scales="free", ncol=2) + 
  theme(axis.text.x = element_text(size=0),
        legend.text=element_text(size=6,family="Helvetica"),
        axis.text.y=element_text(size=6,family="Helvetica"),
        axis.title=element_text(size=8,family="Helvetica")) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value),
                                                        hjust=1.1, vjust=1.5,size=3,parse=TRUE) +
  scale_y_continuous(trans=log10_trans(), breaks=c(1, 10, 100, 1000))
p
```


### TIL correlations

```{r}
ihcmat <- subset(ihc_table, select=c("E_CD8_density", "E_CD4_density",
                               "E_CD20_density", "E_Plasma_density",
                               "S_CD8_density", "S_CD4_density",
                               "S_CD20_density", "S_Plasma_density"))
colnames(ihcmat) <- c("E CD8+", "E CD4+", "E CD20+", "E Plasma", "S CD8+", "S CD4+", "S CD20+",
                      "S Plasma")

corres <- corr.test(ihcmat, adjust="fdr")
corres.r <- corres$r
corres.p <- corres$p
hc <- hclust(dist(corres.r), method="ward.D")

diag(corres.r) <- NA
diag(corres.p) <- NA

corres.labels <- format(signif(corres.p, 2), 2)

cols <- colorRampPalette(c("#95cbee", "#ffd73e","#ce472e"))(100)
```

```{r fig.width=10, fig.height=10}
par(family='helvetica')
pheatmap(corres.r[hc$order,hc$order], display_numbers = corres.labels, 
         color = cols, number_color = "white", 
         cluster_rows = FALSE, cluster_cols=FALSE)
```


## TCR/BCR analysis

```{r}
xcr_table <- read_clonotypes(xcr_table_path, duplicates=FALSE, db_path, verbose = 1)

xcr_samples <- unique(xcr_table$condensed_id)
```

````{r}
tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"

id_type <- "condensed_id"
```

Number of unique VOAs: `r length(unique(xcr_table$voa))`, number of unique condensed_ids: `r length(unique(xcr_table$condensed_id))`.

### Clonotype repertoires

#### Circos 

##### TCR

```{r}
tcr_clonotypes <- subset(xcr_table, type == tcr_segment_type)

tcr_cross_table <- cross_tabulate(tcr_clonotypes, id_type = id_type)

if (prevalence_option != "freq") {
  summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones=length(cdr3nt))
  dists <- compute_immune_distance_matrix(tcr_cross_table, method="num_overlaps")
} else {
  summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads=sum(count))
  dists <- compute_immune_distance_matrix(tcr_cross_table, method="num_reads")
}

overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))

overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)

df <- data.frame(sample1 = overlapmat$sample1,
                 sample2 = overlapmat$sample2,
                 shared = overlapmat$divshared)
```

```{r, fig.width=10, fig.height=10}
circos_plot(df, "shared", id_type = id_type, db_path = db_path)
```


##### BCR

```{r}
bcr_clonotypes <- subset(xcr_table, type == bcr_segment_type)

bcr_cross_table <- cross_tabulate(bcr_clonotypes, id_type = id_type)

if (prevalence_option != "freq") {
  summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones=length(cdr3nt))
  dists <- compute_immune_distance_matrix(bcr_cross_table, method="num_overlaps")
} else {
  summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads=sum(count))
  dists <- compute_immune_distance_matrix(bcr_cross_table, method="num_reads")
}

overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))

overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)

df <- data.frame(sample1 = overlapmat$sample1,
                 sample2 = overlapmat$sample2,
                 shared = overlapmat$divshared)
```

```{r fig.width=10, fig.height=10}
circos_plot(df, "shared", id_type = id_type, db_path = db_path)
```

### Clonotype repertoire similarity

```{r}
cross_tables <- list(tcr=tcr_cross_table, bcr=bcr_cross_table)
```

The distance metric being used is `r distance_method`. 

```{r}
distance_matrices <- lapply(cross_tables, function(cross_table) {
  distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
  mat <- as.matrix(distmat)
  return(mat)
})
```


```{r}
sims_raw <- lapply(distance_matrices, function(mat) {
  sims <- average_intrapatient_similarity(mat, filter_tissue = filter_tissue, id_type = id_type,
                                  db_path = db_path, raw = TRUE)
  sims <- subset(sims, select=-c(patient2))
  colnames(sims) <- mapvalues(colnames(sims), from = c("patient1"), to = c("patient"))
  return(sims)
})

sims_avg <- lapply(distance_matrices, function(mat) {
  average_intrapatient_similarity(mat, filter_tissue = filter_tissue, id_type = id_type,
                                  db_path = db_path, raw = FALSE)
})
```

```{r, fig.width=10, fig.height=6}
ignore <- lapply(list("tcr", "bcr"), function(segment) {
  dat <- sims_raw[[segment]]
  p <- ggplot(dat, aes(x=factor(patient), y=1-dist)) + geom_boxplot(aes(fill=factor(patient))) + theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) + 
    xlab("Patient") + ylab("Pairwise repertoire similarity") + guides(fill=guide_legend(title="Patient"))
  print(p)
})
```


#### Correlation with TIL densities

```{r}
mean_til_densities <- subset(ihc_table, select=-c(voa, site_id, condensed_id)) %>% 
  group_by(patient_id) %>%
  summarise_all(function(x) mean(x, na.rm=TRUE))

sims_avg_tcr_til <- merge(sims_avg$tcr, mean_til_densities, by="patient_id")
sims_avg_bcr_til <- merge(sims_avg$bcr, mean_til_densities, by="patient_id")

sims_avg_tcr_til$patient_id <- factor(sims_avg_tcr_til$patient_id)
sims_avg_bcr_til$patient_id <- factor(sims_avg_bcr_til$patient_id)

sims_avg_tcr_til <- subset(sims_avg_tcr_til, ncomparisons >= 6)
sims_avg_bcr_til <- subset(sims_avg_bcr_til, ncomparisons >= 6)
```

```{r}
tiltypenames <- colnames(ihc_table)[str_detect(colnames(ihc_table), "^(T).*_density$")]

sims_avg_melted <- lapply(list(tcr=sims_avg_tcr_til, bcr=sims_avg_bcr_til), function(merged_sims) {
  other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% tiltypenames]
  melt(merged_sims, id.vars = other_cols, measure.vars = tiltypenames,
       variable.name = "Type", value.name = "value")
})

pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
  setNames(ddply(merged_sims_melt, .(Type), function(x) {
    df <- subset(as.data.frame(x), !is.na(value))
    pval <- with(df, cor.test(median_sim, value, method = "spearman")$p.value)
    #eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
    #return(as.character(as.expression(eq)))
    return(pval)
  }), c("Type", "p.value"))
})

pvals <- lapply(pvals, function(x) {
  x$p.value <- p.adjust(x$p.value, method = "fdr")
  x$p.value <- unlist(sapply(x$p.value, function(pval) as.character(as.expression(substitute(italic(P)==p, list(p=format(pval, digits=3)))))))
  return(x)
})

print(pvals$tcr)

print(pvals$bcr)
```

```{r, fig.width=6, fig.height=6}
ggplot(sims_avg_tcr_til, aes(x=median_sim, y=T_CD8_density)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "T_CD8_density")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE) + scale_y_log10()

ggplot(sims_avg_tcr_til, aes(x=median_sim, y=T_CD4_density)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "T_CD4_density")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE) + scale_y_log10()

ggplot(sims_avg_bcr_til, aes(x=median_sim, y=T_CD20_density)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$bcr, Type == "T_CD20_density")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE) + scale_y_log10()

ggplot(sims_avg_bcr_til, aes(x=median_sim, y=T_Plasma_density)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$bcr, Type == "T_Plasma_density")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE) + scale_y_log10()
```

Perhaps a better way of doing this is to use generalized linear models?

```{r}
summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density, data=sims_avg_tcr_til))
summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density, data=sims_avg_bcr_til))
```

### Summary stats

#### Diversity

```{r}
richness_column <- "observedDiversity_mean"
shannon_column <- "shannonWienerIndex_mean"
inverse_simpson_column <- "inverseSimpsonIndex_mean"
```


```{r}
diversity_files <- list(tcr=tcr_diversity_file,
                        bcr=bcr_diversity_file)

diversity <- lapply(diversity_files, function(f) {
  read_xcr_diversity_file(f, db_path)
})

plot_diversity <- function(col) {
  ignore <- lapply(diversity, function(df) {
    p <- ggplot(df, aes_string(x="patient_id", y=col)) + geom_boxplot(aes(fill=patient_id)) + theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) + xlab("Patient") + ylab("Diversity")
    print(p)
  })
}
```

##### Richness

```{r fig.width=10, fig.height=6}
plot_diversity(richness_column)
```

##### Shannon entropy

```{r fig.width=10, fig.height=6}
plot_diversity(shannon_column)
```

##### Inverse Simpson

```{r fig.width=10, fig.height=6}
plot_diversity(inverse_simpson_column)
```

## Nanostring

```{r}
exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)
```

```{r}
exprmat <- as.data.frame(getExprs(exprs))
annomat <- getAnnos(exprs)
rownames(exprmat) <- annomat$Name

nanostring_samples <- map_id(colnames(exprmat), from = "voa", to="condensed_id", db_path = db_path)
```

```{r}
if (any(table(nanostring_samples) != 1)) {
  stop("Duplicate condensed_ids from the same analysis type (Nanostring).")
}
```

### Cell-type expression signatures

```{r}
patient_strings <- as.character(map_id(colnames(exprmat), from = "voa", to="patient_id", db_path = db_path))

## There are no duplicate condensed_ids when mapping from FFPE voa's, so we can convert here for readability
colnames(exprmat) <- nanostring_samples

annotations <- merge(annomat, labels, by.x="Name", by.y="Gene")
patients <- data.frame(Patient=patient_strings)
rownames(patients) <- colnames(exprmat)

cell_types <- subset(annotations, cell_type != "")$cell_type
cell_type_genes <- subset(annotations, cell_type != "")$Name[order(cell_types)]

annorow <- as.data.frame(subset(annotations, select=c(cell_type)))
rownames(annorow) <- annotations$Name
colnames(annorow)[1] <- "Cell Type"

annocol <- patients

annocolors <- list("Cell Type"=get_colour_palette(unique(cell_types)), 
                   Patient=select_palette("patient"))

mat <- clip_values(t(scale(t(exprmat[cell_type_genes,]))), hi=2, lo=-2)

gaps <- which(rev(!duplicated(rev(cell_types[order(cell_types)]))))
```

```{r, fig.width=16, fig.height=16}
pheatmap(mat, annotation_row = annorow, annotation_col = annocol, annotation_colors = annocolors, 
         cluster_rows = FALSE, show_colnames = FALSE, clustering_method = "ward.D",
         gaps_row = gaps)
```

### Cell-type metagenes

```{r}
celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
```

```{r, fig.width=16, fig.height=4}
celltype_matrix_scaled <- clip_values(t(scale(t(celltype_matrix))), hi=2, lo=-2)

pheatmap(celltype_matrix_scaled, annotation_col = annocol, annotation_colors = annocolors, 
         cluster_rows = TRUE, show_colnames = FALSE, clustering_method = "ward.D")
```

#### Correlation with intrapatient XCR variability

```{r}
celltype_df <- data.frame(celltype_matrix %>% t, check.names = FALSE) %>% rownames_to_column(var = "condensed_id")
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id", to="patient_id",
                                 db_path)

mean_celltype_expression <- subset(celltype_df, select=-c(condensed_id)) %>% 
  group_by(patient_id) %>%
  summarise_all(function(x) mean(x, na.rm=TRUE))

sims_avg_tcr_expr <- merge(sims_avg$tcr, mean_celltype_expression, by="patient_id")
sims_avg_bcr_expr <- merge(sims_avg$bcr, mean_celltype_expression, by="patient_id")

sims_avg_tcr_expr$patient_id <- factor(sims_avg_tcr_expr$patient_id)
sims_avg_bcr_expr$patient_id <- factor(sims_avg_bcr_expr$patient_id)
```

```{r}
exprnames <- colnames(celltype_df)[!colnames(celltype_df) %in% c("condensed_id", "patient_id")]

sims_avg_melted <- lapply(list(tcr=sims_avg_tcr_expr, bcr=sims_avg_bcr_expr), function(merged_sims) {
  other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% exprnames]
  melt(merged_sims, id.vars = other_cols, measure.vars = exprnames,
       variable.name = "Type", value.name = "value")
})

pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
  setNames(ddply(merged_sims_melt, .(Type), function(x) {
    df <- subset(as.data.frame(x), !is.na(value))
    pval <- with(df, cor.test(mean_sim, value, method = "spearman")$p.value)
    eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
    return(as.character(as.expression(eq)))
  }), c("Type", "p.value"))
})

print(pvals$tcr)

print(pvals$bcr)
```

```{r, fig.width=6, fig.height=6}
ggplot(sims_avg_tcr_expr, aes(x=mean_sim, y=`CD8 T-cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "CD8 T-cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)

ggplot(sims_avg_tcr_expr, aes(x=mean_sim, y=`T helper cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "T helper cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)

ggplot(sims_avg_tcr_expr, aes(x=mean_sim, y=`T-cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "T-cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)

ggplot(sims_avg_tcr_expr, aes(x=mean_sim, y=`Th1 cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "Th1 cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)

ggplot(sims_avg_tcr_expr, aes(x=mean_sim, y=`Th2 cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$tcr, Type == "Th2 cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)

ggplot(sims_avg_bcr_expr, aes(x=mean_sim, y=`B-cell`)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) + annotate(geom="text", label = subset(pvals$bcr, Type == "B-cell")$p.value, x=Inf, y=Inf, hjust=1, vjust=1, parse=TRUE)
```

## Intrapatient variability (overall)

Requires that condensed_ids are never duplicated in any analysis type. I've inserted checks into each of the sections above to ensure this. 

### Nanostring

We'll work with cell-type metagenes as working with all expression values will be biased by the number of genes associated with each cell type. 

### TIL densities

```{r}
tiltypes <- c("T_CD8_density", "T_CD4_density",
              "T_CD20_density", "T_Plasma_density")

ihc_slide_data <- subset(ihc_table_slide, select=c("condensed_id", "patient_id", "slide", tiltypes))
ihc_melted <- melt(ihc_slide_data, id.vars = c("condensed_id", "patient_id", "slide"),
                   measure.vars = tiltypes, variable.name = "tiltype",
                   value.name = "density")
ihc_melted$tiltype <- as.character(ihc_melted$tiltype)
```


### XCR

```{r}

```

### Correlations

```{r}
T_CELL_TYPES <- c("T_CD8_density", "T_CD4_density")
B_CELL_TYPES <- c("T_CD20_density", "T_Plasma_density")
NANOSTRING_T_TYPES <- c("CD8 T-cell", "Cytotoxic cell", "T-cell", "T helper cell", "Th1 cell", "Th2 cell", "Th17 cell", "Treg", "Tcm", "Tem")
NANOSTRING_B_TYPES <- c("B-cell", "TFH")
XCR_T_TYPES <- c("tcr")
XCR_B_TYPES <- c("bcr")

cols <- colorRampPalette(c("#95cbee", "#ffd73e","#ce472e"))(100)

corplot <- function(var1, var2, sub1, sub2) {
  cor_df <- Reduce(function(x,y) merge(x,y, by="patient_id"), list(var1, var2))
  cor_matrix <- subset(cor_df, select=-c(patient_id))
  
  corres <- corr.test(cor_matrix, method = "pearson", adjust = "none")
  #hc <- hclust(dist(corres$r), method="ward.D")
  diag(corres$r) <- NA
  diag(corres$p) <- NA
  
  corres.r <- corres$r[sub1, sub2]
  corres.p <- corres$p[sub1, sub2]
  
  corres.p.labels <- format(signif(corres.p, 2), 2)
  
  pheatmap(corres.r, display_numbers = corres.p.labels, 
           color = cols, number_color = "white", 
           cluster_rows = TRUE, cluster_cols=TRUE)
}
```


#### TIL densities vs. XCR

```{r, fig.width=6, fig.height=4}
intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples, xcr_samples))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, til_variance_option = "stabilize", tiltypes)
xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE, id_type = "condensed_id", db_path)

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)

corplot(til_var, xcr_var, ihc_entries, xcr_entries)
```

#### TIL densities vs. Nanostring

```{r, fig.width=6, fig.height=4}
intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples, nanostring_samples))

til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples, til_variance_option = "stabilize", tiltypes)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

corplot(til_var, expr_var, ihc_entries, expr_entries)
```

### XCR vs. Nanostring

```{r, fig.width=6, fig.height=4}
intersect_samples <- compute_sample_intersection(samples_list = list(xcr_samples, nanostring_samples))

xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE, id_type = "condensed_id", db_path)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)

xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

corplot(xcr_var, expr_var, xcr_entries, expr_entries)
```

## Intrapatient variability (pairwise)

We can also look at consistency in intrapatient similarity on a pairwise basis. 

```{r}
scalar_dist_method <- "manhattan"

ihc_stabilized <- stabilize_ihc_variances(ihc_slide_data, ihc_table, tiltypes)
ihc_stabilized$condensed_id <- factor_id(ihc_stabilized$condensed_id, "condensed_id", db_path)
ihc_sims <- pairwise_similarity_frame(ihc_stabilized, id_col = id_type, group = "patient_id",
                          method = scalar_dist_method)

## always filter_tissue = FALSE
xcr_sims_raw_all <- lapply(names(distance_matrices), function(segment) {
  mat <- distance_matrices[[segment]]
  sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type,
                                  db_path = db_path, raw = TRUE)
  sims <- subset(sims, select=-c(patient2))
  colnames(sims) <- mapvalues(colnames(sims), from = c("patient1", "dist"), to = c("patient_id", segment))
  return(sims)
})
names(xcr_sims_raw_all) <- names(distance_matrices)

celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = id_type)
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id", to="patient_id",
                                 db_path)
celltype_sims <- pairwise_similarity_frame(celltype_df, id_col = id_type, group = "patient_id",
                                           method=scalar_dist_method)
```

### TIL densities vs XCR

```{r}
ihc_xcr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(ihc_sims)))

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)

cor_matrix <- subset(ihc_xcr_sims, select=c(ihc_entries, xcr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
#hc <- hclust(dist(corres$r), method="ward.D")
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[ihc_entries, xcr_entries]
corres.p <- corres$p[ihc_entries, xcr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, 
         color = cols, number_color = "white", 
         cluster_rows = TRUE, cluster_cols=TRUE)
```


```{r}
xcr_sims_melted <- Reduce(plyr::join, xcr_sims_raw_all)

xcr_sims_melted <- melt(xcr_sims_melted, id.vars = colnames(xcr_sims_melted)[!colnames(xcr_sims_melted) %in% xcr_entries], measure.vars = xcr_entries, variable.name = "segment", value.name = "xcrsim")

ihc_sims_melted <- melt(ihc_sims, id.vars = colnames(ihc_sims)[!colnames(ihc_sims) %in% ihc_entries], measure.vars = ihc_entries, variable.name = "tiltype", value.name = "ihcsim")

ihc_xcr_melted <- merge(ihc_sims_melted, xcr_sims_melted, by=c("sample1", "sample2", "patient_id"))

ggplot(ihc_xcr_melted, aes(x=ihcsim, y=xcrsim)) + geom_point(aes(colour=factor(patient_id))) + 
  facet_wrap(tiltype ~ segment, scales = "free") + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient)

summary(lmer(xcrsim ~ ihcsim + (1|patient_id), data = subset(ihc_xcr_melted, tiltype == "T_CD8_density" & segment == "tcr" & !is.na(ihcsim) & !is.na(xcrsim))))
```

Critically, TCR similarity does not correlate with CD8+ or CD4+ T-cell density similarity. 

### TIL densities vs. Nanostring

```{r}
ihc_expr_sims <- Reduce(plyr::join, list(ihc_sims, celltype_sims))

ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

cor_matrix <- subset(ihc_expr_sims, select=c(ihc_entries, expr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
#hc <- hclust(dist(corres$r), method="ward.D")
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[ihc_entries, expr_entries]
corres.p <- corres$p[ihc_entries, expr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, 
         color = cols, number_color = "white", 
         cluster_rows = TRUE, cluster_cols=TRUE)
```

### XCR vs. Nanostring

```{r}
xcr_expr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(celltype_sims)))

xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)

cor_matrix <- subset(xcr_expr_sims, select=c(xcr_entries, expr_entries))

corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
#hc <- hclust(dist(corres$r), method="ward.D")
diag(corres$r) <- NA
diag(corres$p) <- NA

corres.r <- corres$r[xcr_entries, expr_entries]
corres.p <- corres$p[xcr_entries, expr_entries]

corres.p.labels <- format(signif(corres.p, 2), 2)

pheatmap(corres.r, display_numbers = corres.p.labels, 
         color = cols, number_color = "white", 
         cluster_rows = TRUE, cluster_cols=TRUE)
```

These results aren't really consistent with the patient-level results. It's important to note, however, that these correlations are calculated naive to the patient labels. i.e. there's no controlling for patient with a random intercept and slope, etc. 

Things become more difficult to interpret if we incorporate random intercepts and slopes. There could be high correlations within patients but low correlations between patients, and vice versa. 

Think about how we can show this in a statistically robust way. 

## IHC pairwise significance testing

```{r}
pairwise_results_filtered <- ihc_pairwise_significance(ihc_table_slide, tiltypes, db_path,
                                                       slide_count_filter = 10)
```

```{r}
Q_VALUE_THRESHOLD <- 0.01

pairwise_results_filtered$significant <- pairwise_results_filtered$q.value < Q_VALUE_THRESHOLD

pairwise_tiltype_summary <- pairwise_results_filtered %>% group_by(patient_id, tiltype) %>%
  summarise(any_significant=any(significant))

pairwise_tiltype_count <- pairwise_tiltype_summary %>% group_by(tiltype) %>% summarise(npatients=sum(any_significant, na.rm=TRUE))

pairwise_tiltype_count
```


```{r}
x <- pairwise_tiltype_summary %>% group_by(tiltype) %>% do(sigvec=.$any_significant)
x_pairs <- as_tibble(merge(x,x,by=c()))
x_pairs_p <- x_pairs %>% group_by(tiltype.x, tiltype.y) %>% summarise(p.value = pchisq(GenomicRanges::phicoef(table(unlist(sigvec.x), unlist(sigvec.y)))^2 * sum(table(unlist(sigvec.x), unlist(sigvec.y))), df = 1, lower.tail = FALSE))
x_pairs_p$q.value <- p.adjust(x_pairs_p$p.value, method = "fdr")

x_pairs_p
```
