%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad
import pickle
from pathlib import Path

plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['font.size'] = 12

NOTEBOOK_DIR = Path('.').resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parents[1]
import sys
sys.path.insert(0, str(PROJECT_ROOT))

DATA_DIR = NOTEBOOK_DIR / 'data'
RESULTS_DIR = NOTEBOOK_DIR / 'results'
FIG_DIR = NOTEBOOK_DIR / 'figures'
FIG_DIR.mkdir(exist_ok=True)

from src.eval_utils import check_alignment, check_triangle_violations
from src.metacell_utils import unpack_metacell_matches

print(f'Project root: {PROJECT_ROOT}')

Project root: /hpc/group/singhlab/rawdata/ap756/1024_same/heart/SAME

CELL_TYPES = ['Endothelial cells', 'Epithelial cells', 'Fibroblasts',
              'Lymphoid cells', 'Myeloid cells']

# Set1 palette matching original notebooks
CT_COLORS = dict(zip(CELL_TYPES, sns.color_palette('Set1', n_colors=len(CELL_TYPES))))

# Protein = query (aligned), RNA = template (ref)
alignDF = pd.read_csv(DATA_DIR / 'prot_df.csv', index_col=0)
refDF = pd.read_csv(DATA_DIR / 'mer_df.csv', index_col=0)

alignDF['X'] = alignDF['transformed_x']
alignDF['Y'] = alignDF['transformed_y']
refDF['X'] = refDF['transformed_x']
refDF['Y'] = refDF['transformed_y']

alignDF[CELL_TYPES] = alignDF[CELL_TYPES] * 100
refDF[CELL_TYPES] = refDF[CELL_TYPES] * 100
alignDF['cell_type'] = alignDF[CELL_TYPES].idxmax(axis=1)
refDF['cell_type'] = refDF[CELL_TYPES].idxmax(axis=1)

print(f'Template (RNA):  {len(refDF)} cells')
print(f'Query (Protein): {len(alignDF)} cells')
print(f'\nCell type distribution (RNA template):')
print(refDF['cell_type'].value_counts())

Template (RNA):  3608 cells
Query (Protein): 4671 cells

Cell type distribution (RNA template):
cell_type
Epithelial cells     1789
Fibroblasts          1239
Endothelial cells     370
Lymphoid cells        137
Myeloid cells          73
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, df, title in zip(axes,
                          [refDF, alignDF],
                          ['a. RNA Template (MERSCOPE)', 'b. Protein Query (PCF)']):
    for ct in CELL_TYPES:
        mask = df['cell_type'] == ct
        if mask.sum() > 0:
            ax.scatter(df.loc[mask, 'X'], df.loc[mask, 'Y'],
                       s=5, alpha=0.7, label=ct, color=CT_COLORS[ct])
    ax.set_title(title, fontsize=14, loc='left', fontweight='bold')
    ax.set_aspect('equal')
    ax.invert_yaxis()
    ax.set_axis_off()

axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=3, fontsize=9)
plt.tight_layout()
plt.savefig(FIG_DIR / 'Fig4ab_cell_types.svg', dpi=300, bbox_inches='tight')
plt.show()

main_dir = RESULTS_DIR / 'dp10_knn8_MS1'
mc_align = pickle.load(open(main_dir / 'mc_align.pkl', 'rb'))
mc_ref = pickle.load(open(main_dir / 'mc_ref.pkl', 'rb'))
matchesDF = pd.read_csv(main_dir / 'matchedDF.csv')

matchesDF['cell_type'] = matchesDF[CELL_TYPES].idxmax(axis=1)
matchesDF['SAME_X'] = matchesDF['ref_X']
matchesDF['SAME_Y'] = matchesDF['ref_Y']

mc_ref_df = mc_ref.metacell_df.copy()
mc_ref_df['cell_type'] = mc_ref_df[CELL_TYPES].idxmax(axis=1)
mc_ref_df['SAME_X'] = mc_ref_df['X']
mc_ref_df['SAME_Y'] = mc_ref_df['Y']

print(f'Matched cells: {len(matchesDF)}')

Matched cells: 3579

alignDF_eval, asc1 = check_alignment(matchesDF, mc_ref_df,
                                      xcol='SAME_X', ycol='SAME_Y',
                                      ctype_col='cell_type', kNN=1)
accuracy = 100 * asc1

# Triangle violations
matchesDF.index = matchesDF['Aligned_metacell_id'].values
tri_df, stats = check_triangle_violations(
    matchesDF, mc_align,
    aligned_id_col='Aligned_metacell_id', ref_id_col='Ref_metacell_id',
    mapped_x_col='ref_X', mapped_y_col='ref_Y',
    cell_type_col='cell_type', ignore_same_type_triangles=False, verbose=False)
violations = stats['percent_flipped']

print(f'dp=10, knn=8, MS=1:')
print(f'  1-NN cell type accuracy: {accuracy:.1f}%')
print(f'  Triangle violations: {violations:.1f}%')
print(f'  Matched cells: {len(matchesDF)}')

dp=10, knn=8, MS=1:
  1-NN cell type accuracy: 84.2%
  Triangle violations: 18.0%
  Matched cells: 3579

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# a. RNA template
for ct in CELL_TYPES:
    mask = mc_ref_df['cell_type'] == ct
    axes[0].scatter(mc_ref_df.loc[mask, 'X'], mc_ref_df.loc[mask, 'Y'],
                    s=3, alpha=0.7, color=CT_COLORS[ct], label=ct)
axes[0].set_title('a. RNA Template', fontsize=13, loc='left', fontweight='bold')
axes[0].invert_yaxis()
axes[0].set_aspect('equal')
axes[0].set_axis_off()

# b. SAME-aligned protein
for ct in CELL_TYPES:
    mask = alignDF_eval['cell_type'] == ct
    axes[1].scatter(alignDF_eval.loc[mask, 'SAME_X'], alignDF_eval.loc[mask, 'SAME_Y'],
                    s=3, alpha=0.7, color=CT_COLORS[ct], label=ct)
axes[1].set_title(f'b. SAME-Aligned Protein ({accuracy:.1f}%)', fontsize=13, loc='left', fontweight='bold')
axes[1].invert_yaxis()
axes[1].set_aspect('equal')
axes[1].set_axis_off()

# c. Correct vs incorrect 1-NN
correct = alignDF_eval[alignDF_eval['_1NN_match'] == True]
incorrect = alignDF_eval[alignDF_eval['_1NN_match'] == False]
axes[2].scatter(correct['SAME_X'], correct['SAME_Y'], s=3, alpha=0.5, color='blue', label='Correct')
axes[2].scatter(incorrect['SAME_X'], incorrect['SAME_Y'], s=3, alpha=0.5, color='red', label='Incorrect')
axes[2].set_title('c. 1-NN Cell Type Match', fontsize=13, loc='left', fontweight='bold')
axes[2].invert_yaxis()
axes[2].set_aspect('equal')
axes[2].set_axis_off()
axes[2].legend(markerscale=3)

plt.tight_layout()
plt.savefig(FIG_DIR / 'Fig4_spatial_alignment.svg', dpi=300, bbox_inches='tight')
plt.savefig(FIG_DIR / 'Fig4_spatial_alignment.png', dpi=300, bbox_inches='tight')
plt.show()

# Load full anndata objects
pcfAD = sc.read_h5ad(DATA_DIR / 'pcfSubAD.h5ad')
merAD = sc.read_h5ad(DATA_DIR / 'merSubAD.h5ad')

# Unpack metacell matches to individual cells
matchesDF_raw = pd.read_csv(main_dir / 'matchedDF.csv')  # fresh copy
matchesDF_expanded = unpack_metacell_matches(
    matchesDF_raw, mc_align.metacell_df, mc_ref.metacell_df,
    aligned_df=mc_align.original_df, ref_df=mc_ref.original_df,
    strategy='distribute',
    aligned_original_idx_col=mc_align.original_idx_col,
    ref_original_idx_col=mc_ref.original_idx_col)

print(f'Expanded matches: {len(matchesDF_expanded)}')
matchesDF_expanded.head()

Expanded matches: 3579

# Index pcfAD by Object ID for subsetting
pcfAD.obs.index = pcfAD.obs['Object ID'].values.astype(str)

pcf_matched_idx = matchesDF_expanded['Aligned_cell_id'].values.astype(str)
mer_matched_idx = matchesDF_expanded['Ref_cell_id'].values.astype(str)

new_pcf = pcfAD[pcf_matched_idx].copy()
new_mer = merAD[mer_matched_idx].copy()

# Add SAME coordinates (use RNA template positions)
new_pcf.obs['CellType'] = new_pcf.obs['cell_type'].values
new_pcf.obs['SAME_X'] = new_mer.obs['transformed_x'].values
new_pcf.obs['SAME_Y'] = new_mer.obs['transformed_y'].values
new_pcf.obs['pred_cell_type'] = new_mer.obs['pred_cell_type'].values
new_pcf.obsm['X_SAME_spatial'] = new_pcf.obs[['SAME_X', 'SAME_Y']].values
new_pcf.obsm['X_spatial_rna'] = new_mer.obs[['transformed_x', 'transformed_y']].values

# Prefix var_names
new_pcf.var_names = ['protein_' + x for x in new_pcf.var_names]
new_mer.var_names = ['mRNA_' + x for x in new_mer.var_names]
sc.pp.normalize_total(new_mer, target_sum=1e2)
sc.pp.log1p(new_mer)

# Combine protein + RNA
combined_X = np.hstack([new_pcf.X, new_mer.X])
combined_var = pd.concat([new_pcf.var, new_mer.var])
combined_obs = new_pcf.obs.copy()
combined_ad = ad.AnnData(X=combined_X, obs=combined_obs, var=combined_var)
combined_ad.obsm['X_SAME_spatial'] = new_pcf.obsm['X_SAME_spatial']
combined_ad.obsm['X_spatial_rna'] = new_pcf.obsm['X_spatial_rna']

print(f'Combined AnnData: {combined_ad.shape}')
print(f'Cell type match rate: {(combined_ad.obs["cell_type"] == combined_ad.obs["pred_cell_type"]).mean():.1%}')

Combined AnnData: (3579, 330)
Cell type match rate: 84.2%

marker_genes = [
    'protein_Pan-Cytokerati', 'mRNA_KRT6A', 'mRNA_KRT4',
    'protein_SMA', 'mRNA_PLVAP', 'mRNA_SELE',
    'protein_VIM', 'mRNA_MGP', 'mRNA_SFRP2',
    'protein_CD3', 'mRNA_CD3E',
    'protein_CD68', 'mRNA_CD14',
]
xlab = ['PanCK', 'KRT6A', 'KRT4',
        'SMA', 'PLVAP', 'SELE',
        'VIM', 'MGP', 'SFRP2',
        'CD3', 'CD3E',
        'CD68', 'CD14']

# Use short names directly as var_names
plot_ad = combined_ad[:, marker_genes].copy()
plot_ad.var.index = pd.Index(xlab)

# Color map: protein=blue, mRNA=red
protein_color, mrna_color = '#1f77b4', '#d62728'
is_protein = [g.startswith('protein_') for g in marker_genes]
label_colors = {xlab[i]: protein_color if is_protein[i] else mrna_color
                for i in range(len(xlab))}

sc.pl.matrixplot(
    plot_ad, var_names=xlab,
    categories_order=['Epithelial cells', 'Endothelial cells', 'Fibroblasts',
                      'Lymphoid cells', 'Myeloid cells'],
    groupby='cell_type', standard_scale='var', cmap='cividis',
    swap_axes=True, show=False)

# Color y-axis labels by modality
fig = plt.gcf()
for ax in fig.axes:
    for label in ax.get_yticklabels():
        t = label.get_text()
        if t in label_colors:
            label.set_color(label_colors[t])
            label.set_fontweight('bold')

fig.savefig(FIG_DIR / 'Fig4_matrixplot.svg', bbox_inches='tight')
fig.savefig(FIG_DIR / 'Fig4_matrixplot.png', dpi=300, bbox_inches='tight')
plt.show()

bash run_same.sh --dp 10 --knn 8 --ms 1

Parameter	Value
`delaunay_penalty`	10
`window_size`	4000
`overlap`	300
`max_matches`	1
`radius` / `knn`	300 / 8
`metacell_size`	1
`lazy_constraints`	True

	Aligned_cell_id	Ref_cell_id
0	6c8df374-8566-4a60-b133-eab85bba713d	3443715200824100046
1	a2b8665d-1a3b-41c0-b377-cc7f0bc63f8d	3443715200751100112
2	8a6f6e48-4604-4dae-8e92-47f2ed17b539	3443715200751100121
3	e390053d-e183-4619-9404-8cd19b907c0d	3443715200751100113
4	18906cfb-2603-493d-ad17-7b904e729a51	3443715200751100115

Setup¶

1. Load Input Data¶

2. Fig 4a,b — Cell Types in RNA Template and Protein Query¶

3. Load SAME Results (dp=10)¶

4. Cell Type Accuracy¶

5. Fig 4c–e — Spatial Alignment Quality¶

(Optional) Full Reproduction¶

MERSCOPE Tongue — Protein + RNA Cross-Modal Alignment¶

Setup¶

1. Load Input Data¶

2. Fig 4a,b — Cell Types in RNA Template and Protein Query¶

3. Load SAME Results (dp=10)¶

4. Cell Type Accuracy¶

5. Fig 4c–e — Spatial Alignment Quality¶

6. Cross-Modal Integration — Combined Protein + RNA AnnData¶

7. Fig 4 — Cross-Modal Matrixplot¶

(Optional) Full Reproduction¶