作者,Evil Genius
今天我们要完成一项工程,那就是CNV层级轨迹,如下图:
首先来看一下什么是CNV谱系
拷贝数变异(CNVs)在细胞增殖过程中在基因组中积累,提供了系统发育谱系的特征。通过转录组数据上使用intercnv生成CNV谱,能够以空间和时间分辨率跟踪克隆系统发育。
文章空间转录组数据分析之CNV burden和CNV聚类(python版本)已经做了基础的CNV分析和聚类,这个是前提。
CNV谱系如下:
我们来完成这项工程,示例数据大家自行下载,大家都是生信资深人员了,中间过程就不啰嗦了。
from IPython.display import display, HTML, Image
display(HTML("<style>.container { width:95% !important; }</style>"))
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import os
import copy
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from anndata import AnnData
import matplotlib
from matplotlib import cm
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import sys
sys.path.append("../../lib")
from stpalette import palette_WM4007_rna, palette_WM4237_rna
from plots import plotSpatialAll, plotSpatialEdge, euclidean_p1
from cheatmap import sMED, cdist, cplot
from utils import loadAdImage, filterCNV
model = 'WM4007'
palette_rna = palette_WM4007_rna if model=='WM4007' else palette_WM4237_rna
preprocessedStDataPath = 'd:/from HPCC 11 28 2022/results_NF1-nod-t2t-k35/%s/' % model
dataPath = '../../data/'
ids = sorted(np.loadtxt(dataPath + 'ids_%s_ST.txt' % model, dtype=str))
sids = [id[7:12] for id in ids]
from scipy.spatial.distance import cdist, pdist, squareform
ad_all = sc.read(dataPath + 'ad_all_human_clustered_st_%s.h5ad' % model)
ad_all = ad_all[ad_all.obs['human_ratio']>=0.25]
filter_spots_index = ad_all.obs.index.copy()
读取空间数据
以下内容为付费内容,定价 ¥200.00点击修改
ad_cnv = sc.read(dataPath + 'ad_all_human_clustered_cnv_%s.h5ad' % model)
ad_cnv = ad_cnv[ad_cnv.obs.index.intersection(filter_spots_index), :]
plt.rcParams["figure.figsize"] = (4, 3)
sc.pl.embedding(ad_cnv[ad_cnv.obs.index.intersection(filter_spots_index), :], color=['cluster', 'cnv_clusters'], basis="X_umap", title='time', palette=palette_rna, size=25)
dfc = ad_cnv.obs[['cnv_clusters', 'sample']].value_counts().unstack().fillna(0).astype(int)
dfc
cnv Burden
use_raw_distances = False
metric = 'cosine'
if use_raw_distances:
df_cnv = ad_cnv.to_df()
df_cnv.index = ad_cnv.obs['cnv_clusters']
else:
df_cnv = pd.DataFrame(ad_cnv.obsm['X_pca'], index=ad_cnv.obs['cnv_clusters'])
uclusters = list(ad_cnv.obs['cnv_clusters'].unique())
res = dict()
for c1 in uclusters:
for c2 in uclusters:
if (c1!=c2) and (not (c1, c2) in res.keys()) and (not (c2, c1) in res.keys()):
print(c1, '\t', c2, '\t', df_cnv.loc[c1].shape[0], df_cnv.loc[c2].shape[0])
d = cdist(df_cnv.loc[c1], df_cnv.loc[c2], metric=metric).mean() # euclidean cosine
res.update({(c1, c2): d})
df_pdist = pd.Series(res).unstack()
df_ppdist = df_pdist.reindex(uclusters).T.reindex(uclusters).fillna(0.) + df_pdist.reindex(uclusters).T.reindex(uclusters).T.fillna(0.)
df_ppdist = df_pdist.reindex(uclusters).T.reindex(uclusters).fillna(0.) + df_pdist.reindex(uclusters).T.reindex(uclusters).T.fillna(0.)
df_styled = df_ppdist.style.background_gradient(axis=None, vmin=df_ppdist[df_ppdist!=0].min().min(), cmap='bwr')
df_styled.to_excel(f'{model}_PCA_cosine_distance_average_over_cnv_clusters.xlsx')
df_styled
Load spatial
images = {id: [(ad_temp := sc.read(preprocessedStDataPath + '%s/st_adata_processed.h5ad' % id)).uns['spatial'], ad_temp.obs.index, ad_temp.obsm['spatial']] for id in ids}
ads = dict()
for sample in ids:
print(sample, end='\t')
ads[sample] = ad_cnv[ad_cnv.obs['id']==sample, :].copy()
ads[sample].uns['spatial'] = images[sample][0]
ads[sample].obsm['spatial'] = pd.DataFrame(index=images[sample][1], data=images[sample][2]).reindex(ads[sample].obs['original_barcode']).values
ads[sample][:, :] = filterCNV(ads[sample].to_df().T, n=10).T
plotSpatialAll(ads, identity='cnv_clusters', palette=palette_rna, f=0.75)
plotSpatialAll({id: ads[id] for id in ids if not '_TE_' in id and '_S2_' in id}, identity='cnv_cosine_distance', palette=palette_rna, nx=6, f=0.6, cmap='nipy_spectral', uniformColorScale=True, wspace=0.3)
params = dict(fontsize=12, markerscale=1.5, linewidth_factor=2.0, alpha=0.5, alpha_img=0.5, decay_power=4, absolute_cutoff=None, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge({id: ads[id] for id in ids if not '_TE_' in id}, identity='cluster', palette=palette_rna, f=1.0, invert=False, use_raw_distances=False, noplot=False, **params)
params = dict(fontsize=12, markerscale=1.25, maintainScale=False, linewidth_factor=2.0, alpha=0.65, alpha_img=0.5, decay_power=4, absolute_cutoff=None, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge({id: ads[id] for id in ids if (('_T0_' in id) and (not '_S2_' in id)) or (('_T1_' in id) and (not '_S2_' in id)) or (('_TC_' in id) and (not '_S1_' in id))},
identity='cluster', palette=palette_rna, f=0.75, nx=3, invert=False, use_raw_distances=False, noplot=False, suptitle=False, **params)
params = dict(fontsize=12, markerscale=1.25, maintainScale=False, linewidth_factor=2.0, alpha=0.65, alpha_img=0.5, decay_power=4, absolute_cutoff=0.8, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge({id: ads[id] for id in ids if (('_T0_' in id) and (not '_S2_' in id)) or (('_T1_' in id) and (not '_S2_' in id)) or (('_TC_' in id) and (not '_S1_' in id))},
identity='cluster', palette=palette_rna, f=0.75, nx=3, invert=False, use_raw_distances=False, noplot=False, suptitle=False, **params)
params = dict(fontsize=12, markerscale=1.25, maintainScale=False, linewidth_factor=2.0, alpha=0.65, alpha_img=0.5, decay_power=4, absolute_cutoff=0.8, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge({id: ads[id] for id in ids if (('_T0_' in id) and (not '_S2_' in id)) or (('_T1_' in id) and (not '_S2_' in id)) or (('_TC_' in id) and (not '_S1_' in id))},
identity='cnv_clusters', palette=palette_rna, f=0.75, nx=3, invert=False, use_raw_distances=False, noplot=False, suptitle=False, **params)
cnv_palette = {str(i): cm.Set3(c-1) for i, c in enumerate([2,1,7,9,5,11,6,10,8,4,3])}
cnv_palette.update({'T0': '#DE8C00', 'T1': '#7CAE00', 'T2': '#00C08B', 'T3': '#B79F00', 'T4': '#C77CFF', 'TC': '#FF64B0', '-1': 'white'})
from vplot import wrapVplotRatio, wrapVplotScore, wrapVplotGene, vplot
params = dict(fontsize=12, markerscale=1.25, maintainScale=False, linewidth_factor=2.0, alpha=0.95, alpha_img=0.5, decay_power=4, absolute_cutoff=2.0, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge({id: ads[id] for id in ids if (('_T0_' in id) and (not '_S2_' in id)) or (('_T1_' in id) and (not '_S2_' in id)) or (('_TC_' in id) and (not '_S1_' in id))},
identity='cnv_clusters', palette=cnv_palette, f=0.75, nx=3, invert=False, use_raw_distances=False, noplot=False, suptitle=False, **params)
params = dict(fontsize=12, markerscale=1.25, maintainScale=False, linewidth_factor=2.0, alpha=0.95, alpha_img=0.5, decay_power=4, absolute_cutoff=0.8, R=1, # 20
alpha_quantile=0.99, method='cosine', cacheEdges=False, useSavedCache=False)
res=plotSpatialEdge(ads, identity='cnv_clusters', palette=cnv_palette, f=0.75, nx=4, invert=False, use_raw_distances=False, noplot=False, suptitle=False, **params)
def wrapVplotScoreLoc(score, genesDicts=None, identity=None, selected=[], palette=None, ads=None, ids=None, vprefix='score_', **kwargs):
if len(selected)==0:
selected = pd.Series(np.hstack([ads[id].obs[identity] for id in ids])).sort_values().unique().tolist()
df = pd.concat([pd.Series(ads[id][ads[id].obs[identity].isin(selected), :].obs[vprefix + score].fillna(0).values) for id in ids], keys=ids, axis=1)
dfc = pd.concat([pd.Series(ads[id][ads[id].obs[identity].isin(selected), :].obs[identity].values.astype(int)) for id in ids], keys=ids, axis=1)
return vplot(df, dfc, name=vprefix + score, palette=palette, **kwargs)
for sample in ids:
ads[sample].obs['score_avgdist'] = ads[sample].obs['avgdist']
wrapVplotScoreLoc('avgdist', identity='cnv_clusters', palette=cnv_palette, ads=ads, ids=ids, vmin=-0.1)
# wrapVplotScoreLoc('avgdist', identity='cluster', palette=palette_rna, ads=ads, ids=ids)
def sMED(v1, v2):
"""Simple Minimum Event Distance (MED), similar (but not equivalent!) to the one implemented in MEDALT.
MED is the minimal number and the series of single-copy gains or losses that are required to evolve one genome to another.
"""
a = v1 - v2
a = a[np.roll(a, -1) != a]
a = np.abs(a[a != 0]).sum()
return int(a)
v1 = np.array([0,0,0,0,1,1,1,1,0,0,0,0,-1,-1,-1,0,0,0,1,1,0,0,0,1])
v2 = np.array([0,0,0,0,0,1,1,0,0,0,0,0,-1,-1,-1,0,0,0,-1,-1,0,0,0,1])
plt.rcParams["figure.figsize"] = (4, 4)
ad_cnv.obs['resistant'] = ((ad_cnv.obs['cluster'].isin(['8'])) & (ad_cnv.obs['time'].isin(['T4']))).astype(int).astype(str).astype('category')
ad_cnv.obs['resistantEarly'] = ((ad_cnv.obs['cluster'].isin(['8'])) & (ad_cnv.obs['time'].isin(['T1', 'T2', 'T3']))).astype(int).astype(str).astype('category')
sc.pl.embedding(ad_cnv, color=['time', 'cluster', 'resistant', 'resistantEarly'], basis="X_umap", palette=palette_rna, size=25)
identity = 'resistant'
sc.tl.embedding_density(ad_cnv, basis='umap', groupby=identity)
sc.pl.embedding_density(ad_cnv, groupby=identity, ncols=10, s=5)
ad_all = sc.read(dataPath + 'ad_all_human_clustered_st_%s.h5ad' % model)
ad_all[(ad_all.obs['cluster'].isin(['8'])) & (ad_cnv.obs['time'].isin(['T1', 'T2', 'T3', 'T4']))
identity = 'resistantEarly'
sc.tl.embedding_density(ad_cnv, basis='umap', groupby=identity)
sc.pl.embedding_density(ad_cnv, groupby=identity, ncols=10, s=50)
sc.pl.embedding(ad_cnv, color=['resistant', 'resistantEarly'], basis="X_umap", palette=palette_rna, size=5)
Load images and create ADs
images = {id: [(ad_temp := sc.read(preprocessedStDataPath + '%s/st_adata_processed.h5ad' % id)).uns['spatial'], ad_temp.obs.index, ad_temp.obsm['spatial']] for id in ids}
ads = dict()
for sample in ids:
print(sample, end='\t')
ads[sample] = ad_cnv[ad_cnv.obs['id']==sample, :].copy()
ads[sample].uns['spatial'] = images[sample][0]
ads[sample].obsm['spatial'] = pd.DataFrame(index=images[sample][1], data=images[sample][2]).reindex(ads[sample].obs['original_barcode']).values
ads[sample][:, :] = filterCNV(ads[sample].to_df().T, n=10).T
params2 = dict(fontsize=12, markerscale=1.5, linewidth_factor=1.0, alpha=1.0, alpha_img=1.0, decay_power=1,
absolute_cutoff=0.4, alpha_quantile=0.75, method='pearson', cacheEdges=False)
df_info = plotSpatialEdge({(id:='WM4007_T3_S1_ST'): ads[id]}, identity='cluster', palette=palette_rna, f=2, invert=False, **params2)
ad = ads['WM4007_T3_S1_ST']
df_temp, df_meta_temp = ad.to_df().T, ad.obs[['time', 'cluster']].sort_values(by=['time', 'cluster'], ascending=False)
df_temp = df_temp[df_meta_temp.index]
cplot(df_temp, df_meta_temp, sampleMED=1000, clusterObsByGroups=False, optimalOrderingForObs=False,
palette=palette_rna, clusterVar=False, clusterObs=True, addLinesOnHeatmap=False, addLinesOnGroups=False, useMEDforObsGroups=True, useMEDforObs=True,
colorbarLabel='InferCNV CNV', colorbarLabels=['Ampl.', 'Del.'], safetyLimit=2000, figsize=[14,8], dendrogramLineWidth=1.0);
dataPath = '../../data/'
dfc_WM4007 = sc.read(dataPath + 'ad_all_scaled_unfiltered_st_WM4007.h5ad').obs.set_index(['T', 'sample'])['in_tissue'].groupby(level=[0,1]).count().replace(0, np.nan).dropna()
dfc_WM4007_h = sc.read(dataPath + 'ad_all_human_clustered_st_WM4007.h5ad').obs.set_index(['T', 'sample'])['in_tissue'].groupby(level=[0,1]).count().replace(0, np.nan).dropna()
params2 = dict(fontsize=12, markerscale=1.5, linewidth_factor=1.0, alpha=1.0, alpha_img=1.0, decay_power=2, absolute_cutoff=70, alpha_quantile=0.75, method='MED', cacheEdges=False)
plotSpatialEdge(ads, identity='cluster', palette=palette_rna, f=1., invert=False, **params2)
params = dict(fontsize=12, markerscale=1.5, linewidth_factor=1.0, alpha=1.0, alpha_img=1.0, decay_power=2, absolute_cutoff=70, method='MED', alpha_quantile=0.75, cacheEdges=True)
plotSpatialEdge(ads, identity='cluster', palette=palette_rna, f=1., invert=False, **params)