首页 > 其他分享 >scanpy 去批次pipeline

scanpy 去批次pipeline

时间:2023-07-06 15:12:32浏览次数:36  
标签:pipeline plt False 批次 genes scanpy sc adata out

1. 脚本主要内容

  * 批量读取下机数据
  * 计算双细胞比例 
  * BBKNN去除批次效应
  * 去除细胞周期的影响 
  * 转换为seurat对象

2. 脚本

点击查看代码
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scrublet as scr
import scipy.io
import numpy as np
import os
import pandas as pd
from math import sqrt
import bbknn  
import random
from matplotlib.pyplot import rc_context
import datetime
import DIY
from DIY import get_tsne
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 3

random.seed(234)
figure_out = './figures'
results_file = 'write/sce.h5ad'

table_out = './write'
datadir='./CellrangerOut/'
# df=pd.read_csv('sample_info.txt',header=0,index_col=0,sep='\t')
sample_type={}
samplelist = []
def InputData(SampleInfo):
    metadata = pd.read_csv(SampleInfo,header=0,index_col=0,sep='\t')
    filenames = metadata.index 
    adatas = [sc.read_10x_mtx(datadir + filename+ '/filtered_feature_bc_matrix',cache=True) for filename in filenames] # use list to store separate adata
    for i in range(len(adatas)):
        adatas[i].obs['sample'] = metadata.index[i]
        for col in metadata.columns:
            adatas[i].obs[col] = metadata[col][i]
    adata = adatas[0].concatenate(adatas[1:],batch_categories = metadata.index)
    adata.var_names_make_unique()
    sc.pl.highest_expr_genes(adata, n_top=20, save = True )
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    
    # ribo_gene_df = pd.read_csv(r'KEGG_RIBOSOME.v2023.1.Hs.csv', sep=',')
    # ribo_gene_df = ribo_gene_df[ribo_gene_df.STANDARD_NAME=='GENE_SYMBOLS']
    # ribo_gene_names = ribo_gene_df.loc[:, 'KEGG_RIBOSOME'].values[0].split(',')
    # adata.var['ribo'] = adata.var.index.isin(ribo_gene_names) 
    
    adata.var['mito'] = adata.var.index.str.startswith(('MT-', 'mt-', 'Mt-'))
    sc.pp.calculate_qc_metrics(adata,
                               expr_type='counts', # default value
                               var_type='genes', # default value
                               qc_vars=['mito'],
                               percent_top=None, 
                               log1p=False, 
                               inplace=True)

    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
                 jitter=0.4, multi_panel=True,save=True)
    
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=.5)

    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito',show=False,ax=ax1)
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',show=False,ax=ax2)
    fig.savefig(os.path.join(figure_out, 'scatter.pdf'), 
                dpi=300, bbox_inches='tight')
    return adata

# 计算doublet
sim_doublet_ratio = 2
def Compute_Doublet(adata,rate):
    counts_matrix = adata.to_df()
    n_cells = adata.shape[0]
    scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=rate,
                        n_neighbors = round(0.5 * sqrt(n_cells)),
                        sim_doublet_ratio = sim_doublet_ratio)
    ### annoy-1.15.1
    doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
    scrub.plot_histogram()
    plt.savefig(os.path.join(figure_out,"histogram.pdf"),dpi=300, bbox_inches='tight')
    plt.show()
    scrub.set_embedding('TSNE', scr.get_tsne(scrub.manifold_obs_, 0.5,10))
    scrub.plot_embedding('TSNE', order_points=True)
    plt.savefig(os.path.join(figure_out,"predicted_doublets.pdf"))
    plt.show()
    
    out_df = pd.DataFrame()
    out_df['barcodes'] = counts_matrix.index
    out_df['doublet_scores'] = doublet_scores
    out_df['predicted_doublets'] = predicted_doublets
    #out_df.to_csv('doublet.txt', index=False,header=True)
    out_df.to_csv(os.path.join(table_out,'doublet.txt'), index=False,header=True)
    out_df.head()
    return out_df,doublet_scores,predicted_doublets 

def Filter_cells(adata,doublet_scores,predicted_doublets,mito):
    adata.obs["doublet_scores"] = doublet_scores
    adata.obs["predicted_doublets"] = predicted_doublets
    #~  可以作为取反的功能
    adata = adata[~adata.obs.predicted_doublets, :]
    
    upper_limit = np.quantile(adata.obs.n_genes_by_counts, 0.98)
    adata = adata[adata.obs.n_genes_by_counts < upper_limit, :]
    adata = adata[adata.obs.pct_counts_mito < mito, :]
    return adata,upper_limit

def CellCycleScoring(adata,cycle,species):
    data = pd.read_csv(cycle,sep=',',header='infer',usecols=[1,2,3,4])
    if species == 'hsa':
        s_genes = list(data.loc[ : ,'hsa_s.genes'])
        g2m_genes  = list(data.loc[ : ,'hsa_g2m.genes'])

    else:
        s_genes = list(data.loc[ : ,'mmu_s.genes'])
        g2m_genes  = list(data.loc[ : ,'mmu_g2m.genes'])
        
    cell_cycle_genes = s_genes + g2m_genes
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.scale(adata,zero_center=False)
    
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata_cc_genes = adata[:, cell_cycle_genes]
    sc.tl.pca(adata_cc_genes)
    sc.pl.pca_scatter(adata_cc_genes, color='phase',save='.Cycle.pdf')
    
    adata.obs['Difference'] = adata.obs['S_score'] - adata.obs['G2M_score']
    adata.write('./write/sce_raw.h5ad')
    return adata



def Run_Normalization(adata,n_neighbors,n_pcs,resolution):
    adata.layers['count'] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, 
                            n_top_genes=3000,
                            flavor='seurat',
                            subset=False, 
                            batch_key='sample')
    sc.pl.highly_variable_genes(adata,save=True)
    adata = adata[:, adata.var.highly_variable]
    
    sc.pp.regress_out(adata, ['Difference'])
    #scale数据
    sc.pp.scale(adata, max_value=10)
    
    sc.tl.pca(adata)
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=resolution, key_added='leiden')                                                                                                                                                 
    
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    plt.subplots_adjust(wspace=.5)
    sc.pl.umap(adata, color=['sample'], frameon=False, ax=ax1, show=False)
    sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data', ax=ax2, show=False)

    plt.savefig(os.path.join(figure_out,"umap.pdf"))
    plt.show()
    return adata

def Run_BBKNN(adata,n_neighbors,n_pcs,resolution):
    sc.tl.pca(adata)
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
    sc.external.pp.bbknn(adata, batch_key='sample')
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=resolution, key_added='leiden')

    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    plt.subplots_adjust(wspace=.5)
    sc.pl.umap(adata, color=['sample'], frameon=False, ax=ax1, show=False)
    sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data', ax=ax2, show=False)

    plt.savefig(os.path.join(figure_out,"umap-BBKNN.pdf"))    
    plt.show()
    
    with rc_context({'figure.figsize': (5, 5)}):
        sc.pl.umap(adata, color='leiden', add_outline=True, legend_loc='on data',
                   legend_fontsize=12, legend_fontoutline=2,frameon=False,
                   title='clustering of cells', palette='Set1',save ='-BBKNN-re.pdf')
    
    adata.write(results_file)
    return adata

def Show_Markers(adata):
    ax = sc.pl.correlation_matrix(adata, 'leiden', figsize=(5,3.5),save= True)
    adata_raw = sc.read_h5ad('./write/sce_raw.h5ad')
    adata_raw.obs = adata.obs
    adata_raw.obsm['X_umap'] = adata.obsm['X_umap']
    adata_raw.obsm['seurat_clusters'] = adata.obsm['leiden']
    adata_raw.obsm['nCount_RNA'] = adata.obsm['total_counts']
    adata_raw.obsm['nFeature_RNA'] = adata.obsm['n_genes']
    adata_raw.obsm['percent.mt'] = adata.obsm['pct_counts_mito']
    
    adata_raw.write('./write/sce_raw.h5ad')
    adata = adata_raw
    
    adata.layers['count'] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    sc.settings.verbosity=2    
    sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
    markers = sc.get.rank_genes_groups_df(adata, group=None, pval_cutoff=.05, log2fc_min=.25)
    markers.to_csv(os.path.join(table_out,'all_markers.csv'), index=False,header=True)
    
    top5 = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
    top5.to_csv(os.path.join(table_out,'top5_markers.csv'),index=False,header =True)
    fig=plt.figure(figsize=(6,24),dpi=100)
    for i in  top5.columns: 
        plt.subplot(2, 7, int(i)+1) #做一个3*3的图 range(9)从0开始,因需要从1开始,所以i+1
        sc.pl.rank_genes_groups_violin(adata, groups=str(i), n_genes=5,show=False)
        plt.tight_layout()
        plt.axis = 'off' #关闭坐标 让图更美观
    plt.savefig(os.path.join(figure_out,"top5-markers.png"))      
    plt.show()
    
    adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X
    sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
    
    sc.pl.rank_genes_groups_matrixplot(adata, n_genes=3, use_raw=False, vmin=-3, vmax=3, cmap='bwr', layer='scaled',save =True)
    sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=3, cmap='bwr',save = True)
    
    sc.pl.rank_genes_groups_dotplot(adata, n_genes=3, values_to_plot='logfoldchanges', min_logfoldchange=3, vmax=7, vmin=-7, cmap='bwr',save = True)
    sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, use_raw=False, swap_axes=True, show_gene_labels=False,
                                vmin=-3, vmax=3, cmap='bwr',save =True)
    os.system('/PERSONALBIO/work/singlecell/s01/.conda/envs/py4/bin/Rscript ./sceasy.R')
    return adata

    
if __name__ == '__main__':
    start = datetime.datetime.now()
    SampleInfo = './sample_info.txt'
    cycle = './cycle.gene.csv'
    species = 'hsa' ## or mmu
    adata = InputData(SampleInfo)
    out_df,doublet_scores,predicted_doublets = Compute_Doublet(adata,0.06) # 双细胞比率 默认0.06
    adata,upper_limit = Filter_cells(adata,doublet_scores,predicted_doublets,10) # 分辨率
    adata = CellCycleScoring(adata,cycle,species)
    adata = Run_Normalization(adata,50,50,0.99) # neibor pc resolution
    adata = Run_BBKNN(adata,50,50,0.99)
    Show_Markers(adata)
    end = datetime.datetime.now()
    print("程序运行时间:"+str((end-start).seconds/3600)+"h")

3. 主要结果展示

  1. 去除批次作用前

  2. 去除批次作用后

标签:pipeline,plt,False,批次,genes,scanpy,sc,adata,out
From: https://www.cnblogs.com/-bonjour/p/17532179.html

相关文章

  • postgresql大表分批次删除
    [root@localhost~]#cat/root/delete_big_table.sh#!/bin/bash#$1对应表名,$2对应主键列,$3对应一次删除多少行i=`psql-h127.0.0.1-Upostgres-dtenant_1011046-c"selectceil(count(1)/${3}::float)from${1}whereplatcreatetime<'2023-04-3023:59:59'......
  • 说说设计模式~管道模式(pipeline)
    说明复合的责任链,类似于管道模式,只要符合条件,说会向下传递,不会终止算法说明按最高优先级去使用,符合就用,不符合就走下一个策略具体链条,有点像pipeline管道模式BlackHandlerip=172.17.0.11RateLimitHandlerheader=is-blackWriteBlackHandlerheader=real-black继承......
  • pipeline流水线脚本
    Pipeline流水线脚本pipeline{ agent{ label'slave1-apitest' } stages{ stage("拉取自动化测试代码"){ steps{ gitcredentialsId:'65623c68-96bc-4037-ab73-db5c091f358f',url:'https://gitee.com/huangshao1989/api-framework.git' ......
  • Scrapy_ImagePipeline保存图片
    创建一个项目scrapystartprojectmyfrist(project_name)创建一个爬虫scrapygenspider爬虫名爬虫地址需要安装pillowpipinstallpillow报错:twisted.python.failure.FailureOpenSSL.SSL.Error解决方案pipuninstallcryptographypipinstallcryptography==36.0.2代......
  • Jenkins + Jenkinsfile + Go自动化Pipeline进行部署
    环境准备(因内容繁琐请自行搭建或问度娘)俺也会逐步更新相关文章Jenkins环境jenkins凭据管理Pipeline语法安装钉钉插件并配置钉钉机器人linux服务器go本地目录结构(微服务)服务器文件目录/home/ubuntuJenkinsfile文件文件名为: Jenkinsfilepipeline{agentanyenvironm......
  • transformers库的使用【一】——pipeline的简单使用
    transformers库的使用使用pipelineAPI来快速使用一些预训练模型使用预训练模型最简单的方法就是使用pipeline(),transformers提供了一些任务:1、情感分析(Sentmentanalysis):分析文本是正面的还是负面的2、文本生成(inEnglish):提供一个语句,模型将生成这条语句的下一句3、命名实体识......
  • 2.6 类神经网路训练不起来怎么办 (五):批次标准化 (Batch Normalization)简介
    1.提出背景  在前文,我们提过\(error\surface\)在不同方向的斜率不一样,因此采用固定的学习率很难将模型\(train\)起来,上节提出了自适应学习率,这里还有一个方法就是直接将e\(rror\surface\)铲平.  或许首先想要提出的是为什么会产生不同方向上斜率相差很大的现象.观察......
  • Jenkins Pipeline 密钥实现远程部署
    前提:已配置jenkins秘钥凭证 一、配置流程1.1片段生成1、按如下图选择2、新增密钥信息1.2脚本配置以上配置完成后,接下来就可以在Jenkinsfile中配置了,:stages{stage('xx启动'){steps{echo"xx启动"dir("${SRC_PATH}")......
  • 【Jenkins系列】-Pipeline语法全集
    Jenkins为您提供了两种开发Pipeline的方式:脚本式和声明式。脚本式流水线(也称为“传统”流水线)基于Groovy作为其特定于域的语言。而声明式流水线提供了简化且更友好的语法,并带有用于定义它们的特定语句,而无需学习Groovy。声明式流水线语法错误在脚本开始时报告。这是一个很好的功能,......
  • GitlabCI学习笔记之五:GitLabRunner pipeline语法之artifacts dependencies
    artifacts用于指定在作业成功或者失败时应附加到作业的文件或目录的列表。作业完成后,工件将被发送到GitLab,并可在GitLabUI中下载。artifacts:paths路径是相对于项目目录的,不能直接链接到项目目录之外。将制品设置为target目录artifacts:paths:-target/禁用工件......