#supress warnings import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import sys import os import matplotlib.pyplot as plt import scanpy as sc import pycisTopic import pyranges as pr import requests import pandas as pd import pickle import scrublet as scr import numpy as np import pybiomart as pbm from pycisTopic.cistopic_class import create_cistopic_object_from_fragments from pycisTopic.topic_binarization import binarize_topics import polars as pl from pycisTopic.gene_activity import get_gene_activity from pycisTopic.iterative_peak_calling import * from pycisTopic.pseudobulk_peak_calling import peak_calling from pycisTopic.plotting.qc_plot import plot_sample_stats, plot_barcode_stats from pycisTopic.qc import get_barcodes_passing_qc_for_sample from pycisTopic.lda_models import run_cgs_models_mallet from pycisTopic.lda_models import evaluate_models from pycisTopic.utils import region_names_to_coordinates from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation from pycisTopic.clust_vis import plot_imputed_features from pycisTopic.utils import fig2img from pycisTopic.label_transfer import label_transfer from pycisTopic.pseudobulk_peak_calling import export_pseudobulk import time _stderr = sys.stderr null = open(os.devnull,'wb') work_dir = 'pbmc_tutorial' cpus= int(sys.argv[1]) sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(5, 5), facecolor='white') start_time = time.time() if not os.path.exists(os.path.join(work_dir, 'scRNA')): os.makedirs(os.path.join(work_dir, 'scRNA')) adata = sc.read_10x_h5(os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5')) adata.var_names_make_unique() ############################################################################################# #BASIC QUALITY CONTROL ############################################################################################# sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) sc.external.pp.scrublet(adata) adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt' sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) mito_filter = 25 n_counts_filter = 4300 # fig, axs = plt.subplots(ncols = 2, figsize = (8,4)) # sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', ax = axs[0], show=False) # sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', ax = axs[1], show = False) # # #draw horizontal red lines indicating thresholds. # axs[0].hlines(y = mito_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed') # axs[1].hlines(y = n_counts_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed') # fig.tight_layout() # fig.savefig("./mitochondrial counts and total counts.png") adata = adata[adata.obs.n_genes_by_counts < n_counts_filter, :] adata = adata[adata.obs.pct_counts_mt < mito_filter, :] ############################################################################################# #Data normalisation ############################################################################################# adata.raw = adata sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) adata = adata[:, adata.var.highly_variable] sc.pp.scale(adata, max_value=10) ############################################################################################# #cell type annotation ############################################################################################# adata_ref = sc.datasets.pbmc3k_processed() #use the preprocessed data from the Scanpy tutorial as reference var_names = adata_ref.var_names.intersection(adata.var_names) #use genes which are present in both assays adata_ref = adata_ref[:, var_names] adata = adata[:, var_names] sc.pp.pca(adata_ref) #calculate PCA embedding sc.pp.neighbors(adata_ref) #calculate neighborhood graph sc.tl.umap(adata_ref) #calculate umap embedding sc.tl.ingest(adata, adata_ref, obs='louvain') #run label transfer adata.obs.rename({'louvain': 'ingest_celltype_label'}, inplace = True, axis = 1) # sc.tl.pca(adata, svd_solver='arpack') # sc.pl.pca_variance_ratio(adata, log=True) #plt.savefig("./pca_variance.png") sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10) sc.tl.umap(adata) sc.pl.umap(adata, color = 'ingest_celltype_label',color_map='viridis') #plt.savefig("./umap_cell_type_label.png") sc.tl.leiden(adata, resolution = 0.8, key_added = 'leiden_res_0.8') sc.pl.umap(adata, color = 'leiden_res_0.8',color_map='viridis') #plt.savefig("./umap_cell_type_label_cleaned.png") tmp_df = adata.obs.groupby(['leiden_res_0.8', 'ingest_celltype_label']).size().unstack(fill_value=0) tmp_df = (tmp_df / tmp_df.sum(0)).fillna(0) leiden_to_annotation = tmp_df.idxmax(1).to_dict() #print(leiden_to_annotation) leiden_to_annotation['7'] = 'B cells 1' leiden_to_annotation['11'] = 'B cells 2' leiden_to_annotation = {cluster: leiden_to_annotation[cluster].replace(' ', '_') for cluster in leiden_to_annotation.keys()} leiden_to_annotation adata.obs['celltype'] = [leiden_to_annotation[cluster_id] for cluster_id in adata.obs['leiden_res_0.8']] del(leiden_to_annotation) del(tmp_df) sc.pl.umap(adata, color = 'celltype') #plt.savefig("./umap_cell_type_label_annotated.png") ## SAVE RESULTS adata.write(os.path.join(work_dir, 'scRNA/adata.h5ad'), compression='gzip') ############################################################################################# #scATAC-seq preprocessing using pycisTopic ############################################################################################# path = os.getcwd() #make a directory for to store the processed scRNA-seq data. if not os.path.exists(os.path.join(work_dir, 'scATAC')): os.makedirs(os.path.join(work_dir, 'scATAC')) tmp_dir =path+"/tmp_scratch" fragments_dict = {'10x_pbmc': os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz')} ############################################################################################# #scATAC-seq preprocessing using pycisTopic ############################################################################################# data = sc.read_h5ad(os.path.join(work_dir, 'scRNA/adata.h5ad')) cell_data = adata.obs cell_data['sample_id'] = '10x_pbmc' cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain. del(adata) target_url='http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes' chromsizes=pd.read_csv(target_url, sep='\t', header=None) chromsizes.columns=['Chromosome', 'End'] chromsizes['Start']=[0]*chromsizes.shape[0] chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']] # Exceptionally in this case, to agree with CellRangerARC annotations chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))] chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))] chromsizes=pr.PyRanges(chromsizes) bw_paths, bed_paths = export_pseudobulk( input_data = cell_data, variable = 'celltype', # variable by which to generate pseubulk profiles, in this case we want pseudobulks per celltype sample_id_col = 'sample_id', chromsizes = chromsizes, bed_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/'), # specify where pseudobulk_bed_files should be stored bigwig_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bw_files/'),# specify where pseudobulk_bw_files should be stored path_to_fragments = fragments_dict, # location of fragment fiels n_cpu = cpus, # specify the number of cores to use, we use ray for multi processing normalize_bigwig = True, # remove_duplicates = True, generate error temp_dir = os.path.join(tmp_dir, 'ray_spill'), split_pattern = '-') pickle.dump(bed_paths, open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl'), 'wb')) pickle.dump(bw_paths, open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl'), 'wb')) bed_paths = pickle.load(open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl'), 'rb')) bw_paths = pickle.load(open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl'), 'rb')) macs_path='macs2' narrow_peaks_dict = peak_calling(macs_path, bed_paths, os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/'), genome_size='hs', n_cpu=cpus, input_format='BEDPE', shift=73, ext_size=146, keep_dup = 'all', q_value = 0.05, _temp_dir = None )#os.getcwd())#os.path.join(tmp_dir, 'ray_spill') pickle.dump(narrow_peaks_dict, open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/narrow_peaks_dict.pkl'), 'wb')) peak_half_width = 250 path_to_blacklist= '/orfeo/LTS/CROAviano/LT_storage/Dati_OECS/pycisTopic/blacklist/hg38-blacklist.v2.bed' # Get consensus peaks consensus_peaks=get_consensus_peaks(narrow_peaks_dict, peak_half_width, chromsizes=chromsizes, path_to_blacklist=path_to_blacklist) consensus_peaks.to_bed( path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed'), keep=True, compression='infer', chain=False) ############################################################################################# #Quality control ############################################################################################# dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype']) annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].to_numpy(dtype = str) filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT') annot = annot[~filter] annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1') annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type'] annot = annot[annot.Transcript_type == 'protein_coding'] out_dir="outs" path_to_regions = {'10x_pbmc':os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed')} regions_bed_filename = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed") tss_bed_filename = os.path.join(out_dir, "qc", "tss.bed") pycistopic_qc_commands_filename = "pycistopic_qc_commands.txt" # Create text file with all pycistopic qc command lines. with open(pycistopic_qc_commands_filename, "w") as fh: for sample, fragment_filename in fragments_dict.items(): print( "pycistopic qc", f"--fragments {fragment_filename}", f"--regions {regions_bed_filename}", f"--tss {tss_bed_filename}", f"--output {os.path.join(out_dir, 'qc')}/{sample}", sep=" ", file=fh, ) for sample_id in fragments_dict: fig = plot_sample_stats( sample_id = sample_id, pycistopic_qc_output_dir = "outs/qc" ) # fig.savefig("./outs/qc/to_visualize.png") sample_id_to_barcodes_passing_filters = {} sample_id_to_thresholds = {} for sample_id in fragments_dict: ( sample_id_to_barcodes_passing_filters[sample_id], sample_id_to_thresholds[sample_id] ) = get_barcodes_passing_qc_for_sample( sample_id = sample_id, pycistopic_qc_output_dir = "outs/qc", unique_fragments_threshold = None, # use automatic thresholding tss_enrichment_threshold = None, # use automatic thresholding frip_threshold = 0, use_automatic_thresholds = True, ) for sample_id in fragments_dict: fig = plot_barcode_stats( sample_id = sample_id, pycistopic_qc_output_dir = "outs/qc", bc_passing_filters = sample_id_to_barcodes_passing_filters[sample_id], detailed_title = False, **sample_id_to_thresholds[sample_id] ) # fig.savefig("./outs/qc/number of unique fragments in regions.png") ############################################################################################# #Creating a cisTopic object ############################################################################################# fragments_dict = {'10x_pbmc': os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz')} adata = sc.read_h5ad(os.path.join(work_dir, 'scRNA/adata.h5ad')) scRNA_bc = adata.obs_names cell_data = adata.obs cell_data['sample_id'] = '10x_pbmc' cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain. del(adata) pycistopic_qc_output_dir = "outs/qc" path_to_regions = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed") cistopic_obj_list = [] #print("before the usual bug in which pandas is involved!") for sample_id in fragments_dict: sample_metrics = pl.read_parquet( os.path.join(pycistopic_qc_output_dir, f'{sample_id}.fragments_stats_per_cb.parquet') ).to_pandas().set_index("CB").loc[ sample_id_to_barcodes_passing_filters[sample_id] ] cistopic_obj = create_cistopic_object_from_fragments( path_to_fragments = fragments_dict[sample_id], path_to_regions = path_to_regions, path_to_blacklist = path_to_blacklist, metrics = sample_metrics, valid_bc = sample_id_to_barcodes_passing_filters[sample_id], n_cpu = cpus, project = sample_id, split_pattern = '-' ) cistopic_obj_list.append(cistopic_obj) # for multisample analysis you would need to run the merge() function on your cisTopic object list cistopic_obj = cistopic_obj_list[0] pickle.dump( cistopic_obj, open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb") ) ############################################################################################# #Adding metadata to a cisTopic object ############################################################################################# cell_data = pd.read_table("data/cell_data.tsv", index_col = 0) cell_data.head() cistopic_obj.add_cell_data(cell_data, split_pattern='-') pickle.dump( cistopic_obj, open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb") ) print("ADDING METADATA:",cistopic_obj.cell_data.keys()) ############################################################################################# #Running scrublet ############################################################################################# scrub = scr.Scrublet(cistopic_obj.fragment_matrix.T, expected_doublet_rate=0.1) doublet_scores, predicted_doublets = scrub.scrub_doublets() scrub.plot_histogram() scrub.call_doublets(threshold=0.22) scrub.plot_histogram() scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], columns=cistopic_obj.cell_names, index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T cistopic_obj.add_cell_data(scrublet, split_pattern = '-') sum(cistopic_obj.cell_data.Predicted_doublets_fragments == True) pickle.dump( cistopic_obj, open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb") ) # Remove doublets singlets = cistopic_obj.cell_data[cistopic_obj.cell_data.Predicted_doublets_fragments == False].index.tolist() # Subset cisTopic object cistopic_obj_noDBL = cistopic_obj.subset(singlets, copy=True, split_pattern='-') print(cistopic_obj_noDBL) pickle.dump( cistopic_obj, open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb") ) ############################################################################################# #Run models ############################################################################################# # os.environ['MALLET_MEMORY'] = '200G' # mallet_path="Mallet-202108/bin/mallet" # models=run_cgs_models_mallet( # cistopic_obj, # n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50], # n_cpu=cpus, # n_iter=500, # random_state=555, # alpha=50, # alpha_by_topic=True, # eta=0.1, # eta_by_topic=False, # tmp_path="./tmp_scratch/ray_spill/mallet/tutorial", # save_path="./tmp_scratch/ray_spill/mallet/tutorial", # mallet_path=mallet_path, # ) # pickle.dump( # models, # open(os.path.join(out_dir, "models.pkl"), "wb") # ) with open(os.path.join(out_dir, "models.pkl"), "rb") as file: models = pickle.load(file) ############################################################################################# #Models selection ############################################################################################# model = evaluate_models( models, select_model = 40, return_model = True ) cistopic_obj.add_LDA_model(model) pickle.dump( cistopic_obj, open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb") ) ############################################################################################# #Clustering and visualisation ############################################################################################# #library for clustering and visualisation from pycisTopic.clust_vis import ( find_clusters, run_umap, run_tsne, plot_metadata, plot_topic, cell_topic_heatmap ) find_clusters( cistopic_obj, target = 'cell', k = 10, res = [0.6, 1.2, 3], prefix = 'pycisTopic_', scale = True, split_pattern = '-' ) run_umap( cistopic_obj, target = 'cell', scale=True) run_tsne( cistopic_obj, target = 'cell', scale=True) # plot_metadata( # cistopic_obj, # reduction_name='UMAP', # variables=['Seurat_cell_type', 'VSN_leiden_res0.3','Seurat_leiden_res0.6', 'Seurat_leiden_res1.2'], # target='cell', num_columns=4, # text_size=10, # dot_size=5, # save='images/metadata_plot.png') # annot_dict = {} # for resolution in 0.6, 1.2, 3]: # annot_dict[f"pycisTopic_leiden_10_{resolution}"] = {} # for cluster in set(cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"]): # counts = cistopic_obj.cell_data.loc[ # cistopic_obj.cell_data.loc[cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"] == cluster].index, # "Seurat_cell_type"].value_counts() # annot_dict[f"pycisTopic_leiden_10_{resolution}"][cluster] = f"{counts.index[counts.argmax()]}({cluster})" # print(annot_dict) # for resolution in [0.3, 0.6, 1.2]:# 3]: # cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'] = [ # annot_dict[f'pycisTopic_leiden_10_{resolution}'][x] for x in cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'].tolist() # ] # plot_metadata( # cistopic_obj, # reduction_name='UMAP', # variables=['Seurat_cell_type','VSN_leiden_res0.3','Seurat_leiden_res0.6', 'Seurat_leiden_res1.2'], # target='cell', num_columns=4, # text_size=10, # dot_size=5, # save='images/metadata_plot2.png' # ) # for resolution in [0.6, 1.2, 3]: # cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'] = [ # annot_dict[f'pycisTopic_leiden_10_{resolution}'][x] for x in cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'].tolist() # ] # plot_metadata( # cistopic_obj, # reduction_name='UMAP', # variables=['Seurat_cell_type', 'pycisTopic_leiden_10_0.6', 'pycisTopic_leiden_10_1.2', 'pycisTopic_leiden_10_3'], # target='cell', # num_columns=4, # text_size=10, # dot_size=5, # save='images/metadata_plot3.png') # plot_metadata( # cistopic_obj, # reduction_name='UMAP', # variables=['log10_unique_fragments_count', 'tss_enrichment', 'Doublet_scores_fragments', 'fraction_of_fragments_in_peaks'], # target='cell', num_columns=4, # text_size=10, # dot_size=5, # save='images/metadata_plot4.png') # plot_topic( # cistopic_obj, # reduction_name = 'UMAP', # target = 'cell', # num_columns=5, # save='images/topic_plot.png' # ) # cell_topic_heatmap( # cistopic_obj, # variables = ['Seurat_cell_type'], # scale = False, # legend_loc_x = 1.0, # legend_loc_y = -1.2, # legend_dist_y = -1, # figsize = (10, 10), # save='images/cell_topic_heatmap.png' # ) ############################################################################################# #Topic binarization & QC ############################################################################################# region_bin_topics_top_3k = binarize_topics( cistopic_obj, method='ntop', ntop = 3_000, plot=True, num_columns=5, save='images/region_bin_topics.png' ) topic_qc_metrics = compute_topic_metrics(cistopic_obj) # fig_dict={} # fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True) # fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True) # fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True) # fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True) # fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True) # fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True) # fig=plt.figure(figsize=(40, 43)) # i = 1 # for fig_ in fig_dict.keys(): # plt.subplot(2, 3, i) # img = fig2img(fig_dict[fig_]) #To convert figures to png to plot together, see .utils.py. This converts the figure to png. # plt.axis('off') # i += 1 # plt.subplots_adjust(wspace=0, hspace=-0.70) # plt.savefig('images/topic_stats.png', dpi=300, bbox_inches='tight') binarized_cell_topic = binarize_topics( cistopic_obj, target='cell', method='li', plot=True, num_columns=5, nbins=100, save='images/binarized_cell_topic.png' ) topic_annot = topic_annotation( cistopic_obj, annot_var='Seurat_cell_type', binarized_cell_topic=binarized_cell_topic, general_topic_thr = 0.2 ) ############################################################################################# #Differentially Accessible Regions (DARs) ############################################################################################# from pycisTopic.diff_features import ( impute_accessibility, normalize_scores, find_highly_variable_features, find_diff_features ) imputed_acc_obj = impute_accessibility( cistopic_obj, selected_cells=None, selected_regions=None, scale_factor=10**6 ) normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4) variable_regions = find_highly_variable_features( normalized_imputed_acc_obj, min_disp = 0.05, min_mean = 0.0125, max_mean = 3, max_disp = np.inf, n_bins=20, n_top_features=None, plot=True ) markers_dict= find_diff_features( cistopic_obj, imputed_acc_obj, variable='Seurat_cell_type', var_features=variable_regions, contrasts=None, adjpval_thr=0.05, log2fc_thr=np.log2(1.5), n_cpu=cpus, _temp_dir='/u/cdslab/astacchetti/scenicplus/tutorial' , # usi this path since it is ok for all the related issue split_pattern = '-' ) # plot_imputed_features( # cistopic_obj, # reduction_name='UMAP', # imputed_data=imputed_acc_obj, # features=[markers_dict[x].index.tolist()[0] for x in ['BG', 'GC', 'INH_SST', 'COP']], # scale=False, # num_columns=4, # save='images/imputed_features.png' # ) print("Number of DARs found:") print("---------------------") for x in markers_dict: print(f" {x}: {len(markers_dict[x])}") ############################################################################################# #Save region sets ############################################################################################# os.makedirs(os.path.join(out_dir, "region_sets"), exist_ok = True) os.makedirs(os.path.join(out_dir, "region_sets", "Topics_otsu"), exist_ok = True) os.makedirs(os.path.join(out_dir, "region_sets", "Topics_top_3k"), exist_ok = True) os.makedirs(os.path.join(out_dir, "region_sets", "DARs_cell_type"), exist_ok = True) # for topic in region_bin_topics_otsu: # region_names_to_coordinates( # region_bin_topics_otsu[topic].index # ).sort_values( # ["Chromosome", "Start", "End"] # ).to_csv( # os.path.join(out_dir, "region_sets", "Topics_otsu", f"{topic}.bed"), # sep = "\t", # header = False, index = False # ) for topic in region_bin_topics_top_3k: region_names_to_coordinates( region_bin_topics_top_3k[topic].index ).sort_values( ["Chromosome", "Start", "End"] ).to_csv( os.path.join(out_dir, "region_sets", "Topics_top_3k", f"{topic}.bed"), sep = "\t", header = False, index = False ) for cell_type in markers_dict: region_names_to_coordinates( markers_dict[cell_type].index ).sort_values( ["Chromosome", "Start", "End"] ).to_csv( os.path.join(out_dir, "region_sets", "DARs_cell_type", f"{cell_type}.bed"), sep = "\t", header = False, index = False ) ############################################################################################# #Gene activity ############################################################################################# chromsizes = pd.read_table(os.path.join(out_dir, "qc", "hg38.chrom_sizes_and_alias.tsv")) chromsizes.rename({"# ucsc": "Chromosome", "length": "End"}, axis = 1, inplace = True) chromsizes["Start"] = 0 chromsizes = pr.PyRanges(chromsizes[["Chromosome", "Start", "End"]]) pr_annotation = pd.read_table( os.path.join(out_dir, "qc", "tss.bed") ).rename( {"Name": "Gene", "# Chromosome": "Chromosome"}, axis = 1) pr_annotation["Transcription_Start_Site"] = pr_annotation["Start"] pr_annotation = pr.PyRanges(pr_annotation) print(pr_annotation) gene_act, weigths = get_gene_activity( imputed_acc_obj, pr_annotation, chromsizes, use_gene_boundaries=True, # Whether to use the whole search space or stop when encountering another gene upstream=[1000, 100000], # Search space upstream. The minimum means that even if there is a gene right next to it # these bp will be taken (1kbp here) downstream=[1000,100000], # Search space downstream distance_weight=True, # Whether to add a distance weight (an exponential function, the weight will decrease with distance) decay_rate=1, # Exponent for the distance exponential funciton (the higher the faster will be the decrease) extend_gene_body_upstream=10000, # Number of bp upstream immune to the distance weight (their value will be maximum for #this weight) extend_gene_body_downstream=500, # Number of bp downstream immune to the distance weight gene_size_weight=False, # Whether to add a weights based on the length of the gene gene_size_scale_factor='median', # Dividend to calculate the gene size weigth. Default is the median value of all genes #in the genome remove_promoters=False, # Whether to remove promoters when computing gene activity scores average_scores=True, # Whether to divide by the total number of region assigned to a gene when calculating the gene #activity score scale_factor=1, # Value to multiply for the final gene activity matrix extend_tss=[10,10], # Space to consider a promoter gini_weight = True, # Whether to add a gini index weigth. The more unique the region is, the higher this weight will be return_weights= True, # Whether to return the final weights project='Gene_activity') # Project name for the gene activity object DAG_markers_dict= find_diff_features( cistopic_obj, gene_act, variable='Seurat_cell_type', var_features=None, contrasts=None, adjpval_thr=0.05, log2fc_thr=np.log2(1.5), n_cpu=cpus, _temp_dir='/u/cdslab/astacchetti/scenicplus/tutorial', split_pattern = '-') # plot_imputed_features( # cistopic_obj, # reduction_name='UMAP', # imputed_data=gene_act, # features=['PDGFRA', 'OLIG2', 'MBP', 'SOX10', # Olig differentiation # 'CTNNA3', 'ENPP6', 'OLIG1', # Olig differentiation # 'GAD2', 'VIP', 'SST', 'CTXN3', # Int # 'NFIB', 'SOX9', #Ast # 'LEF1', #Endo # 'SPI1'], #Glia # scale=True, # num_columns=4 # ) print("Number of DAGs found:") print("---------------------") for x in markers_dict: print(f" {x}: {len(DAG_markers_dict[x])}") ############################################################################################# #Label transfer ############################################################################################# rna_anndata = sc.read_h5ad( "./pbmc_tutorial/scRNA/adata.h5ad" ).raw.to_adata() atac_anndata = sc.AnnData(gene_act.mtx.T, obs = pd.DataFrame(index = gene_act.cell_names), var = pd.DataFrame(index = gene_act.feature_names)) atac_anndata.obs["sample_id"] = "10x_multiome_brain" rna_anndata.obs["sample_id"] = "10x_multiome_brain" print(rna_anndata, atac_anndata) label_dict = label_transfer( rna_anndata, atac_anndata, labels_to_transfer = ['sample_id'], variable_genes = True, methods = ['ingest', 'harmony', 'bbknn', 'scanorama', 'cca'], return_label_weights = False, _temp_dir= '/u/cdslab/astacchetti/scenicplus/tutorial' ) label_dict_x=[label_dict[key] for key in label_dict.keys()] label_pd = pd.concat(label_dict_x, axis=1, sort=False) label_pd.index = cistopic_obj.cell_names label_pd.columns = ['pycisTopic_' + x for x in label_pd.columns] cistopic_obj.add_cell_data(label_pd, split_pattern = '-') ##CONTINUA end_time = time.time() execution_time = end_time - start_time print(f"Execution time: {execution_time} seconds")