#supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
import matplotlib.pyplot as plt
import scanpy as sc
import pycisTopic
import pyranges as pr
import requests
import pandas as pd
import pickle
import scrublet as scr
import numpy as np
import pybiomart as pbm
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments
from pycisTopic.topic_binarization import binarize_topics
import polars as pl
from pycisTopic.gene_activity import get_gene_activity
from pycisTopic.iterative_peak_calling import *
from pycisTopic.pseudobulk_peak_calling import peak_calling
from pycisTopic.plotting.qc_plot import plot_sample_stats, plot_barcode_stats
from pycisTopic.qc import get_barcodes_passing_qc_for_sample
from pycisTopic.lda_models import run_cgs_models_mallet
from pycisTopic.lda_models import evaluate_models
from pycisTopic.utils import region_names_to_coordinates
from pycisTopic.topic_qc import compute_topic_metrics, plot_topic_qc, topic_annotation
from pycisTopic.clust_vis import plot_imputed_features
from pycisTopic.utils import fig2img
from pycisTopic.label_transfer import label_transfer
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk


import time

_stderr = sys.stderr
null = open(os.devnull,'wb')
work_dir = 'pbmc_tutorial'
cpus= int(sys.argv[1])
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(5, 5), facecolor='white')

start_time = time.time()

if not os.path.exists(os.path.join(work_dir, 'scRNA')):
    os.makedirs(os.path.join(work_dir, 'scRNA'))
adata = sc.read_10x_h5(os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5'))
adata.var_names_make_unique()

#############################################################################################
                                #BASIC QUALITY CONTROL
#############################################################################################
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.external.pp.scrublet(adata) 
adata = adata[adata.obs['predicted_doublet'] == False] #do the actual filtering
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
mito_filter = 25
n_counts_filter = 4300
# fig, axs = plt.subplots(ncols = 2, figsize = (8,4))
# sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', ax = axs[0], show=False)
# sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', ax = axs[1], show = False)
# # #draw horizontal red lines indicating thresholds.
# axs[0].hlines(y = mito_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
# axs[1].hlines(y = n_counts_filter, xmin = 0, xmax = max(adata.obs['total_counts']), color = 'red', ls = 'dashed')
# fig.tight_layout()
# fig.savefig("./mitochondrial counts and total counts.png")
adata = adata[adata.obs.n_genes_by_counts < n_counts_filter, :]
adata = adata[adata.obs.pct_counts_mt < mito_filter, :]


#############################################################################################
                                #Data normalisation
#############################################################################################
adata.raw = adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

#############################################################################################
                                #cell type annotation
#############################################################################################
adata_ref = sc.datasets.pbmc3k_processed() #use the preprocessed data from the Scanpy tutorial as reference
var_names = adata_ref.var_names.intersection(adata.var_names) #use genes which are present in both assays
adata_ref = adata_ref[:, var_names]
adata = adata[:, var_names]
sc.pp.pca(adata_ref) #calculate PCA embedding
sc.pp.neighbors(adata_ref) #calculate neighborhood graph
sc.tl.umap(adata_ref) #calculate umap embedding
sc.tl.ingest(adata, adata_ref, obs='louvain') #run label transfer
adata.obs.rename({'louvain': 'ingest_celltype_label'}, inplace = True, axis = 1)
# sc.tl.pca(adata, svd_solver='arpack')
# sc.pl.pca_variance_ratio(adata, log=True)
#plt.savefig("./pca_variance.png")

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10)
sc.tl.umap(adata)
sc.pl.umap(adata, color = 'ingest_celltype_label',color_map='viridis')
#plt.savefig("./umap_cell_type_label.png")

sc.tl.leiden(adata, resolution = 0.8, key_added = 'leiden_res_0.8')
sc.pl.umap(adata, color = 'leiden_res_0.8',color_map='viridis')
#plt.savefig("./umap_cell_type_label_cleaned.png")

tmp_df = adata.obs.groupby(['leiden_res_0.8', 'ingest_celltype_label']).size().unstack(fill_value=0)
tmp_df = (tmp_df / tmp_df.sum(0)).fillna(0)
leiden_to_annotation = tmp_df.idxmax(1).to_dict()
#print(leiden_to_annotation)

leiden_to_annotation['7'] = 'B cells 1'
leiden_to_annotation['11'] = 'B cells 2'
leiden_to_annotation = {cluster: leiden_to_annotation[cluster].replace(' ', '_') for cluster in leiden_to_annotation.keys()}
leiden_to_annotation

adata.obs['celltype'] = [leiden_to_annotation[cluster_id] for cluster_id in adata.obs['leiden_res_0.8']]
del(leiden_to_annotation)
del(tmp_df)
sc.pl.umap(adata, color = 'celltype')
#plt.savefig("./umap_cell_type_label_annotated.png")

## SAVE RESULTS
adata.write(os.path.join(work_dir, 'scRNA/adata.h5ad'), compression='gzip')

#############################################################################################
                                #scATAC-seq preprocessing using pycisTopic
#############################################################################################

path = os.getcwd()
#make a directory for to store the processed scRNA-seq data.
if not os.path.exists(os.path.join(work_dir, 'scATAC')):
    os.makedirs(os.path.join(work_dir, 'scATAC'))
tmp_dir =path+"/tmp_scratch"

fragments_dict = {'10x_pbmc': os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz')}

#############################################################################################
                                #scATAC-seq preprocessing using pycisTopic
#############################################################################################
data = sc.read_h5ad(os.path.join(work_dir, 'scRNA/adata.h5ad'))
cell_data = adata.obs
cell_data['sample_id'] = '10x_pbmc'

cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain.

del(adata)

target_url='http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
chromsizes=pd.read_csv(target_url, sep='\t', header=None)

chromsizes.columns=['Chromosome', 'End']
chromsizes['Start']=[0]*chromsizes.shape[0]
chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']]
# Exceptionally in this case, to agree with CellRangerARC annotations
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))]
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]
chromsizes=pr.PyRanges(chromsizes)


bw_paths, bed_paths = export_pseudobulk(
    input_data = cell_data,
    variable = 'celltype',                                                                     # variable by which to generate pseubulk profiles, in this case we want pseudobulks per celltype
    sample_id_col = 'sample_id',
    chromsizes = chromsizes,
    bed_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/'),  # specify where pseudobulk_bed_files should be stored
    bigwig_path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bw_files/'),# specify where pseudobulk_bw_files should be stored
    path_to_fragments = fragments_dict,                                                        # location of fragment fiels
    n_cpu = cpus,                                                                                 # specify the number of cores to use, we use ray for multi processing
    normalize_bigwig = True,
#     remove_duplicates = True, generate error
    temp_dir = os.path.join(tmp_dir, 'ray_spill'),
    split_pattern = '-')

pickle.dump(bed_paths,
            open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl'), 'wb'))
pickle.dump(bw_paths,
           open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl'), 'wb')) 
bed_paths = pickle.load(open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl'), 'rb'))
bw_paths =  pickle.load(open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl'), 'rb'))
macs_path='macs2'
narrow_peaks_dict = peak_calling(macs_path,
    bed_paths,
    os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/'),
    genome_size='hs',
    n_cpu=cpus,
    input_format='BEDPE',
    shift=73,
    ext_size=146,
    keep_dup = 'all',
    q_value = 0.05,
    _temp_dir = None )#os.getcwd())#os.path.join(tmp_dir, 'ray_spill')
pickle.dump(narrow_peaks_dict,
            open(os.path.join(work_dir, 'scATAC/consensus_peak_calling/MACS/narrow_peaks_dict.pkl'), 'wb'))


peak_half_width = 250
path_to_blacklist= '/orfeo/LTS/CROAviano/LT_storage/Dati_OECS/pycisTopic/blacklist/hg38-blacklist.v2.bed'
# Get consensus peaks
consensus_peaks=get_consensus_peaks(narrow_peaks_dict, peak_half_width, chromsizes=chromsizes, path_to_blacklist=path_to_blacklist)
consensus_peaks.to_bed(
    path = os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed'),
    keep=True,
    compression='infer',
    chain=False)

#############################################################################################
                                #Quality control
#############################################################################################

dataset = pbm.Dataset(name='hsapiens_gene_ensembl',  host='http://www.ensembl.org')
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].to_numpy(dtype = str)
filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT')
annot = annot[~filter]
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1')
annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']

out_dir="outs"

path_to_regions = {'10x_pbmc':os.path.join(work_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed')}
regions_bed_filename = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed")
tss_bed_filename = os.path.join(out_dir, "qc", "tss.bed")

pycistopic_qc_commands_filename = "pycistopic_qc_commands.txt"
# Create text file with all pycistopic qc command lines.
with open(pycistopic_qc_commands_filename, "w") as fh:
    for sample, fragment_filename in fragments_dict.items():
        print(
            "pycistopic qc",
            f"--fragments {fragment_filename}",
            f"--regions {regions_bed_filename}",
            f"--tss {tss_bed_filename}",
            f"--output {os.path.join(out_dir, 'qc')}/{sample}",
            sep=" ",
            file=fh,
        )

for sample_id in fragments_dict:
    fig = plot_sample_stats(
        sample_id = sample_id,
        pycistopic_qc_output_dir = "outs/qc"
    )
#    fig.savefig("./outs/qc/to_visualize.png")

sample_id_to_barcodes_passing_filters = {}
sample_id_to_thresholds = {}
for sample_id in fragments_dict:
    (
        sample_id_to_barcodes_passing_filters[sample_id],
        sample_id_to_thresholds[sample_id]
    ) = get_barcodes_passing_qc_for_sample(
            sample_id = sample_id,
            pycistopic_qc_output_dir = "outs/qc",
            unique_fragments_threshold = None, # use automatic thresholding
            tss_enrichment_threshold = None, # use automatic thresholding
            frip_threshold = 0,
            use_automatic_thresholds = True,
    )
for sample_id in fragments_dict:
    fig = plot_barcode_stats(
        sample_id = sample_id,
        pycistopic_qc_output_dir = "outs/qc",
        bc_passing_filters = sample_id_to_barcodes_passing_filters[sample_id],
        detailed_title = False,
        **sample_id_to_thresholds[sample_id]
    )
#    fig.savefig("./outs/qc/number of unique fragments in regions.png")
#############################################################################################
                                #Creating a cisTopic object
#############################################################################################
    fragments_dict = {'10x_pbmc': os.path.join(work_dir, 'data/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz')}
    adata = sc.read_h5ad(os.path.join(work_dir, 'scRNA/adata.h5ad'))
    scRNA_bc = adata.obs_names
    cell_data = adata.obs
    cell_data['sample_id'] = '10x_pbmc'
    cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain.
    del(adata)
    pycistopic_qc_output_dir = "outs/qc"
    path_to_regions = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed")

    cistopic_obj_list = []
#print("before the usual bug in which pandas is involved!")
    for sample_id in fragments_dict:
        sample_metrics = pl.read_parquet(
            os.path.join(pycistopic_qc_output_dir, f'{sample_id}.fragments_stats_per_cb.parquet')
        ).to_pandas().set_index("CB").loc[ sample_id_to_barcodes_passing_filters[sample_id] ]
        cistopic_obj = create_cistopic_object_from_fragments(
            path_to_fragments = fragments_dict[sample_id],
            path_to_regions = path_to_regions,
            path_to_blacklist = path_to_blacklist,
            metrics = sample_metrics,
            valid_bc = sample_id_to_barcodes_passing_filters[sample_id],
            n_cpu = cpus,
            project = sample_id,
            split_pattern = '-'
        )
    cistopic_obj_list.append(cistopic_obj)

    # for multisample analysis you would need to run the merge() function on your cisTopic object list
    
    cistopic_obj = cistopic_obj_list[0]
    pickle.dump(
        cistopic_obj,
        open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
    )


#############################################################################################
                                #Adding metadata to a cisTopic object
#############################################################################################
    cell_data = pd.read_table("data/cell_data.tsv", index_col = 0)
    cell_data.head()
    cistopic_obj.add_cell_data(cell_data, split_pattern='-')
    pickle.dump(
        cistopic_obj,
        open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
    )
print("ADDING METADATA:",cistopic_obj.cell_data.keys())

#############################################################################################
                                #Running scrublet
#############################################################################################

scrub = scr.Scrublet(cistopic_obj.fragment_matrix.T, expected_doublet_rate=0.1)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
scrub.plot_histogram()
scrub.call_doublets(threshold=0.22)
scrub.plot_histogram()
scrublet = pd.DataFrame([scrub.doublet_scores_obs_, scrub.predicted_doublets_], columns=cistopic_obj.cell_names, index=['Doublet_scores_fragments', 'Predicted_doublets_fragments']).T
cistopic_obj.add_cell_data(scrublet, split_pattern = '-')
sum(cistopic_obj.cell_data.Predicted_doublets_fragments == True)

pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)
# Remove doublets
singlets = cistopic_obj.cell_data[cistopic_obj.cell_data.Predicted_doublets_fragments == False].index.tolist()
# Subset cisTopic object
cistopic_obj_noDBL = cistopic_obj.subset(singlets, copy=True, split_pattern='-')
print(cistopic_obj_noDBL)
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)

#############################################################################################
                                #Run models
#############################################################################################

    # os.environ['MALLET_MEMORY'] = '200G'
    # mallet_path="Mallet-202108/bin/mallet"
    # models=run_cgs_models_mallet(
    #     cistopic_obj,
    #     n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    #     n_cpu=cpus,
    #     n_iter=500,
    #     random_state=555,
    #     alpha=50,
    #     alpha_by_topic=True,
    #     eta=0.1,
    #     eta_by_topic=False,
    #     tmp_path="./tmp_scratch/ray_spill/mallet/tutorial",
    #     save_path="./tmp_scratch/ray_spill/mallet/tutorial",
    #     mallet_path=mallet_path,
    # )
#     pickle.dump(
#      models,
#      open(os.path.join(out_dir, "models.pkl"), "wb")
# )

with open(os.path.join(out_dir, "models.pkl"), "rb") as file:
    models = pickle.load(file)

#############################################################################################
                                #Models selection
#############################################################################################

model = evaluate_models(
     models,
     select_model = 40,
     return_model = True
 )

cistopic_obj.add_LDA_model(model)
pickle.dump(
    cistopic_obj,
    open(os.path.join(out_dir, "cistopic_obj.pkl"), "wb")
)
#############################################################################################
                                #Clustering and visualisation
#############################################################################################

#library for clustering and visualisation

from pycisTopic.clust_vis import (
    find_clusters,
    run_umap,
    run_tsne,
    plot_metadata,
    plot_topic,
    cell_topic_heatmap
)
find_clusters(
    cistopic_obj,
    target  = 'cell',
    k = 10,
    res = [0.6, 1.2, 3],
    prefix = 'pycisTopic_',
    scale = True,
    split_pattern = '-'
)
run_umap(
    cistopic_obj,
    target  = 'cell', scale=True)

run_tsne(
    cistopic_obj,
    target  = 'cell', scale=True)

# plot_metadata(
#     cistopic_obj,
#     reduction_name='UMAP',
#     variables=['Seurat_cell_type', 'VSN_leiden_res0.3','Seurat_leiden_res0.6', 'Seurat_leiden_res1.2'],
#     target='cell', num_columns=4,
#     text_size=10,
#     dot_size=5,
#     save='images/metadata_plot.png')

# annot_dict = {}
# for resolution in  0.6, 1.2, 3]:
#     annot_dict[f"pycisTopic_leiden_10_{resolution}"] = {}
#     for cluster in set(cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"]):
#         counts = cistopic_obj.cell_data.loc[
#             cistopic_obj.cell_data.loc[cistopic_obj.cell_data[f"pycisTopic_leiden_10_{resolution}"] == cluster].index,
#             "Seurat_cell_type"].value_counts()
#         annot_dict[f"pycisTopic_leiden_10_{resolution}"][cluster] = f"{counts.index[counts.argmax()]}({cluster})"

# print(annot_dict)

# for resolution in [0.3, 0.6, 1.2]:# 3]:
#     cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'] = [
#         annot_dict[f'pycisTopic_leiden_10_{resolution}'][x] for x in cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'].tolist()
#     ]

# plot_metadata(
#     cistopic_obj,
#     reduction_name='UMAP',
#     variables=['Seurat_cell_type','VSN_leiden_res0.3','Seurat_leiden_res0.6', 'Seurat_leiden_res1.2'],
#     target='cell', num_columns=4,
#     text_size=10,
#     dot_size=5,
#     save='images/metadata_plot2.png'
#     )

# for resolution in [0.6, 1.2, 3]:
#     cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'] = [
#         annot_dict[f'pycisTopic_leiden_10_{resolution}'][x] for x in cistopic_obj.cell_data[f'pycisTopic_leiden_10_{resolution}'].tolist()
#     ]

# plot_metadata(
#     cistopic_obj,
#     reduction_name='UMAP',
#     variables=['Seurat_cell_type', 'pycisTopic_leiden_10_0.6', 'pycisTopic_leiden_10_1.2', 'pycisTopic_leiden_10_3'],
#     target='cell', 
#     num_columns=4,
#     text_size=10,
#     dot_size=5,
#     save='images/metadata_plot3.png')


# plot_metadata(
#     cistopic_obj,
#     reduction_name='UMAP',
#     variables=['log10_unique_fragments_count', 'tss_enrichment', 'Doublet_scores_fragments', 'fraction_of_fragments_in_peaks'],
#     target='cell', num_columns=4,
#     text_size=10,
#     dot_size=5,
#     save='images/metadata_plot4.png')


# plot_topic(
#     cistopic_obj,
#     reduction_name = 'UMAP',
#     target = 'cell',
#     num_columns=5,
#     save='images/topic_plot.png'
# )

# cell_topic_heatmap(
#    cistopic_obj,
#     variables = ['Seurat_cell_type'],
#     scale = False,
#     legend_loc_x = 1.0,
#     legend_loc_y = -1.2,
#     legend_dist_y = -1,
#     figsize = (10, 10),
#     save='images/cell_topic_heatmap.png'
# )

#############################################################################################
                                #Topic binarization & QC
#############################################################################################
region_bin_topics_top_3k = binarize_topics(
    cistopic_obj, method='ntop', ntop = 3_000,
    plot=True, num_columns=5, save='images/region_bin_topics.png'
)

topic_qc_metrics = compute_topic_metrics(cistopic_obj)

# fig_dict={}
# fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True)
# fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
# fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
# fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
# fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True)
# fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True)

# fig=plt.figure(figsize=(40, 43))
# i = 1
# for fig_ in fig_dict.keys():
#     plt.subplot(2, 3, i)
#     img = fig2img(fig_dict[fig_]) #To convert figures to png to plot together, see .utils.py. This converts the figure to png.
#     plt.axis('off')
#     i += 1
# plt.subplots_adjust(wspace=0, hspace=-0.70)
# plt.savefig('images/topic_stats.png', dpi=300, bbox_inches='tight')
binarized_cell_topic = binarize_topics(
    cistopic_obj,
    target='cell',
    method='li',
    plot=True,
    num_columns=5, nbins=100,
    save='images/binarized_cell_topic.png' 
    )

topic_annot = topic_annotation(
    cistopic_obj,
    annot_var='Seurat_cell_type',
    binarized_cell_topic=binarized_cell_topic,
    general_topic_thr = 0.2
)

#############################################################################################
                            #Differentially Accessible Regions (DARs)
#############################################################################################

from pycisTopic.diff_features import (
    impute_accessibility,
    normalize_scores,
    find_highly_variable_features,
    find_diff_features
)

imputed_acc_obj = impute_accessibility(
    cistopic_obj,
    selected_cells=None,
    selected_regions=None,
    scale_factor=10**6
)
normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4)

variable_regions = find_highly_variable_features(
    normalized_imputed_acc_obj,
    min_disp = 0.05,
    min_mean = 0.0125,
    max_mean = 3,
    max_disp = np.inf,
    n_bins=20,
    n_top_features=None,
    plot=True
)
markers_dict= find_diff_features(
    cistopic_obj,
    imputed_acc_obj,
    variable='Seurat_cell_type',
    var_features=variable_regions,
    contrasts=None,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.5),
    n_cpu=cpus,
    _temp_dir='/u/cdslab/astacchetti/scenicplus/tutorial' , # usi this path since it is ok for all the related issue
    split_pattern = '-'
)

# plot_imputed_features(
#     cistopic_obj,
#     reduction_name='UMAP',
#     imputed_data=imputed_acc_obj,
#     features=[markers_dict[x].index.tolist()[0] for x in ['BG', 'GC', 'INH_SST', 'COP']],
#     scale=False,
#     num_columns=4,
#     save='images/imputed_features.png'
# )

print("Number of DARs found:")
print("---------------------")
for x in markers_dict:
    print(f"  {x}: {len(markers_dict[x])}")

#############################################################################################
                                #Save region sets
#############################################################################################
os.makedirs(os.path.join(out_dir, "region_sets"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "Topics_otsu"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "Topics_top_3k"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "region_sets", "DARs_cell_type"), exist_ok = True)

# for topic in region_bin_topics_otsu:
#     region_names_to_coordinates(
#         region_bin_topics_otsu[topic].index
#     ).sort_values(
#         ["Chromosome", "Start", "End"]
#     ).to_csv(
#         os.path.join(out_dir, "region_sets", "Topics_otsu", f"{topic}.bed"),
#         sep = "\t",
#         header = False, index = False
#     )

for topic in region_bin_topics_top_3k:
    region_names_to_coordinates(
        region_bin_topics_top_3k[topic].index
    ).sort_values(
        ["Chromosome", "Start", "End"]
    ).to_csv(
        os.path.join(out_dir, "region_sets", "Topics_top_3k", f"{topic}.bed"),
        sep = "\t",
        header = False, index = False
    )

for cell_type in markers_dict:
    region_names_to_coordinates(
        markers_dict[cell_type].index
    ).sort_values(
        ["Chromosome", "Start", "End"]
    ).to_csv(
        os.path.join(out_dir, "region_sets", "DARs_cell_type", f"{cell_type}.bed"),
        sep = "\t",
        header = False, index = False
    )

#############################################################################################
                                #Gene activity
#############################################################################################
chromsizes = pd.read_table(os.path.join(out_dir, "qc", "hg38.chrom_sizes_and_alias.tsv"))
chromsizes.rename({"# ucsc": "Chromosome", "length": "End"}, axis = 1, inplace = True)
chromsizes["Start"] = 0
chromsizes = pr.PyRanges(chromsizes[["Chromosome", "Start", "End"]])

pr_annotation = pd.read_table(
        os.path.join(out_dir, "qc", "tss.bed")
    ).rename(
        {"Name": "Gene", "# Chromosome": "Chromosome"}, axis = 1)
pr_annotation["Transcription_Start_Site"] = pr_annotation["Start"]
pr_annotation = pr.PyRanges(pr_annotation)

print(pr_annotation)

gene_act, weigths = get_gene_activity(
    imputed_acc_obj,
    pr_annotation,
    chromsizes,
    use_gene_boundaries=True, # Whether to use the whole search space or stop when encountering another gene
    upstream=[1000, 100000], # Search space upstream. The minimum means that even if there is a gene right next to it
                             # these bp will be taken (1kbp here)
    downstream=[1000,100000], # Search space downstream
    distance_weight=True, # Whether to add a distance weight (an exponential function, the weight will decrease with distance)
    decay_rate=1, # Exponent for the distance exponential funciton (the higher the faster will be the decrease)
    extend_gene_body_upstream=10000, # Number of bp upstream immune to the distance weight (their value will be maximum for
                          #this weight)
    extend_gene_body_downstream=500, # Number of bp downstream immune to the distance weight
    gene_size_weight=False, # Whether to add a weights based on the length of the gene
    gene_size_scale_factor='median', # Dividend to calculate the gene size weigth. Default is the median value of all genes
                          #in the genome
    remove_promoters=False, # Whether to remove promoters when computing gene activity scores
    average_scores=True, # Whether to divide by the total number of region assigned to a gene when calculating the gene
                          #activity score
    scale_factor=1, # Value to multiply for the final gene activity matrix
    extend_tss=[10,10], # Space to consider a promoter
    gini_weight = True, # Whether to add a gini index weigth. The more unique the region is, the higher this weight will be
    return_weights= True, # Whether to return the final weights
    project='Gene_activity') # Project name for the gene activity object

DAG_markers_dict= find_diff_features(
    cistopic_obj,
    gene_act,
    variable='Seurat_cell_type',
    var_features=None,
    contrasts=None,
    adjpval_thr=0.05,
    log2fc_thr=np.log2(1.5),
    n_cpu=cpus,
    _temp_dir='/u/cdslab/astacchetti/scenicplus/tutorial',
    split_pattern = '-')


# plot_imputed_features(
#     cistopic_obj,
#     reduction_name='UMAP',
#     imputed_data=gene_act,
#     features=['PDGFRA', 'OLIG2', 'MBP', 'SOX10', # Olig differentiation
#               'CTNNA3', 'ENPP6', 'OLIG1', # Olig differentiation
#               'GAD2', 'VIP', 'SST', 'CTXN3', # Int
#               'NFIB', 'SOX9',  #Ast
#               'LEF1', #Endo
#               'SPI1'], #Glia
#     scale=True,
#     num_columns=4
# )

print("Number of DAGs found:")
print("---------------------")
for x in markers_dict:
    print(f"  {x}: {len(DAG_markers_dict[x])}")


#############################################################################################
                                #Label transfer
#############################################################################################
rna_anndata = sc.read_h5ad(
    "./pbmc_tutorial/scRNA/adata.h5ad"
).raw.to_adata()
atac_anndata = sc.AnnData(gene_act.mtx.T, obs = pd.DataFrame(index = gene_act.cell_names),
var = pd.DataFrame(index = gene_act.feature_names))
atac_anndata.obs["sample_id"] = "10x_multiome_brain"
rna_anndata.obs["sample_id"] = "10x_multiome_brain"
print(rna_anndata, atac_anndata)
label_dict = label_transfer(
    rna_anndata,
    atac_anndata,
    labels_to_transfer = ['sample_id'],
    variable_genes = True,
    methods = ['ingest', 'harmony', 'bbknn', 'scanorama', 'cca'],
    return_label_weights = False,
    _temp_dir= '/u/cdslab/astacchetti/scenicplus/tutorial'
)

label_dict_x=[label_dict[key] for key in label_dict.keys()]
label_pd = pd.concat(label_dict_x, axis=1, sort=False)
label_pd.index = cistopic_obj.cell_names
label_pd.columns = ['pycisTopic_' + x for x in label_pd.columns]
cistopic_obj.add_cell_data(label_pd, split_pattern = '-')

##CONTINUA


end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")