Skip to content

Commit

Permalink
Add "create_fragment_matrix_from_fragments" to create directly a spar…
Browse files Browse the repository at this point in the history
…se fragment matrix.

Add "create_fragment_matrix_from_fragments" to create directly a
sparse fragment matrix from a fragments file for consensus peaks
of interest. The new code uses a lot less memory as it never builds
a full dense matrix.

    import pycisTopic.cistopic_class
    import pycisTopic.fragments

    # Create fragments matrix for fragment file for consensus regions and
    # for cell barcodes selected after pycistopic QC.
    counts_fragments_matrix, cbs, region_ids = pycisTopic.fragments.create_fragment_matrix_from_fragments(
        fragments_bed_filename="fragments_GSM7822226_MM_566.tsv.gz",
        regions_bed_filename="consensus_regions.bed",
        barcodes_tsv_filename="cbs_after_pycistopic_qc.tsv",
        blacklist_bed_filename="hg38-blacklist.v2.bed",  # Or None
    )

    # Define sample ID (project name).
    sample_id = "Sample1"

    # Create cisTopic object from sparse fragment matrix.
    cistopic_obj = pycisTopic.cistopic_class.create_cistopic_object(
        fragment_matrix=counts_fragments_matrix,
        cell_names=cbs,
        region_names=region_ids,
        path_to_fragments={sample_id: "fragments.tsv.gz"},
        project=sample_id
    )
  • Loading branch information
ghuls committed Jul 18, 2024
1 parent 7b285a4 commit a40e47c
Showing 1 changed file with 209 additions and 1 deletion.
210 changes: 209 additions & 1 deletion src/pycisTopic/fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@

import gzip
from operator import itemgetter
from typing import Literal, Sequence
from typing import TYPE_CHECKING, Literal, Sequence

import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa # type: ignore[import]
import pyarrow.csv # type: ignore[import]
import pyranges as pr # type: ignore[import]
import scipy as sp
from pycisTopic.genomic_ranges import intersection as gr_intersection
from pycisTopic.utils import normalise_filepath

if TYPE_CHECKING:
from pathlib import Path

# Enable Polars global string cache so all categoricals are created with the same
# string cache.
pl.enable_string_cache()
Expand Down Expand Up @@ -1121,3 +1125,207 @@ def get_fragments_in_peaks(
)

return fragments_in_peaks_df_pl


def create_fragment_matrix_from_fragments(
fragments_bed_filename: str | Path,
regions_bed_filename: str | Path,
barcodes_tsv_filename: str | Path,
blacklist_bed_filename: str | Path | None = None,
sample_id: str | None = None,
cb_end_to_remove: str | None = "-1",
cb_sample_separator: str | None = "___",
):
"""
Create fragments matrix from a fragment file and BED file with regions.
Parameters
----------
fragments_bed_filename
Fragments BED filename.
regions_bed_filename
Consensus peaks / SCREEN regions BED file for which to make the fragments matrix per cell barcode.
barcodes_tsv_filename
TSV file with selected cell barcodes after pycisTopic QC filtering.
blacklist_bed_filename
BED file with blacklisted regions (Amemiya et al., 2019). Default: None.
sample_id
Optional sample ID to append after cell barcode after removing `cb_end_to_remove`
and appending `cb_sample_separator`.
cb_end_to_remove
Remove this string from the end of the cell barcode if `sample_id` is specified.
cb_sample_separator
Add this string to the cell barcode if `sample_id` is specified, after removing
`cb_end_to_remove` and before appending `sample_id`.
Returns
-------
(
counts_fragments_matrix,
cbs,
region_ids,
)
References
----------
Amemiya, H. M., Kundaje, A., & Boyle, A. P. (2019). The ENCODE blacklist: identification of problematic regions of the genome. Scientific reports, 9(1), 1-5.
"""
# Create logger
# level = logging.INFO
# log_format = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
# handlers = [logging.StreamHandler(stream=sys.stdout)]
# logging.basicConfig(level=level, format=log_format, handlers=handlers)
# log = logging.getLogger("cisTopic")

# Read file with cell barcodes as a Polars Series and add sample ID to cell barcodes.
cbs = read_barcodes_file_to_polars_series(
barcodes_tsv_filename=barcodes_tsv_filename,
sample_id=sample_id,
cb_end_to_remove=cb_end_to_remove,
cb_sample_separator=cb_sample_separator,
)

# log.info("Reading data for " + project)
# Read fragments file to Polars Dataframe and add sample ID to cell barcodes.
fragments_df_pl = read_fragments_to_polars_df(
fragments_bed_filename=fragments_bed_filename,
engine="pyarrow",
sample_id=sample_id,
cb_end_to_remove=cb_end_to_remove,
cb_sample_separator=cb_sample_separator,
)

# Only keep fragments with the requested cell barcode.
fragments_cb_filtered_df_pl = filter_fragments_by_cb(
fragments_df_pl=fragments_df_pl,
cbs=cbs,
).rename({"Name": "CB", "Score": "CB_count"})

del fragments_df_pl

# Read regions BED file as a Polars Dataframe.
regions_df_pl = (
read_bed_to_polars_df(
bed_filename=regions_bed_filename,
engine="polars",
min_column_count=3,
)
.with_columns(
(
pl.col("Chromosome")
+ ":"
+ pl.col("Start").cast(pl.Utf8)
+ "-"
+ pl.col("End").cast(pl.Utf8)
)
.cast(pl.Categorical)
.alias("RegionID")
)
.select(
pl.col("Chromosome"),
pl.col("Start"),
pl.col("End"),
pl.col("RegionID"),
)
)

if blacklist_bed_filename:
# Read BED file with blacklisted regions .
blacklist_df_pl = read_bed_to_polars_df(
bed_filename=blacklist_bed_filename,
engine="polars",
min_column_count=3,
).select(
pl.col("Chromosome"),
pl.col("Start"),
pl.col("End"),
)

# Filter out regions that overlap with blacklisted regions.
regions_df_pl = (
regions_df_pl.lazy()
.join(
# Get all regionIDs that overlap with blacklisted regions.
gr_overlap(
regions1_df_pl=regions_df_pl,
regions2_df_pl=blacklist_df_pl,
how="first",
)
.lazy()
.select(
pl.col("RegionID"),
),
on="RegionID",
how="anti",
)
.select(
pl.col("Chromosome"),
pl.col("Start"),
pl.col("End"),
pl.col("RegionID"),
)
.collect()
)

# Get accessibility (binary and counts) for each region ID and cell barcode.
region_cb_df_pl = (
gr_intersection(
regions1_df_pl=regions_df_pl,
regions2_df_pl=fragments_cb_filtered_df_pl,
# how: Literal["all", "containment", "first", "last"] | str | None = None,
how="all",
regions1_info=True,
regions2_info=True,
regions1_coord=False,
regions2_coord=False,
regions1_suffix="@1",
regions2_suffix="@2",
)
.rename({"CB@2": "CB"})
.lazy()
.group_by(["RegionID", "CB"])
.agg(
# Get accessibility in binary form.
pl.lit(1).cast(pl.Int8).alias("accessible_binary"),
# Get accessibility in count form.
pl.len().cast(pl.UInt32).alias("accessible_count"),
)
.join(
regions_df_pl.lazy()
.select(pl.col("RegionID"))
.with_row_index("region_idx"),
on="RegionID",
how="left",
)
.join(
cbs.to_frame().lazy().with_row_index("CB_idx"),
on="CB",
how="left",
)
.collect()
)

# Construct binary accessibility matrix as a sparse matrix
# (regions as rows and cells as columns).
counts_fragments_matrix = sp.sparse.csr_matrix(
(
# All data points are 1:
# - same as: region_cb_df_pl.get_column("accessible_binary").to_numpy()
# - for count matrix: region_cb_df_pl.get_column("accessible_count").to_numpy()
# np.ones(region_cb_df_pl.shape[0], dtype=np.int8),
region_cb_df_pl.get_column("accessible_count").to_numpy(),
(
# Row indices:
region_cb_df_pl.get_column("region_idx").to_numpy(),
# Column indices:
region_cb_df_pl.get_column("CB_idx").to_numpy(),
),
)
)

return (
counts_fragments_matrix,
cbs.to_list(),
regions_df_pl.get_column("RegionID").to_list(),
)

0 comments on commit a40e47c

Please sign in to comment.