Skip to content

Commit

Permalink
Speed up random function for many chroms
Browse files Browse the repository at this point in the history
  • Loading branch information
endrebak committed Dec 20, 2024
1 parent 861a81a commit ef79c69
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 35 deletions.
12 changes: 6 additions & 6 deletions docs/how_to_create.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,10 +287,10 @@ By default, the data refers to the human genome (hg19):
index | Chromosome Start End Strand
int64 | object int64 int64 object
------- --- ------------ --------- --------- --------
0 | chr12 46554011 46554061 +
1 | chr1 202415019 202415069 -
2 | chr3 89385747 89385797 -
3 | chr3 182842974 182843024 -
4 | chr3 89004288 89004338 -
0 | nan 0 50 +
1 | chr12 123591431 123591481 -
2 | chr3 54767920 54767970 +
3 | chr3 162329749 162329799 +
4 | chr3 176218669 176218719 +
PyRanges with 5 rows, 4 columns, and 1 index columns.
Contains 3 chromosomes and 2 strands.
Contains 2 chromosomes and 2 strands.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ line-length = 120
select = ["ALL"]
ignore = [
"E501", "ANN002", "ANN003", "ANN001", "ANN101", "ANN102", "ANN401", "PLR0913", "PLC0415",
"PD901", "D101", "D100", "D107", "CPY", "D105", "D104",
"PD901", "D101", "D100", "D107", "CPY", "D105", "D104", "D203", "D211", "D213", "COM812", "ISC001"
]

[tool.pytest-watcher]
Expand Down
12 changes: 6 additions & 6 deletions pyranges/core/pyranges_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,14 +1039,14 @@ def count_overlaps(
index | Chromosome Start End Strand transcript_id Feature NumberOverlaps
int64 | category int64 int64 category object category int64
------- --- ------------ ------- ------- ---------- --------------- ---------- ----------------
0 | 1 11868 14409 + nan gene 20
1 | 1 11868 14409 + ENST00000456328 transcript 20
0 | 1 11868 14409 + nan gene 17
1 | 1 11868 14409 + ENST00000456328 transcript 17
2 | 1 11868 12227 + ENST00000456328 exon 3
3 | 1 12612 12721 + ENST00000456328 exon 3
3 | 1 12612 12721 + ENST00000456328 exon 1
... | ... ... ... ... ... ... ...
7 | 1 120724 133723 - ENST00000610542 transcript 85
8 | 1 133373 133723 - ENST00000610542 exon 2
9 | 1 129054 129223 - ENST00000610542 exon 1
7 | 1 120724 133723 - ENST00000610542 transcript 76
8 | 1 133373 133723 - ENST00000610542 exon 1
9 | 1 129054 129223 - ENST00000610542 exon 3
10 | 1 120873 120932 - ENST00000610542 exon 1
PyRanges with 11 rows, 7 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
Expand Down
57 changes: 35 additions & 22 deletions pyranges/core/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@
from pyranges.core.example_data import example_data
from pyranges.core.pyranges_helpers import mypy_ensure_pyranges

Chromsizes = dict[str, int] | dict[tuple[str, str], int]


def random(
n: int = 1000,
length: int = 100,
chromsizes: Chromsizes | None = None,
chromsizes: dict[str, int] | pd.DataFrame | None = None,
seed: int | None = None,
*,
strand: bool = True,
Expand Down Expand Up @@ -42,44 +40,59 @@ def random(
index | Chromosome Start End Strand
int64 | object int64 int64 object
------- --- ------------ --------- --------- --------
0 | chr4 130788360 130788460 +
1 | chr4 36129012 36129112 +
2 | chr4 69733790 69733890 -
3 | chr4 187723767 187723867 -
0 | chr11 25516829 25516929 +
1 | chr11 132583621 132583721 -
2 | chr11 2504795 2504895 +
3 | chr11 23816613 23816713 +
... | ... ... ... ...
996 | chr21 13544178 13544278 -
997 | chr21 33556472 33556572 +
998 | chr21 31438477 31438577 +
999 | chr21 38433522 38433622 -
996 | chr21 30756250 30756350 -
997 | chr21 22517078 22517178 +
998 | chr21 20605246 20605346 +
999 | chr21 21153142 21153242 -
PyRanges with 1000 rows, 4 columns, and 1 index columns.
Contains 24 chromosomes and 2 strands.
"""
rng = np.random.default_rng(seed=seed)

df: pd.DataFrame
if chromsizes is None:
df = example_data.chromsizes
elif isinstance(chromsizes, dict):
df = pd.DataFrame({names.CHROM_COL: list(chromsizes.keys()), names.END_COL: list(chromsizes.values())})
else:
df = chromsizes

# Probability of picking each chromosome proportional to its size
p = df.End / df.End.sum()

n_per_chrom = pd.Series(rng.choice(df.index, size=n, p=p)).value_counts(sort=False).to_frame()
n_per_chrom.insert(1, names.CHROM_COL, df.loc[n_per_chrom.index].Chromosome)
n_per_chrom.columns = pd.Index("Count Chromosome".split())
# Determine how many intervals per chromosome
chosen = rng.choice(df.index, size=n, p=p)
n_per_chrom = pd.Series(chosen).value_counts(sort=False).to_frame("Count")
n_per_chrom.insert(1, names.CHROM_COL, pd.Series(df.loc[n_per_chrom.index, names.CHROM_COL].values))

# Merge chromosome sizes into n_per_chrom for direct access
n_per_chrom = n_per_chrom.merge(df[[names.CHROM_COL, names.END_COL]], on=names.CHROM_COL, how="left")

# Extract arrays
counts_array = n_per_chrom["Count"].to_numpy()
chroms_array = n_per_chrom[names.CHROM_COL].to_numpy()
ends_array = n_per_chrom["End"].to_numpy() - length

# Repeat arrays according to the counts for vectorized generation
chroms_repeated = np.repeat(chroms_array, counts_array)
ends_repeated = np.repeat(ends_array, counts_array)

# Generate random starts in [0, ends_repeated)
# Using random() gives a uniform [0,1), we scale by ends_repeated
random_starts = (rng.random(chroms_repeated.size) * ends_repeated).astype(int)

random_dfs = []
for _, (count, chrom) in n_per_chrom.iterrows():
r = rng.integers(0, df[df.Chromosome == chrom].End - length, size=count)
_df = pd.DataFrame({names.CHROM_COL: chrom, names.START_COL: r, "End": r + length})
random_dfs.append(_df)
# Build final DataFrame
random_df = pd.DataFrame(
{names.CHROM_COL: chroms_repeated, names.START_COL: random_starts, "End": random_starts + length},
)

random_df = pd.concat(random_dfs)
if strand:
s = rng.choice("+ -".split(), size=n)
s = rng.choice(["+", "-"], size=n)
random_df.insert(3, "Strand", s)

return mypy_ensure_pyranges(random_df.reset_index(drop=True))

0 comments on commit ef79c69

Please sign in to comment.