Speed up random function for many chroms

pyranges · Dec 20, 2024 · ef79c69 · ef79c69
1 parent 861a81a
commit ef79c69
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 35 deletions.
diff --git a/docs/how_to_create.rst b/docs/how_to_create.rst
@@ -287,10 +287,10 @@ By default, the data refers to the human genome (hg19):
     index  |    Chromosome        Start        End  Strand
     int64  |    object            int64      int64  object
   -------  ---  ------------  ---------  ---------  --------
-        0  |    chr12          46554011   46554061  +
-        1  |    chr1          202415019  202415069  -
-        2  |    chr3           89385747   89385797  -
-        3  |    chr3          182842974  182843024  -
-        4  |    chr3           89004288   89004338  -
+        0  |    nan                   0         50  +
+        1  |    chr12         123591431  123591481  -
+        2  |    chr3           54767920   54767970  +
+        3  |    chr3          162329749  162329799  +
+        4  |    chr3          176218669  176218719  +
   PyRanges with 5 rows, 4 columns, and 1 index columns.
-  Contains 3 chromosomes and 2 strands.
+  Contains 2 chromosomes and 2 strands.
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,7 +82,7 @@ line-length = 120
 select = ["ALL"]
 ignore = [
     "E501", "ANN002", "ANN003", "ANN001", "ANN101", "ANN102", "ANN401", "PLR0913", "PLC0415",
-    "PD901", "D101", "D100", "D107", "CPY", "D105", "D104",
+    "PD901", "D101", "D100", "D107", "CPY", "D105", "D104", "D203", "D211", "D213", "COM812", "ISC001"
 ]
 
 [tool.pytest-watcher]

diff --git a/pyranges/core/pyranges_main.py b/pyranges/core/pyranges_main.py
@@ -1039,14 +1039,14 @@ def count_overlaps(
         index    |    Chromosome    Start    End      Strand      transcript_id    Feature     NumberOverlaps
         int64    |    category      int64    int64    category    object           category    int64
         -------  ---  ------------  -------  -------  ----------  ---------------  ----------  ----------------
-        0        |    1             11868    14409    +           nan              gene        20
-        1        |    1             11868    14409    +           ENST00000456328  transcript  20
+        0        |    1             11868    14409    +           nan              gene        17
+        1        |    1             11868    14409    +           ENST00000456328  transcript  17
         2        |    1             11868    12227    +           ENST00000456328  exon        3
-        3        |    1             12612    12721    +           ENST00000456328  exon        3
+        3        |    1             12612    12721    +           ENST00000456328  exon        1
         ...      |    ...           ...      ...      ...         ...              ...         ...
-        7        |    1             120724   133723   -           ENST00000610542  transcript  85
-        8        |    1             133373   133723   -           ENST00000610542  exon        2
-        9        |    1             129054   129223   -           ENST00000610542  exon        1
+        7        |    1             120724   133723   -           ENST00000610542  transcript  76
+        8        |    1             133373   133723   -           ENST00000610542  exon        1
+        9        |    1             129054   129223   -           ENST00000610542  exon        3
         10       |    1             120873   120932   -           ENST00000610542  exon        1
         PyRanges with 11 rows, 7 columns, and 1 index columns.
         Contains 1 chromosomes and 2 strands.

diff --git a/pyranges/core/random.py b/pyranges/core/random.py
@@ -6,13 +6,11 @@
 from pyranges.core.example_data import example_data
 from pyranges.core.pyranges_helpers import mypy_ensure_pyranges
 
-Chromsizes = dict[str, int] | dict[tuple[str, str], int]
-
 
 def random(
     n: int = 1000,
     length: int = 100,
-    chromsizes: Chromsizes | None = None,
+    chromsizes: dict[str, int] | pd.DataFrame | None = None,
     seed: int | None = None,
     *,
     strand: bool = True,
@@ -42,44 +40,59 @@ def random(
     index    |    Chromosome    Start      End        Strand
     int64    |    object        int64      int64      object
     -------  ---  ------------  ---------  ---------  --------
-    0        |    chr4          130788360  130788460  +
-    1        |    chr4          36129012   36129112   +
-    2        |    chr4          69733790   69733890   -
-    3        |    chr4          187723767  187723867  -
+    0        |    chr11         25516829   25516929   +
+    1        |    chr11         132583621  132583721  -
+    2        |    chr11         2504795    2504895    +
+    3        |    chr11         23816613   23816713   +
     ...      |    ...           ...        ...        ...
-    996      |    chr21         13544178   13544278   -
-    997      |    chr21         33556472   33556572   +
-    998      |    chr21         31438477   31438577   +
-    999      |    chr21         38433522   38433622   -
+    996      |    chr21         30756250   30756350   -
+    997      |    chr21         22517078   22517178   +
+    998      |    chr21         20605246   20605346   +
+    999      |    chr21         21153142   21153242   -
     PyRanges with 1000 rows, 4 columns, and 1 index columns.
     Contains 24 chromosomes and 2 strands.
 
     """
     rng = np.random.default_rng(seed=seed)
 
-    df: pd.DataFrame
     if chromsizes is None:
         df = example_data.chromsizes
     elif isinstance(chromsizes, dict):
         df = pd.DataFrame({names.CHROM_COL: list(chromsizes.keys()), names.END_COL: list(chromsizes.values())})
     else:
         df = chromsizes
 
+    # Probability of picking each chromosome proportional to its size
     p = df.End / df.End.sum()
 
-    n_per_chrom = pd.Series(rng.choice(df.index, size=n, p=p)).value_counts(sort=False).to_frame()
-    n_per_chrom.insert(1, names.CHROM_COL, df.loc[n_per_chrom.index].Chromosome)
-    n_per_chrom.columns = pd.Index("Count Chromosome".split())
+    # Determine how many intervals per chromosome
+    chosen = rng.choice(df.index, size=n, p=p)
+    n_per_chrom = pd.Series(chosen).value_counts(sort=False).to_frame("Count")
+    n_per_chrom.insert(1, names.CHROM_COL, pd.Series(df.loc[n_per_chrom.index, names.CHROM_COL].values))
+
+    # Merge chromosome sizes into n_per_chrom for direct access
+    n_per_chrom = n_per_chrom.merge(df[[names.CHROM_COL, names.END_COL]], on=names.CHROM_COL, how="left")
+
+    # Extract arrays
+    counts_array = n_per_chrom["Count"].to_numpy()
+    chroms_array = n_per_chrom[names.CHROM_COL].to_numpy()
+    ends_array = n_per_chrom["End"].to_numpy() - length
+
+    # Repeat arrays according to the counts for vectorized generation
+    chroms_repeated = np.repeat(chroms_array, counts_array)
+    ends_repeated = np.repeat(ends_array, counts_array)
+
+    # Generate random starts in [0, ends_repeated)
+    # Using random() gives a uniform [0,1), we scale by ends_repeated
+    random_starts = (rng.random(chroms_repeated.size) * ends_repeated).astype(int)
 
-    random_dfs = []
-    for _, (count, chrom) in n_per_chrom.iterrows():
-        r = rng.integers(0, df[df.Chromosome == chrom].End - length, size=count)
-        _df = pd.DataFrame({names.CHROM_COL: chrom, names.START_COL: r, "End": r + length})
-        random_dfs.append(_df)
+    # Build final DataFrame
+    random_df = pd.DataFrame(
+        {names.CHROM_COL: chroms_repeated, names.START_COL: random_starts, "End": random_starts + length},
+    )
 
-    random_df = pd.concat(random_dfs)
     if strand:
-        s = rng.choice("+ -".split(), size=n)
+        s = rng.choice(["+", "-"], size=n)
         random_df.insert(3, "Strand", s)
 
     return mypy_ensure_pyranges(random_df.reset_index(drop=True))