From ef79c697634c08d23725be2cd2e948e82db7443e Mon Sep 17 00:00:00 2001 From: "endrebak.ada" Date: Fri, 20 Dec 2024 16:00:52 +0700 Subject: [PATCH] Speed up random function for many chroms --- docs/how_to_create.rst | 12 +++---- pyproject.toml | 2 +- pyranges/core/pyranges_main.py | 12 +++---- pyranges/core/random.py | 57 +++++++++++++++++++++------------- 4 files changed, 48 insertions(+), 35 deletions(-) diff --git a/docs/how_to_create.rst b/docs/how_to_create.rst index 82975d96..edca136d 100644 --- a/docs/how_to_create.rst +++ b/docs/how_to_create.rst @@ -287,10 +287,10 @@ By default, the data refers to the human genome (hg19): index | Chromosome Start End Strand int64 | object int64 int64 object ------- --- ------------ --------- --------- -------- - 0 | chr12 46554011 46554061 + - 1 | chr1 202415019 202415069 - - 2 | chr3 89385747 89385797 - - 3 | chr3 182842974 182843024 - - 4 | chr3 89004288 89004338 - + 0 | nan 0 50 + + 1 | chr12 123591431 123591481 - + 2 | chr3 54767920 54767970 + + 3 | chr3 162329749 162329799 + + 4 | chr3 176218669 176218719 + PyRanges with 5 rows, 4 columns, and 1 index columns. - Contains 3 chromosomes and 2 strands. + Contains 2 chromosomes and 2 strands. diff --git a/pyproject.toml b/pyproject.toml index e134b52d..dbaf48bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ line-length = 120 select = ["ALL"] ignore = [ "E501", "ANN002", "ANN003", "ANN001", "ANN101", "ANN102", "ANN401", "PLR0913", "PLC0415", - "PD901", "D101", "D100", "D107", "CPY", "D105", "D104", + "PD901", "D101", "D100", "D107", "CPY", "D105", "D104", "D203", "D211", "D213", "COM812", "ISC001" ] [tool.pytest-watcher] diff --git a/pyranges/core/pyranges_main.py b/pyranges/core/pyranges_main.py index 30977c80..3566f7e7 100644 --- a/pyranges/core/pyranges_main.py +++ b/pyranges/core/pyranges_main.py @@ -1039,14 +1039,14 @@ def count_overlaps( index | Chromosome Start End Strand transcript_id Feature NumberOverlaps int64 | category int64 int64 category object category int64 ------- --- ------------ ------- ------- ---------- --------------- ---------- ---------------- - 0 | 1 11868 14409 + nan gene 20 - 1 | 1 11868 14409 + ENST00000456328 transcript 20 + 0 | 1 11868 14409 + nan gene 17 + 1 | 1 11868 14409 + ENST00000456328 transcript 17 2 | 1 11868 12227 + ENST00000456328 exon 3 - 3 | 1 12612 12721 + ENST00000456328 exon 3 + 3 | 1 12612 12721 + ENST00000456328 exon 1 ... | ... ... ... ... ... ... ... - 7 | 1 120724 133723 - ENST00000610542 transcript 85 - 8 | 1 133373 133723 - ENST00000610542 exon 2 - 9 | 1 129054 129223 - ENST00000610542 exon 1 + 7 | 1 120724 133723 - ENST00000610542 transcript 76 + 8 | 1 133373 133723 - ENST00000610542 exon 1 + 9 | 1 129054 129223 - ENST00000610542 exon 3 10 | 1 120873 120932 - ENST00000610542 exon 1 PyRanges with 11 rows, 7 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. diff --git a/pyranges/core/random.py b/pyranges/core/random.py index 3038bd01..f52a3b74 100644 --- a/pyranges/core/random.py +++ b/pyranges/core/random.py @@ -6,13 +6,11 @@ from pyranges.core.example_data import example_data from pyranges.core.pyranges_helpers import mypy_ensure_pyranges -Chromsizes = dict[str, int] | dict[tuple[str, str], int] - def random( n: int = 1000, length: int = 100, - chromsizes: Chromsizes | None = None, + chromsizes: dict[str, int] | pd.DataFrame | None = None, seed: int | None = None, *, strand: bool = True, @@ -42,22 +40,21 @@ def random( index | Chromosome Start End Strand int64 | object int64 int64 object ------- --- ------------ --------- --------- -------- - 0 | chr4 130788360 130788460 + - 1 | chr4 36129012 36129112 + - 2 | chr4 69733790 69733890 - - 3 | chr4 187723767 187723867 - + 0 | chr11 25516829 25516929 + + 1 | chr11 132583621 132583721 - + 2 | chr11 2504795 2504895 + + 3 | chr11 23816613 23816713 + ... | ... ... ... ... - 996 | chr21 13544178 13544278 - - 997 | chr21 33556472 33556572 + - 998 | chr21 31438477 31438577 + - 999 | chr21 38433522 38433622 - + 996 | chr21 30756250 30756350 - + 997 | chr21 22517078 22517178 + + 998 | chr21 20605246 20605346 + + 999 | chr21 21153142 21153242 - PyRanges with 1000 rows, 4 columns, and 1 index columns. Contains 24 chromosomes and 2 strands. """ rng = np.random.default_rng(seed=seed) - df: pd.DataFrame if chromsizes is None: df = example_data.chromsizes elif isinstance(chromsizes, dict): @@ -65,21 +62,37 @@ def random( else: df = chromsizes + # Probability of picking each chromosome proportional to its size p = df.End / df.End.sum() - n_per_chrom = pd.Series(rng.choice(df.index, size=n, p=p)).value_counts(sort=False).to_frame() - n_per_chrom.insert(1, names.CHROM_COL, df.loc[n_per_chrom.index].Chromosome) - n_per_chrom.columns = pd.Index("Count Chromosome".split()) + # Determine how many intervals per chromosome + chosen = rng.choice(df.index, size=n, p=p) + n_per_chrom = pd.Series(chosen).value_counts(sort=False).to_frame("Count") + n_per_chrom.insert(1, names.CHROM_COL, pd.Series(df.loc[n_per_chrom.index, names.CHROM_COL].values)) + + # Merge chromosome sizes into n_per_chrom for direct access + n_per_chrom = n_per_chrom.merge(df[[names.CHROM_COL, names.END_COL]], on=names.CHROM_COL, how="left") + + # Extract arrays + counts_array = n_per_chrom["Count"].to_numpy() + chroms_array = n_per_chrom[names.CHROM_COL].to_numpy() + ends_array = n_per_chrom["End"].to_numpy() - length + + # Repeat arrays according to the counts for vectorized generation + chroms_repeated = np.repeat(chroms_array, counts_array) + ends_repeated = np.repeat(ends_array, counts_array) + + # Generate random starts in [0, ends_repeated) + # Using random() gives a uniform [0,1), we scale by ends_repeated + random_starts = (rng.random(chroms_repeated.size) * ends_repeated).astype(int) - random_dfs = [] - for _, (count, chrom) in n_per_chrom.iterrows(): - r = rng.integers(0, df[df.Chromosome == chrom].End - length, size=count) - _df = pd.DataFrame({names.CHROM_COL: chrom, names.START_COL: r, "End": r + length}) - random_dfs.append(_df) + # Build final DataFrame + random_df = pd.DataFrame( + {names.CHROM_COL: chroms_repeated, names.START_COL: random_starts, "End": random_starts + length}, + ) - random_df = pd.concat(random_dfs) if strand: - s = rng.choice("+ -".split(), size=n) + s = rng.choice(["+", "-"], size=n) random_df.insert(3, "Strand", s) return mypy_ensure_pyranges(random_df.reset_index(drop=True))