Skip to content

Commit

Permalink
Update embeddings tools for 2025-01-30 LTS (#1354)
Browse files Browse the repository at this point in the history
Variety of minor changes & readme updates for our embedding preparation tooling, motivated as we exercised them new LTS.
  • Loading branch information
mlin authored Feb 12, 2025
1 parent 2442b2e commit 77fdea0
Show file tree
Hide file tree
Showing 17 changed files with 118 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class GeneformerTokenizer(CellDatasetBuilder):
cell in CELLxGENE Census ExperimentAxisQuery results (human).
This class requires the Geneformer package to be installed separately with:
`pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6`
`pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096`
Example usage:
Expand Down Expand Up @@ -64,8 +64,8 @@ def __init__(
*,
obs_column_names: Sequence[str] | None = None,
obs_attributes: Sequence[str] | None = None,
max_input_tokens: int = 2048,
special_token: bool = False,
max_input_tokens: int = 4096,
special_token: bool = True,
token_dictionary_file: str = "",
gene_median_file: str = "",
gene_mapping_file: str = "",
Expand All @@ -78,8 +78,8 @@ def __init__(
- `obs_query`: obs AxisQuery defining the set of Census cells to process (default all)
- `obs_column_names`: obs dataframe columns (cell metadata) to propagate into attributes
of each Dataset item
- `max_input_tokens`: maximum length of Geneformer input token sequence (default 2048)
- `special_token`: whether to affix separator tokens to the sequence (default False)
- `max_input_tokens`: maximum length of Geneformer input token sequence (default 4096)
- `special_token`: whether to affix separator tokens to the sequence (default True)
- `token_dictionary_file`, `gene_median_file`: pickle files supplying the mapping of
Ensembl human gene IDs onto Geneformer token numbers and median expression values.
By default, these will be loaded from the Geneformer package.
Expand Down Expand Up @@ -120,19 +120,21 @@ def _load_geneformer_data(
.set_index("soma_joinid")
)

if not (token_dictionary_file and gene_median_file):
if not (token_dictionary_file and gene_median_file and gene_mapping_file):
try:
import geneformer
except ImportError:
# pyproject.toml can't express Geneformer git+https dependency
raise ImportError(
"Please install Geneformer with: "
"pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6"
"pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096"
) from None
if not token_dictionary_file:
token_dictionary_file = geneformer.tokenizer.TOKEN_DICTIONARY_FILE
if not gene_median_file:
gene_median_file = geneformer.tokenizer.GENE_MEDIAN_FILE
if not gene_mapping_file:
gene_mapping_file = geneformer.tokenizer.ENSEMBL_MAPPING_FILE
with open(token_dictionary_file, "rb") as f:
gene_token_dict = pickle.load(f)
with open(gene_median_file, "rb") as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
assert len(true_tokens) == len(cell_ids)
identical = 0
for i, cell_id in enumerate(cell_ids):
assert len(test_tokens[i]) == len(true_tokens[i])
if len(test_tokens[i]) != len(true_tokens[i]):
assert test_tokens[i] == true_tokens[i] # to show diff
rho, _ = spearmanr(test_tokens[i], true_tokens[i])
if rho < RHO_THRESHOLD:
# token sequences are too dissimilar; assert exact identity so that pytest -vv will
Expand All @@ -103,7 +104,9 @@ def test_GeneformerTokenizer_docstring_example() -> None:
"soma_joinid",
"cell_type_ontology_term_id",
),
max_input_tokens=2048,
special_token=False,
) as tokenizer:
dataset = tokenizer.build()
assert len(dataset) == 15020
assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27798388
assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27793772
4 changes: 2 additions & 2 deletions tools/census_contrib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ dynamic = ["version"]
dependencies= [
"attrs",
"cattrs>=23.2.2",
"tiledbsoma==1.4.4", # IMPORTANT: this must match the Census Builder version
"cellxgene-census==1.6.0", # IMPORTANT: this must match the Census Builder version
"tiledbsoma==1.15.3", # IMPORTANT: this must match the Census Builder version
"cellxgene-census==1.15.0", # IMPORTANT: this must match the Census Builder version
"pyyaml",
"requests",
"typed-argument-parser",
Expand Down
21 changes: 17 additions & 4 deletions tools/census_embeddings_indexer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
FROM ubuntu:22.04
# TILEDB_VECTOR_SEARCH_VERSION should be the newest that doesn't need a newer version of tiledb
# than the client tiledbsoma: https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.2.2/pyproject.toml
ARG TILEDB_VECTOR_SEARCH_VERSION=0.2.2

# TILEDB_PY_VERSION should be set such that the TileDB Embedded version will match that used by
# tiledbsoma in cellxgene_census_builder and census_contrib.
# https://github.com/single-cell-data/TileDB-SOMA/blob/1.15.3/libtiledbsoma/cmake/Modules/FindTileDB_EP.cmake#L93 (2.27.0)
# ==
# https://github.com/TileDB-Inc/TileDB-Py/blob/0.33.3/CMakeLists.txt#L49 (2.27.0)
ARG TILEDB_PY_VERSION=0.33.3
# TILEDB_VECTOR_SEARCH_VERSION should be the newest compatible with TILEDB_PY_VERSION.
# https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.11.0/pyproject.toml#L23 (tiledb-py>=0.32.0)
ARG TILEDB_VECTOR_SEARCH_VERSION=0.11.0

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
python3-pip
RUN pip3 install \
cellxgene_census \
tiledb==$TILEDB_PY_VERSION \
tiledb-vector-search==$TILEDB_VECTOR_SEARCH_VERSION

# FIXME: monkey patch tiledb-vector-search 0.11 for https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564
# This should be removed when we update to a new version addressing that issue.
ADD ingestion.py.patch /tmp
RUN patch /usr/local/lib/python3.10/dist-packages/tiledb/vector_search/ingestion.py /tmp/ingestion.py.patch
24 changes: 11 additions & 13 deletions tools/census_embeddings_indexer/README.md
Original file line number Diff line number Diff line change
@@ -1,33 +1,31 @@
# census_embeddings_indexer

This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (assuming account setup documented there).
This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (`pip3 install miniwdl-omics-run`; one-time account setup steps documented there are probably already done in the relevant CZI AWS account).

The pipeline consumes one or more of the existing TileDB arrays for hosted and contributed [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3.
The pipeline consumes one or more of the existing TileDB arrays for [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3.

```bash
export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
export AWS_DEFAULT_REGION=$(aws configure get region)
export ECR_ENDPT=${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com
export WDL_OUTPUT_BUCKET=mlin-census-screatch
export WDL_OUTPUT_BUCKET=mlin-census-scratch

docker build -t ${ECR_ENDPT}/omics:census_embeddings_indexer .
docker build --platform linux/amd64 -t ${ECR_ENDPT}/omics:census_embeddings_indexer .
aws ecr get-login-password | docker login --username AWS --password-stdin "$ECR_ENDPT"
docker push ${ECR_ENDPT}/omics:census_embeddings_indexer

miniwdl-omics-run census_embeddings_indexer.wdl \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-1 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-4 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-5 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-1 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-2 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-3 \
census_version=2023-12-15 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-6 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-7 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-8 \
embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-contrib-7 \
census_version=2024-07-01 \
s3_region=$AWS_DEFAULT_REGION \
docker=${ECR_ENDPT}/omics:census_embeddings_indexer \
--output-uri s3://${WDL_OUTPUT_BUCKET}/census_embeddings_indexer/out/ \
--role poweromics
--role poweromics --storage-capacity 4800
```

(The `embeddings_s3_uris=s3_//...` with `s3_//` instead of `s3://` is a workaround for an AWS-side existence check that doesn't seem to work right on public buckets.)

The Dockerfile has an argument for the TileDB-Vector-Search version to use. We should use the newest version that doesn't need a newer version of TileDB than the intended client tiledbsoma/cellxgene_census.
The [Dockerfile](Dockerfile) has arguments for the TileDB-Py and TileDB-Vector-Search versions to use; see comments there for guidance on setting them.
16 changes: 13 additions & 3 deletions tools/census_embeddings_indexer/census_embeddings_indexer.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ task indexer {
set -euxo pipefail

python3 << 'EOF'
import sys
import math
import tiledb
import tiledb.vector_search as vs
Expand All @@ -52,14 +53,18 @@ task indexer {
source_uri = "~{embeddings_s3_uri}".replace("s3_//", "s3://")
with tiledb.open(source_uri, config=config) as emb_array:
N, M = emb_array.shape
(_, N), (_, M) = emb_array.nonempty_domain() # TODO use "current domain" when supported
N += 1 # ASSUMES contiguous soma_joinid's [0, N)
M += 1
input_vectors_per_work_item = 1_500_000_000 // M # controls memory usage
print(f"N={N} M={M} input_vectors_per_work_item={input_vectors_per_work_item}", file=sys.stderr)
vs.ingest(
config=config,
source_uri=source_uri,
source_type="TILEDB_SPARSE_ARRAY",
dimensions=M,
size=N,
dimensions_override=M, # FIXME: see Dockerfile
index_type="IVF_FLAT",
index_uri="./~{embeddings_name}",
partitions=math.ceil(math.sqrt(N)),
Expand All @@ -70,8 +75,12 @@ task indexer {
)
final_index = vs.ivf_flat_index.IVFFlatIndex(uri="./~{embeddings_name}", memory_budget=1024*1048756)
assert final_index.size == N
print(f"VACUUM", file=sys.stderr)
final_index.vacuum()
assert final_index.size == N, f"final_index.size=={final_index.size} != N=={N}"
EOF
>&2 ls -lR '~{embeddings_name}'
>>>
runtime {
Expand Down Expand Up @@ -100,6 +109,7 @@ task make_one_directory {
while read -r dir; do
cp -r "$dir" '~{directory_name}/'
done < '~{manifest}'
>&2 ls -lR '~{directory_name}'
>>>
output {
Expand Down
5 changes: 5 additions & 0 deletions tools/census_embeddings_indexer/ingestion.py.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
56a57
> dimensions_override: int = -1,
3144a3146,3147
> if dimensions_override >= 0:
> dimensions = min(dimensions, dimensions_override)
9 changes: 3 additions & 6 deletions tools/models/geneformer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN git lfs install

ENV GIT_SSL_NO_VERIFY=true
RUN pip install --upgrade pip setuptools setuptools_scm
RUN pip install torch torchdata --index-url https://download.pytorch.org/whl/cu118
RUN pip install torch 'torchdata<0.10' --index-url https://download.pytorch.org/whl/cu118
# ^^^ match the base image CUDA version!
RUN pip install owlready2 boto3 transformers[torch]
# workaround for unknown problem blocking `import geneformer`:
Expand All @@ -19,9 +19,9 @@ RUN pip uninstall -y transformer-engine

# Set the tiledbsoma version used to write the embeddings SparseNDArray, to ensure
# compatibility with the Census embeddings curator
ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.9.5
ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.11.4
ARG CELLXGENE_CENSUS_VERSION=main
ARG GENEFORMER_VERSION=57f02a4
ARG GENEFORMER_VERSION=ebc1e096

RUN mkdir /census-geneformer
WORKDIR /census-geneformer
Expand All @@ -44,6 +44,3 @@ RUN python3 -m venv --system-site-packages embeddings_tiledbsoma_venv && \
COPY helpers ./helpers
COPY *.py ./
COPY finetune-geneformer.config.yml .

# FIXME: eliminate once model is published in Geneformer repo
COPY gf-95m/ ./gf-95m/
62 changes: 35 additions & 27 deletions tools/models/geneformer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,60 @@
These scripts automate:

1. preparing tokenized Geneformer datasets from CELLxGENE Census (`prepare-census-geneformer-dataset.py`)
2. fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`)
3. using the fine-tuned model to generate cell embedding vectors (`generate-geneformer-embeddings.py`)
2. **(deprecated)** fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`)
3. generate cell embedding vectors given a dataset & model (`generate-geneformer-embeddings.py`)

Embedding generation is computationally intensive on large datasets (e.g. all of Census). To make this practical, a WDL workflow (`wdl/generate_embeddings.wdl`) provides a way to distribute across many compute nodes. The other steps also have WDLs for encapsulation, even though they aren't distributed.

The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc. It also bundles `finetune-geneformer.config.yml` with various fine-tuning settings; an alternate config file can be supplied at runtime.
The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc.

(Starting with the 2024-07-01 LTS, [Geneformer includes a model fine-tuned with CELLxGENE](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522), which we use instead of our own fine-tuning. Our historical fine-tuning code remains here for reference.)

## Example invocations

Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to a suitable repository like ECR (tagged `$DOCKER_TAG`).
Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to ECR (tagged `$DOCKER_TAG`).

Preparing a tokenized training dataset with 2,500 primary cells per human cell type:
Preparing a tokenized dataset for all of Census (>500GB, sharded):

```bash
miniwdl-omics-run wdl/prepare_datasets.wdl \
docker=$DOCKER_TAG \
census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \
N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
census_version=s3://cellxgene-census-public-us-west-2/cell-census/2025-01-30/soma/ \
value_filter='is_primary_data==True or is_primary_data==False' \
output_name=2025-01-30 shards=500 --storage-capacity 4800 \
--role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
```

And a tokenized dataset for all of Census (>300GiB, sharded):
(We set `census_version` to the SOMACollection S3 URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.) The run produces a folder containing 500 shard subfolders named e.g. `shard-123`, under the output URI and HealthOmics run ID.

Generating cell embeddings (takes 8-12h on up to 500 g5.4xlarge, generates 200GB `tiledbsoma.SparseNDArray` on S3):

```bash
seq 0 499 \
| xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/1234567/out/dataset/2025-01-30/shard-%03d/\n' \
| xargs -n 9999 miniwdl-omics-run \
--role poweromics --output-uri s3://MYBUCKET/geneformer/embs \
wdl/generate_embeddings.wdl \
docker=$DOCKER_TAG \
emb_mode=cls emb_layer=0 model_type=Pretrained \
model=s3://MYBUCKET/geneformer/models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/ \
output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/2025-01-30/
```

The `model` input folder can be [copied from upstream](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522). The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow creates it using TileDB.

### (deprecated) Fine-tuning procedure

Preparing a tokenized training dataset with 2,500 primary cells per human cell type:

```bash
miniwdl-omics-run wdl/prepare_datasets.wdl \
docker=$DOCKER_TAG \
census_version=s3://cellxgene-census-public-us-west-2/cell-census/2024-05-20/soma/ \
value_filter='is_primary_data==True or is_primary_data==False' \
output_name=2024-05-20 shards=256 \
census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \
N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
--role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
```

(We set `census_version` to the SOMACollection URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.)

Fine-tuning for 8 epochs (takes ~36h on g5.8xlarge):

```bash
Expand All @@ -47,18 +67,6 @@ miniwdl-omics-run wdl/finetune_geneformer.wdl \
--role poweromics --output-uri s3://MYBUCKET/geneformer/models/
```

Generating cell embeddings (takes 8-12h on up to 256 g5.2xlarge, generates 130GiB `tiledbsoma.SparseNDArray` on S3):

```bash
seq 0 255 \
| xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/census-2024-05-20/shard-%03d/\n' \
| xargs -n 9999 miniwdl-omics-run \
--role poweromics --output-uri s3://MYBUCKET/geneformer/embs \
wdl/generate_embeddings.wdl \
docker=$DOCKER_TAG \
emb_layer=0 model_type=Pretrained \
model=s3://MYBUCKET/geneformer/gf-95m/fine_tuned_model/ \
output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/census-2024-05-20/
```
Then the output model folder can be supplied to the `model` input to `generate_embeddings.wdl`.

(The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow has TileDB create it.)
To change fine-tuning parameters, customize the default `finetune-geneformer.config.yaml` file and supply that to the `config` argument to `finetune_geneformer.wdl`.
1 change: 0 additions & 1 deletion tools/models/geneformer/buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ phases:
- aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 699936264352.dkr.ecr.us-west-2.amazonaws.com
build:
commands:
- aws s3 cp s3://mlin-census-scratch/geneformer/gf-95m/ tools/models/geneformer/gf-95m/ --recursive
- docker build -t 699936264352.dkr.ecr.us-west-2.amazonaws.com/omics:census-geneformer --build-arg CELLXGENE_CENSUS_VERSION=$CODEBUILD_RESOLVED_SOURCE_VERSION tools/models/geneformer
post_build:
commands:
Expand Down
1 change: 1 addition & 0 deletions tools/models/geneformer/finetune-geneformer.config.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# DEPRECATED: see README.md
# Name of a categorical column/feature in the Dataset to use as the classifier label
label_feature: cell_subclass
# Specific labels to exclude from training and evaluation
Expand Down
2 changes: 2 additions & 0 deletions tools/models/geneformer/finetune-geneformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python3
# mypy: ignore-errors

# DEPRECATED: see README.md

import argparse
import json
import logging
Expand Down
Loading

0 comments on commit 77fdea0

Please sign in to comment.