Update embeddings tools for 2025-01-30 LTS (#1354)

Variety of minor changes & readme updates for our embedding preparation tooling, motivated as we exercised them new LTS.
chanzuckerberg · Feb 12, 2025 · 77fdea0 · 77fdea0
1 parent 2442b2e
commit 77fdea0
Show file tree

Hide file tree

Showing 17 changed files with 118 additions and 77 deletions.
diff --git a/...cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/...cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py
@@ -15,7 +15,7 @@ class GeneformerTokenizer(CellDatasetBuilder):
     cell in CELLxGENE Census ExperimentAxisQuery results (human).
 
     This class requires the Geneformer package to be installed separately with:
-    `pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6`
+    `pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096`
 
     Example usage:
 
@@ -64,8 +64,8 @@ def __init__(
         *,
         obs_column_names: Sequence[str] | None = None,
         obs_attributes: Sequence[str] | None = None,
-        max_input_tokens: int = 2048,
-        special_token: bool = False,
+        max_input_tokens: int = 4096,
+        special_token: bool = True,
         token_dictionary_file: str = "",
         gene_median_file: str = "",
         gene_mapping_file: str = "",
@@ -78,8 +78,8 @@ def __init__(
         - `obs_query`: obs AxisQuery defining the set of Census cells to process (default all)
         - `obs_column_names`: obs dataframe columns (cell metadata) to propagate into attributes
            of each Dataset item
-        - `max_input_tokens`: maximum length of Geneformer input token sequence (default 2048)
-        - `special_token`: whether to affix separator tokens to the sequence (default False)
+        - `max_input_tokens`: maximum length of Geneformer input token sequence (default 4096)
+        - `special_token`: whether to affix separator tokens to the sequence (default True)
         - `token_dictionary_file`, `gene_median_file`: pickle files supplying the mapping of
           Ensembl human gene IDs onto Geneformer token numbers and median expression values.
           By default, these will be loaded from the Geneformer package.
@@ -120,19 +120,21 @@ def _load_geneformer_data(
             .set_index("soma_joinid")
         )
 
-        if not (token_dictionary_file and gene_median_file):
+        if not (token_dictionary_file and gene_median_file and gene_mapping_file):
             try:
                 import geneformer
             except ImportError:
                 # pyproject.toml can't express Geneformer git+https dependency
                 raise ImportError(
                     "Please install Geneformer with: "
-                    "pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6"
+                    "pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096"
                 ) from None
             if not token_dictionary_file:
                 token_dictionary_file = geneformer.tokenizer.TOKEN_DICTIONARY_FILE
             if not gene_median_file:
                 gene_median_file = geneformer.tokenizer.GENE_MEDIAN_FILE
+            if not gene_mapping_file:
+                gene_mapping_file = geneformer.tokenizer.ENSEMBL_MAPPING_FILE
         with open(token_dictionary_file, "rb") as f:
             gene_token_dict = pickle.load(f)
         with open(gene_median_file, "rb") as f:

diff --git a/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py b/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py
@@ -77,7 +77,8 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
         assert len(true_tokens) == len(cell_ids)
         identical = 0
         for i, cell_id in enumerate(cell_ids):
-            assert len(test_tokens[i]) == len(true_tokens[i])
+            if len(test_tokens[i]) != len(true_tokens[i]):
+                assert test_tokens[i] == true_tokens[i]  # to show diff
             rho, _ = spearmanr(test_tokens[i], true_tokens[i])
             if rho < RHO_THRESHOLD:
                 # token sequences are too dissimilar; assert exact identity so that pytest -vv will
@@ -103,7 +104,9 @@ def test_GeneformerTokenizer_docstring_example() -> None:
                 "soma_joinid",
                 "cell_type_ontology_term_id",
             ),
+            max_input_tokens=2048,
+            special_token=False,
         ) as tokenizer:
             dataset = tokenizer.build()
             assert len(dataset) == 15020
-            assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27798388
+            assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27793772
diff --git a/tools/census_contrib/pyproject.toml b/tools/census_contrib/pyproject.toml
@@ -5,8 +5,8 @@ dynamic = ["version"]
 dependencies= [
     "attrs",
     "cattrs>=23.2.2",
-    "tiledbsoma==1.4.4",  # IMPORTANT: this must match the Census Builder version
-    "cellxgene-census==1.6.0",  # IMPORTANT: this must match the Census Builder version
+    "tiledbsoma==1.15.3",  # IMPORTANT: this must match the Census Builder version
+    "cellxgene-census==1.15.0",  # IMPORTANT: this must match the Census Builder version
     "pyyaml",
     "requests",
     "typed-argument-parser",

diff --git a/tools/census_embeddings_indexer/Dockerfile b/tools/census_embeddings_indexer/Dockerfile
@@ -1,9 +1,22 @@
 FROM ubuntu:22.04
-# TILEDB_VECTOR_SEARCH_VERSION should be the newest that doesn't need a newer version of tiledb
-# than the client tiledbsoma: https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.2.2/pyproject.toml
-ARG TILEDB_VECTOR_SEARCH_VERSION=0.2.2
+
+# TILEDB_PY_VERSION should be set such that the TileDB Embedded version will match that used by
+# tiledbsoma in cellxgene_census_builder and census_contrib.
+# https://github.com/single-cell-data/TileDB-SOMA/blob/1.15.3/libtiledbsoma/cmake/Modules/FindTileDB_EP.cmake#L93 (2.27.0)
+#  ==
+# https://github.com/TileDB-Inc/TileDB-Py/blob/0.33.3/CMakeLists.txt#L49 (2.27.0)
+ARG TILEDB_PY_VERSION=0.33.3
+# TILEDB_VECTOR_SEARCH_VERSION should be the newest compatible with TILEDB_PY_VERSION.
+# https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.11.0/pyproject.toml#L23 (tiledb-py>=0.32.0)
+ARG TILEDB_VECTOR_SEARCH_VERSION=0.11.0
+
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     python3-pip
 RUN pip3 install \
-    cellxgene_census \
+    tiledb==$TILEDB_PY_VERSION \
     tiledb-vector-search==$TILEDB_VECTOR_SEARCH_VERSION
+
+# FIXME: monkey patch tiledb-vector-search 0.11 for https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564
+# This should be removed when we update to a new version addressing that issue.
+ADD ingestion.py.patch /tmp
+RUN patch /usr/local/lib/python3.10/dist-packages/tiledb/vector_search/ingestion.py /tmp/ingestion.py.patch
diff --git a/tools/census_embeddings_indexer/README.md b/tools/census_embeddings_indexer/README.md
@@ -1,33 +1,31 @@
 # census_embeddings_indexer
 
-This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (assuming account setup documented there).
+This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (`pip3 install miniwdl-omics-run`; one-time account setup steps documented there are probably already done in the relevant CZI AWS account).
 
-The pipeline consumes one or more of the existing TileDB arrays for hosted and contributed [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3.
+The pipeline consumes one or more of the existing TileDB arrays for [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3.
 
 ```bash
 export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
 export AWS_DEFAULT_REGION=$(aws configure get region)
 export ECR_ENDPT=${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com
-export WDL_OUTPUT_BUCKET=mlin-census-screatch
+export WDL_OUTPUT_BUCKET=mlin-census-scratch
 
-docker build -t ${ECR_ENDPT}/omics:census_embeddings_indexer .
+docker build --platform linux/amd64 -t ${ECR_ENDPT}/omics:census_embeddings_indexer .
 aws ecr get-login-password | docker login --username AWS --password-stdin "$ECR_ENDPT"
 docker push ${ECR_ENDPT}/omics:census_embeddings_indexer
 
 miniwdl-omics-run census_embeddings_indexer.wdl \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-1 \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-4 \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-5 \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-1 \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-2 \
-    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-3 \
-    census_version=2023-12-15 \
+    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-6 \
+    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-7 \
+    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-8 \
+    embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-contrib-7 \
+    census_version=2024-07-01 \
     s3_region=$AWS_DEFAULT_REGION \
     docker=${ECR_ENDPT}/omics:census_embeddings_indexer \
     --output-uri s3://${WDL_OUTPUT_BUCKET}/census_embeddings_indexer/out/ \
-    --role poweromics
+    --role poweromics --storage-capacity 4800
 ```
 
 (The `embeddings_s3_uris=s3_//...` with `s3_//` instead of `s3://` is a workaround for an AWS-side existence check that doesn't seem to work right on public buckets.)
 
-The Dockerfile has an argument for the TileDB-Vector-Search version to use. We should use the newest version that doesn't need a newer version of TileDB than the intended client tiledbsoma/cellxgene_census.
+The [Dockerfile](Dockerfile) has arguments for the TileDB-Py and TileDB-Vector-Search versions to use; see comments there for guidance on setting them.
diff --git a/tools/census_embeddings_indexer/census_embeddings_indexer.wdl b/tools/census_embeddings_indexer/census_embeddings_indexer.wdl
@@ -43,6 +43,7 @@ task indexer {
         set -euxo pipefail
 
         python3 << 'EOF'
+        import sys
         import math
         import tiledb
         import tiledb.vector_search as vs
@@ -52,14 +53,18 @@ task indexer {
 
         source_uri = "~{embeddings_s3_uri}".replace("s3_//", "s3://")
         with tiledb.open(source_uri, config=config) as emb_array:
-            N, M = emb_array.shape
+            (_, N), (_, M) = emb_array.nonempty_domain() # TODO use "current domain" when supported
+        N += 1  # ASSUMES contiguous soma_joinid's [0, N)
+        M += 1
         input_vectors_per_work_item = 1_500_000_000 // M  # controls memory usage
+        print(f"N={N} M={M} input_vectors_per_work_item={input_vectors_per_work_item}", file=sys.stderr)
 
         vs.ingest(
             config=config,
             source_uri=source_uri,
             source_type="TILEDB_SPARSE_ARRAY",
-            dimensions=M,
+            size=N,
+            dimensions_override=M,  # FIXME: see Dockerfile
             index_type="IVF_FLAT",
             index_uri="./~{embeddings_name}",
             partitions=math.ceil(math.sqrt(N)),
@@ -70,8 +75,12 @@ task indexer {
         )
 
         final_index = vs.ivf_flat_index.IVFFlatIndex(uri="./~{embeddings_name}", memory_budget=1024*1048756)
-        assert final_index.size == N
+        print(f"VACUUM", file=sys.stderr)
+        final_index.vacuum()
+        assert final_index.size == N, f"final_index.size=={final_index.size} != N=={N}"
         EOF
+
+        >&2 ls -lR '~{embeddings_name}'
     >>>
 
     runtime {
@@ -100,6 +109,7 @@ task make_one_directory {
         while read -r dir; do
             cp -r "$dir" '~{directory_name}/'
         done < '~{manifest}'
+        >&2 ls -lR '~{directory_name}'
     >>>
 
     output {

diff --git a/tools/census_embeddings_indexer/ingestion.py.patch b/tools/census_embeddings_indexer/ingestion.py.patch
@@ -0,0 +1,5 @@
+56a57
+>     dimensions_override: int = -1,
+3144a3146,3147
+>         if dimensions_override >= 0:
+>             dimensions = min(dimensions, dimensions_override)
diff --git a/tools/models/geneformer/Dockerfile b/tools/models/geneformer/Dockerfile
@@ -10,7 +10,7 @@ RUN git lfs install
 
 ENV GIT_SSL_NO_VERIFY=true
 RUN pip install --upgrade pip setuptools setuptools_scm
-RUN pip install torch torchdata --index-url https://download.pytorch.org/whl/cu118
+RUN pip install torch 'torchdata<0.10' --index-url https://download.pytorch.org/whl/cu118
                                                                              # ^^^ match the base image CUDA version!
 RUN pip install owlready2 boto3 transformers[torch]
 # workaround for unknown problem blocking `import geneformer`:
@@ -19,9 +19,9 @@ RUN pip uninstall -y transformer-engine
 
 # Set the tiledbsoma version used to write the embeddings SparseNDArray, to ensure
 # compatibility with the Census embeddings curator
-ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.9.5
+ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.11.4
 ARG CELLXGENE_CENSUS_VERSION=main
-ARG GENEFORMER_VERSION=57f02a4
+ARG GENEFORMER_VERSION=ebc1e096
 
 RUN mkdir /census-geneformer
 WORKDIR /census-geneformer
@@ -44,6 +44,3 @@ RUN python3 -m venv --system-site-packages embeddings_tiledbsoma_venv && \
 COPY helpers ./helpers
 COPY *.py ./
 COPY finetune-geneformer.config.yml .
-
-# FIXME: eliminate once model is published in Geneformer repo
-COPY gf-95m/ ./gf-95m/
diff --git a/tools/models/geneformer/README.md b/tools/models/geneformer/README.md
@@ -3,40 +3,60 @@
 These scripts automate:
 
 1. preparing tokenized Geneformer datasets from CELLxGENE Census (`prepare-census-geneformer-dataset.py`)
-2. fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`)
-3. using the fine-tuned model to generate cell embedding vectors (`generate-geneformer-embeddings.py`)
+2. **(deprecated)** fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`)
+3. generate cell embedding vectors given a dataset & model (`generate-geneformer-embeddings.py`)
 
 Embedding generation is computationally intensive on large datasets (e.g. all of Census). To make this practical, a WDL workflow (`wdl/generate_embeddings.wdl`) provides a way to distribute across many compute nodes. The other steps also have WDLs for encapsulation, even though they aren't distributed.
 
-The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc. It also bundles `finetune-geneformer.config.yml` with various fine-tuning settings; an alternate config file can be supplied at runtime.
+The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc.
+
+(Starting with the 2024-07-01 LTS, [Geneformer includes a model fine-tuned with CELLxGENE](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522), which we use instead of our own fine-tuning. Our historical fine-tuning code remains here for reference.)
 
 ## Example invocations
 
-Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to a suitable repository like ECR (tagged `$DOCKER_TAG`).
+Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to ECR (tagged `$DOCKER_TAG`).
 
-Preparing a tokenized training dataset with 2,500 primary cells per human cell type:
+Preparing a tokenized dataset for all of Census (>500GB, sharded):
 
 ```bash
 miniwdl-omics-run wdl/prepare_datasets.wdl \
     docker=$DOCKER_TAG \
-    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \
-    N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
+    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2025-01-30/soma/ \
+    value_filter='is_primary_data==True or is_primary_data==False' \
+    output_name=2025-01-30 shards=500 --storage-capacity 4800 \
     --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
 ```
 
-And a tokenized dataset for all of Census (>300GiB, sharded):
+(We set `census_version` to the SOMACollection S3 URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.) The run produces a folder containing 500 shard subfolders named e.g. `shard-123`, under the output URI and HealthOmics run ID.
+
+Generating cell embeddings (takes 8-12h on up to 500 g5.4xlarge, generates 200GB `tiledbsoma.SparseNDArray` on S3):
+
+```bash
+seq 0 499 \
+    | xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/1234567/out/dataset/2025-01-30/shard-%03d/\n' \
+    | xargs -n 9999 miniwdl-omics-run \
+    --role poweromics --output-uri s3://MYBUCKET/geneformer/embs \
+    wdl/generate_embeddings.wdl \
+    docker=$DOCKER_TAG \
+    emb_mode=cls emb_layer=0 model_type=Pretrained \
+    model=s3://MYBUCKET/geneformer/models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/ \
+    output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/2025-01-30/
+```
+
+The `model` input folder can be [copied from upstream](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522). The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow creates it using TileDB.
+
+### (deprecated) Fine-tuning procedure
+
+Preparing a tokenized training dataset with 2,500 primary cells per human cell type:
 
 ```bash
 miniwdl-omics-run wdl/prepare_datasets.wdl \
     docker=$DOCKER_TAG \
-    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2024-05-20/soma/ \
-    value_filter='is_primary_data==True or is_primary_data==False' \
-    output_name=2024-05-20 shards=256 \
+    census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \
+    N=2500 sampling_column=cell_type output_name=2500_per_cell_type \
     --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/
 ```
 
-(We set `census_version` to the SOMACollection URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.)
-
 Fine-tuning for 8 epochs (takes ~36h on g5.8xlarge):
 
 ```bash
@@ -47,18 +67,6 @@ miniwdl-omics-run wdl/finetune_geneformer.wdl \
     --role poweromics --output-uri s3://MYBUCKET/geneformer/models/
 ```
 
-Generating cell embeddings (takes 8-12h on up to 256 g5.2xlarge, generates 130GiB `tiledbsoma.SparseNDArray` on S3):
-
-```bash
-seq 0 255 \
-    | xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/census-2024-05-20/shard-%03d/\n' \
-    | xargs -n 9999 miniwdl-omics-run \
-    --role poweromics --output-uri s3://MYBUCKET/geneformer/embs \
-    wdl/generate_embeddings.wdl \
-    docker=$DOCKER_TAG \
-    emb_layer=0 model_type=Pretrained \
-    model=s3://MYBUCKET/geneformer/gf-95m/fine_tuned_model/ \
-    output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/census-2024-05-20/
-```
+Then the output model folder can be supplied to the `model` input to `generate_embeddings.wdl`.
 
-(The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow has TileDB create it.)
+To change fine-tuning parameters, customize the default `finetune-geneformer.config.yaml` file and supply that to the `config` argument to `finetune_geneformer.wdl`.
diff --git a/tools/models/geneformer/buildspec.yml b/tools/models/geneformer/buildspec.yml
@@ -8,7 +8,6 @@ phases:
     - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 699936264352.dkr.ecr.us-west-2.amazonaws.com
   build:
     commands:
-    - aws s3 cp s3://mlin-census-scratch/geneformer/gf-95m/ tools/models/geneformer/gf-95m/ --recursive
     - docker build -t 699936264352.dkr.ecr.us-west-2.amazonaws.com/omics:census-geneformer --build-arg CELLXGENE_CENSUS_VERSION=$CODEBUILD_RESOLVED_SOURCE_VERSION tools/models/geneformer
   post_build:
     commands:

diff --git a/tools/models/geneformer/finetune-geneformer.config.yml b/tools/models/geneformer/finetune-geneformer.config.yml
@@ -1,3 +1,4 @@
+# DEPRECATED: see README.md
 # Name of a categorical column/feature in the Dataset to use as the classifier label
 label_feature: cell_subclass
 # Specific labels to exclude from training and evaluation

diff --git a/tools/models/geneformer/finetune-geneformer.py b/tools/models/geneformer/finetune-geneformer.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 # mypy: ignore-errors
 
+# DEPRECATED: see README.md
+
 import argparse
 import json
 import logging