diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py index 7303d3bbf..2c1ba37c6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py @@ -15,7 +15,7 @@ class GeneformerTokenizer(CellDatasetBuilder): cell in CELLxGENE Census ExperimentAxisQuery results (human). This class requires the Geneformer package to be installed separately with: - `pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6` + `pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096` Example usage: @@ -64,8 +64,8 @@ def __init__( *, obs_column_names: Sequence[str] | None = None, obs_attributes: Sequence[str] | None = None, - max_input_tokens: int = 2048, - special_token: bool = False, + max_input_tokens: int = 4096, + special_token: bool = True, token_dictionary_file: str = "", gene_median_file: str = "", gene_mapping_file: str = "", @@ -78,8 +78,8 @@ def __init__( - `obs_query`: obs AxisQuery defining the set of Census cells to process (default all) - `obs_column_names`: obs dataframe columns (cell metadata) to propagate into attributes of each Dataset item - - `max_input_tokens`: maximum length of Geneformer input token sequence (default 2048) - - `special_token`: whether to affix separator tokens to the sequence (default False) + - `max_input_tokens`: maximum length of Geneformer input token sequence (default 4096) + - `special_token`: whether to affix separator tokens to the sequence (default True) - `token_dictionary_file`, `gene_median_file`: pickle files supplying the mapping of Ensembl human gene IDs onto Geneformer token numbers and median expression values. By default, these will be loaded from the Geneformer package. @@ -120,19 +120,21 @@ def _load_geneformer_data( .set_index("soma_joinid") ) - if not (token_dictionary_file and gene_median_file): + if not (token_dictionary_file and gene_median_file and gene_mapping_file): try: import geneformer except ImportError: # pyproject.toml can't express Geneformer git+https dependency raise ImportError( "Please install Geneformer with: " - "pip install git+https://huggingface.co/ctheodoris/Geneformer@eb038a6" + "pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096" ) from None if not token_dictionary_file: token_dictionary_file = geneformer.tokenizer.TOKEN_DICTIONARY_FILE if not gene_median_file: gene_median_file = geneformer.tokenizer.GENE_MEDIAN_FILE + if not gene_mapping_file: + gene_mapping_file = geneformer.tokenizer.ENSEMBL_MAPPING_FILE with open(token_dictionary_file, "rb") as f: gene_token_dict = pickle.load(f) with open(gene_median_file, "rb") as f: diff --git a/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py b/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py index e95992a14..952a1f314 100644 --- a/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py +++ b/api/python/cellxgene_census/tests/experimental/ml/huggingface/test_geneformer.py @@ -77,7 +77,8 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None: assert len(true_tokens) == len(cell_ids) identical = 0 for i, cell_id in enumerate(cell_ids): - assert len(test_tokens[i]) == len(true_tokens[i]) + if len(test_tokens[i]) != len(true_tokens[i]): + assert test_tokens[i] == true_tokens[i] # to show diff rho, _ = spearmanr(test_tokens[i], true_tokens[i]) if rho < RHO_THRESHOLD: # token sequences are too dissimilar; assert exact identity so that pytest -vv will @@ -103,7 +104,9 @@ def test_GeneformerTokenizer_docstring_example() -> None: "soma_joinid", "cell_type_ontology_term_id", ), + max_input_tokens=2048, + special_token=False, ) as tokenizer: dataset = tokenizer.build() assert len(dataset) == 15020 - assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27798388 + assert sum(it.length for it in dataset.to_pandas().itertuples()) == 27793772 diff --git a/tools/census_contrib/pyproject.toml b/tools/census_contrib/pyproject.toml index 4b2ea2d27..ac7075a7d 100644 --- a/tools/census_contrib/pyproject.toml +++ b/tools/census_contrib/pyproject.toml @@ -5,8 +5,8 @@ dynamic = ["version"] dependencies= [ "attrs", "cattrs>=23.2.2", - "tiledbsoma==1.4.4", # IMPORTANT: this must match the Census Builder version - "cellxgene-census==1.6.0", # IMPORTANT: this must match the Census Builder version + "tiledbsoma==1.15.3", # IMPORTANT: this must match the Census Builder version + "cellxgene-census==1.15.0", # IMPORTANT: this must match the Census Builder version "pyyaml", "requests", "typed-argument-parser", diff --git a/tools/census_embeddings_indexer/Dockerfile b/tools/census_embeddings_indexer/Dockerfile index e1bab42d6..958c49ba6 100644 --- a/tools/census_embeddings_indexer/Dockerfile +++ b/tools/census_embeddings_indexer/Dockerfile @@ -1,9 +1,22 @@ FROM ubuntu:22.04 -# TILEDB_VECTOR_SEARCH_VERSION should be the newest that doesn't need a newer version of tiledb -# than the client tiledbsoma: https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.2.2/pyproject.toml -ARG TILEDB_VECTOR_SEARCH_VERSION=0.2.2 + +# TILEDB_PY_VERSION should be set such that the TileDB Embedded version will match that used by +# tiledbsoma in cellxgene_census_builder and census_contrib. +# https://github.com/single-cell-data/TileDB-SOMA/blob/1.15.3/libtiledbsoma/cmake/Modules/FindTileDB_EP.cmake#L93 (2.27.0) +# == +# https://github.com/TileDB-Inc/TileDB-Py/blob/0.33.3/CMakeLists.txt#L49 (2.27.0) +ARG TILEDB_PY_VERSION=0.33.3 +# TILEDB_VECTOR_SEARCH_VERSION should be the newest compatible with TILEDB_PY_VERSION. +# https://github.com/TileDB-Inc/TileDB-Vector-Search/blob/0.11.0/pyproject.toml#L23 (tiledb-py>=0.32.0) +ARG TILEDB_VECTOR_SEARCH_VERSION=0.11.0 + RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ python3-pip RUN pip3 install \ - cellxgene_census \ + tiledb==$TILEDB_PY_VERSION \ tiledb-vector-search==$TILEDB_VECTOR_SEARCH_VERSION + +# FIXME: monkey patch tiledb-vector-search 0.11 for https://github.com/TileDB-Inc/TileDB-Vector-Search/issues/564 +# This should be removed when we update to a new version addressing that issue. +ADD ingestion.py.patch /tmp +RUN patch /usr/local/lib/python3.10/dist-packages/tiledb/vector_search/ingestion.py /tmp/ingestion.py.patch diff --git a/tools/census_embeddings_indexer/README.md b/tools/census_embeddings_indexer/README.md index cf18aa82d..b715511c4 100644 --- a/tools/census_embeddings_indexer/README.md +++ b/tools/census_embeddings_indexer/README.md @@ -1,33 +1,31 @@ # census_embeddings_indexer -This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (assuming account setup documented there). +This is a Docker+WDL pipeline to build [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) indexes for Census cell embeddings, supporting cell similarity search in embedding space. It's meant to run on the AWS HealthOmics workflow service using the [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) launcher (`pip3 install miniwdl-omics-run`; one-time account setup steps documented there are probably already done in the relevant CZI AWS account). -The pipeline consumes one or more of the existing TileDB arrays for hosted and contributed [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3. +The pipeline consumes one or more of the existing TileDB arrays for [Census embeddings](https://cellxgene.cziscience.com/census-models) stored on S3. The resulting indexes are themselves TileDB groups to be stored on S3. ```bash export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) export AWS_DEFAULT_REGION=$(aws configure get region) export ECR_ENDPT=${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com -export WDL_OUTPUT_BUCKET=mlin-census-screatch +export WDL_OUTPUT_BUCKET=mlin-census-scratch -docker build -t ${ECR_ENDPT}/omics:census_embeddings_indexer . +docker build --platform linux/amd64 -t ${ECR_ENDPT}/omics:census_embeddings_indexer . aws ecr get-login-password | docker login --username AWS --password-stdin "$ECR_ENDPT" docker push ${ECR_ENDPT}/omics:census_embeddings_indexer miniwdl-omics-run census_embeddings_indexer.wdl \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-1 \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-4 \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-czi-5 \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-1 \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-2 \ - embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-3 \ - census_version=2023-12-15 \ + embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-6 \ + embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-7 \ + embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-czi-8 \ + embeddings_s3_uris=s3_//cellxgene-contrib-public/contrib/cell-census/soma/2024-07-01/CxG-contrib-7 \ + census_version=2024-07-01 \ s3_region=$AWS_DEFAULT_REGION \ docker=${ECR_ENDPT}/omics:census_embeddings_indexer \ --output-uri s3://${WDL_OUTPUT_BUCKET}/census_embeddings_indexer/out/ \ - --role poweromics + --role poweromics --storage-capacity 4800 ``` (The `embeddings_s3_uris=s3_//...` with `s3_//` instead of `s3://` is a workaround for an AWS-side existence check that doesn't seem to work right on public buckets.) -The Dockerfile has an argument for the TileDB-Vector-Search version to use. We should use the newest version that doesn't need a newer version of TileDB than the intended client tiledbsoma/cellxgene_census. +The [Dockerfile](Dockerfile) has arguments for the TileDB-Py and TileDB-Vector-Search versions to use; see comments there for guidance on setting them. diff --git a/tools/census_embeddings_indexer/census_embeddings_indexer.wdl b/tools/census_embeddings_indexer/census_embeddings_indexer.wdl index 4087642eb..76e5ccfbd 100644 --- a/tools/census_embeddings_indexer/census_embeddings_indexer.wdl +++ b/tools/census_embeddings_indexer/census_embeddings_indexer.wdl @@ -43,6 +43,7 @@ task indexer { set -euxo pipefail python3 << 'EOF' + import sys import math import tiledb import tiledb.vector_search as vs @@ -52,14 +53,18 @@ task indexer { source_uri = "~{embeddings_s3_uri}".replace("s3_//", "s3://") with tiledb.open(source_uri, config=config) as emb_array: - N, M = emb_array.shape + (_, N), (_, M) = emb_array.nonempty_domain() # TODO use "current domain" when supported + N += 1 # ASSUMES contiguous soma_joinid's [0, N) + M += 1 input_vectors_per_work_item = 1_500_000_000 // M # controls memory usage + print(f"N={N} M={M} input_vectors_per_work_item={input_vectors_per_work_item}", file=sys.stderr) vs.ingest( config=config, source_uri=source_uri, source_type="TILEDB_SPARSE_ARRAY", - dimensions=M, + size=N, + dimensions_override=M, # FIXME: see Dockerfile index_type="IVF_FLAT", index_uri="./~{embeddings_name}", partitions=math.ceil(math.sqrt(N)), @@ -70,8 +75,12 @@ task indexer { ) final_index = vs.ivf_flat_index.IVFFlatIndex(uri="./~{embeddings_name}", memory_budget=1024*1048756) - assert final_index.size == N + print(f"VACUUM", file=sys.stderr) + final_index.vacuum() + assert final_index.size == N, f"final_index.size=={final_index.size} != N=={N}" EOF + + >&2 ls -lR '~{embeddings_name}' >>> runtime { @@ -100,6 +109,7 @@ task make_one_directory { while read -r dir; do cp -r "$dir" '~{directory_name}/' done < '~{manifest}' + >&2 ls -lR '~{directory_name}' >>> output { diff --git a/tools/census_embeddings_indexer/ingestion.py.patch b/tools/census_embeddings_indexer/ingestion.py.patch new file mode 100644 index 000000000..27ddfe479 --- /dev/null +++ b/tools/census_embeddings_indexer/ingestion.py.patch @@ -0,0 +1,5 @@ +56a57 +> dimensions_override: int = -1, +3144a3146,3147 +> if dimensions_override >= 0: +> dimensions = min(dimensions, dimensions_override) diff --git a/tools/models/geneformer/Dockerfile b/tools/models/geneformer/Dockerfile index af92edaf2..f1472199b 100644 --- a/tools/models/geneformer/Dockerfile +++ b/tools/models/geneformer/Dockerfile @@ -10,7 +10,7 @@ RUN git lfs install ENV GIT_SSL_NO_VERIFY=true RUN pip install --upgrade pip setuptools setuptools_scm -RUN pip install torch torchdata --index-url https://download.pytorch.org/whl/cu118 +RUN pip install torch 'torchdata<0.10' --index-url https://download.pytorch.org/whl/cu118 # ^^^ match the base image CUDA version! RUN pip install owlready2 boto3 transformers[torch] # workaround for unknown problem blocking `import geneformer`: @@ -19,9 +19,9 @@ RUN pip uninstall -y transformer-engine # Set the tiledbsoma version used to write the embeddings SparseNDArray, to ensure # compatibility with the Census embeddings curator -ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.9.5 +ARG EMBEDDINGS_TILEDBSOMA_VERSION=1.11.4 ARG CELLXGENE_CENSUS_VERSION=main -ARG GENEFORMER_VERSION=57f02a4 +ARG GENEFORMER_VERSION=ebc1e096 RUN mkdir /census-geneformer WORKDIR /census-geneformer @@ -44,6 +44,3 @@ RUN python3 -m venv --system-site-packages embeddings_tiledbsoma_venv && \ COPY helpers ./helpers COPY *.py ./ COPY finetune-geneformer.config.yml . - -# FIXME: eliminate once model is published in Geneformer repo -COPY gf-95m/ ./gf-95m/ diff --git a/tools/models/geneformer/README.md b/tools/models/geneformer/README.md index c360fb382..18753d02d 100644 --- a/tools/models/geneformer/README.md +++ b/tools/models/geneformer/README.md @@ -3,40 +3,60 @@ These scripts automate: 1. preparing tokenized Geneformer datasets from CELLxGENE Census (`prepare-census-geneformer-dataset.py`) -2. fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`) -3. using the fine-tuned model to generate cell embedding vectors (`generate-geneformer-embeddings.py`) +2. **(deprecated)** fine-tuning a Geneformer cell classifier model (`finetune-geneformer.py`) +3. generate cell embedding vectors given a dataset & model (`generate-geneformer-embeddings.py`) Embedding generation is computationally intensive on large datasets (e.g. all of Census). To make this practical, a WDL workflow (`wdl/generate_embeddings.wdl`) provides a way to distribute across many compute nodes. The other steps also have WDLs for encapsulation, even though they aren't distributed. -The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc. It also bundles `finetune-geneformer.config.yml` with various fine-tuning settings; an alternate config file can be supplied at runtime. +The `Dockerfile` provides the recipe for the docker image used by the WDLs, which packages the scripts together with `cellxgene_census`, Geneformer, pytorch, etc. + +(Starting with the 2024-07-01 LTS, [Geneformer includes a model fine-tuned with CELLxGENE](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522), which we use instead of our own fine-tuning. Our historical fine-tuning code remains here for reference.) ## Example invocations -Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to a suitable repository like ECR (tagged `$DOCKER_TAG`). +Using [miniwdl-omics-run](https://github.com/miniwdl-ext/miniwdl-omics-run) for the Amazon HealthOmics workflow service, and assuming the docker image has been built and pushed to ECR (tagged `$DOCKER_TAG`). -Preparing a tokenized training dataset with 2,500 primary cells per human cell type: +Preparing a tokenized dataset for all of Census (>500GB, sharded): ```bash miniwdl-omics-run wdl/prepare_datasets.wdl \ docker=$DOCKER_TAG \ - census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \ - N=2500 sampling_column=cell_type output_name=2500_per_cell_type \ + census_version=s3://cellxgene-census-public-us-west-2/cell-census/2025-01-30/soma/ \ + value_filter='is_primary_data==True or is_primary_data==False' \ + output_name=2025-01-30 shards=500 --storage-capacity 4800 \ --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/ ``` -And a tokenized dataset for all of Census (>300GiB, sharded): +(We set `census_version` to the SOMACollection S3 URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.) The run produces a folder containing 500 shard subfolders named e.g. `shard-123`, under the output URI and HealthOmics run ID. + +Generating cell embeddings (takes 8-12h on up to 500 g5.4xlarge, generates 200GB `tiledbsoma.SparseNDArray` on S3): + +```bash +seq 0 499 \ + | xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/1234567/out/dataset/2025-01-30/shard-%03d/\n' \ + | xargs -n 9999 miniwdl-omics-run \ + --role poweromics --output-uri s3://MYBUCKET/geneformer/embs \ + wdl/generate_embeddings.wdl \ + docker=$DOCKER_TAG \ + emb_mode=cls emb_layer=0 model_type=Pretrained \ + model=s3://MYBUCKET/geneformer/models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/ \ + output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/2025-01-30/ +``` + +The `model` input folder can be [copied from upstream](https://huggingface.co/ctheodoris/Geneformer/tree/main/fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522). The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow creates it using TileDB. + +### (deprecated) Fine-tuning procedure + +Preparing a tokenized training dataset with 2,500 primary cells per human cell type: ```bash miniwdl-omics-run wdl/prepare_datasets.wdl \ docker=$DOCKER_TAG \ - census_version=s3://cellxgene-census-public-us-west-2/cell-census/2024-05-20/soma/ \ - value_filter='is_primary_data==True or is_primary_data==False' \ - output_name=2024-05-20 shards=256 \ + census_version=s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/ \ + N=2500 sampling_column=cell_type output_name=2500_per_cell_type \ --role poweromics --output-uri s3://MYBUCKET/geneformer/datasets/ ``` -(We set `census_version` to the SOMACollection URI because the HealthOmics workers don't have internet access to the Census release directory endpoint.) - Fine-tuning for 8 epochs (takes ~36h on g5.8xlarge): ```bash @@ -47,18 +67,6 @@ miniwdl-omics-run wdl/finetune_geneformer.wdl \ --role poweromics --output-uri s3://MYBUCKET/geneformer/models/ ``` -Generating cell embeddings (takes 8-12h on up to 256 g5.2xlarge, generates 130GiB `tiledbsoma.SparseNDArray` on S3): - -```bash -seq 0 255 \ - | xargs -n 1 printf 'dataset_shards=s3://MYBUCKET/geneformer/datasets/census-2024-05-20/shard-%03d/\n' \ - | xargs -n 9999 miniwdl-omics-run \ - --role poweromics --output-uri s3://MYBUCKET/geneformer/embs \ - wdl/generate_embeddings.wdl \ - docker=$DOCKER_TAG \ - emb_layer=0 model_type=Pretrained \ - model=s3://MYBUCKET/geneformer/gf-95m/fine_tuned_model/ \ - output_uri=s3_//MYBUCKET/geneformer/embs/$(date '+%s')/census-2024-05-20/ -``` +Then the output model folder can be supplied to the `model` input to `generate_embeddings.wdl`. -(The `s3_//MYBUCKET` is a workaround for the workflow service rejecting our submission if the specified S3 output folder doesn't yet exist; this workflow has TileDB create it.) +To change fine-tuning parameters, customize the default `finetune-geneformer.config.yaml` file and supply that to the `config` argument to `finetune_geneformer.wdl`. diff --git a/tools/models/geneformer/buildspec.yml b/tools/models/geneformer/buildspec.yml index 0c439d650..140e8f139 100644 --- a/tools/models/geneformer/buildspec.yml +++ b/tools/models/geneformer/buildspec.yml @@ -8,7 +8,6 @@ phases: - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 699936264352.dkr.ecr.us-west-2.amazonaws.com build: commands: - - aws s3 cp s3://mlin-census-scratch/geneformer/gf-95m/ tools/models/geneformer/gf-95m/ --recursive - docker build -t 699936264352.dkr.ecr.us-west-2.amazonaws.com/omics:census-geneformer --build-arg CELLXGENE_CENSUS_VERSION=$CODEBUILD_RESOLVED_SOURCE_VERSION tools/models/geneformer post_build: commands: diff --git a/tools/models/geneformer/finetune-geneformer.config.yml b/tools/models/geneformer/finetune-geneformer.config.yml index 734fa3b1a..e3c415d19 100644 --- a/tools/models/geneformer/finetune-geneformer.config.yml +++ b/tools/models/geneformer/finetune-geneformer.config.yml @@ -1,3 +1,4 @@ +# DEPRECATED: see README.md # Name of a categorical column/feature in the Dataset to use as the classifier label label_feature: cell_subclass # Specific labels to exclude from training and evaluation diff --git a/tools/models/geneformer/finetune-geneformer.py b/tools/models/geneformer/finetune-geneformer.py index cf3f3af5a..16928b5da 100644 --- a/tools/models/geneformer/finetune-geneformer.py +++ b/tools/models/geneformer/finetune-geneformer.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # mypy: ignore-errors +# DEPRECATED: see README.md + import argparse import json import logging diff --git a/tools/models/geneformer/wdl/finetune_geneformer.wdl b/tools/models/geneformer/wdl/finetune_geneformer.wdl index 6471fdd20..519a9e0a4 100644 --- a/tools/models/geneformer/wdl/finetune_geneformer.wdl +++ b/tools/models/geneformer/wdl/finetune_geneformer.wdl @@ -1,5 +1,7 @@ version development +# DEPRECATED: see ../README.md + task finetune_geneformer { input { Directory dataset diff --git a/tools/models/geneformer/wdl/generate_embeddings.wdl b/tools/models/geneformer/wdl/generate_embeddings.wdl index 2f28a41d2..52c1a0107 100644 --- a/tools/models/geneformer/wdl/generate_embeddings.wdl +++ b/tools/models/geneformer/wdl/generate_embeddings.wdl @@ -108,9 +108,9 @@ task generate_embeddings { >>> runtime { - # sizing to g5.2xlarge since EmbExtractor uses only one GPU - cpu: 8 - memory: "30G" + # sizing to g5.4xlarge; note EmbExtractor uses only one GPU + cpu: 16 + memory: "60G" gpu: true acceleratorCount: 1 acceleratorType: "nvidia-tesla-a10g" diff --git a/tools/models/scvi/README.md b/tools/models/scvi/README.md index 9d4193b43..b6dd063e0 100644 --- a/tools/models/scvi/README.md +++ b/tools/models/scvi/README.md @@ -41,4 +41,4 @@ The final selection of parameters for the training phase was based on a hyper pa ## Environment setup -The training has been performed on an AWS EC2 machine (instance type: g4dn.12xlarge), running on Ubuntu 20.04. Run [scvi-init.sh](scvi-init.sh) to set up the environment required to run the pipeline. It is also necessary to mount the instance storage as swap since the loader requires the AnnData to be in memory. You can use [this script](https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/scripts/aws/swapon_instance_storage.sh) to automatically mount the drives as swap. +The training has been performed on an AWS EC2 machine (instance type: g5.16xlarge), running on Ubuntu 20.04. Run [`source scvi-init.sh`](scvi-init.sh) to set up the system packages and Python virtualenv required to run the pipeline. It is also necessary to mount the instance storage as swap since the loader requires the AnnData to be in memory. You can sudo [this script](https://github.com/chanzuckerberg/cellxgene-census/blob/main/tools/scripts/aws/swapon_instance_storage.sh) to automatically mount the drives as swap. diff --git a/tools/models/scvi/scvi-create-latent-update.py b/tools/models/scvi/scvi-create-latent-update.py index 0a70e2596..9268ff6df 100644 --- a/tools/models/scvi/scvi-create-latent-update.py +++ b/tools/models/scvi/scvi-create-latent-update.py @@ -71,7 +71,7 @@ gc.collect() with open("latent-idx.npy", "wb") as f: - np.save(f, idx) + np.save(f, idx.flatten()) with open("latent.npy", "wb") as f: np.save(f, qz_m) diff --git a/tools/models/scvi/scvi-init.sh b/tools/models/scvi/scvi-init.sh index 94bda3d02..c0d9dbecd 100644 --- a/tools/models/scvi/scvi-init.sh +++ b/tools/models/scvi/scvi-init.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Can be used to bootstrap a g4dn.* instance with scvi-tools and cellxgene-census +# Can be used to bootstrap a {g4dn,g5}.* instance with scvi-tools and cellxgene-census export DEBIAN_FRONTEND=noninteractive @@ -10,8 +10,7 @@ sudo add-apt-repository -y ppa:deadsnakes/ppa sudo apt -y update sudo apt -y install python3.11 sudo apt -y install python3.11-venv -curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 -sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 1 +curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.8 2 sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 3 sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 @@ -22,13 +21,15 @@ sudo cp /usr/lib/python3/dist-packages/apt_pkg.cpython-38-x86_64-linux-gnu.so /u sudo apt -y install libnvidia-gl-535 libnvidia-common-535 libnvidia-compute-535 libnvidia-encode-535 libnvidia-decode-535 nvidia-compute-utils-535 libnvidia-fbc1-535 nvidia-driver-535 -ipip install --upgrade "jax[cuda11_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html -pip install pathlib torch click ray hyperopt -pip install git+https://github.com/scverse/scvi-tools.git +python3 -m venv scvi_venv +source scvi_venv/bin/activate + +pip install --upgrade "jax[cuda11_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +pip install torch click ray hyperopt scvi-tools # pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # pip install nvidia-cusolver-cu11 pip install scikit-misc -pip install git+https://github.com/chanzuckerberg/cellxgene-census#subdirectory=api/python/cellxgene_census \ No newline at end of file +pip install git+https://github.com/chanzuckerberg/cellxgene-census#subdirectory=api/python/cellxgene_census