Skip to content

Commit

Permalink
Less copying of local data (#217)
Browse files Browse the repository at this point in the history
* Start to use anywidget

* Update widget esm

* Remove JS stuff

* Lint

* Update

* Update

* Use esm.sh

* No-copy image.ome-zarr and anndata.zarr local support

* Fix space filepath issue, implement remote image.ome-zarr wrapper

* Lint, update tests

* Docs

* CI

* Update readme

* Constants

* Constants

* Update dryrun_environment.yml

* Update test_demos.yml

* Update test_demos.yml

* CI

* Extra file

* Update

* Add CsvWrapper

* Update

* Update docs, update notebooks, update data_utils, add ome_tiff data_utils

* Lint

* Revert out_dir

* Update constants

* Update extras_require

* Lint

* Version

* Cache js

* Update launch_vitessce_io

* Use uvicorn, add ways to stop servers, use vitessce@2.0.3-beta.0 to fix style bugs

* Lint

* Version, chainable config update

* Component -> ViewType

* Docs

* Add base url placeholder constant

* Update base_url_placeholder
  • Loading branch information
keller-mark authored Jan 16, 2023
1 parent f206823 commit a17e8cc
Show file tree
Hide file tree
Showing 30 changed files with 973 additions and 354 deletions.
18 changes: 4 additions & 14 deletions .github/workflows/test_demos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,14 @@ on: [push, pull_request]
jobs:
test_demos:
runs-on: ubuntu-22.04
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v2
- uses: conda-incubator/setup-miniconda@v2
- uses: actions/setup-python@v2
with:
activate-environment: vitessce-python-demos-dryrun
environment-file: demos/dryrun_environment.yml
python-version: 3.8
use-mamba: true
mamba-version: "*"
auto-activate-base: false
channels: conda-forge,bioconda,defaults
channel-priority: true
python-version: '3.8'
- run: |
conda info
conda list
pip install snakemake
pip install pyyaml
- name: Install vitessce
run: pip install -e .[testing]
- name: Run tests
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# vitessce-python

[![PyPI](https://img.shields.io/pypi/v/vitessce)](https://pypi.org/project/vitessce)
[![Vitessce JS dependency version](https://img.shields.io/badge/dynamic/json.svg?url=https%3A%2F%2Fraw.githubusercontent.com%2Fvitessce%2Fvitessce-python%2Fmain%2Fjs%2Fpackage.json&label=vitessce&query=$.dependencies.vitessce&colorB=blue)](https://github.com/vitessce/vitessce/blob/main/CHANGELOG.md)
[![docs](https://img.shields.io/badge/docs-📖-57B4E9.svg)](https://vitessce.github.io/vitessce-python/)

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/vitessce/vitessce-python/main?filepath=docs/notebooks/widget_pbmc.ipynb)
Expand Down
8 changes: 0 additions & 8 deletions demos/dryrun_environment.yml

This file was deleted.

14 changes: 4 additions & 10 deletions demos/human-lymph-node-10x-visium/src/create_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
import scanpy as sc
import numpy as np
import scipy.cluster
from scipy import sparse
from vitessce.data_utils import (
to_diamond,
rgb_img_to_ome_zarr,
optimize_adata,
to_dense,
)


Expand Down Expand Up @@ -60,12 +58,6 @@ def get_orig_index(gene_id):
# Create a new *ordered* gene expression dataframe.
adata = adata[:, var_index_ordering].copy()
adata.obsm["X_hvg"] = adata[:, adata.var['highly_variable']].X.copy()
# Vitessce plays nicely with dense matrices saved with chunking
# and this one is small enough that dense is not a huge overhead.
# TODO: automate conversion to csc in optimize_adata function
if isinstance(adata.X, sparse.spmatrix):
adata.X = adata.X.todense()
# adata.obsm["X_hvg"] = adata[:, adata.var['highly_variable']].X.copy()

# Unclear what the exact scale factor is required to align
# the spots to the image. Through trial and error / manual binary search
Expand All @@ -91,9 +83,11 @@ def get_orig_index(gene_id):
obs_cols=["clusters"],
var_cols=["highly_variable"],
obsm_keys=["X_hvg", "spatial", "segmentations", "X_umap", "X_pca"],
preserve_X=True,
optimize_X=True,
# Vitessce plays nicely with dense matrices saved with chunking
# and this one is small enough that dense is not a huge overhead.
to_dense_X=True,
)
adata.X = np.array(to_dense(adata.X))
adata.write_zarr(output_adata, chunks=[adata.shape[0], 10])


Expand Down
11 changes: 2 additions & 9 deletions demos/kuppe-2022/src/convert_to_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pandas as pd
import json
from anndata import read_h5ad, AnnData
from scipy import sparse
import imageio.v2 as imageio
from vitessce.data_utils import (
to_diamond,
Expand Down Expand Up @@ -48,13 +47,7 @@ def process_h5ad_files(args):
rna_adata.layers['X_uint8'] = to_uint8(rna_adata.X, norm_along="global")
atac_adata.layers['X_uint8'] = to_uint8(atac_adata.X, norm_along="global")

# TODO: automate conversion to csc in optimize_adata function
visium_adata.layers['X_uint8'] = to_uint8(visium_adata.X, norm_along="var")
# Vitessce plays nicely with csc at the moment but not csr.
if isinstance(rna_adata.X, sparse.spmatrix):
rna_adata.X = rna_adata.X.tocsc()
if isinstance(atac_adata.X, sparse.spmatrix):
atac_adata.X = atac_adata.X.tocsc()

joint_cols = ['cell_type', 'development_stage', 'disease', 'sex']
joint_obs_df = pd.concat([
Expand All @@ -75,15 +68,15 @@ def process_h5ad_files(args):
obsm_keys=["X_umap", "X_pca"],
var_cols=["feature_name"],
layer_keys=["X_uint8"],
preserve_X=True,
optimize_X=False,
)

atac_adata = optimize_adata(
atac_adata,
obsm_keys=["X_umap"],
var_cols=["feature_name"],
layer_keys=["X_uint8"],
preserve_X=True,
optimize_X=False,
)

# Use chunks in case data is not sparse.
Expand Down
6 changes: 0 additions & 6 deletions demos/marshall-2022/src/convert_to_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from anndata import read_h5ad
import numpy as np
import scanpy as sc
from scipy import sparse
from vitessce.data_utils import (
to_diamond,
to_uint8,
Expand Down Expand Up @@ -38,17 +37,12 @@ def convert_h5ad_to_zarr(input_path, output_path):
for i in range(num_cells):
adata.obsm['X_segmentations'][i, :, :] = to_diamond(adata.obsm['X_spatial'][i, 0], adata.obsm['X_spatial'][i, 1], radius)

# TODO: automate conversion to csc in optimize_adata function
if isinstance(adata.X, sparse.spmatrix):
adata.X = adata.X.tocsc()

adata = optimize_adata(
adata,
obs_cols=["cell_type"],
var_cols=["feature_name"],
obsm_keys=["X_hvg", "X_hvg_uint8", "X_umap", "X_spatial", "X_segmentations"],
layer_keys=[],
preserve_X=True
)

adata.write_zarr(output_path, chunks=[adata.shape[0], 10])
Expand Down
8 changes: 8 additions & 0 deletions docs/api_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,12 @@ vitessce.export
*****************

.. automodule:: vitessce.export
:members:

vitessce.data_utils
*****************

.. automodule:: vitessce.data_utils.ome
:members:
.. automodule:: vitessce.data_utils.anndata
:members:
3 changes: 2 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
'sphinx.ext.linkcode',
'sphinx.ext.intersphinx',
'sphinx_rtd_theme',
'nbsphinx'
'nbsphinx',
'IPython.sphinxext.ipython_console_highlighting'
]

# Add any paths that contain templates here, relative to this directory.
Expand Down
3 changes: 1 addition & 2 deletions docs/data_examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ Data preparation examples
:maxdepth: 2

notebooks/data_export_s3
notebooks/data_export_files
notebooks/data_conversion
notebooks/data_export_files
31 changes: 27 additions & 4 deletions docs/notebooks/data_export_files.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"import os\n",
"import json\n",
"from urllib.parse import quote_plus\n",
"from os.path import join\n",
"from os.path import join, isfile, isdir\n",
"from urllib.request import urlretrieve\n",
"from anndata import read_h5ad\n",
"import scanpy as sc\n",
Expand All @@ -45,6 +45,10 @@
" Component as cm,\n",
" CoordinationType as ct,\n",
" AnnDataWrapper,\n",
")\n",
"from vitessce.data_utils import (\n",
" optimize_adata,\n",
" VAR_CHUNK_SIZE,\n",
")"
]
},
Expand All @@ -63,9 +67,10 @@
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(\"data\", exist_ok=True)\n",
"adata_filepath = join(\"data\", \"habib17.processed.h5ad\")\n",
"urlretrieve('https://covid19.cog.sanger.ac.uk/habib17.processed.h5ad', adata_filepath)\n",
"if not isfile(adata_filepath):\n",
" os.makedirs(\"data\", exist_ok=True)\n",
" urlretrieve('https://covid19.cog.sanger.ac.uk/habib17.processed.h5ad', adata_filepath)\n",
"\n",
"adata = read_h5ad(adata_filepath)\n",
"top_dispersion = adata.var[\"dispersions_norm\"][\n",
Expand All @@ -79,6 +84,24 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"zarr_filepath = join(\"data\", \"habib17.processed.zarr\")\n",
"if not isdir(zarr_filepath):\n",
" adata = optimize_adata(\n",
" adata,\n",
" obs_cols=[\"CellType\"],\n",
" obsm_keys=[\"X_umap\"],\n",
" var_cols=[\"top_highly_variable\"],\n",
" optimize_X=True,\n",
" )\n",
" adata.write_zarr(zarr_filepath, chunks=[adata.shape[0], VAR_CHUNK_SIZE])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -101,7 +124,7 @@
"source": [
"vc = VitessceConfig(schema_version=\"1.0.15\", name='Habib et al', description='COVID-19 Healthy Donor Brain')\n",
"dataset = vc.add_dataset(name='Brain').add_object(AnnDataWrapper(\n",
" adata,\n",
" adata_path=zarr_filepath,\n",
" obs_embedding_paths=[\"obsm/X_umap\"],\n",
" obs_embedding_names=[\"UMAP\"],\n",
" obs_set_paths=[\"obs/CellType\"],\n",
Expand Down
31 changes: 27 additions & 4 deletions docs/notebooks/data_export_s3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"import boto3\n",
"import json\n",
"from urllib.parse import quote_plus\n",
"from os.path import join\n",
"from os.path import join, isfile, isdir\n",
"from urllib.request import urlretrieve\n",
"from anndata import read_h5ad\n",
"import scanpy as sc\n",
Expand All @@ -46,6 +46,10 @@
" Component as cm,\n",
" CoordinationType as ct,\n",
" AnnDataWrapper,\n",
")\n",
"from vitessce.data_utils import (\n",
" optimize_adata,\n",
" VAR_CHUNK_SIZE,\n",
")"
]
},
Expand All @@ -64,9 +68,10 @@
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(\"data\", exist_ok=True)\n",
"adata_filepath = join(\"data\", \"habib17.processed.h5ad\")\n",
"urlretrieve('https://covid19.cog.sanger.ac.uk/habib17.processed.h5ad', adata_filepath)\n",
"if not isfile(adata_filepath):\n",
" os.makedirs(\"data\", exist_ok=True)\n",
" urlretrieve('https://covid19.cog.sanger.ac.uk/habib17.processed.h5ad', adata_filepath)\n",
"\n",
"adata = read_h5ad(adata_filepath)\n",
"top_dispersion = adata.var[\"dispersions_norm\"][\n",
Expand All @@ -80,6 +85,24 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"zarr_filepath = join(\"data\", \"habib17.processed.zarr\")\n",
"if not isdir(zarr_filepath):\n",
" adata = optimize_adata(\n",
" adata,\n",
" obs_cols=[\"CellType\"],\n",
" obsm_keys=[\"X_umap\"],\n",
" var_cols=[\"top_highly_variable\"],\n",
" optimize_X=True,\n",
" )\n",
" adata.write_zarr(zarr_filepath, chunks=[adata.shape[0], VAR_CHUNK_SIZE])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -102,7 +125,7 @@
"source": [
"vc = VitessceConfig(schema_version=\"1.0.15\", name='Habib et al', description='COVID-19 Healthy Donor Brain')\n",
"dataset = vc.add_dataset(name='Brain').add_object(AnnDataWrapper(\n",
" adata,\n",
" adata_path=zarr_filepath,\n",
" obs_embedding_paths=[\"obsm/X_umap\"],\n",
" obs_embedding_names=[\"UMAP\"],\n",
" obs_set_paths=[\"obs/CellType\"],\n",
Expand Down
Loading

0 comments on commit a17e8cc

Please sign in to comment.