Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cuvs acceleration for gpu k-means #2816

Merged
merged 33 commits into from
Sep 23, 2024
Merged
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
893088e
Accelerate k-means with cuvs
jacketsj Aug 31, 2024
9907154
Accelerate k-means with cuvs
jacketsj Sep 4, 2024
2e9a064
Clean up cuvs code, add it as a dependency
jacketsj Aug 31, 2024
d3a9583
Add time logging, and clean up some param choices
jacketsj Aug 31, 2024
0795607
Change use_cuvs to default=False and use logging.info for logging
jacketsj Aug 31, 2024
d2f1b36
Remove 'time_init'
jacketsj Aug 31, 2024
498fe00
Autoformat
jacketsj Sep 4, 2024
95674fc
Revert batch size
jacketsj Aug 31, 2024
0825198
Increase itopk_size over time, up to a reasonable limit
jacketsj Aug 31, 2024
4412cb9
Adjust import methodology
jacketsj Aug 31, 2024
7d66605
Simplify imports
jacketsj Aug 31, 2024
5d59807
Run autoformatter
jacketsj Aug 31, 2024
a71879c
More formatting
jacketsj Aug 31, 2024
2318b0b
Remove f strings in logging statements
jacketsj Aug 31, 2024
e74aa83
Split line
jacketsj Aug 31, 2024
f9b1a8c
Update based on linter
jacketsj Aug 31, 2024
a33cdaf
Run correct version of autoformatter
jacketsj Aug 31, 2024
109f844
subimports for cuvs and pylibraft
jacketsj Aug 31, 2024
33c8678
Add cuvs and pylibraft to full dependencies as a temporary measure
jacketsj Aug 31, 2024
5ebf485
Sort import block
jacketsj Aug 31, 2024
7919477
Clean up commented code
jacketsj Sep 4, 2024
58c5c98
Warnings -> Errors
jacketsj Sep 5, 2024
9cc06af
Move modified kmeans module to cuvs/kmeans.py
jacketsj Sep 5, 2024
4c5eb55
Setup multiple optional cuvs dependencies for different python versions
jacketsj Sep 5, 2024
a6647b1
Fix imports, use cagra and device_ndarray directly
jacketsj Sep 6, 2024
549ad27
Add missing cuvs module init file
jacketsj Sep 6, 2024
d035431
Integrate cuvs kmeans into training/assignments for ivf
jacketsj Sep 6, 2024
75744c3
Run linter
jacketsj Sep 6, 2024
15c243a
Move import check to top of kmeans.py
jacketsj Sep 6, 2024
28259aa
Seemingly finally fix optional submodule dependencies
jacketsj Sep 7, 2024
16e81c3
Run ruff fixes
jacketsj Sep 7, 2024
a6f129f
Add missing license header
jacketsj Sep 7, 2024
48ce370
Merge branch 'main' into jack/cuvs-accel
jacketsj Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Integrate cuvs kmeans into training/assignments for ivf
jacketsj committed Sep 12, 2024
commit d035431262ee646c09d00cb6c5ea81835dcfb4b5
8 changes: 5 additions & 3 deletions python/python/lance/cuvs/kmeans.py
Original file line number Diff line number Diff line change
@@ -6,9 +6,11 @@

from lance.dependencies import numpy as np
from lance.dependencies import torch
from lance.dependencies import cagra
from lance.dependencies import device_ndarray
#from lance.dependencies import cagra
#from lance.dependencies import device_ndarray
from lance.torch.kmeans import KMeans as KMeansTorch
from pylibraft.common import device_ndarray
from cuvs.neighbors import cagra

__all__ = ["KMeans"]

@@ -70,7 +72,7 @@ def __init__(
device=device,
)

if self.device.type != "cuda":
if self.device.type != "cuda" or not torch.cuda.is_available():
raise ValueError("KMeans::__init__: cuda is not enabled/available")

self.itopk_size = itopk_size
4 changes: 2 additions & 2 deletions python/python/lance/dependencies.py
Original file line number Diff line number Diff line change
@@ -168,7 +168,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
import numpy
import pandas
import polars
import pylibraft.device_ndarray
import pylibraft.common.device_ndarray
import ray
import tensorflow
import torch
@@ -179,7 +179,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
polars, _POLARS_AVAILABLE = _lazy_import("polars")
torch, _TORCH_AVAILABLE = _lazy_import("torch")
cagra, _CAGRA_AVAILABLE = _lazy_import("cuvs.neighbors.cagra")
device_ndarray, _DEVICE_NDARRAY_AVAILABLE = _lazy_import("pylibraft.device_ndarray")
device_ndarray, _DEVICE_NDARRAY_AVAILABLE = _lazy_import("pylibraft.common.device_ndarray")
datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets")
tensorflow, _TENSORFLOW_AVAILABLE = _lazy_import("tensorflow")
ray, _RAY_AVAILABLE = _lazy_import("ray")
35 changes: 26 additions & 9 deletions python/python/lance/vector.py
Original file line number Diff line number Diff line change
@@ -139,7 +139,11 @@ def train_ivf_centroids_on_accelerator(
) -> (np.ndarray, str):
"""Use accelerator (GPU or MPS) to train kmeans."""
if isinstance(accelerator, str) and (
not (CUDA_REGEX.match(accelerator) or accelerator == "mps")
not (
CUDA_REGEX.match(accelerator)
or accelerator == "mps"
or accelerator == "cuvs"
)
):
raise ValueError(
"Train ivf centroids on accelerator: "
@@ -168,14 +172,27 @@ def train_ivf_centroids_on_accelerator(
cache=True,
)

logging.info("Training IVF partitions using GPU(%s)", accelerator)
kmeans = KMeans(
k,
max_iters=max_iters,
metric=metric_type,
device=accelerator,
centroids=init_centroids,
)
if accelerator == "cuvs":
logging.info("Training IVF partitions using cuVS+GPU")
print("Training IVF partitions using cuVS+GPU")
from lance.cuvs.kmeans import KMeans as KMeansCuVS

kmeans = KMeansCuVS(
k,
max_iters=max_iters,
metric=metric_type,
device="cuda",
centroids=init_centroids,
)
else:
logging.info("Training IVF partitions using GPU(%s)", accelerator)
kmeans = KMeans(
k,
max_iters=max_iters,
metric=metric_type,
device=accelerator,
centroids=init_centroids,
)
kmeans.fit(ds)

centroids = kmeans.centroids.cpu().numpy()