Skip to content

Commit

Permalink
min components (#642)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben Epstein authored Jun 1, 2023
1 parent 1fd112b commit 595cd16
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion dataquality/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
dataquality.get_insights()
"""

__version__ = "v0.8.46"
__version__ = "v0.8.47"

import sys
from typing import Any, List, Optional
Expand Down
3 changes: 2 additions & 1 deletion dataquality/utils/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def get_pca_embeddings(embs: np.ndarray) -> np.ndarray:
"""
import cuml

pca = cuml.IncrementalPCA(n_components=PCA_N_COMPONENTS, batch_size=PCA_CHUNK_SIZE)
n_components = min(PCA_N_COMPONENTS, *embs.shape)
pca = cuml.IncrementalPCA(n_components=n_components, batch_size=PCA_CHUNK_SIZE)
return pca.fit_transform(embs)


Expand Down
4 changes: 2 additions & 2 deletions dataquality/utils/vaex.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ def get_output_df(
def add_pca_to_df(df: DataFrame, chunk_size: int = PCA_CHUNK_SIZE) -> DataFrame:
"""Adds the 'emb_pca' to the dataframe"""
df_copy = df.copy()
# sklearn breaks if a the dataset has less samples than PCA_N_COMPONENTS
n_components = min(PCA_N_COMPONENTS, len(df_copy))
# n_components must be <= num samples and <= emb dimension
n_components = min(PCA_N_COMPONENTS, *df["emb"].shape)
pca = IncrementalPCA(n_components=n_components)
for i1, i2, chunk in df_copy.evaluate_iterator("emb", chunk_size=chunk_size):
pca.partial_fit(chunk)
Expand Down

0 comments on commit 595cd16

Please sign in to comment.