min components (#642)

rungalileo · Jun 1, 2023 · 595cd16 · 595cd16
1 parent 1fd112b
commit 595cd16
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 4 deletions.
diff --git a/dataquality/__init__.py b/dataquality/__init__.py
@@ -30,7 +30,7 @@
         dataquality.get_insights()
 """
 
-__version__ = "v0.8.46"
+__version__ = "v0.8.47"
 
 import sys
 from typing import Any, List, Optional

diff --git a/dataquality/utils/cuda.py b/dataquality/utils/cuda.py
@@ -20,7 +20,8 @@ def get_pca_embeddings(embs: np.ndarray) -> np.ndarray:
     """
     import cuml
 
-    pca = cuml.IncrementalPCA(n_components=PCA_N_COMPONENTS, batch_size=PCA_CHUNK_SIZE)
+    n_components = min(PCA_N_COMPONENTS, *embs.shape)
+    pca = cuml.IncrementalPCA(n_components=n_components, batch_size=PCA_CHUNK_SIZE)
     return pca.fit_transform(embs)
 
 

diff --git a/dataquality/utils/vaex.py b/dataquality/utils/vaex.py
@@ -224,8 +224,8 @@ def get_output_df(
 def add_pca_to_df(df: DataFrame, chunk_size: int = PCA_CHUNK_SIZE) -> DataFrame:
     """Adds the 'emb_pca' to the dataframe"""
     df_copy = df.copy()
-    # sklearn breaks if a the dataset has less samples than PCA_N_COMPONENTS
-    n_components = min(PCA_N_COMPONENTS, len(df_copy))
+    # n_components must be <= num samples and <= emb dimension
+    n_components = min(PCA_N_COMPONENTS, *df["emb"].shape)
     pca = IncrementalPCA(n_components=n_components)
     for i1, i2, chunk in df_copy.evaluate_iterator("emb", chunk_size=chunk_size):
         pca.partial_fit(chunk)