diff --git a/404.html b/404.html index 99c132e..34b6aca 100644 --- a/404.html +++ b/404.html @@ -146,7 +146,7 @@
  • - + Finetuners
  • @@ -254,7 +254,7 @@
  • - + Finetuners
  • diff --git a/API/external/index.html b/API/external/index.html index 1329dca..77d8e78 100644 --- a/API/external/index.html +++ b/API/external/index.html @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -414,6 +414,13 @@ OpenAIEncoder + + +
  • + + AzureOpenAIEncoder + +
  • @@ -501,7 +508,7 @@

    OpenAIEncoder

    Encoder that can numerically encode sentences.

    Note that this is an external embedding provider. If their API breaks, so will this component. We also assume that you've already importen openai upfront and ran this command:

    -

    This encoder will require the OPENAI_ORG and OPENAI_KEY environment variables to be set. +

    This encoder will require the OPENAI_API_KEY (optionally OPENAI_ORG_ID and OPENAI_PROJECT_ID) environment variable to be set. If you have it defined in your .env file, you can use python-dotenv to load it.

    You also need to install the openai library beforehand.

    python -m pip install openai
    @@ -549,13 +556,13 @@ 

    OpenAIEncoder

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.external import OpenAIEncoder
    -from dotenv import load_dotenv
    +from embetter.grab import ColumnGrabber
    +from embetter.external import OpenAIEncoder
    +from dotenv import load_dotenv
     
     load_dotenv()  # take environment variables from .env.
     
    @@ -586,8 +593,7 @@ 

    OpenAIEncoder

    Source code in embetter/external/_openai.py -
    + + + + + +
    14
    -15
    +              
    15
     16
     17
     18
    @@ -658,14 +664,14 @@ 

    OpenAIEncoder

    83 84 85 -86
    class OpenAIEncoder(EmbetterBase):
    +86
    class OpenAIEncoder(EmbetterBase):
         """
         Encoder that can numerically encode sentences.
     
         Note that this is an **external** embedding provider. If their API breaks, so will this component.
         We also assume that you've already importen openai upfront and ran this command:
     
    -    This encoder will require the `OPENAI_ORG` and `OPENAI_KEY` environment variables to be set.
    +    This encoder will require the `OPENAI_API_KEY` (optionally `OPENAI_ORG_ID` and `OPENAI_PROJECT_ID`) environment variable to be set.
         If you have it defined in your `.env` file, you can use python-dotenv to load it.
     
         You also need to install the `openai` library beforehand.
    @@ -717,19 +723,18 @@ 

    OpenAIEncoder

    ``` """ - def __init__(self, model="text-embedding-ada-002", batch_size=25): + def __init__(self, model="text-embedding-ada-002", batch_size=25): # You must run this first! - openai.organization = os.getenv("OPENAI_ORG") - openai.api_key = os.getenv("OPENAI_KEY") + self.client = OpenAI() self.model = model self.batch_size = batch_size - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the text into a numeric representation.""" result = [] for b in _batch(X, self.batch_size): - resp = openai.Embedding.create(input=b, model=self.model) # fmt: off - result.extend([_["embedding"] for _ in resp["data"]]) + resp = self.client.embeddings.create(input=b, model=self.model) # fmt: off + result.extend([_.embedding for _ in resp.data]) return np.array(result)
    @@ -748,6 +753,257 @@

    OpenAIEncoder

    +
    + + + +

    AzureOpenAIEncoder

    + + +
    + + + +
    +

    + Bases: OpenAIEncoder

    + + +

    Encoder that can numerically encode sentences.

    +

    Note that this is an external embedding provider. If their API breaks, so will this component.

    +

    To use this encoder you must provide credentials. Please provide one of the api_key, azure_ad_token, azure_ad_token_provider arguments, or the AZURE_OPENAI_API_KEY or AZURE_OPENAI_AD_TOKEN. +You must provide one of the base_url or azure_endpoint arguments, or the AZURE_OPENAI_ENDPOINT environment variable. +Furthermore you must provide either the api_version argument or the OPENAI_API_VERSION environment variable.

    +

    If you have your enviroment variables defined in your .env file, you can use python-dotenv to load it.

    +

    You also need to install the openai library beforehand.

    +
    python -m pip install openai
    +
    + + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    model + +
    +

    name of model.

    +
    +
    + required +
    batch_size + +
    +

    Batch size to send to AzureOpenAI.

    +
    +
    + required +
    +

    Usage:

    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
    +
    +from embetter.grab import ColumnGrabber
    +from embetter.external import AzureOpenAIEncoder
    +from dotenv import load_dotenv
    +
    +load_dotenv()  # take environment variables from .env.
    +
    +# Let's suppose this is the input dataframe
    +dataf = pd.DataFrame({
    +    "text": ["positive sentiment", "super negative"],
    +    "label_col": ["pos", "neg"]
    +})
    +
    +# This pipeline grabs the `text` column from a dataframe
    +# which then get fed into OpenAI's endpoint
    +text_emb_pipeline = make_pipeline(
    +    ColumnGrabber("text"),
    +    AzureOpenAIEncoder()
    +)
    +X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
    +
    +# This pipeline can also be trained to make predictions, using
    +# the embedded features.
    +text_clf_pipeline = make_pipeline(
    +    text_emb_pipeline,
    +    LogisticRegression()
    +)
    +
    +# Prediction example
    +text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    +
    + +
    + Source code in embetter/external/_openai.py +
     89
    + 90
    + 91
    + 92
    + 93
    + 94
    + 95
    + 96
    + 97
    + 98
    + 99
    +100
    +101
    +102
    +103
    +104
    +105
    +106
    +107
    +108
    +109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    +122
    +123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    +134
    +135
    +136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    class AzureOpenAIEncoder(OpenAIEncoder):
    +    """
    +    Encoder that can numerically encode sentences.
    +
    +    Note that this is an *external* embedding provider. If their API breaks, so will this component.
    +
    +    To use this encoder you must provide credentials. Please provide one of the `api_key`, `azure_ad_token`, `azure_ad_token_provider` arguments, or the `AZURE_OPENAI_API_KEY` or `AZURE_OPENAI_AD_TOKEN`.
    +    You must provide one of the `base_url` or `azure_endpoint` arguments, or the `AZURE_OPENAI_ENDPOINT` environment variable.
    +    Furthermore you must provide either the `api_version` argument or the `OPENAI_API_VERSION` environment variable.
    +
    +    If you have your enviroment variables defined in your `.env` file, you can use python-dotenv to load it.
    +
    +    You also need to install the `openai` library beforehand.
    +
    +    ```
    +    python -m pip install openai
    +    ```
    +
    +    Arguments:
    +        model: name of model.
    +        batch_size: Batch size to send to AzureOpenAI.
    +
    +    *Usage*:
    +
    +    ```python
    +    import pandas as pd
    +    from sklearn.pipeline import make_pipeline
    +    from sklearn.linear_model import LogisticRegression
    +
    +    from embetter.grab import ColumnGrabber
    +    from embetter.external import AzureOpenAIEncoder
    +    from dotenv import load_dotenv
    +
    +    load_dotenv()  # take environment variables from .env.
    +
    +    # Let's suppose this is the input dataframe
    +    dataf = pd.DataFrame({
    +        "text": ["positive sentiment", "super negative"],
    +        "label_col": ["pos", "neg"]
    +    })
    +
    +    # This pipeline grabs the `text` column from a dataframe
    +    # which then get fed into OpenAI's endpoint
    +    text_emb_pipeline = make_pipeline(
    +        ColumnGrabber("text"),
    +        AzureOpenAIEncoder()
    +    )
    +    X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
    +
    +    # This pipeline can also be trained to make predictions, using
    +    # the embedded features.
    +    text_clf_pipeline = make_pipeline(
    +        text_emb_pipeline,
    +        LogisticRegression()
    +    )
    +
    +    # Prediction example
    +    text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
    +    ```
    +    """
    +
    +    def _init_(self, model="text-embedding-ada-002", batch_size=25, **kwargs):
    +        self.model = model
    +        self.batch_size = batch_size
    +        self.client = AzureOpenAI(**kwargs)
    +
    +
    + + + +
    + + + + + + + + + + +
    @@ -798,16 +1054,29 @@

    OpenAIEncoder

    'large'
    batch_size + +
    +

    Batch size to send to Cohere.

    +
    +
    + 10 +

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.external import CohereEncoder
    -from dotenv import load_dotenv
    +from embetter.grab import ColumnGrabber
    +from embetter.external import CohereEncoder
    +from dotenv import load_dotenv
     
     load_dotenv()  # take environment variables from .env.
     
    @@ -838,8 +1107,7 @@ 

    OpenAIEncoder

    Source code in embetter/external/_cohere.py -
    13
    -14
    +              
    14
     15
     16
     17
    @@ -907,7 +1175,10 @@ 

    OpenAIEncoder

    79 80 81 -82
    class CohereEncoder(EmbetterBase):
    +82
    +83
    +84
    +85
    class CohereEncoder(EmbetterBase):
         """
         Encoder that can numerically encode sentences.
     
    @@ -924,6 +1195,7 @@ 

    OpenAIEncoder

    Arguments: model: name of model, can be "small" or "large" + batch_size: Batch size to send to Cohere. **Usage**: @@ -964,16 +1236,17 @@

    OpenAIEncoder

    ``` """ - def __init__(self, model="large"): - from cohere import Client + def __init__(self, model="large", batch_size=10): + from cohere import Client self.client = Client(os.getenv("COHERE_KEY")) self.model = model + self.batch_size = batch_size - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the text into a numeric representation.""" result = [] - for b in _batch(X, 10): + for b in _batch(X, self.batch_size): response = self.client.embed(b) result.extend(response.embeddings) return np.array(result) diff --git a/API/finetune/index.html b/API/finetune/index.html index eb9e40b..d291c20 100644 --- a/API/finetune/index.html +++ b/API/finetune/index.html @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -626,7 +626,16 @@

    FeedForwardTuner

    78 79 80 -81
    class FeedForwardTuner(BaseEstimator, TransformerMixin):
    +81
    +82
    +83
    +84
    +85
    +86
    +87
    +88
    +89
    +90
    class FeedForwardTuner(BaseEstimator, TransformerMixin):
         """
         Create a feed forward model to finetune the embeddings towards a class.
     
    @@ -636,17 +645,20 @@ 

    FeedForwardTuner

    learning_rate: The learning rate of the feed forward model """ - def __init__(self, hidden_dim=50, n_epochs=500, learning_rate=0.01) -> None: + def __init__( + self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32 + ) -> None: self.hidden_dim = hidden_dim self.n_epochs = n_epochs self.learning_rate = learning_rate + self.batch_size = batch_size self.label_enc = LabelEncoder() - def fit(self, X, y): + def fit(self, X, y): """Fits the finetuner.""" return self.partial_fit(X, y, classes=np.unique(y)) - def partial_fit(self, X, y, classes=None): + def partial_fit(self, X, y, classes=None): """Fits the finetuner using the partial_fit API.""" if not hasattr(self, "_classes"): if classes is None: @@ -667,16 +679,22 @@

    FeedForwardTuner

    torch_X = torch.from_numpy(X).detach().float() torch_y = torch.from_numpy(self.label_enc.transform(y)).detach() + dataset = torch.utils.data.TensorDataset(torch_X, torch_y) + dataloader = torch.utils.data.DataLoader( + dataset, batch_size=self.batch_size, shuffle=True + ) + for _ in range(self.n_epochs): - self._optimizer.zero_grad() - out = self._model(torch_X) - loss = self._criterion(out, torch_y) - loss.backward() - self._optimizer.step() + for batch_X, batch_y in dataloader: + self._optimizer.zero_grad() + out = self._model(batch_X) + loss = self._criterion(out, batch_y) + loss.backward() + self._optimizer.step() return self - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the data according to the sklearn api by using the hidden layer.""" Xt = torch.from_numpy(X).float().detach() return self._model.embed(Xt).detach().numpy() @@ -840,7 +858,7 @@

    FeedForwardTuner

    116 117 118 -119
    class ContrastiveTuner(BaseEstimator, TransformerMixin):
    +119
    class ContrastiveTuner(BaseEstimator, TransformerMixin):
         """
         Run a contrastive network to finetune the embeddings towards a class.
     
    @@ -851,7 +869,7 @@ 

    FeedForwardTuner

    learning_rate: learning rate of the contrastive network """ - def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None: + def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None: self.learner = ContrastiveLearner( shape_out=hidden_dim, batch_size=256, @@ -863,11 +881,11 @@

    FeedForwardTuner

    self.epochs = epochs self.learning_rate = learning_rate - def fit(self, X, y): + def fit(self, X, y): """Fits the finetuner.""" return self.partial_fit(X, y, classes=np.unique(y)) - def generate_batch(self, X_torch, y): + def generate_batch(self, X_torch, y): """Generate a batch of pytorch pairs used for finetuning""" pairs = generate_pairs_batch(y, n_neg=self.n_neg) X1 = torch.zeros(len(pairs), X_torch.shape[1]) @@ -878,7 +896,7 @@

    FeedForwardTuner

    X2[i] = X_torch[pair.i2] return X1, X2, labels - def partial_fit(self, X, y, classes=None): + def partial_fit(self, X, y, classes=None): """Fits the finetuner using the partial_fit API.""" if not hasattr(self, "_classes"): if classes is None: @@ -893,7 +911,7 @@

    FeedForwardTuner

    return self - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the data according to the sklearn api by using the hidden layer.""" return self.learner.transform(X)
    @@ -1001,14 +1019,14 @@

    FeedForwardTuner

    Usage:

    -
    from sentence_transformers import SentenceTransformer
    -from embetter.finetune import ContrastiveLearner
    -import random
    +
    from sentence_transformers import SentenceTransformer
    +from embetter.finetune import ContrastiveLearner
    +import random
     
     sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
     learner = SbertLearner(sent_tfm)
     
    -def sample_generator(examples, n_neg=3):
    +def sample_generator(examples, n_neg=3):
         # A generator that assumes examples to be a dictionary of the shape
         # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
         # this is typically a function that's very custom to your use-case though
    @@ -1146,7 +1164,13 @@ 

    FeedForwardTuner

    128 129 130 -131
    class ContrastiveLearner:
    +131
    +132
    +133
    +134
    +135
    +136
    +137
    class ContrastiveLearner:
         """
         A learner model that can finetune on pairs of data on top of numeric embeddings.
     
    @@ -1202,7 +1226,7 @@ 

    FeedForwardTuner

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would. """ - def __init__( + def __init__( self, shape_out: int = 300, batch_size: int = 16, @@ -1215,7 +1239,7 @@

    FeedForwardTuner

    self.epochs = epochs self.shape_out = shape_out - def fit(self, X1, X2, y): + def fit(self, X1, X2, y): """Finetune an Sbert model based on similarities between two sets of texts.""" self.network_ = ContrastiveNetwork( shape_in=X1.shape[1], hidden_dim=self.shape_out @@ -1227,29 +1251,35 @@

    FeedForwardTuner

    X2_torch = torch.from_numpy(X2).detach().float() y_torch = torch.from_numpy(np.array(y)).detach().float() + dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch) + dataloader = torch.utils.data.DataLoader( + dataset, batch_size=self.batch_size, shuffle=True + ) + for _ in range(self.epochs): # loop over the dataset multiple times - # zero the parameter gradients - optimizer.zero_grad() - - # forward + backward + optimize - cos_sim = self.network_(X1_torch, X2_torch) - loss = criterion(cos_sim, y_torch) - loss.backward() - optimizer.step() + for batch_X1, batch_X2, batch_y in dataloader: + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + cos_sim = self.network_(batch_X1, batch_X2) + loss = criterion(cos_sim, batch_y) + loss.backward() + optimizer.step() return self - def transform(self, X, y=None): + def transform(self, X, y=None): """Encode a single batch of inputs.""" X_torch = torch.from_numpy(X).detach().float() return self.network_.embed(X_torch).detach().numpy() - def predict(self, X1, X2): + def predict(self, X1, X2): """Predicts the cosine similarity.""" emb1 = self.transform(X1) emb2 = self.transform(X2) return np.array(CosineSimilarity()(emb1, emb2)) - def to_disk(self, path): + def to_disk(self, path): """Save the finetuned Sbert model.""" self.sent_tfm.save(path=path)
    @@ -1359,14 +1389,14 @@

    FeedForwardTuner

    Usage:

    -
    from sentence_transformers import SentenceTransformer
    -from embetter.finetune import SbertLearner
    -import random
    +
    from sentence_transformers import SentenceTransformer
    +from embetter.finetune import SbertLearner
    +import random
     
     sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
     learner = SbertLearner(sent_tfm)
     
    -def sample_generator(examples, n_neg=3):
    +def sample_generator(examples, n_neg=3):
         # A generator that assumes examples to be a dictionary of the shape
         # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
         # this is typically a function that's very custom to your use-case though
    @@ -1494,7 +1524,7 @@ 

    FeedForwardTuner

    100 101 102 -103
    class SbertLearner:
    +103
    class SbertLearner:
         """
         A learner model that can finetune on pairs of data that leverages SBERT under the hood.
     
    @@ -1550,7 +1580,7 @@ 

    FeedForwardTuner

    After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would. """ - def __init__( + def __init__( self, sent_tfm: SentenceTransformer, batch_size: int = 16, @@ -1562,7 +1592,7 @@

    FeedForwardTuner

    self.epochs = epochs self.warmup_steps = warmup_steps - def fit(self, X1, X2, y): + def fit(self, X1, X2, y): """Finetune an Sbert model based on similarities between two sets of texts.""" train_examples = [ InputExample(texts=[x1, x2], label=float(lab)) @@ -1577,17 +1607,17 @@

    FeedForwardTuner

    ) return self - def transform(self, X, y=None): + def transform(self, X, y=None): """Encode a single batch of Sbert inputs (usually texts).""" return self.sent_tfm.encode(X) - def predict(self, X1, X2): + def predict(self, X1, X2): """Predicts the cosine similarity.""" emb1 = self.transform(X1) emb2 = self.transform(X2) return np.array(CosineSimilarity(dim=1)(emb1, emb2)) - def to_disk(self, path): + def to_disk(self, path): """Save the finetuned Sbert model.""" self.sent_tfm.save(path=path)
    diff --git a/API/grab/index.html b/API/grab/index.html index d45648c..17d911d 100644 --- a/API/grab/index.html +++ b/API/grab/index.html @@ -151,7 +151,7 @@
  • - + Finetuners
  • @@ -259,7 +259,7 @@
  • - + Finetuners
  • @@ -483,8 +483,8 @@

    ColumnGrabber

    Usage

    In essense, the ColumnGrabber really just selects a single column.

    -
    import pandas as pd
    -from embetter.grab import ColumnGrabber
    +
    import pandas as pd
    +from embetter.grab import ColumnGrabber
     
     # Let's say we start we start with a csv file with filepaths
     data = {"filepaths":  ["tests/data/thiscatdoesnotexist.jpeg"]}
    @@ -494,11 +494,11 @@ 

    ColumnGrabber

    ColumnGrabber("filepaths").fit_transform(df)

    But the most common way to use the ColumnGrabber is part of a pipeline.

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
     
    -from embetter.grab import ColumnGrabber
    -from embetter.vision import ImageLoader, ColorHistogramEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.vision import ImageLoader, ColorHistogramEncoder
     
     # Let's say we start we start with a csv file with filepaths
     data = {"filepaths":  ["tests/data/thiscatdoesnotexist.jpeg"]}
    @@ -583,7 +583,7 @@ 

    ColumnGrabber

    64 65 66 -67
    class ColumnGrabber(EmbetterBase):
    +67
    class ColumnGrabber(EmbetterBase):
         """
         Component that can grab a pandas column as a list.
     
    @@ -639,10 +639,10 @@ 

    ColumnGrabber

    ``` """ - def __init__(self, colname: str) -> None: + def __init__(self, colname: str) -> None: self.colname = colname - def transform(self, X, y=None): + def transform(self, X, y=None): """ Takes a column from pandas and returns it as a list. """ @@ -683,7 +683,7 @@

    64 65 66 -67

    def transform(self, X, y=None):
    +67
    def transform(self, X, y=None):
         """
         Takes a column from pandas and returns it as a list.
         """
    @@ -730,16 +730,16 @@ 

    82 83 84 -85

    class KeyGrabber:
    +85
    class KeyGrabber:
         """
         Effectively the same thing as the ColumnGrabber, except this is
         meant to work on generators of dictionaries instead of dataframes.
         """
     
    -    def __init__(self, colname: str) -> None:
    +    def __init__(self, colname: str) -> None:
             self.colname = colname
     
    -    def transform(self, X, y=None):
    +    def transform(self, X, y=None):
             """
             Takes a column from pandas and returns it as a list.
             """
    @@ -784,7 +784,7 @@ 

    82 83 84 -85

    def transform(self, X, y=None):
    +85
    def transform(self, X, y=None):
         """
         Takes a column from pandas and returns it as a list.
         """
    diff --git a/API/model/index.html b/API/model/index.html
    index ed346db..964dfd2 100644
    --- a/API/model/index.html
    +++ b/API/model/index.html
    @@ -153,7 +153,7 @@
     
     
       
  • - + Finetuners
  • @@ -263,7 +263,7 @@
  • - + Finetuners
  • @@ -536,8 +536,8 @@

    DifferenceClassifier

    Usage:

    -
    from embetter.model import DifferenceClassifier
    -from embetter.text import SentenceEncoder
    +
    from embetter.model import DifferenceClassifier
    +from embetter.text import SentenceEncoder
     
     mod = DifferenceClassifier(enc=SentenceEncoder())
     
    @@ -615,7 +615,7 @@ 

    DifferenceClassifier

    60 61 62 -63
    class DifferenceClassifier:
    +63
    class DifferenceClassifier:
         """
         Classifier for similarity using encoders under the hood.
     
    @@ -652,25 +652,25 @@ 

    DifferenceClassifier

    ``` """ - def __init__(self, enc: TransformerMixin, clf_head: ClassifierMixin = None): + def __init__(self, enc: TransformerMixin, clf_head: ClassifierMixin = None): self.enc = enc self.clf_head = ( LogisticRegression(class_weight="balanced") if not clf_head else clf_head ) - def _calc_feats(self, X1, X2): + def _calc_feats(self, X1, X2): enc1 = self.enc.transform(X1) enc2 = self.enc.transform(X2) return np.abs(enc1 - enc2) - def fit(self, X1, X2, y): + def fit(self, X1, X2, y): self.clf_head.fit(self._calc_feats(X1, X2), y) return self - def predict(self, X1, X2): + def predict(self, X1, X2): return self.clf_head.predict(self._calc_feats(X1, X2)) - def predict_proba(self, X1, X2): + def predict_proba(self, X1, X2): return self.clf_head.predict_proba(self._calc_feats(X1, X2))
    diff --git a/API/multimodal/index.html b/API/multimodal/index.html index 2af4b10..f20a804 100644 --- a/API/multimodal/index.html +++ b/API/multimodal/index.html @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -526,7 +526,7 @@

    ClipEncoder

    -

    manually override cpu/gpu device, tries to grab gpu automatically when available

    +

    manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available

    @@ -613,7 +613,12 @@

    ClipEncoder

    49 50 51 -52
    class ClipEncoder(EmbetterBase):
    +52
    +53
    +54
    +55
    +56
    +57
    class ClipEncoder(EmbetterBase):
         """
         Clip model than can encode text and images.
     
    @@ -621,7 +626,7 @@ 

    ClipEncoder

    Arguments: name: name of model, see available options - device: manually override cpu/gpu device, tries to grab gpu automatically when available + device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available quantize: turns on quantization num_threads: number of treads for pytorch to use, only affects when device=cpu @@ -633,11 +638,16 @@

    ClipEncoder

    - `clip-ViT-B-32-multilingual-v1` """ - def __init__( + def __init__( self, name="clip-ViT-B-32", device=None, quantize=False, num_threads=None ): if not device: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") self.name = name self.device = device self.tfm = SBERT(name, device=self.device) @@ -649,7 +659,7 @@

    ClipEncoder

    if self.device.type == "cpu": torch.set_num_threads(num_threads) - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the text into a numeric representation.""" # Convert pd.Series objects to encode compatable if isinstance(X, pd.Series): diff --git a/API/text/index.html b/API/text/index.html index 9fb45e3..dee6401 100644 --- a/API/text/index.html +++ b/API/text/index.html @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -375,8 +375,8 @@
  • - - MatrouskaEncoder + + MatryoshkaEncoder
  • @@ -574,7 +574,7 @@

    SentenceEncoder

    -

    manually override cpu/gpu device, tries to grab gpu automatically when available

    +

    manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available

    @@ -627,12 +627,12 @@

    SentenceEncoder

    You can find the more options, and information, on the sentence-transformers docs page.

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import SentenceEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import SentenceEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -661,99 +661,104 @@ 

    SentenceEncoder

    Source code in embetter/text/_sbert.py -
    10
    -11
    -12
    -13
    -14
    -15
    -16
    -17
    -18
    -19
    -20
    -21
    -22
    -23
    -24
    -25
    -26
    -27
    -28
    -29
    -30
    -31
    -32
    -33
    -34
    -35
    -36
    -37
    -38
    -39
    -40
    -41
    -42
    -43
    -44
    -45
    -46
    -47
    -48
    -49
    -50
    -51
    -52
    -53
    -54
    -55
    -56
    -57
    -58
    -59
    -60
    -61
    -62
    -63
    -64
    -65
    -66
    -67
    -68
    -69
    -70
    -71
    -72
    -73
    -74
    -75
    -76
    -77
    -78
    -79
    -80
    -81
    -82
    -83
    -84
    -85
    -86
    -87
    -88
    -89
    -90
    -91
    -92
    -93
    -94
    -95
    -96
    class SentenceEncoder(EmbetterBase):
    +              
     12
    + 13
    + 14
    + 15
    + 16
    + 17
    + 18
    + 19
    + 20
    + 21
    + 22
    + 23
    + 24
    + 25
    + 26
    + 27
    + 28
    + 29
    + 30
    + 31
    + 32
    + 33
    + 34
    + 35
    + 36
    + 37
    + 38
    + 39
    + 40
    + 41
    + 42
    + 43
    + 44
    + 45
    + 46
    + 47
    + 48
    + 49
    + 50
    + 51
    + 52
    + 53
    + 54
    + 55
    + 56
    + 57
    + 58
    + 59
    + 60
    + 61
    + 62
    + 63
    + 64
    + 65
    + 66
    + 67
    + 68
    + 69
    + 70
    + 71
    + 72
    + 73
    + 74
    + 75
    + 76
    + 77
    + 78
    + 79
    + 80
    + 81
    + 82
    + 83
    + 84
    + 85
    + 86
    + 87
    + 88
    + 89
    + 90
    + 91
    + 92
    + 93
    + 94
    + 95
    + 96
    + 97
    + 98
    + 99
    +100
    +101
    +102
    +103
    class SentenceEncoder(EmbetterBase):
         """
         Encoder that can numerically encode sentences.
     
         Arguments:
             name: name of model, see available options
    -        device: manually override cpu/gpu device, tries to grab gpu automatically when available
    +        device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available
             quantize: turns on quantization
             num_threads: number of treads for pytorch to use, only affects when device=cpu
     
    @@ -811,11 +816,16 @@ 

    SentenceEncoder

    ``` """ - def __init__( + def __init__( self, name="all-MiniLM-L6-v2", device=None, quantize=False, num_threads=None ): if not device: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") self.name = name self.device = device self.tfm = SBERT(name, device=self.device) @@ -827,7 +837,7 @@

    SentenceEncoder

    if self.device.type == "cpu": torch.set_num_threads(num_threads) - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the text into a numeric representation.""" # Convert pd.Series objects to encode compatable if isinstance(X, pd.Series): @@ -855,7 +865,7 @@

    SentenceEncoder

    -

    MatrouskaEncoder

    +

    MatryoshkaEncoder

    @@ -867,18 +877,18 @@

    SentenceEncoder

    Encoder that can numerically encode sentences.

    This function, which looks like a class, offers a shorthand way to fetch pretrained -Matrouska embeddings. +Matryoshka embeddings. Under the hood it just returns a SentenceEncoder object, but the default name points -to a pretrained Matrouska model.

    +to a pretrained Matryoshka model.

    These embeddings are more flexible in the sense that you can more easily reduce the dimensions without losing as much information. The aforementioned docs give more details

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import SentenceEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import SentenceEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -890,7 +900,7 @@ 

    SentenceEncoder

    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. text_emb_pipeline = make_pipeline( ColumnGrabber("text"), - MatrouskaEncoder() + MatryoshkaEncoder() ) X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) @@ -907,23 +917,7 @@

    SentenceEncoder

    Source code in embetter/text/_sbert.py -
     99
    -100
    -101
    -102
    -103
    -104
    -105
    -106
    -107
    -108
    -109
    -110
    -111
    -112
    -113
    -114
    -115
    +            
    115
     116
     117
     118
    @@ -954,14 +948,30 @@ 

    SentenceEncoder

    143 144 145 -146
    def MatrouskaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    +157
    +158
    +159
    +160
    +161
    +162
    def MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
         """
         Encoder that can numerically encode sentences.
     
         This function, which looks like a class, offers a shorthand way to fetch pretrained
    -    [Matrouska embeddings](https://www.sbert.net/examples/training/matryoshka/README.html).
    +    [Matryoshka embeddings](https://www.sbert.net/examples/training/matryoshka/README.html).
         Under the hood it just returns a `SentenceEncoder` object, but the default name points
    -    to a pretrained Matrouska model.
    +    to a pretrained Matryoshka model.
     
         These embeddings are more flexible in the sense that you can more easily reduce the
         dimensions without losing as much information. The aforementioned docs give more details
    @@ -986,7 +996,7 @@ 

    SentenceEncoder

    # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. text_emb_pipeline = make_pipeline( ColumnGrabber("text"), - MatrouskaEncoder() + MatryoshkaEncoder() ) X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) @@ -1049,7 +1059,7 @@

    SentenceEncoder

    This function can be used to load a model that's saved with featherbed_textrepr.

    Usage:

    You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

    -
    from embetter.text import learn_lite_text_embeddings, LiteTextEncoder
    +
    from embetter.text import learn_lite_text_embeddings, LiteTextEncoder
     
     learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")
     
    @@ -1081,7 +1091,7 @@ 

    SentenceEncoder

    71 72 73 -74
    def LiteTextEncoder(path):
    +74
    def LiteTextEncoder(path):
         """
         Function that looks like class so that it fits the API.
     
    @@ -1192,19 +1202,19 @@ 

    SentenceEncoder

    The pre-trained model names that you could use can be found here.

    Usage:

    You can leverage the multiple backends from keras-core by setting the KERAS_BACKEND environment variable.

    -
    import os
    +
    import os
     # Pick the right setting
     os.environ["KERAS_BACKEND"] = "jax"
     os.environ["KERAS_BACKEND"] = "torch"
     os.environ["KERAS_BACKEND"] = "tensorflow"
     

    Once this is set, the following code will automatically use the right backend.

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import SentenceEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import SentenceEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -1308,7 +1318,7 @@ 

    SentenceEncoder

    79 80 81 -82
    class KerasNLPEncoder(EmbetterBase):
    +82
    class KerasNLPEncoder(EmbetterBase):
         """
         Encoder that can numerically encode sentences.
     
    @@ -1368,12 +1378,12 @@ 

    SentenceEncoder

    ``` """ - def __init__(self, name="bert_tiny_en_uncased"): + def __init__(self, name="bert_tiny_en_uncased"): self.name = name self.backbone = keras_nlp.models.BertBackbone.from_preset(name) self.preprocessor = keras_nlp.models.BertPreprocessor.from_preset(name) - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the text into a numeric representation.""" if isinstance(X, pd.Series): X = X.to_numpy() @@ -1418,12 +1428,12 @@

    SentenceEncoder

    Usage

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import spaCyEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import spaCyEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -1524,7 +1534,7 @@ 

    SentenceEncoder

    79 80 81 -82
    class spaCyEncoder(EmbetterBase):
    +82
    class spaCyEncoder(EmbetterBase):
         """
         **Usage**
     
    @@ -1562,7 +1572,7 @@ 

    SentenceEncoder

    ``` """ - def __init__(self, nlp: Union[str, Language], agg: str = "base"): + def __init__(self, nlp: Union[str, Language], agg: str = "base"): if isinstance(nlp, str): self.nlp = spacy.load(nlp, disable=["ner", "tagger", "parser"]) elif isinstance(nlp, Language): @@ -1571,18 +1581,18 @@

    SentenceEncoder

    raise ValueError("`nlp` must be `str` or spaCy-language object.") self.agg = agg - def fit(self, X, y=None): + def fit(self, X, y=None): """No-op. Merely checks for object inputs per sklearn standard.""" # Scikit-learn also expects this in the `.fit()` command. self._check_inputs(X) return self - def _check_inputs(self, X): + def _check_inputs(self, X): options = ["mean", "max", "both", "base"] if self.agg not in options: raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the phrase text into a numeric representation.""" self._check_inputs(X) docs = self.nlp.pipe(X) @@ -1663,12 +1673,12 @@

    SentenceEncoder

    Usage

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import Sense2VecEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import Sense2VecEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -1734,7 +1744,7 @@ 

    SentenceEncoder

    51 52 53 -54
    class Sense2VecEncoder(BaseEstimator):
    +54
    class Sense2VecEncoder(BaseEstimator):
         """
         Create a [Sense2Vec encoder](https://github.com/explosion/sense2vec), meant to
         help when encoding phrases as opposed to sentences.
    @@ -1768,18 +1778,18 @@ 

    SentenceEncoder

    ``` """ - def __init__(self, path: str): + def __init__(self, path: str): self.path = path self.s2v = Sense2Vec().from_disk(self.path) self.shape = self.s2v["duck|NOUN"].shape - def _to_vector(self, text): + def _to_vector(self, text): sense = self.s2v.get_best_sense(text) if not sense: return np.zeros(shape=self.shape) return self.s2v[sense] - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the phrase text into a numeric representation.""" return np.array([self._to_vector(x) for x in X])
    @@ -1911,12 +1921,12 @@

    SentenceEncoder

    Usage

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import BytePairEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import BytePairEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -2039,7 +2049,7 @@ 

    SentenceEncoder

    100 101 102 -103
    class BytePairEncoder(EmbetterBase):
    +103
    class BytePairEncoder(EmbetterBase):
         """
         This language represents token-free pre-trained subword embeddings. Originally created by
         Benjamin Heinzerling and Michael Strube.
    @@ -2095,7 +2105,7 @@ 

    SentenceEncoder

    ``` """ - def __init__( + def __init__( self, lang: str, vs: int = 1000, @@ -2112,18 +2122,18 @@

    SentenceEncoder

    cache_dir = Path.home() / Path(".cache/bpemb") self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir) - def fit(self, X, y=None): + def fit(self, X, y=None): """No-op. Merely checks for object inputs per sklearn standard.""" # Scikit-learn also expects this in the `.fit()` command. self._check_inputs(X) return self - def _check_inputs(self, X): + def _check_inputs(self, X): options = ["mean", "max", "both"] if self.agg not in options: raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the phrase text into a numeric representation.""" self._check_inputs(X) if self.agg == "mean": @@ -2257,12 +2267,12 @@

    SentenceEncoder

  • glove-twitter-200
  • Usage

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import Word2VecEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import Word2VecEncoder
     
     # Let's suppose this is the input dataframe
     dataf = pd.DataFrame({
    @@ -2428,7 +2438,7 @@ 

    SentenceEncoder

    145 146 147 -148
    class GensimEncoder(EmbetterBase):
    +148
    class GensimEncoder(EmbetterBase):
         """
         Encodes text using a static word embedding model. The component uses gensim's default tokenizer.
     
    @@ -2487,7 +2497,7 @@ 

    SentenceEncoder

    ``` """ - def __init__( + def __init__( self, model: Union[str, Word2Vec, KeyedVectors] = "word2vec-google-news-300", agg: Literal["mean", "max", "both"] = "mean", @@ -2525,18 +2535,18 @@

    SentenceEncoder

    else self.keyed_vectors.vector_size * 2 ) - def fit(self, X, y=None): + def fit(self, X, y=None): """No-op. Merely checks for object inputs per sklearn standard.""" # Scikit-learn also expects this in the `.fit()` command. self._check_inputs(X) return self - def _check_inputs(self, X): + def _check_inputs(self, X): options = ["mean", "max", "both"] if self.agg not in options: raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") - def _tokenize(self, X) -> List[List[int]]: + def _tokenize(self, X) -> List[List[int]]: token_indices = [] for text in X: tokens = tokenize(text, deacc=self.deacc, lowercase=self.lowercase) @@ -2548,7 +2558,7 @@

    SentenceEncoder

    token_indices.append(indices) return token_indices - def transform(self, X, y=None): + def transform(self, X, y=None): """Transforms the phrase text into a numeric representation using word embeddings.""" self._check_inputs(X) tokens = self._tokenize(X) diff --git a/API/utils/index.html b/API/utils/index.html index 9d4c411..bb5e2b6 100644 --- a/API/utils/index.html +++ b/API/utils/index.html @@ -151,7 +151,7 @@
  • - + Finetuners
  • @@ -259,7 +259,7 @@
  • - + Finetuners
  • @@ -497,8 +497,8 @@

    cached

    Usage: -

    from embetter.text import SentenceEncoder
    -from embetter.utils import cached
    +
    from embetter.text import SentenceEncoder
    +from embetter.utils import cached
     
     encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
     
    @@ -511,7 +511,7 @@ 

    cached

    encoder.transform(examples)

    Note that you're also able to fetch the precalculated embeddings directly via:

    -
    from diskcache import Cache
    +
    from diskcache import Cache
     
     # Make sure that you use the same name as in `cached`
     cache = Cache("sentence-enc")
    @@ -581,7 +581,7 @@ 

    cached

    67 68 69 -70
    def cached(name: str, pipeline: BaseEstimator):
    +70
    def cached(name: str, pipeline: BaseEstimator):
         """
         Uses a [diskcache](https://grantjenks.com/docs/diskcache/tutorial.html) in
         an attempt to fetch precalculated embeddings from disk instead of inferring them.
    @@ -625,8 +625,8 @@ 

    cached

    """ cache = Cache(name) - def run_cached(method: Callable): - def wrapped(X, y=None): + def run_cached(method: Callable): + def wrapped(X, y=None): results = {i: cache[x] if x in cache else "TODO" for i, x in enumerate(X)} text_todo = [X[i] for i, x in results.items() if str(x) == "TODO"] i_todo = [i for i, x in results.items() if str(x) == "TODO"] @@ -716,7 +716,7 @@

    cached

    82 83 84 -85
    def batched(iterable: Iterable, n: int = 64):
    +85
    def batched(iterable: Iterable, n: int = 64):
         """
         Takes an iterable and turns it into a batched iterable.
     
    @@ -888,7 +888,7 @@ 

    cached

    117 118 119 -120
    def calc_distances(
    +120
    def calc_distances(
         inputs,
         anchors,
         pipeline,
    diff --git a/API/vision/index.html b/API/vision/index.html
    index 0b9f923..cc4ce7d 100644
    --- a/API/vision/index.html
    +++ b/API/vision/index.html
    @@ -155,7 +155,7 @@
     
     
       
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -552,17 +552,17 @@

    ImageLoader

    Usage

    You can use the ImageLoader in standalone fashion.

    -
    from embetter.vision import ImageLoader
    +
    from embetter.vision import ImageLoader
     
     filepath = "tests/data/thiscatdoesnotexist.jpeg"
     ImageLoader(convert="RGB").fit_transform([filepath])
     

    But it's more common to see it part of a pipeline.

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
     
    -from embetter.grab import ColumnGrabber
    -from embetter.vision import ImageLoader, ColorHistogramEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.vision import ImageLoader, ColorHistogramEncoder
     
     # Let's say we start we start with a csv file with filepaths
     data = {"filepaths":  ["tests/data/thiscatdoesnotexist.jpeg"]}
    @@ -648,7 +648,7 @@ 

    ImageLoader

    71 72 73 -74
    class ImageLoader(EmbetterBase):
    +74
    class ImageLoader(EmbetterBase):
         """
         Component that can turn filepaths into a list of PIL.Image objects.
     
    @@ -693,11 +693,11 @@ 

    ImageLoader

    """ - def __init__(self, convert: str = "RGB", out: str = "pil") -> None: + def __init__(self, convert: str = "RGB", out: str = "pil") -> None: self.convert = convert self.out = out - def fit(self, X, y=None): + def fit(self, X, y=None): """ Not actual "fitting" happens in this method, but it does check the input arguments per sklearn convention. @@ -708,7 +708,7 @@

    ImageLoader

    ) return self - def transform(self, X, y=None): + def transform(self, X, y=None): """ Turn a file path into numpy array containing pixel values. """ @@ -780,11 +780,11 @@

    ImageLoader

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
     
    -from embetter.grab import ColumnGrabber
    -from embetter.vision import ImageLoader, ColorHistogramEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.vision import ImageLoader, ColorHistogramEncoder
     
     # Let's say we start we start with a csv file with filepaths
     data = {"filepaths":  ["tests/data/thiscatdoesnotexist.jpeg"]}
    @@ -864,7 +864,7 @@ 

    ImageLoader

    63 64 65 -66
    class ColorHistogramEncoder(EmbetterBase):
    +66
    class ColorHistogramEncoder(EmbetterBase):
         """
         Encoder that generates an embedding based on the color histogram of the image.
     
    @@ -897,10 +897,10 @@ 

    ImageLoader

    ``` """ - def __init__(self, n_buckets=256): + def __init__(self, n_buckets=256): self.n_buckets = n_buckets - def transform(self, X, y=None): + def transform(self, X, y=None): """ Takes a sequence of `PIL.Image` and returns a numpy array representing a color histogram for each. @@ -1004,11 +1004,11 @@

    ImageLoader

    Usage:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline
     
    -from embetter.grab import ColumnGrabber
    -from embetter.vision import ImageLoader, TimmEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.vision import ImageLoader, TimmEncoder
     
     # Let's say we start we start with a csv file with filepaths
     data = {"filepaths":  ["tests/data/thiscatdoesnotexist.jpeg"]}
    @@ -1079,7 +1079,7 @@ 

    ImageLoader

    57 58 59 -60
    class TimmEncoder(EmbetterBase):
    +60
    class TimmEncoder(EmbetterBase):
         """
         Use a pretrained vision model from TorchVision to generate embeddings. Embeddings
         are provider via the lovely `timm` library.
    @@ -1116,7 +1116,7 @@ 

    ImageLoader

    ``` """ - def __init__(self, name="mobilenetv3_large_100", encode_predictions=False): + def __init__(self, name="mobilenetv3_large_100", encode_predictions=False): self.name = name self.encode_predictions = encode_predictions self.model = timm.create_model(name, pretrained=True, num_classes=0) @@ -1125,7 +1125,7 @@

    ImageLoader

    self.config = resolve_data_config({}, model=self.model) self.transform_img = create_transform(**self.config) - def transform(self, X, y=None): + def transform(self, X, y=None): """ Transforms grabbed images into numeric representations. """ diff --git a/applications/index.html b/applications/index.html index 70f5461..1e7d3bb 100644 --- a/applications/index.html +++ b/applications/index.html @@ -11,7 +11,7 @@ - + @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -265,7 +265,7 @@
  • - + Finetuners
  • @@ -514,8 +514,8 @@

    Cache

    This is why this library offers an integration with diskcache. That way, you can infer the embeddings once and store them to disk for later.

    Here's an example of how you might run that.

    -
    from embetter.text import SentenceEncoder
    -from embetter.utils import cached
    +
    from embetter.text import SentenceEncoder
    +from embetter.utils import cached
     
     encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
     
    @@ -528,7 +528,7 @@ 

    Cache

    encoder.transform(examples)

    Note that you're also able to fetch the precalculated embeddings directly via:

    -
    from diskcache import Cache
    +
    from diskcache import Cache
     
     # Make sure that you use the same name as in `cached`
     cache = Cache("sentence-enc")
    @@ -543,10 +543,10 @@ 

    Lite Embeddings

    from scikit-learn followed by TruncatedSVD. The TfidfVectorizer even allows you to specify analyzer=char with ngram_range = (3,4) to encode subwords, which even contributes to robustness against spelling errors if that's a concern.

    The main thing that's cool about this approach is the representations can still be very reasonable for a lot of applications and train very quickly. Here's a quick demo:

    -
    import srsly
    -from umap import UMAP
    -from cluestar import plot_text
    -from embetter.text import learn_lite_doc_embeddings
    +
    import srsly
    +from umap import UMAP
    +from cluestar import plot_text
    +from embetter.text import learn_lite_doc_embeddings
     
     # Train embeddings 
     texts = [ex['text'] for ex in srsly.read_jsonl("datasets/new-dataset.jsonl")]
    @@ -573,7 +573,7 @@ 

    Lite Embeddings

    Here's what this chart looks like. Note that you can click and drag to explore!

    Let's now consider what a similar chart might look like that uses Sentence Transformers.

    -
    from embetter.text import SentenceEncoder
    +
    from embetter.text import SentenceEncoder
     
     sent_enc = SentenceEncoder()
     X_orig = sent_enc.transform(texts) # this takes ~13.5s 
    @@ -586,9 +586,9 @@ 

    Lite Embeddings

    These "litetext" embeddings do overfit on the same words being used. But they are much faster and still give a reasonable representation for a lot of use-cases. Also not that you don't have to use our utilities here, you can just create the same pipeline via:

    -
    from sklearn.decomposition import TruncatedSVD
    -from sklearn.feature_extraction.text import TfidfVectorizer
    -from sklearn.pipeline import make_pipeline
    +
    from sklearn.decomposition import TruncatedSVD
    +from sklearn.feature_extraction.text import TfidfVectorizer
    +from sklearn.pipeline import make_pipeline
     
     enc = make_pipeline(
         TfidfVectorizer(),
    @@ -603,8 +603,8 @@ 

    Difference Models

    To help investigate this, this library offers a DifferenceModel utility.

    Here's how you might use it.

    -
    from embetter.model import DifferenceClassifier
    -from embetter.text import SentenceEncoder
    +
    from embetter.model import DifferenceClassifier
    +from embetter.text import SentenceEncoder
     
     mod = DifferenceClassifier(enc=SentenceEncoder())
     
    @@ -644,9 +644,9 @@ 

    Speedup with Modal

    SentenceEncoder as well as ClipEncoder should both benefit. These components will also automatically detect when the GPU is available automatically.

    The code below gives an example.

    -
    import time
    -import h5py
    -import modal
    +
    import time
    +import h5py
    +import modal
     
     
     stub = modal.Stub("example-get-started")
    @@ -658,13 +658,13 @@ 

    Speedup with Modal

    # This is the function that actually runs the embedding, # notice that there's a GPU attached. @stub.function(image=image, gpu="any") -def create(data): - from embetter.text import SentenceEncoder +def create(data): + from embetter.text import SentenceEncoder return SentenceEncoder().transform(data) @stub.local_entrypoint() -def main(): +def main(): tic = time.time() # You'd need to write your own function to read in the texts diff --git a/finetuners.removed-md b/finetuners.removed-md new file mode 100644 index 0000000..f0d053a --- /dev/null +++ b/finetuners.removed-md @@ -0,0 +1,362 @@ +Embetter also supports tools to finetune the embedded space. This can be useful when you're trying to steer the embedding towards a task that you're interested in, which can make [bulk labelling](https://github.com/koaning/bulk/) much easier. This guide will give an example of this. + +## Feeding Forward + +In general, this library is able to generate embeddings. + +
    + +
    Thing goes in. Vector goes out.
    +
    + + +But the embeddings could eventually be the input of a neural network. So let's draw that. + +
    + +
    Thing goes in. Vector goes out. Then a feed forward network.
    +
    + +In this diagram, the network has an input layer of size `n`, which is provide by one of our embedding models. Next it has a hidden layer of size `k` and an output node. To make the drawing easier we've only draw a single node as output, but the argument will also work for any number of classes. + +Let's now suppose that we train this model on a small set of labelled data. Then we'll have a gradient update that can update all the weights in this network. + +
    + +
    The network has a gradient signal.
    +
    + +Here's the main trick: after we're done training, we don't output the predictions from the neural network! Instead, we might the hidden layer as the new embedding. + +
    + +
    Notice how this layer "combines" the embedding and the label?
    +
    + +The thinking here is that this embedding will blend the information from the embedding, which hopefully is general, with the label that we're interested in, which is specific to our problem. Having such a blended embedding can be very useful for bulk labelling purposes, but if we pick our hyperparams right, we might even have an embedding that's a better fit for modelling. + +There are many methods that we might use for finetuning and we've just explained the method used in the `FeedForwardTuner` component. + +### Demo + +Let's demonstrate this effect with a demo. We will use the imdb dataset, hosted on Huggingface, for our example here. This dataset contains movie reviews and the task is to predict if these are negative or positive reviews. + +```python +from datasets import load_dataset +imdb = load_dataset("imdb") +``` + +Next we'll prepare our data. We'll assume that we have 200 annotated examples. Let's call this our "train" set. We will encode this data with a sentence encoder. + +```python +from embetter.text import SentenceEncoder + +# Load up a sentence encoder. +enc = SentenceEncoder() + +# Assume we have 200 labels +n_train = 200 + +# Grab 200 examples and encode them +df_train = imdb['train'].to_pandas().sample(frac=1, random_state=32) +X_train = enc.transform(df_train['text'].to_list()[:n_train]) +y_train = df_train['label'][:n_train].values +``` + +Next we will also prepare a seperate set which we'll use to evaluate. This set is much larger, but we'll still pluck a subset to make the compute time shorter. + +```python +# Let's grab 2000 examples for our "test" set +n_test = 2000 + +# Grab 2000 examples and encode them +df_test = imdb['test'].to_pandas().sample(frac=1, random_state=42) +X_test = enc.transform(df_test['text'].to_list()[:n_test]) +y_test = df_test['label'][:n_test].values +``` + +Next we'll load our finetuner. + +```python +from embetter.finetune import FeedForwardTuner + +# Create a network with some settings. You can totally change these. +tuner = FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=200) + +# Learn from our small training data +tuner.fit(X_train, y_train) +``` + +Given that we have a tuner trained, we can now apply it to our larger test set. + +```python +# Note that it's all skearn compatible +X_test_tfm = tuner.transform(X_test) +``` + +### Evaluation + +We just created `X_test_tfm`, which is a finetuned variant of `X_test`. To help +explain how the embedded space changed we'll make a PCA chart for both. + +```python +from sklearn.decomposition import PCA +from matplotlib import pylab as plt + +X_orig = PCA().fit_transform(X_test) +X_finetuned = PCA().fit_transform(X_test_tfm) +``` + +Let's now show the difference. + +```python +# First chart +plt.scatter(X_orig[:, 0] , X_orig[:, 1], c=y_test, s=10) +plt.title("PCA of original embedding space") +``` + +
    + +
    + +Notice how the two classes (positive/negative) are all mixed up when we look at the PCA plot of the embeddings. Let's now see what happens when we apply finetuning. + +```python +# Second chart +plt.scatter(X_finetuned[:, 0] , X_finetuned[:, 1], c=y_test, s=10) +plt.title("PCA of fine-tuned embedding space") +``` + +
    + +
    + +The classes seem to separate much better! That's good news if you'd like to make selections for bulk labelling. It should be much easier to select the class that you're interested in, or to select from a region where there is plenty of doubt. + +### Hyperparams + +It deserves mentioning that the effect on the PCA-space does depend a lot on the chosen hyperparameters of the `ForwardFinertuner`. + +```python +tuner = FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10) +``` + +If we decrease the hidden dimensions for example then we end up with a space that looks like this: + +
    + +
    + + +You might want to play around with the settings, but it deserves to be said that you can also overfit on the few examples that you have in `X_train`. + +### Extra Details + +In scikit-learn terms, a tuner is a "transformer"-component. That means that it can be used in a pipeline too! + +```python +from sklearn.pipeline import make_pipeline + +# Grab a few examples +X = df_test['text'].to_list()[:50] +y = df_test['label'].to_list()[:50] + +# Let's build a pipeline! +pipe = make_pipeline( + SentenceEncoder(), + FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10), + PCA() +) + +# The fine-tuning component can use `y_train`. +pipe.fit(X, y) + +# Apply all the trained steps! +pipe.transform(X) +``` + +Feel free to mix and match as you see fit. Also note that the tuner components in this library also support the `partial_fit` API incase you want to train on a stream of small batches. + +## Contrastive Methods + +There is more than one way to finetune though. Instead of using a feed forward architecture, you can also opt +for a contrastive approach. In this approach two items are compared with eachother. The idea here is that similarity on pairs can also be the based on which to finetune towards a goal. + +
    + +
    + +This approach works by generating pairs of original embeddings. Some pairs will be positive, meaning they are embeddings of examples that belong to the same class. Others will be negatively sampled, meaning they don't share the same class. The embeddings get re-embedding with an extra embedding on top, which is determined by these pairs + +
    + +
    + +Note that in general this extra embedding layer is the same for both the items. On other words: these embeddings share the same weights. + +
    + +
    + +When you're done training such a system, you can re-use this trained embedding head to map the original embedding to a new space. The thinking is that this will lead to a better embedding. + +
    + +
    + +The benefit of this approach, compared to the feed forward one, is that you're flexible with how you generate pairs of examples. Are two examples part of the same label in a classification problem? Sure, that might be used. Doing something unsupervised and want two sentences from the same paragraph to be declared similar? Why not? Got image embeddings that you want to glue to text? You can really go nuts here, and this library will provide some tools to make it easy to bootstrap an approach using this technique. + +### Demo + +As a demonstration of this technique, we'll use data found in the `datasets` folder of this repository. + +```python +import srsly +import itertools as it +from pathlib import Path + +examples = list(it.chain(srsly.read_jsonl(p) for p in Path("datasets"))) +``` + +This `examples` list contains examples that look like this: + +```json +{'text': 'Our code and dataset is available here.', 'cats': {'new-dataset': 1, 'dev-research': 0}} +``` + +The interesting thing in this dataset is that there are nested labels. For some examples we'll have all labels, but for others we may only have a subset. + +```python +labels = set() +for ex in examples: + for cat in ex['cats'].keys(): + if cat not in labels: + labels = labels.union([cat]) +assert labels == {'data-quality', 'dev-research', 'new-dataset'} +``` + +But from this we can generate pairs of examples that can be declared similar/dissimilar. + +```python +import random + +def sample_generator(examples, labels, n_neg=3): + for label in labels: + if label == "new-dataset": + pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1] + neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0] + for ex in pos_examples: + sample = random.choice(pos_examples) + yield (ex['text'], sample['text'], 1.0) + for n in range(n_neg): + sample = random.choice(neg_examples) + yield (ex['text'], sample['text'], 0.0) + +learn_examples = list(sample_generator(examples, labels, n_neg=3)) +texts1, texts2, similar = zip(*learn_examples) +``` + +Here's what the `texts1`, `text2` and `similar` lists might include as an example. + +| Sentence A | Sentence B | Similar | +|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------|---------| +| Our code and dataset is available here. | We release the resulting corpus and our analysis pipeline for future research. | 1 | +| Our code and dataset is available here. | In this paper, we introduce the predicted intent labels to calibrate answer labels in a self-training paradigm. | 0 | + + +It's these kinds of pairs that we can try to learn from. So let's do this with a `ContrastiveLearner` by finetuning the embeddings provided to us from a `SentenceEncoder`. To do that, we'll first need to generate the data in a format that it can used. + +```python +import numpy as np +from embetter.text import SentenceEncoder +from embetter.finetune import ContrastiveLearner + +# Generate numeric representations for the pairs +sentence_encoder = SentenceEncoder('all-MiniLM-L6-v2') +X1, X2 = sentence_encoder.transform(texts1), sentence_encoder.transform(texts2) + +# This is a good habbit, numpy arrays are nicer to work with than tuples here +y = np.array(similar) +``` + +With the data ready, we can train. + +```python +from embetter.finetune import ContrastiveLearner + +learner = ContrastiveLearner(epochs=50, batch_size=256, learning_rate=0.002, shape_out=384) +learner.fit(X1, X2, y) +``` + +Note that `learner` types of finetuners accept two data inputs in `.fit(X1, X2, y)`-method. This is not what the scikit-learn API would allow in a pipeline, but it is a format that allows you to be flexible. + +In this case the fine-tuning will be done quickly and we can generate new embeddings. + +```python +texts = [ex['text'] for ex in examples if 'new-dataset' in ex['cats']] +labels = np.array([ex['cats']['new-dataset'] for ex in examples if 'new-dataset' in ex['cats']]) + +X_texts = sentence_encoder.transform(texts) +X_texts_tfm = learner.transform(X_texts) +``` + +For fun, we can also see if these new embeddings give us more predictive power. + +```python +from sklearn.linear_model import LogisticRegression + +def calc_performance(X_in, y_in, name): + mod = LogisticRegression(class_weight="balanced").fit(X_in, y_in) + acc = np.mean(mod.predict(X_in) == y_in) + print(f"{name} got {acc=}") + +calc_performance(X_texts, labels, "original embeddings") +calc_performance(X_texts_tfm, labels, "finetuned embeddings") + +# original embeddings got acc=0.8624434389140272 +# finetuned embeddings got acc=0.9180995475113122 +``` + +This isn't a proper benchmark, we're measuring the train set after all, but it does comfirm that the embeddings differ. If you're finetuning your own embeddings you should always think hard about how you'd like to evaluate this. + +### More learners + +This library also provides a learning that directly integrates with `sentence-transformers`. Training these is typically slower, because it involves finetuning an entire BERT pipeline but may provide solid results. One downside of this approach is that you'll have a learner than cannot accept general arrays. It must provide inputs that sentence-transformers can deal with, which it typically text. + +```python +from embetter.finetune import SbertLearner +from sentence_transformers import SentenceTransformer + +# Load in a sentence transformer manually +sent_tfm = SentenceTransformer('all-MiniLM-L6-v2') + +# Pass it to the SbertLearner and train +sbert_learn = SbertLearner(sent_tfm=sent_tfm) +sbert_learn.fit(texts1, texts2, labels) + +# Once training is done, it can be used to encode embeddings +# Note that we input `texts`, not `X_texts`! +X_texts_sbert = sbert_learn.transform(texts) + +# You can now save the new model which is stored in the original variable +# the `SbertLearner` object directly operates on it +sent_tfm.to_disk(...) +``` + +### `Tuner`s vs. `Learner`s + +One downside of the `learner` objects is that they cannot be used in a scikit-learn pipeline during the `.fit()`-step because they have an incompatible API. To mitigate these, this library sometimes offers a "`Tuner`"-variant which has a "`Learner`"-variant of a method internally. Under the hood, a "tuner" will use a "learner" to make sure the finetuning works, but it won't be as flexible when it comes to training. + +```python +from embetter.finetune import ContrastiveTuner +from embetter.text import SentenceEncoder +from sklearn.pipeline import make_pipeline + +# Notice that we're using `tuner` here, not `learner`! +pipe = make_pipeline(SentenceEncoder(), Contrastivetuner()) +pipe.fit(X, y).predict(X) +``` + +### Performance + +This library favors ease of use over optimal performance, but it's certainly possible that the performance can be improved. If you have a clever suggestion, feel free to discuss it by opening [an issue](https://github.com/koaning/embetter/issues). diff --git a/finetuners/index.html b/finetuners/index.html deleted file mode 100644 index 3d87b8f..0000000 --- a/finetuners/index.html +++ /dev/null @@ -1,886 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - Finetuners - Embetter Docs - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - -
    - - - - - - -
    - - - - - - - -
    - -
    - - - - -
    -
    - - - -
    -
    -
    - - - - - - - - -
    -
    -
    - - - - -
    -
    - - - - - - - -

    Finetuners

    - -

    Embetter also supports tools to finetune the embedded space. This can be useful when you're trying to steer the embedding towards a task that you're interested in, which can make bulk labelling much easier. This guide will give an example of this.

    -

    Feeding Forward

    -

    In general, this library is able to generate embeddings.

    -
    - -
    Thing goes in. Vector goes out.
    -
    - -

    But the embeddings could eventually be the input of a neural network. So let's draw that.

    -
    - -
    Thing goes in. Vector goes out. Then a feed forward network.
    -
    - -

    In this diagram, the network has an input layer of size n, which is provide by one of our embedding models. Next it has a hidden layer of size k and an output node. To make the drawing easier we've only draw a single node as output, but the argument will also work for any number of classes.

    -

    Let's now suppose that we train this model on a small set of labelled data. Then we'll have a gradient update that can update all the weights in this network.

    -
    - -
    The network has a gradient signal.
    -
    - -

    Here's the main trick: after we're done training, we don't output the predictions from the neural network! Instead, we might the hidden layer as the new embedding.

    -
    - -
    Notice how this layer "combines" the embedding and the label?
    -
    - -

    The thinking here is that this embedding will blend the information from the embedding, which hopefully is general, with the label that we're interested in, which is specific to our problem. Having such a blended embedding can be very useful for bulk labelling purposes, but if we pick our hyperparams right, we might even have an embedding that's a better fit for modelling.

    -

    There are many methods that we might use for finetuning and we've just explained the method used in the FeedForwardTuner component.

    -

    Demo

    -

    Let's demonstrate this effect with a demo. We will use the imdb dataset, hosted on Huggingface, for our example here. This dataset contains movie reviews and the task is to predict if these are negative or positive reviews.

    -
    from datasets import load_dataset
    -imdb = load_dataset("imdb")
    -
    -

    Next we'll prepare our data. We'll assume that we have 200 annotated examples. Let's call this our "train" set. We will encode this data with a sentence encoder.

    -
    from embetter.text import SentenceEncoder
    -
    -# Load up a sentence encoder.
    -enc = SentenceEncoder()
    -
    -# Assume we have 200 labels 
    -n_train = 200
    -
    -# Grab 200 examples and encode them
    -df_train = imdb['train'].to_pandas().sample(frac=1, random_state=32)
    -X_train = enc.transform(df_train['text'].to_list()[:n_train])
    -y_train = df_train['label'][:n_train].values
    -
    -

    Next we will also prepare a seperate set which we'll use to evaluate. This set is much larger, but we'll still pluck a subset to make the compute time shorter.

    -
    # Let's grab 2000 examples for our "test" set 
    -n_test = 2000
    -
    -# Grab 2000 examples and encode them
    -df_test = imdb['test'].to_pandas().sample(frac=1, random_state=42)
    -X_test = enc.transform(df_test['text'].to_list()[:n_test])
    -y_test = df_test['label'][:n_test].values
    -
    -

    Next we'll load our finetuner.

    -
    from embetter.finetune import FeedForwardTuner 
    -
    -# Create a network with some settings. You can totally change these. 
    -tuner = FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=200)
    -
    -# Learn from our small training data
    -tuner.fit(X_train, y_train)
    -
    -

    Given that we have a tuner trained, we can now apply it to our larger test set.

    -
    # Note that it's all skearn compatible 
    -X_test_tfm = tuner.transform(X_test)
    -
    -

    Evaluation

    -

    We just created X_test_tfm, which is a finetuned variant of X_test. To help -explain how the embedded space changed we'll make a PCA chart for both.

    -
    from sklearn.decomposition import PCA
    -from matplotlib import pylab as plt 
    -
    -X_orig = PCA().fit_transform(X_test)
    -X_finetuned = PCA().fit_transform(X_test_tfm)
    -
    -

    Let's now show the difference.

    -
    # First chart 
    -plt.scatter(X_orig[:, 0] , X_orig[:, 1], c=y_test, s=10)
    -plt.title("PCA of original embedding space")
    -
    -
    - -
    - -

    Notice how the two classes (positive/negative) are all mixed up when we look at the PCA plot of the embeddings. Let's now see what happens when we apply finetuning.

    -
    # Second chart
    -plt.scatter(X_finetuned[:, 0] , X_finetuned[:, 1], c=y_test, s=10)
    -plt.title("PCA of fine-tuned embedding space")
    -
    -
    - -
    - -

    The classes seem to separate much better! That's good news if you'd like to make selections for bulk labelling. It should be much easier to select the class that you're interested in, or to select from a region where there is plenty of doubt.

    -

    Hyperparams

    -

    It deserves mentioning that the effect on the PCA-space does depend a lot on the chosen hyperparameters of the ForwardFinertuner.

    -
    tuner = FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10)
    -
    -

    If we decrease the hidden dimensions for example then we end up with a space that looks like this:

    -
    - -
    - -

    You might want to play around with the settings, but it deserves to be said that you can also overfit on the few examples that you have in X_train.

    -

    Extra Details

    -

    In scikit-learn terms, a tuner is a "transformer"-component. That means that it can be used in a pipeline too!

    -
    from sklearn.pipeline import make_pipeline 
    -
    -# Grab a few examples
    -X = df_test['text'].to_list()[:50]
    -y = df_test['label'].to_list()[:50]
    -
    -# Let's build a pipeline!
    -pipe = make_pipeline(
    -    SentenceEncoder(),
    -    FeedForwardTuner(n_epochs=500, learning_rate=0.01, hidden_dim=10),
    -    PCA()
    -)
    -
    -# The fine-tuning component can use `y_train`.
    -pipe.fit(X, y)
    -
    -# Apply all the trained steps! 
    -pipe.transform(X)
    -
    -

    Feel free to mix and match as you see fit. Also note that the tuner components in this library also support the partial_fit API incase you want to train on a stream of small batches.

    -

    Contrastive Methods

    -

    There is more than one way to finetune though. Instead of using a feed forward architecture, you can also opt -for a contrastive approach. In this approach two items are compared with eachother. The idea here is that similarity on pairs can also be the based on which to finetune towards a goal.

    -
    - -
    - -

    This approach works by generating pairs of original embeddings. Some pairs will be positive, meaning they are embeddings of examples that belong to the same class. Others will be negatively sampled, meaning they don't share the same class. The embeddings get re-embedding with an extra embedding on top, which is determined by these pairs

    -
    - -
    - -

    Note that in general this extra embedding layer is the same for both the items. On other words: these embeddings share the same weights.

    -
    - -
    - -

    When you're done training such a system, you can re-use this trained embedding head to map the original embedding to a new space. The thinking is that this will lead to a better embedding.

    -
    - -
    - -

    The benefit of this approach, compared to the feed forward one, is that you're flexible with how you generate pairs of examples. Are two examples part of the same label in a classification problem? Sure, that might be used. Doing something unsupervised and want two sentences from the same paragraph to be declared similar? Why not? Got image embeddings that you want to glue to text? You can really go nuts here, and this library will provide some tools to make it easy to bootstrap an approach using this technique.

    -

    Demo

    -

    As a demonstration of this technique, we'll use data found in the datasets folder of this repository.

    -
    import srsly 
    -import itertools as it 
    -from pathlib import Path
    -
    -examples = list(it.chain(srsly.read_jsonl(p) for p in Path("datasets")))
    -
    -

    This examples list contains examples that look like this:

    -
    {'text': 'Our code and dataset is available here.', 'cats': {'new-dataset': 1, 'dev-research': 0}}
    -
    -

    The interesting thing in this dataset is that there are nested labels. For some examples we'll have all labels, but for others we may only have a subset.

    -
    labels = set()
    -for ex in examples:
    -    for cat in ex['cats'].keys():
    -        if cat not in labels:
    -            labels = labels.union([cat])
    -assert labels == {'data-quality', 'dev-research', 'new-dataset'}
    -
    -

    But from this we can generate pairs of examples that can be declared similar/dissimilar.

    -
    import random 
    -
    -def sample_generator(examples, labels, n_neg=3):
    -    for label in labels:
    -        if label == "new-dataset":
    -            pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
    -            neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
    -            for ex in pos_examples:
    -                sample = random.choice(pos_examples)
    -                yield (ex['text'], sample['text'], 1.0)
    -                for n in range(n_neg):
    -                    sample = random.choice(neg_examples)
    -                    yield (ex['text'], sample['text'], 0.0)
    -
    -learn_examples = list(sample_generator(examples, labels, n_neg=3))
    -texts1, texts2, similar = zip(*learn_examples)
    -
    -

    Here's what the texts1, text2 and similar lists might include as an example.

    - - - - - - - - - - - - - - - - - - - - -
    Sentence ASentence BSimilar
    Our code and dataset is available here.We release the resulting corpus and our analysis pipeline for future research.1
    Our code and dataset is available here.In this paper, we introduce the predicted intent labels to calibrate answer labels in a self-training paradigm.0
    -

    It's these kinds of pairs that we can try to learn from. So let's do this with a ContrastiveLearner by finetuning the embeddings provided to us from a SentenceEncoder. To do that, we'll first need to generate the data in a format that it can used.

    -
    import numpy as np 
    -from embetter.text import SentenceEncoder
    -from embetter.finetune import ContrastiveLearner
    -
    -# Generate numeric representations for the pairs
    -sentence_encoder = SentenceEncoder('all-MiniLM-L6-v2')
    -X1, X2 = sentence_encoder.transform(texts1), sentence_encoder.transform(texts2)
    -
    -# This is a good habbit, numpy arrays are nicer to work with than tuples here
    -y = np.array(similar)
    -
    -

    With the data ready, we can train.

    -
    from embetter.finetune import ContrastiveLearner
    -
    -learner = ContrastiveLearner(epochs=50, batch_size=256, learning_rate=0.002, shape_out=384)
    -learner.fit(X1, X2, y)
    -
    -

    Note that learner types of finetuners accept two data inputs in .fit(X1, X2, y)-method. This is not what the scikit-learn API would allow in a pipeline, but it is a format that allows you to be flexible.

    -

    In this case the fine-tuning will be done quickly and we can generate new embeddings.

    -
    texts = [ex['text'] for ex in examples if 'new-dataset' in ex['cats']]
    -labels = np.array([ex['cats']['new-dataset'] for ex in examples if 'new-dataset' in ex['cats']])
    -
    -X_texts = sentence_encoder.transform(texts)
    -X_texts_tfm = learner.transform(X_texts)
    -
    -

    For fun, we can also see if these new embeddings give us more predictive power.

    -
    from sklearn.linear_model import LogisticRegression
    -
    -def calc_performance(X_in, y_in, name):
    -    mod = LogisticRegression(class_weight="balanced").fit(X_in, y_in)
    -    acc = np.mean(mod.predict(X_in) == y_in)
    -    print(f"{name} got {acc=}")
    -
    -calc_performance(X_texts, labels, "original embeddings")
    -calc_performance(X_texts_tfm, labels, "finetuned embeddings")
    -
    -# original embeddings got acc=0.8624434389140272
    -# finetuned embeddings got acc=0.9180995475113122
    -
    -

    This isn't a proper benchmark, we're measuring the train set after all, but it does comfirm that the embeddings differ. If you're finetuning your own embeddings you should always think hard about how you'd like to evaluate this.

    -

    More learners

    -

    This library also provides a learning that directly integrates with sentence-transformers. Training these is typically slower, because it involves finetuning an entire BERT pipeline but may provide solid results. One downside of this approach is that you'll have a learner than cannot accept general arrays. It must provide inputs that sentence-transformers can deal with, which it typically text.

    -
    from embetter.finetune import SbertLearner
    -from sentence_transformers import SentenceTransformer
    -
    -# Load in a sentence transformer manually
    -sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
    -
    -# Pass it to the SbertLearner and train
    -sbert_learn = SbertLearner(sent_tfm=sent_tfm)
    -sbert_learn.fit(texts1, texts2, labels)
    -
    -# Once training is done, it can be used to encode embeddings
    -# Note that we input `texts`, not `X_texts`!
    -X_texts_sbert = sbert_learn.transform(texts)
    -
    -# You can now save the new model which is stored in the original variable
    -# the `SbertLearner` object directly operates on it
    -sent_tfm.to_disk(...)
    -
    -

    Tuners vs. Learners

    -

    One downside of the learner objects is that they cannot be used in a scikit-learn pipeline during the .fit()-step because they have an incompatible API. To mitigate these, this library sometimes offers a "Tuner"-variant which has a "Learner"-variant of a method internally. Under the hood, a "tuner" will use a "learner" to make sure the finetuning works, but it won't be as flexible when it comes to training.

    -
    from embetter.finetune import ContrastiveTuner
    -from embetter.text import SentenceEncoder
    -from sklearn.pipeline import make_pipeline
    -
    -# Notice that we're using `tuner` here, not `learner`!
    -pipe = make_pipeline(SentenceEncoder(), Contrastivetuner())
    -pipe.fit(X, y).predict(X)
    -
    -

    Performance

    -

    This library favors ease of use over optimal performance, but it's certainly possible that the performance can be improved. If you have a clever suggestion, feel free to discuss it by opening an issue.

    - - - - - - -
    -
    - - -
    - -
    - - - -
    -
    -
    -
    - - - - - - - - - \ No newline at end of file diff --git a/index.html b/index.html index d164c3b..f5dcff6 100644 --- a/index.html +++ b/index.html @@ -12,7 +12,7 @@ - + @@ -155,7 +155,7 @@
  • - + Finetuners
  • @@ -332,7 +332,7 @@
  • - + Finetuners
  • @@ -531,34 +531,34 @@

    Install

    API Design

    This is what's being implemented now.

    # Helpers to grab text or image from pandas column.
    -from embetter.grab import ColumnGrabber
    +from embetter.grab import ColumnGrabber
     
     # Representations/Helpers for computer vision
    -from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
    +from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
     
     # Representations for text
    -from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder
    +from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder
     
     # Representations from multi-modal models
    -from embetter.multi import ClipEncoder
    +from embetter.multi import ClipEncoder
     
     # Finetuning components 
    -from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner
    +from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner
     
     # External embedding providers, typically needs an API key
    -from embetter.external import CohereEncoder, OpenAIEncoder
    +from embetter.external import CohereEncoder, OpenAIEncoder
     

    All of these components are scikit-learn compatible, which means that you can apply them as you would normally in a scikit-learn pipeline. Just be aware that these components are stateless. They won't require training as these are all pretrained tools.

    Text Example

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline 
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline 
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.text import SentenceEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.text import SentenceEncoder
     
     # This pipeline grabs the `text` column from a dataframe
     # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
    @@ -583,13 +583,13 @@ 

    Text Example

    Image Example

    The goal of the API is to allow pipelines like this:

    -
    import pandas as pd
    -from sklearn.pipeline import make_pipeline 
    -from sklearn.linear_model import LogisticRegression
    +
    import pandas as pd
    +from sklearn.pipeline import make_pipeline 
    +from sklearn.linear_model import LogisticRegression
     
    -from embetter.grab import ColumnGrabber
    -from embetter.vision import ImageLoader
    -from embetter.multi import ClipEncoder
    +from embetter.grab import ColumnGrabber
    +from embetter.vision import ImageLoader
    +from embetter.multi import ClipEncoder
     
     # This pipeline grabs the `img_path` column from a dataframe
     # then it grabs the image paths and turns them into `PIL.Image` objects
    diff --git a/sitemap.xml.gz b/sitemap.xml.gz
    index d0391fe..2b0207c 100644
    Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ