From 50b5cce9f44ce35bd701016802d84828b26722b3 Mon Sep 17 00:00:00 2001 From: goldpulpy Date: Thu, 10 Oct 2024 22:59:16 +0300 Subject: [PATCH] Function documentation has been updated --- pysentence_similarity/_model.py | 33 +++++++++--- pysentence_similarity/_splitter.py | 38 +++++++++++--- pysentence_similarity/_storage.py | 47 +++++++++++++++-- pysentence_similarity/compute.py | 82 +++++++++++++++++++++++++++++- pysentence_similarity/pooling.py | 32 ++++++++++-- 5 files changed, 210 insertions(+), 22 deletions(-) diff --git a/pysentence_similarity/_model.py b/pysentence_similarity/_model.py index 8cad450..e9230cc 100644 --- a/pysentence_similarity/_model.py +++ b/pysentence_similarity/_model.py @@ -3,6 +3,7 @@ import os import time import logging +import copy from typing import List, Union, Callable import onnxruntime as ort @@ -37,13 +38,23 @@ def __init__( """ Initialize the sentence similarity task model. + This constructor initializes the necessary components to load a model + for sentence similarity tasks, including the model, tokenizer, and the + device configurations. + :param model: The name of the model to be used. :type model: str - :param dtype: The dtype of the model ('fp32', 'fp16', 'int8'). + :param dtype: The data type of the model. Options include 'fp32' for + 32-bit floating point, 'fp16' for 16-bit floating point, and 'int8' for + 8-bit integer. Default is 'fp32'. :type dtype: str - :param cache_dir: Directory to cache the model and tokenizer. + :param cache_dir: The directory where the model and tokenizer should be + cached. If not provided, a default cache directory is used based on + the package name. :type cache_dir: str - :param device: Device to use for inference ('cuda', 'cpu'). + :param device: The device to use for inference. Options include 'cuda' + for GPU acceleration and 'cpu' for running on the CPU. + Default is 'cpu'. :type device: str :raises ValueError: If the model or tokenizer cannot be loaded. """ @@ -56,8 +67,8 @@ def __init__( try: self._providers = self._get_providers() - self._tokenizer = self._load_tokenizer() self._session = self._load_model() + self._tokenizer = self._load_tokenizer() except Exception as err: logger.error("Error initializing model: %s", err) raise @@ -77,7 +88,11 @@ def encode( pooling_function: Callable = mean_pooling, progress_bar: bool = False ) -> Union[np.ndarray, List[np.ndarray]]: - """Convert a single sentence to an embedding vector. + """Convert a single sentence or a list of sentences to an embedding + vector. + + This method takes one or more sentences as input and converts them + into embedding vectors using a specified pooling function. :param sentences: Sentence or list of sentences to convert. :type sentences: Union[str, List[str]] @@ -320,8 +335,14 @@ def __repr__(self) -> str: """Return a string representation of the Model object.""" return self.__str__() - def __copy__(self): + def __copy__(self) -> "Model": """Create a shallow copy of the Model object.""" new_instance = self.__class__.__new__(self.__class__) new_instance.__dict__.update(self.__dict__) return new_instance + + def __deepcopy__(self, memo) -> "Model": + """ + Create a deep copy of the Model object. + """ + return copy.deepcopy(self, memo) diff --git a/pysentence_similarity/_splitter.py b/pysentence_similarity/_splitter.py index a19694d..52c9d81 100644 --- a/pysentence_similarity/_splitter.py +++ b/pysentence_similarity/_splitter.py @@ -15,8 +15,6 @@ class Splitter: """ A class to split text into sentences. - Supports splitting by periods, exclamation marks, question marks, and - newline characters. """ def __init__( @@ -25,7 +23,10 @@ def __init__( preserve_markers: bool = False, ) -> None: """ - Initializes the Splitter object. + Initializes the Splitter object, which is used to split a given text + based on specific characters or markers. This class allows flexible + splitting based on one or more characters and provides the option to + preserve these markers in the split result. :param markers_to_split: A string or list of characters (e.g., punctuation marks) used to split the text. Default is a newline @@ -51,7 +52,13 @@ def split_from_text( text: str, ) -> List[str]: """ - Splits the given text into sentences based on punctuation and newlines. + Splits the given text into sentences based on specified punctuation and + newlines. + + This method uses regular expressions to identify splitting points in + the input text. It can preserve split markers (such as punctuation) + based on the `preserve_markers` attribute set during initialization. + :param text: The input text to split. :type text: str @@ -93,8 +100,12 @@ def split_from_file( file_path: str, ) -> List[str]: """ - Splits the contents of a txt file into sentences based on punctuation - and newlines. + Splits the contents of a text file into sentences based on specified + punctuation and newlines. + + This method reads the entire content of the specified text file and + utilizes the `split_from_text` method to split the content into + sentences. It expects the file to be encoded in UTF-8. :param file_path: The path to the file to split. :type file_path: str @@ -127,6 +138,10 @@ def split_from_url( Fetches the content from a URL, removes HTML tags, and splits the cleaned text into sentences. + This method retrieves the content from the provided URL, removes all + HTML tags, and splits the remaining plain text into sentences based on + the specified split markers. + :param url: The URL of the webpage to split. :type url: str :param timeout: The number of seconds to wait for the request to @@ -166,6 +181,11 @@ def split_from_csv( Reads a CSV file and splits the text from specified columns into sentences. + This method reads the contents of a CSV file, extracts text from the + specified columns, and then splits the text into sentences based on + the markers defined in the `Splitter` object. It can handle multiple + columns and combines the results into a single list of sentences. + :param file_path: The path to the CSV file to read. :type file_path: str :param column_names: A list of column names to extract text from. @@ -234,6 +254,12 @@ def split_from_json(self, file_path: str, keys: List[str]) -> List[str]: """ Reads a JSON file and splits text from specified keys into sentences. + This method processes a JSON file by extracting text values from + specified keys. The extracted text is then split into sentences based + on the markers defined in the `Splitter` object. It can handle nested + JSON structures and recursively extract values from deeply nested + objects. + :param file_path: The path to the JSON file to read. :type file_path: str :param keys: A list of keys to extract text from. diff --git a/pysentence_similarity/_storage.py b/pysentence_similarity/_storage.py index c8dbf96..6c6c35f 100644 --- a/pysentence_similarity/_storage.py +++ b/pysentence_similarity/_storage.py @@ -1,6 +1,7 @@ """Class to store embeddings in memory.""" import logging -from typing import List, Optional, Union +import copy +from typing import List, Optional, Union, Tuple import h5py import numpy as np @@ -25,6 +26,10 @@ def __init__( """ Initialize the storage class. + This constructor initializes an instance of the storage class, allowing + for the optional provision of a list of sentences and their + corresponding embeddings. + :param sentences: List of sentences. :type sentences: List[str], optional :param embeddings: List of embeddings. @@ -41,7 +46,9 @@ def save(self, filename: str) -> None: """ Save the embeddings and sentences to a file. - Save the embeddings and sentences to a file. + This method saves the embeddings and sentences into an HDF5 file format + using the h5py library. It validates the data before saving to ensure + that it is in the correct format. :param filename: The name of the file to save the embeddings to. :type filename: str @@ -68,6 +75,10 @@ def load(filename: str) -> "Storage": Factory method to load the embeddings and sentences from a file and return a new Storage instance. + This method reads the embeddings and sentences from an HDF5 file + using the h5py library. It constructs and returns a new instance of + the Storage class with the loaded data. + :param filename: The name of the file to load the embeddings from. :type filename: str :return: A new instance of Storage class populated with the loaded @@ -99,7 +110,11 @@ def add( filename: str = None ) -> None: """ - Add a new sentences and embeddings to the storage. + Add new sentences and embeddings to the storage. + + This method appends new sentences and their corresponding embeddings + to the internal storage. If specified, it can also save the updated + data to a file. :param sentence: The sentence to add. :type sentence: Union[str, List[str]] @@ -140,6 +155,10 @@ def remove_by_index(self, index: int) -> None: """ Remove the sentence and embedding at the specified index. + This method removes a sentence and its corresponding embedding + from the storage based on the provided index. If the index is + out of range, it raises an IndexError. + :param index: Index of the item to remove. :type index: int :raises IndexError: If the index is out of bounds. @@ -157,6 +176,10 @@ def remove_by_sentence(self, sentence: str) -> None: """ Remove the sentence and its corresponding embedding by sentence. + This method searches for a specific sentence in the storage and + removes it along with its corresponding embedding. If the sentence + is not found, it raises a ValueError. + :param sentence: The sentence to remove. :type sentence: str :raises ValueError: If the sentence is not found in the storage. @@ -173,6 +196,8 @@ def get_sentences(self) -> List[str]: """ Get the list of sentences. + This method retrieves the stored sentences from the storage. + :return: The list of sentences. :rtype: List[str] """ @@ -182,6 +207,10 @@ def get_embedding_by_sentence(self, sentence: str) -> np.ndarray: """ Get the embedding for the specified sentence. + This method retrieves the stored embedding corresponding to the given + sentence. + + :param sentence: The sentence to get the embedding for. :type sentence: str :return: The embedding for the specified sentence. @@ -199,6 +228,8 @@ def get_embeddings(self) -> List[np.ndarray]: """ Get the list of embeddings. + This method retrieves all stored embeddings. + :return: The list of embeddings. :rtype: List[np.ndarray] """ @@ -252,11 +283,17 @@ def __copy__(self): new_instance.__dict__.update(self.__dict__) return new_instance + def __deepcopy__(self, memo) -> "Storage": + """ + Create a deep copy of the Storage object. + """ + return copy.deepcopy(self, memo) + def __len__(self) -> int: """Return the number of sentences.""" return len(self._sentences) - def __getitem__(self, index: int) -> List[Union[str, np.ndarray]]: + def __getitem__(self, index: int) -> Tuple[str, np.ndarray]: """ Get the sentence and embedding at the specified index. @@ -266,7 +303,7 @@ def __getitem__(self, index: int) -> List[Union[str, np.ndarray]]: :raises IndexError: If the index is out of bounds. """ try: - return [self._sentences[index], self._embeddings[index]] + return self._sentences[index], self._embeddings[index] except IndexError as e: logger.error("Index out of range: %s", e) raise diff --git a/pysentence_similarity/compute.py b/pysentence_similarity/compute.py index 13c5d2c..32b3859 100644 --- a/pysentence_similarity/compute.py +++ b/pysentence_similarity/compute.py @@ -10,7 +10,16 @@ def cosine( embedding_1: np.ndarray, embedding_2: np.ndarray ) -> float: - """Compute cosine similarity between two embedding vectors. + """ + Compute cosine similarity between two embedding vectors. + + Cosine similarity is a measure of similarity between two non-zero vectors + of an inner product space that measures the cosine of the angle between + them. + It is defined as the dot product of the vectors divided by the product of + their magnitudes (norms). The value ranges from -1 to 1, where 1 indicates + that the vectors are identical, 0 indicates orthogonality, and -1 indicates + opposite directions. :param embedding_1: First embedding vector. :type embedding_1: np.ndarray @@ -35,6 +44,14 @@ def euclidean( ) -> float: """Compute Euclidean distance between two embedding vectors. + The Euclidean distance is a measure of the straight-line distance between + two points in Euclidean space. It is calculated as the square root of the + sum of the squared differences between corresponding elements of the + vectors. + This distance metric is commonly used in various machine learning and + data analysis tasks to quantify similarity or dissimilarity between data + points. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -56,6 +73,15 @@ def manhattan( ) -> float: """Compute Manhattan distance between two embedding vectors. + The Manhattan distance, also known as L1 distance or city block distance, + measures the distance between two points in a grid-based system by + calculating the sum of the absolute differences of their coordinates. + It is defined as the sum of the absolute differences between corresponding + elements of the vectors. + + This distance metric is useful in various machine learning applications and + optimization problems. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -77,6 +103,14 @@ def jaccard( ) -> float: """Compute Jaccard similarity between two embedding vectors. + The Jaccard similarity coefficient measures the similarity between two sets + by comparing the size of their intersection to the size of their union. + For two embedding vectors, the Jaccard similarity is calculated as the + sum of the minimum values (intersection) divided by the sum of the maximum + values (union) for corresponding elements of the vectors. This metric + is particularly useful in applications such as clustering and information + retrieval where the similarity between sets is of interest. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -100,6 +134,13 @@ def pearson( ) -> float: """Compute Pearson correlation between two embedding vectors. + The Pearson correlation coefficient measures the linear correlation + between two variables, ranging from -1 to 1. A coefficient of 1 indicates + a perfect positive linear relationship, 0 indicates no linear correlation, + and -1 indicates a perfect negative linear relationship. This metric is + commonly used in statistics to determine the strength and direction of a + linear relationship between two data sets. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -122,6 +163,16 @@ def minkowski( ) -> float: """Compute Minkowski distance between two embedding vectors. + The Minkowski distance is a generalization of both the Euclidean and + Manhattan distances, defined as the p-th root of the sum of the absolute + differences of the coordinates raised to the p-th power. + The Minkowski distance becomes: + - Euclidean distance when p = 2 + - Manhattan distance when p = 1 + + The parameter p controls the 'order' of the distance metric. A higher value + of p emphasizes larger differences between dimensions. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -145,6 +196,13 @@ def hamming( ) -> float: """Compute Hamming distance between two embedding vectors. + The Hamming distance measures the proportion of positions at which + the corresponding elements of two vectors are different. It is + commonly used for comparing binary strings or categorical data + and is defined as the number of differing elements divided by the + total number of elements. This distance metric is particularly + useful in error detection and correction codes. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -171,6 +229,12 @@ def kl_divergence( """Compute Kullback-Leibler divergence between two probability distributions. + The Kullback-Leibler (KL) divergence is a measure of how one probability + distribution diverges from a second, expected probability distribution. + It quantifies the information lost when one distribution is used to + approximate another. The KL divergence is always non-negative and is + zero if and only if the two distributions are identical. + :param embedding_1: First probability distribution. :param embedding_2: Second probability distribution. :return: KL divergence. @@ -195,6 +259,13 @@ def chebyshev( ) -> float: """Compute Chebyshev distance between two embedding vectors. + The Chebyshev distance, also known as the maximum metric, + measures the maximum absolute difference between the components + of two vectors. It is defined as the greatest of the absolute + differences along any coordinate dimension. This distance + metric is particularly useful in scenarios where you want to + focus on the largest difference between dimensions. + :param embedding_1: First embedding vector. :type embedding_1: np.ndarray :param embedding_2: Second embedding vector. @@ -216,9 +287,16 @@ def bregman( f=np.square, grad_f=lambda x: 2 * x ) -> float: - """Compute Bregman divergence between two embedding vectors using a convex + """Compute Bregman divergence between two embedding vectors using a convex function. + Bregman divergence is a generalization of various distance measures + based on a convex function. It quantifies the difference between + two points in terms of the convex function and its gradient. Bregman + divergence is non-negative and equals zero only when the two points + are the same. This metric is useful in various applications, + including optimization and information theory. + :param embedding_1: First embedding vector. :param embedding_2: Second embedding vector. :param f: Convex function to compute divergence (default is square diff --git a/pysentence_similarity/pooling.py b/pysentence_similarity/pooling.py index 8cfd276..e5b0a9f 100644 --- a/pysentence_similarity/pooling.py +++ b/pysentence_similarity/pooling.py @@ -8,7 +8,19 @@ def max_pooling( attention_mask: List[int] ) -> np.ndarray: """ - Perform max pooling on token embeddings. + Perform max pooling on token embeddings, using an attention mask to ignore + padding tokens. + + This function takes in token embeddings (e.g., from a transformer model's + output) and an attention mask and applies a max pooling operation across + the token embeddings for each sentence. The attention mask ensures that + padding tokens (which have a mask value of 0) are ignored in the pooling + operation. + + Max pooling selects the maximum value across the embedding dimension for + each token, after multiplying the embeddings by the attention mask. This + results in a pooled embedding representing the entire input sentence. + :param model_output: Model output (token embeddings). :type model_output: np.ndarray @@ -28,7 +40,14 @@ def mean_pooling( attention_mask: List[int] ) -> np.ndarray: """ - Perform mean pooling on token embeddings. + Perform mean pooling on token embeddings, using an attention mask to ignore + padding tokens. + + This function computes the mean (average) of the token embeddings for each + sentence, ignoring the padding tokens by using an attention mask. The + attention mask helps in weighting the valid tokens during pooling and + ensures that the padding tokens (marked as 0 in the mask) are excluded from + the average computation. :param model_output: Model output (token embeddings). :type model_output: np.ndarray @@ -51,7 +70,14 @@ def min_pooling( attention_mask: List[int] ) -> np.ndarray: """ - Perform min pooling on token embeddings. + Perform min pooling on token embeddings, using an attention mask to ignore + padding tokens. + + This function computes the minimum of the token embeddings for each + sentence, while ignoring padding tokens by utilizing an attention mask. The + attention mask ensures that tokens marked as padding (with a value of 0) + are not considered in the min pooling operation, effectively allowing the + computation to focus only on valid tokens. :param model_output: Model output (token embeddings). :type model_output: np.ndarray