From 50b5cce9f44ce35bd701016802d84828b26722b3 Mon Sep 17 00:00:00 2001
From: goldpulpy <sawsani1928@gmail.com>
Date: Thu, 10 Oct 2024 22:59:16 +0300
Subject: [PATCH] Function documentation has been updated

---
 pysentence_similarity/_model.py    | 33 +++++++++---
 pysentence_similarity/_splitter.py | 38 +++++++++++---
 pysentence_similarity/_storage.py  | 47 +++++++++++++++--
 pysentence_similarity/compute.py   | 82 +++++++++++++++++++++++++++++-
 pysentence_similarity/pooling.py   | 32 ++++++++++--
 5 files changed, 210 insertions(+), 22 deletions(-)

diff --git a/pysentence_similarity/_model.py b/pysentence_similarity/_model.py
index 8cad450..e9230cc 100644
--- a/pysentence_similarity/_model.py
+++ b/pysentence_similarity/_model.py
@@ -3,6 +3,7 @@
 import os
 import time
 import logging
+import copy
 from typing import List, Union, Callable
 
 import onnxruntime as ort
@@ -37,13 +38,23 @@ def __init__(
         """
         Initialize the sentence similarity task model.
 
+        This constructor initializes the necessary components to load a model 
+        for sentence similarity tasks, including the model, tokenizer, and the 
+        device configurations.
+
         :param model: The name of the model to be used.
         :type model: str
-        :param dtype: The dtype of the model ('fp32', 'fp16', 'int8').
+        :param dtype: The data type of the model. Options include 'fp32' for 
+        32-bit floating point, 'fp16' for 16-bit floating point, and 'int8' for 
+        8-bit integer. Default is 'fp32'.
         :type dtype: str
-        :param cache_dir: Directory to cache the model and tokenizer.
+        :param cache_dir: The directory where the model and tokenizer should be 
+        cached. If not provided, a default cache directory is used based on 
+        the package name.
         :type cache_dir: str
-        :param device: Device to use for inference ('cuda', 'cpu').
+        :param device:  The device to use for inference. Options include 'cuda' 
+        for GPU acceleration and 'cpu' for running on the CPU. 
+        Default is 'cpu'.
         :type device: str
         :raises ValueError: If the model or tokenizer cannot be loaded.
         """
@@ -56,8 +67,8 @@ def __init__(
 
         try:
             self._providers = self._get_providers()
-            self._tokenizer = self._load_tokenizer()
             self._session = self._load_model()
+            self._tokenizer = self._load_tokenizer()
         except Exception as err:
             logger.error("Error initializing model: %s", err)
             raise
@@ -77,7 +88,11 @@ def encode(
         pooling_function: Callable = mean_pooling,
         progress_bar: bool = False
     ) -> Union[np.ndarray, List[np.ndarray]]:
-        """Convert a single sentence to an embedding vector.
+        """Convert a single sentence or a list of sentences to an embedding 
+        vector.
+
+        This method takes one or more sentences as input and converts them 
+        into embedding vectors using a specified pooling function.
 
         :param sentences: Sentence or list of sentences to convert.
         :type sentences: Union[str, List[str]]
@@ -320,8 +335,14 @@ def __repr__(self) -> str:
         """Return a string representation of the Model object."""
         return self.__str__()
 
-    def __copy__(self):
+    def __copy__(self) -> "Model":
         """Create a shallow copy of the Model object."""
         new_instance = self.__class__.__new__(self.__class__)
         new_instance.__dict__.update(self.__dict__)
         return new_instance
+
+    def __deepcopy__(self, memo) -> "Model":
+        """
+        Create a deep copy of the Model object.
+        """
+        return copy.deepcopy(self, memo)
diff --git a/pysentence_similarity/_splitter.py b/pysentence_similarity/_splitter.py
index a19694d..52c9d81 100644
--- a/pysentence_similarity/_splitter.py
+++ b/pysentence_similarity/_splitter.py
@@ -15,8 +15,6 @@
 class Splitter:
     """
     A class to split text into sentences.
-    Supports splitting by periods, exclamation marks, question marks, and
-    newline characters.
     """
 
     def __init__(
@@ -25,7 +23,10 @@ def __init__(
         preserve_markers: bool = False,
     ) -> None:
         """
-        Initializes the Splitter object.
+        Initializes the Splitter object, which is used to split a given text 
+        based on specific characters or markers. This class allows flexible 
+        splitting based on one or more characters and provides the option to 
+        preserve these markers in the split result.
 
         :param markers_to_split: A string or list of characters (e.g., 
         punctuation marks) used to split the text. Default is a newline 
@@ -51,7 +52,13 @@ def split_from_text(
         text: str,
     ) -> List[str]:
         """
-        Splits the given text into sentences based on punctuation and newlines.
+        Splits the given text into sentences based on specified punctuation and 
+        newlines.
+
+        This method uses regular expressions to identify splitting points in 
+        the input text. It can preserve split markers (such as punctuation) 
+        based on the `preserve_markers` attribute set during initialization.
+
 
         :param text: The input text to split.
         :type text: str
@@ -93,8 +100,12 @@ def split_from_file(
         file_path: str,
     ) -> List[str]:
         """
-        Splits the contents of a txt file into sentences based on punctuation
-        and newlines.
+        Splits the contents of a text file into sentences based on specified 
+        punctuation and newlines.
+
+        This method reads the entire content of the specified text file and 
+        utilizes the `split_from_text` method to split the content into 
+        sentences. It expects the file to be encoded in UTF-8.
 
         :param file_path: The path to the file to split.
         :type file_path: str
@@ -127,6 +138,10 @@ def split_from_url(
         Fetches the content from a URL, removes HTML tags, and splits the
         cleaned text into sentences.
 
+        This method retrieves the content from the provided URL, removes all
+        HTML tags, and splits the remaining plain text into sentences based on 
+        the specified split markers.
+
         :param url: The URL of the webpage to split.
         :type url: str
         :param timeout: The number of seconds to wait for the request to
@@ -166,6 +181,11 @@ def split_from_csv(
         Reads a CSV file and splits the text from specified columns into
         sentences.
 
+        This method reads the contents of a CSV file, extracts text from the 
+        specified columns, and then splits the text into sentences based on 
+        the markers defined in the `Splitter` object. It can handle multiple 
+        columns and combines the results into a single list of sentences.
+
         :param file_path: The path to the CSV file to read.
         :type file_path: str
         :param column_names: A list of column names to extract text from.
@@ -234,6 +254,12 @@ def split_from_json(self, file_path: str, keys: List[str]) -> List[str]:
         """
         Reads a JSON file and splits text from specified keys into sentences.
 
+        This method processes a JSON file by extracting text values from 
+        specified keys. The extracted text is then split into sentences based 
+        on the markers defined in the `Splitter` object. It can handle nested 
+        JSON structures and recursively extract values from deeply nested 
+        objects.
+
         :param file_path: The path to the JSON file to read.
         :type file_path: str
         :param keys: A list of keys to extract text from.
diff --git a/pysentence_similarity/_storage.py b/pysentence_similarity/_storage.py
index c8dbf96..6c6c35f 100644
--- a/pysentence_similarity/_storage.py
+++ b/pysentence_similarity/_storage.py
@@ -1,6 +1,7 @@
 """Class to store embeddings in memory."""
 import logging
-from typing import List, Optional, Union
+import copy
+from typing import List, Optional, Union, Tuple
 
 import h5py
 import numpy as np
@@ -25,6 +26,10 @@ def __init__(
         """
         Initialize the storage class.
 
+        This constructor initializes an instance of the storage class, allowing 
+        for the optional provision of a list of sentences and their 
+        corresponding embeddings.
+
         :param sentences: List of sentences.
         :type sentences: List[str], optional
         :param embeddings: List of embeddings.
@@ -41,7 +46,9 @@ def save(self, filename: str) -> None:
         """
         Save the embeddings and sentences to a file.
 
-        Save the embeddings and sentences to a file.
+        This method saves the embeddings and sentences into an HDF5 file format 
+        using the h5py library. It validates the data before saving to ensure 
+        that it is in the correct format.
 
         :param filename: The name of the file to save the embeddings to.
         :type filename: str
@@ -68,6 +75,10 @@ def load(filename: str) -> "Storage":
         Factory method to load the embeddings and sentences from a file and
         return a new Storage instance.
 
+        This method reads the embeddings and sentences from an HDF5 file 
+        using the h5py library. It constructs and returns a new instance of 
+        the Storage class with the loaded data.
+
         :param filename: The name of the file to load the embeddings from.
         :type filename: str
         :return: A new instance of Storage class populated with the loaded 
@@ -99,7 +110,11 @@ def add(
         filename: str = None
     ) -> None:
         """
-        Add a new sentences and embeddings to the storage.
+        Add new sentences and embeddings to the storage.
+
+        This method appends new sentences and their corresponding embeddings 
+        to the internal storage. If specified, it can also save the updated 
+        data to a file.
 
         :param sentence: The sentence to add.
         :type sentence: Union[str, List[str]]
@@ -140,6 +155,10 @@ def remove_by_index(self, index: int) -> None:
         """
         Remove the sentence and embedding at the specified index.
 
+        This method removes a sentence and its corresponding embedding 
+        from the storage based on the provided index. If the index is 
+        out of range, it raises an IndexError.
+
         :param index: Index of the item to remove.
         :type index: int
         :raises IndexError: If the index is out of bounds.
@@ -157,6 +176,10 @@ def remove_by_sentence(self, sentence: str) -> None:
         """
         Remove the sentence and its corresponding embedding by sentence.
 
+        This method searches for a specific sentence in the storage and 
+        removes it along with its corresponding embedding. If the sentence 
+        is not found, it raises a ValueError.
+
         :param sentence: The sentence to remove.
         :type sentence: str
         :raises ValueError: If the sentence is not found in the storage.
@@ -173,6 +196,8 @@ def get_sentences(self) -> List[str]:
         """
         Get the list of sentences.
 
+        This method retrieves the stored sentences from the storage. 
+
         :return: The list of sentences.
         :rtype: List[str]
         """
@@ -182,6 +207,10 @@ def get_embedding_by_sentence(self, sentence: str) -> np.ndarray:
         """
         Get the embedding for the specified sentence.
 
+        This method retrieves the stored embedding corresponding to the given 
+        sentence.
+
+
         :param sentence: The sentence to get the embedding for.
         :type sentence: str
         :return: The embedding for the specified sentence.
@@ -199,6 +228,8 @@ def get_embeddings(self) -> List[np.ndarray]:
         """
         Get the list of embeddings.
 
+        This method retrieves all stored embeddings.
+
         :return: The list of embeddings.
         :rtype: List[np.ndarray]
         """
@@ -252,11 +283,17 @@ def __copy__(self):
         new_instance.__dict__.update(self.__dict__)
         return new_instance
 
+    def __deepcopy__(self, memo) -> "Storage":
+        """
+        Create a deep copy of the Storage object.
+        """
+        return copy.deepcopy(self, memo)
+
     def __len__(self) -> int:
         """Return the number of sentences."""
         return len(self._sentences)
 
-    def __getitem__(self, index: int) -> List[Union[str, np.ndarray]]:
+    def __getitem__(self, index: int) -> Tuple[str, np.ndarray]:
         """
         Get the sentence and embedding at the specified index.
 
@@ -266,7 +303,7 @@ def __getitem__(self, index: int) -> List[Union[str, np.ndarray]]:
         :raises IndexError: If the index is out of bounds.
         """
         try:
-            return [self._sentences[index], self._embeddings[index]]
+            return self._sentences[index], self._embeddings[index]
         except IndexError as e:
             logger.error("Index out of range: %s", e)
             raise
diff --git a/pysentence_similarity/compute.py b/pysentence_similarity/compute.py
index 13c5d2c..32b3859 100644
--- a/pysentence_similarity/compute.py
+++ b/pysentence_similarity/compute.py
@@ -10,7 +10,16 @@ def cosine(
     embedding_1: np.ndarray,
     embedding_2: np.ndarray
 ) -> float:
-    """Compute cosine similarity between two embedding vectors.
+    """
+    Compute cosine similarity between two embedding vectors.
+
+    Cosine similarity is a measure of similarity between two non-zero vectors
+    of an inner product space that measures the cosine of the angle between 
+    them.
+    It is defined as the dot product of the vectors divided by the product of 
+    their magnitudes (norms). The value ranges from -1 to 1, where 1 indicates 
+    that the vectors are identical, 0 indicates orthogonality, and -1 indicates 
+    opposite directions.
 
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
@@ -35,6 +44,14 @@ def euclidean(
 ) -> float:
     """Compute Euclidean distance between two embedding vectors.
 
+    The Euclidean distance is a measure of the straight-line distance between 
+    two points in Euclidean space. It is calculated as the square root of the 
+    sum of the squared differences between corresponding elements of the 
+    vectors.
+    This distance metric is commonly used in various machine learning and
+    data analysis tasks to quantify similarity or dissimilarity between data 
+    points.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -56,6 +73,15 @@ def manhattan(
 ) -> float:
     """Compute Manhattan distance between two embedding vectors.
 
+    The Manhattan distance, also known as L1 distance or city block distance, 
+    measures the distance between two points in a grid-based system by 
+    calculating the sum of the absolute differences of their coordinates. 
+    It is defined as the sum of the absolute differences between corresponding 
+    elements of the vectors. 
+
+    This distance metric is useful in various machine learning applications and 
+    optimization problems.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -77,6 +103,14 @@ def jaccard(
 ) -> float:
     """Compute Jaccard similarity between two embedding vectors.
 
+    The Jaccard similarity coefficient measures the similarity between two sets 
+    by comparing the size of their intersection to the size of their union. 
+    For two embedding vectors, the Jaccard similarity is calculated as the 
+    sum of the minimum values (intersection) divided by the sum of the maximum 
+    values (union) for corresponding elements of the vectors. This metric 
+    is particularly useful in applications such as clustering and information 
+    retrieval where the similarity between sets is of interest.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -100,6 +134,13 @@ def pearson(
 ) -> float:
     """Compute Pearson correlation between two embedding vectors.
 
+    The Pearson correlation coefficient measures the linear correlation 
+    between two variables, ranging from -1 to 1. A coefficient of 1 indicates 
+    a perfect positive linear relationship, 0 indicates no linear correlation, 
+    and -1 indicates a perfect negative linear relationship. This metric is 
+    commonly used in statistics to determine the strength and direction of a 
+    linear relationship between two data sets.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -122,6 +163,16 @@ def minkowski(
 ) -> float:
     """Compute Minkowski distance between two embedding vectors.
 
+    The Minkowski distance is a generalization of both the Euclidean and 
+    Manhattan distances, defined as the p-th root of the sum of the absolute 
+    differences of the coordinates raised to the p-th power. 
+    The Minkowski distance becomes:
+    - Euclidean distance when p = 2
+    - Manhattan distance when p = 1
+
+    The parameter p controls the 'order' of the distance metric. A higher value 
+    of p emphasizes larger differences between dimensions.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -145,6 +196,13 @@ def hamming(
 ) -> float:
     """Compute Hamming distance between two embedding vectors.
 
+    The Hamming distance measures the proportion of positions at which 
+    the corresponding elements of two vectors are different. It is 
+    commonly used for comparing binary strings or categorical data 
+    and is defined as the number of differing elements divided by the 
+    total number of elements. This distance metric is particularly 
+    useful in error detection and correction codes.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -171,6 +229,12 @@ def kl_divergence(
     """Compute Kullback-Leibler divergence between two probability 
     distributions.
 
+    The Kullback-Leibler (KL) divergence is a measure of how one probability 
+    distribution diverges from a second, expected probability distribution. 
+    It quantifies the information lost when one distribution is used to 
+    approximate another. The KL divergence is always non-negative and is 
+    zero if and only if the two distributions are identical.
+
     :param embedding_1: First probability distribution.
     :param embedding_2: Second probability distribution.
     :return: KL divergence.
@@ -195,6 +259,13 @@ def chebyshev(
 ) -> float:
     """Compute Chebyshev distance between two embedding vectors.
 
+    The Chebyshev distance, also known as the maximum metric, 
+    measures the maximum absolute difference between the components 
+    of two vectors. It is defined as the greatest of the absolute 
+    differences along any coordinate dimension. This distance 
+    metric is particularly useful in scenarios where you want to 
+    focus on the largest difference between dimensions.
+
     :param embedding_1: First embedding vector.
     :type embedding_1: np.ndarray
     :param embedding_2: Second embedding vector.
@@ -216,9 +287,16 @@ def bregman(
     f=np.square,
     grad_f=lambda x: 2 * x
 ) -> float:
-    """Compute Bregman divergence between two embedding vectors using a convex
+    """Compute Bregman divergence between two embedding vectors using a convex 
     function.
 
+    Bregman divergence is a generalization of various distance measures 
+    based on a convex function. It quantifies the difference between 
+    two points in terms of the convex function and its gradient. Bregman 
+    divergence is non-negative and equals zero only when the two points 
+    are the same. This metric is useful in various applications, 
+    including optimization and information theory.
+
     :param embedding_1: First embedding vector.
     :param embedding_2: Second embedding vector.
     :param f: Convex function to compute divergence (default is square
diff --git a/pysentence_similarity/pooling.py b/pysentence_similarity/pooling.py
index 8cfd276..e5b0a9f 100644
--- a/pysentence_similarity/pooling.py
+++ b/pysentence_similarity/pooling.py
@@ -8,7 +8,19 @@ def max_pooling(
     attention_mask: List[int]
 ) -> np.ndarray:
     """
-    Perform max pooling on token embeddings.
+    Perform max pooling on token embeddings, using an attention mask to ignore 
+    padding tokens.
+
+    This function takes in token embeddings (e.g., from a transformer model's 
+    output) and an attention mask and applies a max pooling operation across 
+    the token embeddings for each sentence. The attention mask ensures that 
+    padding tokens (which have a mask value of 0) are ignored in the pooling 
+    operation.
+
+    Max pooling selects the maximum value across the embedding dimension for 
+    each token, after multiplying the embeddings by the attention mask. This 
+    results in a pooled embedding representing the entire input sentence.
+
 
     :param model_output: Model output (token embeddings).
     :type model_output: np.ndarray
@@ -28,7 +40,14 @@ def mean_pooling(
     attention_mask: List[int]
 ) -> np.ndarray:
     """
-    Perform mean pooling on token embeddings.
+    Perform mean pooling on token embeddings, using an attention mask to ignore 
+    padding tokens.
+
+    This function computes the mean (average) of the token embeddings for each 
+    sentence, ignoring the padding tokens by using an attention mask. The 
+    attention mask helps in weighting the valid tokens during pooling and 
+    ensures that the padding tokens (marked as 0 in the mask) are excluded from 
+    the average computation.
 
     :param model_output: Model output (token embeddings).
     :type model_output: np.ndarray
@@ -51,7 +70,14 @@ def min_pooling(
     attention_mask: List[int]
 ) -> np.ndarray:
     """
-    Perform min pooling on token embeddings.
+    Perform min pooling on token embeddings, using an attention mask to ignore 
+    padding tokens.
+
+    This function computes the minimum of the token embeddings for each 
+    sentence, while ignoring padding tokens by utilizing an attention mask. The 
+    attention mask ensures that tokens marked as padding (with a value of 0) 
+    are not considered in the min pooling operation, effectively allowing the 
+    computation to focus only on valid tokens.
 
     :param model_output: Model output (token embeddings).
     :type model_output: np.ndarray