Merge pull request #163 from citadel-ai/drop-38

Drop Python 3.8 support
citadel-ai · Oct 28, 2024 · 4be82f5 · 4be82f5
2 parents 1c39619 + 0a79fc7
commit 4be82f5
Show file tree

Hide file tree

Showing 54 changed files with 782 additions and 564 deletions.
diff --git a/.github/workflows/pip_install_matrix.yml b/.github/workflows/pip_install_matrix.yml
@@ -17,17 +17,15 @@ jobs:
       fail-fast: false  # Continue running jobs even if another fails
       matrix:
         # We specify Python versions as strings so 3.10 doesn't become 3.1
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         os: [ubuntu-latest, windows-latest, macos-14]
         # "en", "de", and "" are equivalent
         # "all" is tested by pytest.yml
         language: ["en", "ja", "zh"]
 
         exclude:
-          # GitHub Actions doesn't support Python 3.8 and 3.9 on M1 macOS yet:
+          # GitHub Actions doesn't support Python 3.9 on M1 macOS yet:
           # https://github.com/actions/setup-python/issues/696
-          - python-version: "3.8"
-            os: macos-14
           - python-version: "3.9"
             os: macos-14
           # TODO: Figure out how to install MeCab on Windows to install

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -7,7 +7,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.9"
 
 sphinx:
    configuration: docs/conf.py

diff --git a/docs/installation.md b/docs/installation.md
@@ -14,7 +14,7 @@ pip install --upgrade pip
 pip install langcheck[all]
 ```
 
-LangCheck works with Python 3.8 or higher.
+LangCheck works with Python 3.9 or higher.
 
 :::{note}
 Model files are lazily downloaded the first time you run a metric function. For example, the first time you run the ``langcheck.metrics.sentiment()`` function, LangCheck will automatically download the Twitter-roBERTa-base model.

diff --git a/docs/tutorial_langcheckchat.md b/docs/tutorial_langcheckchat.md
@@ -69,7 +69,7 @@ Here’s the response from the LLM:
 >
 > pip install langcheck
 >
-> Please note that LangCheck requires Python 3.8 or higher to work properly.
+> Please note that LangCheck requires Python 3.9 or higher to work properly.
 
 We can also see the sources that were retrieved from the index. By default, the top 2 most relevant source nodes are returned, which is what we see in `response.source_nodes`.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,11 +29,11 @@ dependencies = [
     'tomli; python_version < "3.11"',
     'tokenizers >= 0.13.2; python_version >= "3.11"',  # See https://github.com/citadel-ai/langcheck/pull/45
     'torch >= 2',
-    'transformers >= 4.6, < 4.46',
+    'transformers >= 4.6',
     'tabulate >= 0.9.0', # For model manager print table
     'omegaconf >= 2.3.0' # For model manager print table
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 
 [project.optional-dependencies]
 de = []  # No extra dependencies needed for German

diff --git a/src/langcheck/metrics/de/_tokenizers.py b/src/langcheck/metrics/de/_tokenizers.py
@@ -1,5 +1,3 @@
-from typing import List
-
 from nltk.stem.cistem import Cistem
 from nltk.tokenize import word_tokenize
 from rouge_score.tokenizers import Tokenizer as BaseTokenizer
@@ -16,7 +14,7 @@ def __init__(self, stemmer=False):
         if stemmer:
             self.stemmer = Cistem()
 
-    def tokenize(self, text: str) -> List[str]:
+    def tokenize(self, text: str) -> list[str]:
         if self.stemmer:
             # use only the stem part of the word
             text, _ = self.stemmer.segment(text)

diff --git a/src/langcheck/metrics/de/reference_based_text_quality.py b/src/langcheck/metrics/de/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from rouge_score import rouge_scorer
 
 from langcheck.metrics.de._tokenizers import DeTokenizer
@@ -19,9 +17,9 @@
 
 
 def semantic_similarity(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
 ) -> MetricValue[float]:
     """Calculates the semantic similarities between the generated outputs and
@@ -85,9 +83,9 @@ def semantic_similarity(
 
 
 def rouge1(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-1 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of unigrams
@@ -127,9 +125,9 @@ def rouge1(
 
 
 def rouge2(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-2 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of bigrams
@@ -169,9 +167,9 @@ def rouge2(
 
 
 def rougeL(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-L scores between the generated
     outputs and the reference outputs. It evaluates the longest common
@@ -221,8 +219,8 @@ def rougeL(
 
 
 def _rouge(
-    generated_outputs: List[str], reference_outputs: List[str], rouge_type: str
-) -> List[float]:
+    generated_outputs: list[str], reference_outputs: list[str], rouge_type: str
+) -> list[float]:
     """Helper function for computing the rouge1, rouge2, and rougeL metrics.
     This uses Google Research's implementation of ROUGE:
     https://github.com/google-research/google-research/tree/master/rouge

diff --git a/src/langcheck/metrics/de/reference_free_text_quality.py b/src/langcheck/metrics/de/reference_free_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.de._translation import Translate
 from langcheck.metrics.de.reference_based_text_quality import (
     semantic_similarity,
@@ -30,11 +28,11 @@
 
 
 def sentiment(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the sentiment scores of generated outputs. This metric takes
     on float values between [0, 1], where 0 is negative sentiment and 1 is
     positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
@@ -112,8 +110,8 @@ def sentiment(
 
 
 def _sentiment_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the sentiment scores of generated outputs using the
     twitter-xlm-roberta-base-sentiment-finetunned model. This metric takes on
     float values between [0, 1], where 0 is negative sentiment and 1 is positive
@@ -142,10 +140,10 @@ def _sentiment_local(
 
 
 def fluency(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the fluency scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low fluency and 1 is high fluency.
     (NOTE: when using an EvalClient, the fluency scores are either 0.0
@@ -220,11 +218,11 @@ def fluency(
 
 
 def toxicity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the toxicity scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
     (NOTE: when using an EvalClient, the toxicity scores are in steps of
@@ -301,8 +299,8 @@ def toxicity(
 
 
 def _toxicity_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the toxicity scores of generated outputs using the Detoxify
     model. This metric takes on float values between [0, 1], where 0 is low
     toxicity and 1 is high toxicity.
@@ -324,8 +322,8 @@ def _toxicity_local(
 
 
 def flesch_kincaid_grade(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch-Kincaid.
     It is the same as in English (but higher):
@@ -338,8 +336,8 @@ def flesch_kincaid_grade(
 
 
 def flesch_reading_ease(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch Reading
     Ease Score. This metric takes on float values between (-∞, 121.22], but
@@ -387,8 +385,8 @@ def flesch_reading_ease(
 
 
 def ai_disclaimer_similarity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     ai_disclaimer_phrase: str = (
         "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein."
     ),

diff --git a/src/langcheck/metrics/de/source_based_text_quality.py b/src/langcheck/metrics/de/source_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.de._translation import Translate
 from langcheck.metrics.en.source_based_text_quality import (
     factual_consistency as en_factual_consistency,
@@ -20,11 +18,11 @@
 
 
 def factual_consistency(
-    generated_outputs: List[str] | str,
-    sources: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    sources: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the factual consistency between the generated outputs and
     the sources. This metric takes on float values between [0, 1], where 0
     means that the output is not at all consistent with the source text, and 1
@@ -123,8 +121,8 @@ def factual_consistency(
 
 
 def context_relevance(
-    sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
-) -> MetricValue[Optional[float]]:
+    sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
+) -> MetricValue[float | None]:
     """Calculates the relevance of the sources to the prompts. This metric takes
     on float values between [0, 1], where 0 means that the source text is not at
     all relevant to the prompt, and 1 means that the source text is fully

diff --git a/src/langcheck/metrics/en/pairwise_text_quality.py b/src/langcheck/metrics/en/pairwise_text_quality.py
@@ -2,7 +2,7 @@
 
 import math
 import random
-from typing import List, Optional, cast
+from typing import cast
 
 from langcheck.metrics._pairwise_text_quality_utils import (
     compute_pairwise_comparison_metric_values_with_consistency,
@@ -16,13 +16,13 @@
 
 
 def simulated_annotators(
-    prompt_params: List[dict[str, str | None]],
+    prompt_params: list[dict[str, str | None]],
     eval_model: EvalClient,
     preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
     k: int = 5,
     n: int = 5,
     seed: int | None = None,
-) -> List[float | None]:
+) -> list[float | None]:
     """Compute a confidence score for the pairwise comparison metric based on
     the method Simulated Annotators proposed in the paper "Trust or Escalate:
     LLM Judges with Provable Guarantees for Human Agreement"
@@ -73,7 +73,7 @@ def simulated_annotators(
             prompts.append(prompt_template.render(prompt_param))
 
         # Get the response and top five logprobs of the first token
-        responses: List[Optional[TextResponseWithLogProbs]] = (
+        responses: list[TextResponseWithLogProbs | None] = (
             eval_model.get_text_responses_with_log_likelihood(
                 prompts, top_logprobs=5
             )
@@ -83,7 +83,7 @@ def simulated_annotators(
             if response:
                 response = cast(TextResponseWithLogProbs, response)
                 top_five_first_token_logprobs = cast(
-                    List[TokenLogProb], response["response_logprobs"][0]
+                    list[TokenLogProb], response["response_logprobs"][0]
                 )
                 # Extract logprobs for tokens 'A' and 'B'
                 logprobs_dict = {
@@ -110,20 +110,20 @@ def simulated_annotators(
 
 
 def pairwise_comparison(
-    generated_outputs_a: List[str] | str,
-    generated_outputs_b: List[str] | str,
-    prompts: List[str] | str,
-    sources_a: Optional[List[str] | str] = None,
-    sources_b: Optional[List[str] | str] = None,
-    reference_outputs: Optional[List[str] | str] = None,
+    generated_outputs_a: list[str] | str,
+    generated_outputs_b: list[str] | str,
+    prompts: list[str] | str,
+    sources_a: list[str] | str | None = None,
+    sources_b: list[str] | str | None = None,
+    reference_outputs: list[str] | str | None = None,
     enforce_consistency: bool = True,
     calculated_confidence: bool = False,
     preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
     k: int = 5,
     n: int = 5,
     seed: int | None = None,
     eval_model: EvalClient | None = None,
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the pairwise comparison metric. This metric takes on float
     values of either 0.0 (Response A is better), 0.5 (Tie), or 1.0 (Response B
     is better). The score may also be `None` if it could not be computed.