diff --git a/.github/workflows/pip_install_matrix.yml b/.github/workflows/pip_install_matrix.yml
index 5468eb60..debf8ca4 100644
--- a/.github/workflows/pip_install_matrix.yml
+++ b/.github/workflows/pip_install_matrix.yml
@@ -17,17 +17,15 @@ jobs:
       fail-fast: false  # Continue running jobs even if another fails
       matrix:
         # We specify Python versions as strings so 3.10 doesn't become 3.1
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         os: [ubuntu-latest, windows-latest, macos-14]
         # "en", "de", and "" are equivalent
         # "all" is tested by pytest.yml
         language: ["en", "ja", "zh"]
 
         exclude:
-          # GitHub Actions doesn't support Python 3.8 and 3.9 on M1 macOS yet:
+          # GitHub Actions doesn't support Python 3.9 on M1 macOS yet:
           # https://github.com/actions/setup-python/issues/696
-          - python-version: "3.8"
-            os: macos-14
           - python-version: "3.9"
             os: macos-14
           # TODO: Figure out how to install MeCab on Windows to install
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 9aa05816..5790595d 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,7 +7,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: "3.9"
 
 sphinx:
    configuration: docs/conf.py
diff --git a/docs/installation.md b/docs/installation.md
index 22e880a8..b72d5400 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -14,7 +14,7 @@ pip install --upgrade pip
 pip install langcheck[all]
 ```
 
-LangCheck works with Python 3.8 or higher.
+LangCheck works with Python 3.9 or higher.
 
 :::{note}
 Model files are lazily downloaded the first time you run a metric function. For example, the first time you run the ``langcheck.metrics.sentiment()`` function, LangCheck will automatically download the Twitter-roBERTa-base model.
diff --git a/docs/tutorial_langcheckchat.md b/docs/tutorial_langcheckchat.md
index ccab624a..fb257cc8 100644
--- a/docs/tutorial_langcheckchat.md
+++ b/docs/tutorial_langcheckchat.md
@@ -69,7 +69,7 @@ Here’s the response from the LLM:
 >
 > pip install langcheck
 >
-> Please note that LangCheck requires Python 3.8 or higher to work properly.
+> Please note that LangCheck requires Python 3.9 or higher to work properly.
 
 We can also see the sources that were retrieved from the index. By default, the top 2 most relevant source nodes are returned, which is what we see in `response.source_nodes`.
 
diff --git a/pyproject.toml b/pyproject.toml
index cf7a48bc..3ec923a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,11 +29,11 @@ dependencies = [
     'tomli; python_version < "3.11"',
     'tokenizers >= 0.13.2; python_version >= "3.11"',  # See https://github.com/citadel-ai/langcheck/pull/45
     'torch >= 2',
-    'transformers >= 4.6, < 4.46',
+    'transformers >= 4.6',
     'tabulate >= 0.9.0', # For model manager print table
     'omegaconf >= 2.3.0' # For model manager print table
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 
 [project.optional-dependencies]
 de = []  # No extra dependencies needed for German
diff --git a/src/langcheck/metrics/de/_tokenizers.py b/src/langcheck/metrics/de/_tokenizers.py
index cee6d96a..1dbbb675 100644
--- a/src/langcheck/metrics/de/_tokenizers.py
+++ b/src/langcheck/metrics/de/_tokenizers.py
@@ -1,5 +1,3 @@
-from typing import List
-
 from nltk.stem.cistem import Cistem
 from nltk.tokenize import word_tokenize
 from rouge_score.tokenizers import Tokenizer as BaseTokenizer
@@ -16,7 +14,7 @@ def __init__(self, stemmer=False):
         if stemmer:
             self.stemmer = Cistem()
 
-    def tokenize(self, text: str) -> List[str]:
+    def tokenize(self, text: str) -> list[str]:
         if self.stemmer:
             # use only the stem part of the word
             text, _ = self.stemmer.segment(text)
diff --git a/src/langcheck/metrics/de/reference_based_text_quality.py b/src/langcheck/metrics/de/reference_based_text_quality.py
index a27225db..51f0c104 100644
--- a/src/langcheck/metrics/de/reference_based_text_quality.py
+++ b/src/langcheck/metrics/de/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from rouge_score import rouge_scorer
 
 from langcheck.metrics.de._tokenizers import DeTokenizer
@@ -19,9 +17,9 @@
 
 
 def semantic_similarity(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
 ) -> MetricValue[float]:
     """Calculates the semantic similarities between the generated outputs and
@@ -85,9 +83,9 @@ def semantic_similarity(
 
 
 def rouge1(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-1 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of unigrams
@@ -127,9 +125,9 @@ def rouge1(
 
 
 def rouge2(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-2 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of bigrams
@@ -169,9 +167,9 @@ def rouge2(
 
 
 def rougeL(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-L scores between the generated
     outputs and the reference outputs. It evaluates the longest common
@@ -221,8 +219,8 @@ def rougeL(
 
 
 def _rouge(
-    generated_outputs: List[str], reference_outputs: List[str], rouge_type: str
-) -> List[float]:
+    generated_outputs: list[str], reference_outputs: list[str], rouge_type: str
+) -> list[float]:
     """Helper function for computing the rouge1, rouge2, and rougeL metrics.
     This uses Google Research's implementation of ROUGE:
     https://github.com/google-research/google-research/tree/master/rouge
diff --git a/src/langcheck/metrics/de/reference_free_text_quality.py b/src/langcheck/metrics/de/reference_free_text_quality.py
index 070e7db8..30a60afb 100644
--- a/src/langcheck/metrics/de/reference_free_text_quality.py
+++ b/src/langcheck/metrics/de/reference_free_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.de._translation import Translate
 from langcheck.metrics.de.reference_based_text_quality import (
     semantic_similarity,
@@ -30,11 +28,11 @@
 
 
 def sentiment(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the sentiment scores of generated outputs. This metric takes
     on float values between [0, 1], where 0 is negative sentiment and 1 is
     positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
@@ -112,8 +110,8 @@ def sentiment(
 
 
 def _sentiment_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the sentiment scores of generated outputs using the
     twitter-xlm-roberta-base-sentiment-finetunned model. This metric takes on
     float values between [0, 1], where 0 is negative sentiment and 1 is positive
@@ -142,10 +140,10 @@ def _sentiment_local(
 
 
 def fluency(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the fluency scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low fluency and 1 is high fluency.
     (NOTE: when using an EvalClient, the fluency scores are either 0.0
@@ -220,11 +218,11 @@ def fluency(
 
 
 def toxicity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the toxicity scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
     (NOTE: when using an EvalClient, the toxicity scores are in steps of
@@ -301,8 +299,8 @@ def toxicity(
 
 
 def _toxicity_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the toxicity scores of generated outputs using the Detoxify
     model. This metric takes on float values between [0, 1], where 0 is low
     toxicity and 1 is high toxicity.
@@ -324,8 +322,8 @@ def _toxicity_local(
 
 
 def flesch_kincaid_grade(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch-Kincaid.
     It is the same as in English (but higher):
@@ -338,8 +336,8 @@ def flesch_kincaid_grade(
 
 
 def flesch_reading_ease(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch Reading
     Ease Score. This metric takes on float values between (-∞, 121.22], but
@@ -387,8 +385,8 @@ def flesch_reading_ease(
 
 
 def ai_disclaimer_similarity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     ai_disclaimer_phrase: str = (
         "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein."
     ),
diff --git a/src/langcheck/metrics/de/source_based_text_quality.py b/src/langcheck/metrics/de/source_based_text_quality.py
index 91ef5407..56be205b 100644
--- a/src/langcheck/metrics/de/source_based_text_quality.py
+++ b/src/langcheck/metrics/de/source_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.de._translation import Translate
 from langcheck.metrics.en.source_based_text_quality import (
     factual_consistency as en_factual_consistency,
@@ -20,11 +18,11 @@
 
 
 def factual_consistency(
-    generated_outputs: List[str] | str,
-    sources: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    sources: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the factual consistency between the generated outputs and
     the sources. This metric takes on float values between [0, 1], where 0
     means that the output is not at all consistent with the source text, and 1
@@ -123,8 +121,8 @@ def factual_consistency(
 
 
 def context_relevance(
-    sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
-) -> MetricValue[Optional[float]]:
+    sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
+) -> MetricValue[float | None]:
     """Calculates the relevance of the sources to the prompts. This metric takes
     on float values between [0, 1], where 0 means that the source text is not at
     all relevant to the prompt, and 1 means that the source text is fully
diff --git a/src/langcheck/metrics/en/pairwise_text_quality.py b/src/langcheck/metrics/en/pairwise_text_quality.py
index 23263a38..07482507 100644
--- a/src/langcheck/metrics/en/pairwise_text_quality.py
+++ b/src/langcheck/metrics/en/pairwise_text_quality.py
@@ -2,7 +2,7 @@
 
 import math
 import random
-from typing import List, Optional, cast
+from typing import cast
 
 from langcheck.metrics._pairwise_text_quality_utils import (
     compute_pairwise_comparison_metric_values_with_consistency,
@@ -16,13 +16,13 @@
 
 
 def simulated_annotators(
-    prompt_params: List[dict[str, str | None]],
+    prompt_params: list[dict[str, str | None]],
     eval_model: EvalClient,
     preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
     k: int = 5,
     n: int = 5,
     seed: int | None = None,
-) -> List[float | None]:
+) -> list[float | None]:
     """Compute a confidence score for the pairwise comparison metric based on
     the method Simulated Annotators proposed in the paper "Trust or Escalate:
     LLM Judges with Provable Guarantees for Human Agreement"
@@ -73,7 +73,7 @@ def simulated_annotators(
             prompts.append(prompt_template.render(prompt_param))
 
         # Get the response and top five logprobs of the first token
-        responses: List[Optional[TextResponseWithLogProbs]] = (
+        responses: list[TextResponseWithLogProbs | None] = (
             eval_model.get_text_responses_with_log_likelihood(
                 prompts, top_logprobs=5
             )
@@ -83,7 +83,7 @@ def simulated_annotators(
             if response:
                 response = cast(TextResponseWithLogProbs, response)
                 top_five_first_token_logprobs = cast(
-                    List[TokenLogProb], response["response_logprobs"][0]
+                    list[TokenLogProb], response["response_logprobs"][0]
                 )
                 # Extract logprobs for tokens 'A' and 'B'
                 logprobs_dict = {
@@ -110,12 +110,12 @@ def simulated_annotators(
 
 
 def pairwise_comparison(
-    generated_outputs_a: List[str] | str,
-    generated_outputs_b: List[str] | str,
-    prompts: List[str] | str,
-    sources_a: Optional[List[str] | str] = None,
-    sources_b: Optional[List[str] | str] = None,
-    reference_outputs: Optional[List[str] | str] = None,
+    generated_outputs_a: list[str] | str,
+    generated_outputs_b: list[str] | str,
+    prompts: list[str] | str,
+    sources_a: list[str] | str | None = None,
+    sources_b: list[str] | str | None = None,
+    reference_outputs: list[str] | str | None = None,
     enforce_consistency: bool = True,
     calculated_confidence: bool = False,
     preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
@@ -123,7 +123,7 @@ def pairwise_comparison(
     n: int = 5,
     seed: int | None = None,
     eval_model: EvalClient | None = None,
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the pairwise comparison metric. This metric takes on float
     values of either 0.0 (Response A is better), 0.5 (Tie), or 1.0 (Response B
     is better). The score may also be `None` if it could not be computed.
diff --git a/src/langcheck/metrics/en/reference_based_text_quality.py b/src/langcheck/metrics/en/reference_based_text_quality.py
index 578f62cc..cc0e9680 100644
--- a/src/langcheck/metrics/en/reference_based_text_quality.py
+++ b/src/langcheck/metrics/en/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from rouge_score import rouge_scorer
 
 from langcheck.metrics.eval_clients import EvalClient
@@ -19,11 +17,11 @@
 
 
 def answer_correctness(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: List[str] | str,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str,
     eval_model: EvalClient,
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the correctness of the generated outputs. This metric takes on
     float values of either 0.0 (Incorrect), 0.5 (Partially Correct), or 1.0
     (Correct). The score may also be `None` if it could not be computed.
@@ -61,9 +59,9 @@ def answer_correctness(
 
 
 def semantic_similarity(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
 ) -> MetricValue[float]:
     """Calculates the semantic similarities between the generated outputs and
@@ -126,9 +124,9 @@ def semantic_similarity(
 
 
 def rouge1(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-1 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of unigrams
@@ -168,9 +166,9 @@ def rouge1(
 
 
 def rouge2(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-2 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of bigrams
@@ -210,9 +208,9 @@ def rouge2(
 
 
 def rougeL(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-L scores between the generated
     outputs and the reference outputs. It evaluates the longest common
@@ -262,8 +260,8 @@ def rougeL(
 
 
 def _rouge(
-    generated_outputs: List[str], reference_outputs: List[str], rouge_type: str
-) -> List[float]:
+    generated_outputs: list[str], reference_outputs: list[str], rouge_type: str
+) -> list[float]:
     """Helper function for computing the rouge1, rouge2, and rougeL metrics.
     This uses Google Research's implementation of ROUGE:
     https://github.com/google-research/google-research/tree/master/rouge
diff --git a/src/langcheck/metrics/en/reference_free_text_quality.py b/src/langcheck/metrics/en/reference_free_text_quality.py
index 43636091..efbcef86 100644
--- a/src/langcheck/metrics/en/reference_free_text_quality.py
+++ b/src/langcheck/metrics/en/reference_free_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.en.reference_based_text_quality import (
     semantic_similarity,
 )
@@ -22,11 +20,11 @@
 
 
 def sentiment(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the sentiment scores of generated outputs. This metric takes
     on float values between [0, 1], where 0 is negative sentiment and 1 is
     positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
@@ -101,8 +99,8 @@ def sentiment(
 
 
 def _sentiment_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the sentiment scores of generated outputs using the
     Twitter-roBERTa-base model. This metric takes on float values between
     [0, 1], where 0 is negative sentiment and 1 is positive sentiment.
@@ -131,11 +129,11 @@ def _sentiment_local(
 
 
 def fluency(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the fluency scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low fluency and 1 is high fluency.
     (NOTE: when using an EvalClient, the fluency scores are either 0.0
@@ -210,8 +208,8 @@ def fluency(
 
 
 def _fluency_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the fluency scores of generated outputs using the Parrot
     fluency model. This metric takes on float values between [0, 1], where 0 is
     low fluency and 1 is high fluency.
@@ -238,12 +236,12 @@ def _fluency_local(
 
 
 def toxicity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
     eval_prompt_version: str = "v2",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the toxicity scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
     (NOTE: when using an EvalClient, the toxicity scores are either 0.0
@@ -335,8 +333,8 @@ def toxicity(
 
 
 def _toxicity_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the toxicity scores of generated outputs using the Detoxify
     model. This metric takes on float values between [0, 1], where 0 is low
     toxicity and 1 is high toxicity.
@@ -356,8 +354,8 @@ def _toxicity_local(
 
 
 def flesch_reading_ease(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch Reading
     Ease Score. This metric takes on float values between (-∞, 121.22], but
@@ -403,8 +401,8 @@ def flesch_reading_ease(
 
 
 def flesch_kincaid_grade(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated outputs using the Flesch-Kincaid
     Grade Level metric. This metric takes on float values between [-3.40, ∞),
@@ -451,8 +449,8 @@ def flesch_kincaid_grade(
 
 
 def ai_disclaimer_similarity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     ai_disclaimer_phrase: str = (
         "I don't have personal opinions, emotions, or consciousness."
     ),
diff --git a/src/langcheck/metrics/en/source_based_text_quality.py b/src/langcheck/metrics/en/source_based_text_quality.py
index 7f87f798..efe171b3 100644
--- a/src/langcheck/metrics/en/source_based_text_quality.py
+++ b/src/langcheck/metrics/en/source_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 import nltk
 import torch
 import torch.nn as nn
@@ -26,11 +24,11 @@
 
 
 def factual_consistency(
-    generated_outputs: List[str] | str,
-    sources: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    sources: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the factual consistency between the generated outputs and
     the sources. This metric takes on float values between [0, 1], where 0
     means that the output is not at all consistent with the source text, and 1
@@ -104,8 +102,8 @@ def factual_consistency(
 
 
 def _factual_consistency_local(
-    generated_outputs: List[str], sources: List[str]
-) -> List[float]:
+    generated_outputs: list[str], sources: list[str]
+) -> list[float]:
     """Calculates the factual consistency between each generated sentence and
     its corresponding source text. The factual consistency score for one
     generated output is computed as the average of the per-sentence
@@ -226,8 +224,8 @@ def _factual_consistency_local(
 
 
 def context_relevance(
-    sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
-) -> MetricValue[Optional[float]]:
+    sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
+) -> MetricValue[float | None]:
     """Calculates the relevance of the sources to the prompts. This metric takes
     on float values between [0, 1], where 0 means that the source text is not at
     all relevant to the prompt, and 1 means that the source text is fully
diff --git a/src/langcheck/metrics/eval_clients/_anthropic.py b/src/langcheck/metrics/eval_clients/_anthropic.py
index dd28fd9a..e8989740 100644
--- a/src/langcheck/metrics/eval_clients/_anthropic.py
+++ b/src/langcheck/metrics/eval_clients/_anthropic.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import asyncio
-from typing import Any, Iterable
+from collections.abc import Iterable
+from typing import Any
 
 from anthropic import Anthropic, AsyncAnthropic
 
diff --git a/src/langcheck/metrics/eval_clients/_base.py b/src/langcheck/metrics/eval_clients/_base.py
index e2075dae..9e1ac635 100644
--- a/src/langcheck/metrics/eval_clients/_base.py
+++ b/src/langcheck/metrics/eval_clients/_base.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import Dict, Iterable, List, Optional, Union
+from collections.abc import Iterable
+from typing import Union
 
 from jinja2 import Template
 
@@ -10,9 +11,9 @@
 from ..prompts._utils import get_template
 from ..scorer._base import BaseSimilarityScorer
 
-TokenLogProb = Dict[str, Union[str, float]]
-TopKLogProbs = List[List[TokenLogProb]]
-TextResponseWithLogProbs = Dict[str, Union[str, List[TopKLogProbs]]]
+TokenLogProb = dict[str, Union[str, float]]
+TopKLogProbs = list[list[TokenLogProb]]
+TextResponseWithLogProbs = dict[str, Union[str, list[TopKLogProbs]]]
 
 
 class EvalClient:
@@ -71,7 +72,7 @@ def get_text_responses_with_log_likelihood(
         top_logprobs: int | None = None,
         *,
         tqdm_description: str | None = None,
-    ) -> List[Optional[TextResponseWithLogProbs]]:
+    ) -> list[TextResponseWithLogProbs | None]:
         """The function that gets responses with log likelihood to the given
         prompt texts. Each concrete subclass needs to define the concrete
         implementation of this function to enable text scoring.
diff --git a/src/langcheck/metrics/eval_clients/_gemini.py b/src/langcheck/metrics/eval_clients/_gemini.py
index e066c086..f5196b07 100644
--- a/src/langcheck/metrics/eval_clients/_gemini.py
+++ b/src/langcheck/metrics/eval_clients/_gemini.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import os
-from typing import Any, Iterable
+from collections.abc import Iterable
+from typing import Any
 
 import google.ai.generativelanguage as glm
 import google.generativeai as genai
diff --git a/src/langcheck/metrics/eval_clients/_llama.py b/src/langcheck/metrics/eval_clients/_llama.py
index c26afa2e..a4e2b1d9 100644
--- a/src/langcheck/metrics/eval_clients/_llama.py
+++ b/src/langcheck/metrics/eval_clients/_llama.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Iterable
+from collections.abc import Iterable
 
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
diff --git a/src/langcheck/metrics/eval_clients/_openai.py b/src/langcheck/metrics/eval_clients/_openai.py
index 2a044ddd..50c75c03 100644
--- a/src/langcheck/metrics/eval_clients/_openai.py
+++ b/src/langcheck/metrics/eval_clients/_openai.py
@@ -3,7 +3,8 @@
 import asyncio
 import json
 import os
-from typing import Any, Iterable, List, Optional
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI
@@ -143,7 +144,7 @@ def get_text_responses_with_log_likelihood(
         top_logprobs: int | None = None,
         *,
         tqdm_description: str | None = None,
-    ) -> List[Optional[TextResponseWithLogProbs]]:
+    ) -> list[TextResponseWithLogProbs | None]:
         """The function that gets responses with log likelihood to the given
         prompt texts. Each concrete subclass needs to define the concrete
         implementation of this function to enable text scoring.
diff --git a/src/langcheck/metrics/eval_clients/_prometheus.py b/src/langcheck/metrics/eval_clients/_prometheus.py
index 62691975..c51ee9bb 100644
--- a/src/langcheck/metrics/eval_clients/_prometheus.py
+++ b/src/langcheck/metrics/eval_clients/_prometheus.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Iterable
+from collections.abc import Iterable
 
 from jinja2 import Template
 from transformers import AutoTokenizer
diff --git a/src/langcheck/metrics/ja/pairwise_text_quality.py b/src/langcheck/metrics/ja/pairwise_text_quality.py
index 22d6d1e2..cf6ab7e8 100644
--- a/src/langcheck/metrics/ja/pairwise_text_quality.py
+++ b/src/langcheck/metrics/ja/pairwise_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics._pairwise_text_quality_utils import (
     compute_pairwise_comparison_metric_values_with_consistency,
 )
@@ -11,15 +9,15 @@
 
 
 def pairwise_comparison(
-    generated_outputs_a: List[str] | str,
-    generated_outputs_b: List[str] | str,
-    prompts: List[str] | str,
-    sources_a: Optional[List[str] | str] = None,
-    sources_b: Optional[List[str] | str] = None,
-    reference_outputs: Optional[List[str] | str] = None,
+    generated_outputs_a: list[str] | str,
+    generated_outputs_b: list[str] | str,
+    prompts: list[str] | str,
+    sources_a: list[str] | str | None = None,
+    sources_b: list[str] | str | None = None,
+    reference_outputs: list[str] | str | None = None,
     enforce_consistency: bool = True,
     eval_model: EvalClient | None = None,
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the pairwise comparison metric. This metric takes on float
     values of either 0.0 (Response A is better), 0.5 (Tie), or 1.0 (Response B
     is better). The score may also be `None` if it could not be computed.
diff --git a/src/langcheck/metrics/ja/reference_based_text_quality.py b/src/langcheck/metrics/ja/reference_based_text_quality.py
index 4b8f7419..a8027875 100644
--- a/src/langcheck/metrics/ja/reference_based_text_quality.py
+++ b/src/langcheck/metrics/ja/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from rouge_score import rouge_scorer
 from rouge_score.tokenizers import Tokenizer
 
@@ -21,11 +19,11 @@
 
 
 def answer_correctness(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: List[str] | str,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str,
     eval_model: EvalClient,
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the correctness of the generated outputs. This metric takes on
     float values of either 0.0 (Incorrect), 0.5 (Partially Correct), or 1.0
     (Correct). The score may also be `None` if it could not be computed.
@@ -63,9 +61,9 @@ def answer_correctness(
 
 
 def semantic_similarity(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
 ) -> MetricValue[float]:
     """Calculates the semantic similarities between the generated outputs and
@@ -130,11 +128,11 @@ def semantic_similarity(
 
 
 def rouge1(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-1 scores between the generated
     (single tokens) between the generated outputs and the reference outputs.
@@ -176,11 +174,11 @@ def rouge1(
 
 
 def rouge2(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-2 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of bigrams
@@ -222,11 +220,11 @@ def rouge2(
 
 
 def rougeL(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-L scores between the generated
     outputs and the reference outputs. It evaluates the longest common
@@ -278,12 +276,12 @@ def rougeL(
 
 
 def _rouge(
-    generated_outputs: List[str],
-    reference_outputs: List[str],
+    generated_outputs: list[str],
+    reference_outputs: list[str],
     rouge_type: str,
     *,
-    tokenizer: Optional[Tokenizer] = None,
-) -> List[float]:
+    tokenizer: Tokenizer | None = None,
+) -> list[float]:
     """Helper function for computing the rouge1, rouge2, and rougeL metrics.
     This uses Google Research's implementation of ROUGE:
     https://github.com/google-research/google-research/tree/master/rouge
diff --git a/src/langcheck/metrics/ja/reference_free_text_quality.py b/src/langcheck/metrics/ja/reference_free_text_quality.py
index c8b0efb3..ec6d6565 100644
--- a/src/langcheck/metrics/ja/reference_free_text_quality.py
+++ b/src/langcheck/metrics/ja/reference_free_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 import regex as re
 
 from langcheck.metrics.eval_clients import EvalClient
@@ -19,11 +17,11 @@
 
 
 def sentiment(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the sentiment scores of generated outputs. This metric takes
     on float values between [0, 1], where 0 is negative sentiment and 1 is
     positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
@@ -101,8 +99,8 @@ def sentiment(
 
 
 def _sentiment_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the sentiment scores of generated outputs using the
     Twitter-roBERTa-base-sentiment-multilingual model. This metric takes on
     float values between [0, 1], where 0 is negative sentiment and 1 is positive
@@ -132,12 +130,12 @@ def _sentiment_local(
 
 
 def toxicity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
     eval_prompt_version: str = "v2",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the toxicity scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
     (NOTE: when using an EvalClient, the toxicity scores are either 0.0
@@ -235,8 +233,8 @@ def toxicity(
 
 
 def _toxicity_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the toxicity scores of generated outputs using a fine-tuned
     model from `line-corporation/line-distilbert-base-japanese`. This metric
     takes on float values between [0, 1], where 0 is low toxicity and 1 is high
@@ -265,11 +263,11 @@ def _toxicity_local(
 
 
 def fluency(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     local_overflow_strategy: str = "truncate",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the fluency scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low fluency and 1 is high fluency.
     (NOTE: when using an EvalClient, the fluency scores are either 0.0
@@ -350,8 +348,8 @@ def fluency(
 
 
 def _fluency_local(
-    generated_outputs: List[str], overflow_strategy: str
-) -> List[Optional[float]]:
+    generated_outputs: list[str], overflow_strategy: str
+) -> list[float | None]:
     """Calculates the fluency scores of generated outputs using a fine-tuned
     model from `line-corporation/line-distilbert-base-japanese`. This metric
     takes on float values between [0, 1], where 0 is low fluency and 1 is high
@@ -380,8 +378,8 @@ def _fluency_local(
 
 
 def tateishi_ono_yamada_reading_ease(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability of generated Japanese outputs using the
     reading ease score introduced in "日本文の読みやすさの評価式 (A Computer
@@ -423,7 +421,7 @@ def tateishi_ono_yamada_reading_ease(
     delimiters_re = r"[、|。|!|？|!|?|「|」|,|，|.|．|…|『|』]"
 
     # Aux function to compute the average length of strings in the list
-    def _mean_str_length(ls: List[str]) -> float:
+    def _mean_str_length(ls: list[str]) -> float:
         if len(ls) == 0:
             return 0
         lens = [len(el) for el in ls]
diff --git a/src/langcheck/metrics/ja/source_based_text_quality.py b/src/langcheck/metrics/ja/source_based_text_quality.py
index b9e9ee93..708ac66a 100644
--- a/src/langcheck/metrics/ja/source_based_text_quality.py
+++ b/src/langcheck/metrics/ja/source_based_text_quality.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Optional, cast
+from typing import cast
 
 from transformers.pipelines import pipeline
 from transformers.pipelines.base import Pipeline
@@ -23,11 +23,11 @@
 
 
 def factual_consistency(
-    generated_outputs: List[str] | str,
-    sources: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    sources: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the factual consistency between the generated outputs and
     the sources. This metric takes on float values between [0, 1], where 0
     means that the output is not at all consistent with the source text, and 1
@@ -106,8 +106,8 @@ def factual_consistency(
 
 
 def _factual_consistency_local(
-    generated_outputs: List[str], sources: List[str]
-) -> List[float]:
+    generated_outputs: list[str], sources: list[str]
+) -> list[float]:
     """Calculates the factual consistency between each generated sentence and
     its corresponding source text. The factual consistency score for one
     generated output is computed as the average of the per-sentence
@@ -177,13 +177,13 @@ def _factual_consistency_local(
         generated_outputs=en_generated_outputs, sources=en_source
     ).metric_values
 
-    # Local factual consistency scores are of type List[float]
+    # Local factual consistency scores are of type list[float]
     return factual_consistency_scores  # type: ignore
 
 
 def context_relevance(
-    sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
-) -> MetricValue[Optional[float]]:
+    sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
+) -> MetricValue[float | None]:
     """Calculates the relevance of the sources to the prompts. This metric takes
     on float values between [0, 1], where 0 means that the source text is not at
     all relevant to the prompt, and 1 means that the source text is fully
diff --git a/src/langcheck/metrics/metric_inputs.py b/src/langcheck/metrics/metric_inputs.py
index c7e0d8ac..98b0d335 100644
--- a/src/langcheck/metrics/metric_inputs.py
+++ b/src/langcheck/metrics/metric_inputs.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
-from typing import List, Union
+from typing import Union
 
 import pandas as pd
 from jinja2 import Environment, meta
 
-IndividualInputType = Union[str, List[str], None]
+# You need "Union" to declare a type in Python < 3.10
+IndividualInputType = Union[str, list[str], None]
 
 
 def _map_pairwise_input_to_list(
diff --git a/src/langcheck/metrics/metric_value.py b/src/langcheck/metrics/metric_value.py
index 4bf45bdd..d07e0b17 100644
--- a/src/langcheck/metrics/metric_value.py
+++ b/src/langcheck/metrics/metric_value.py
@@ -4,7 +4,7 @@
 import warnings
 from dataclasses import dataclass, fields
 from statistics import mean
-from typing import Generic, List, Optional, TypeVar
+from typing import Generic, TypeVar, Union
 
 import pandas as pd
 
@@ -12,7 +12,9 @@
 
 # Metrics take on float or integer values
 # Some metrics may return `None` values when the score fails to be computed
-NumericType = TypeVar("NumericType", float, int, Optional[float], Optional[int])
+NumericType = TypeVar(
+    "NumericType", float, int, Union[float, None], Union[int, None]
+)
 
 
 @dataclass
@@ -20,14 +22,14 @@ class MetricValue(Generic[NumericType]):
     """A rich object that is the output of any langcheck.metrics function."""
 
     metric_name: str
-    metric_values: List[NumericType]
+    metric_values: list[NumericType]
 
     # Input of the metrics such as prompts, generated outputs... etc
     metric_inputs: MetricInputs
 
     # An explanation can be None if the metric could not be computed
-    explanations: Optional[List[Optional[str]]]
-    language: Optional[str]
+    explanations: list[str | None] | None
+    language: str | None
 
     def to_df(self) -> pd.DataFrame:
         """Returns a DataFrame of metric values for each data point."""
@@ -235,7 +237,7 @@ def pass_rate(self) -> float:
         return self._pass_rate
 
     @property
-    def threshold_results(self) -> List[bool]:
+    def threshold_results(self) -> list[bool]:
         """Returns a list of booleans indicating whether each data point passes
         the threshold.
         """
diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py
index fcf75dd4..c8b8e1b0 100644
--- a/src/langcheck/metrics/model_manager/_model_loader.py
+++ b/src/langcheck/metrics/model_manager/_model_loader.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from __future__ import annotations
 
 from sentence_transformers import SentenceTransformer
 from transformers.models.auto.modeling_auto import (
@@ -11,10 +11,11 @@
 
 
 def load_sentence_transformers(
-        model_name: str,
-        model_revision: Optional[str] = None,
-        tokenizer_name: Optional[str] = None,
-        tokenizer_revision: Optional[str] = None) -> SentenceTransformer:
+    model_name: str,
+    model_revision: str | None = None,
+    tokenizer_name: str | None = None,
+    tokenizer_revision: str | None = None,
+) -> SentenceTransformer:
     """
     Loads a SentenceTransformer model.
 
@@ -44,10 +45,10 @@ def load_sentence_transformers(
 
 def load_auto_model_for_text_classification(
     model_name: str,
-    model_revision: Optional[str] = None,
-    tokenizer_name: Optional[str] = None,
-    tokenizer_revision: Optional[str] = None
-) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]:
+    model_revision: str | None = None,
+    tokenizer_name: str | None = None,
+    tokenizer_revision: str | None = None,
+) -> tuple[AutoTokenizer, AutoModelForSequenceClassification]:
     """
     Loads a sequence classification model and its tokenizer.
 
@@ -67,20 +68,21 @@ def load_auto_model_for_text_classification(
     # There are "Some weights are not used warning" for some models, but we
     # ignore it because that is intended.
     with _handle_logging_level():
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
-                                                  trust_remote_code=True,
-                                                  revision=tokenizer_revision)
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name, trust_remote_code=True, revision=tokenizer_revision
+        )
         model = AutoModelForSequenceClassification.from_pretrained(
-            model_name, revision=model_revision)
+            model_name, revision=model_revision
+        )
     return tokenizer, model  # type: ignore
 
 
 def load_auto_model_for_seq2seq(
     model_name: str,
-    model_revision: Optional[str] = None,
-    tokenizer_name: Optional[str] = None,
-    tokenizer_revision: Optional[str] = None
-) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
+    model_revision: str | None = None,
+    tokenizer_name: str | None = None,
+    tokenizer_revision: str | None = None,
+) -> tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
     """
     Loads a sequence-to-sequence model and its tokenizer.
 
@@ -97,11 +99,13 @@ def load_auto_model_for_seq2seq(
     """
     if tokenizer_name is None:
         tokenizer_name = model_name
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
-                                              revision=tokenizer_revision)
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name, revision=tokenizer_revision
+    )
     # There are "Some weights are not used warning" for some models, but we
     # ignore it because that is intended.
     with _handle_logging_level():
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
-                                                      revision=model_revision)
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_name, revision=model_revision
+        )
     return tokenizer, model  # type: ignore
diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py
index 5995b164..b95e0439 100644
--- a/src/langcheck/metrics/model_manager/_model_management.py
+++ b/src/langcheck/metrics/model_manager/_model_management.py
@@ -1,7 +1,8 @@
+from __future__ import annotations
+
 import os
 from copy import deepcopy
 from functools import lru_cache
-from typing import Optional, Tuple, Union
 
 import pandas as pd
 import requests
@@ -36,7 +37,7 @@
 VALID_LANGUAGE = ["zh", "en", "ja", "de"]
 
 
-def check_model_availability(model_name: str, revision: Optional[str]) -> bool:
+def check_model_availability(model_name: str, revision: str | None) -> bool:
     # TODO: add local cached model availability check for offline environment
     if revision is None or revision == "":
         url = f"https://huggingface.co/api/models/{model_name}"
@@ -88,11 +89,11 @@ def __load_config(self, path: str) -> None:
     @lru_cache
     def fetch_model(
         self, language: str, metric: str
-    ) -> Union[
-        Tuple[AutoTokenizer, AutoModelForSequenceClassification],
-        Tuple[AutoTokenizer, AutoModelForSeq2SeqLM],
-        SentenceTransformer,
-    ]:
+    ) -> (
+        tuple[AutoTokenizer, AutoModelForSequenceClassification]
+        | tuple[AutoTokenizer, AutoModelForSeq2SeqLM]
+        | SentenceTransformer
+    ):
         """
         Return the model (and if applicable, the tokenizer) used for the given
         metric and language.
diff --git a/src/langcheck/metrics/prompts/_utils.py b/src/langcheck/metrics/prompts/_utils.py
index c263eff8..3a323b3c 100644
--- a/src/langcheck/metrics/prompts/_utils.py
+++ b/src/langcheck/metrics/prompts/_utils.py
@@ -28,7 +28,7 @@ def load_few_shot_examples(relative_path: str) -> list[dict[str, str]]:
         relative_path (str): The relative path of the JSONL file.
 
     Returns:
-        List[str]: The few-shot examples.
+        list[str]: The few-shot examples.
     """
     cwd = Path(__file__).parent
     with open(cwd / relative_path) as f:
diff --git a/src/langcheck/metrics/reference_based_text_quality.py b/src/langcheck/metrics/reference_based_text_quality.py
index 88cb5276..1c6c640e 100644
--- a/src/langcheck/metrics/reference_based_text_quality.py
+++ b/src/langcheck/metrics/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from langcheck.metrics.metric_inputs import (
     get_metric_inputs_with_required_lists,
 )
@@ -12,9 +10,9 @@
 
 
 def exact_match(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if the generated outputs exact matches with the reference outputs.
     This metric takes on binary 0 or 1 values.
diff --git a/src/langcheck/metrics/scorer/_base.py b/src/langcheck/metrics/scorer/_base.py
index a1f22482..67ccc47f 100644
--- a/src/langcheck/metrics/scorer/_base.py
+++ b/src/langcheck/metrics/scorer/_base.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Generic, Optional, TypeVar
+from typing import Generic, TypeVar
 
 import torch
 from sentence_transformers import util
@@ -27,7 +27,7 @@ def _tokenize(self, inputs: list[str]) -> _TokensType:
         """
         raise NotImplementedError
 
-    def _score_tokens(self, tokens: _TokensType) -> list[Optional[float]]:
+    def _score_tokens(self, tokens: _TokensType) -> list[float | None]:
         """Score the tokens. The returned list should have the same length as
         the tokens. Each element in the list should be the score of the token.
         """
@@ -42,14 +42,14 @@ def _slice_tokens(
         """
         raise NotImplementedError
 
-    def score(self, inputs: list[str]) -> list[Optional[float]]:
+    def score(self, inputs: list[str]) -> list[float | None]:
         """Score the inputs. Basically subclasses should not override this."""
 
         tokens = self._tokenize(inputs)
 
         input_length = len(inputs)
 
-        scores: list[Optional[float]] = []
+        scores: list[float | None] = []
         for i in tqdm_wrapper(
             range(0, input_length, self.batch_size),
             total=(input_length + self.batch_size - 1) // self.batch_size,
diff --git a/src/langcheck/metrics/scorer/detoxify_models.py b/src/langcheck/metrics/scorer/detoxify_models.py
index e20292f1..49ae199e 100644
--- a/src/langcheck/metrics/scorer/detoxify_models.py
+++ b/src/langcheck/metrics/scorer/detoxify_models.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional, Tuple, Union
-
 import torch
 from transformers import (
     BatchEncoding,
@@ -28,10 +26,10 @@
 
 def load_checkpoint(
     device: str, lang: str
-) -> Tuple[
-    Union[BertForSequenceClassification, XLMRobertaForSequenceClassification],
-    Union[BertTokenizer, XLMRobertaTokenizer],
-    List[str],
+) -> tuple[
+    BertForSequenceClassification | XLMRobertaForSequenceClassification,
+    BertTokenizer | XLMRobertaTokenizer,
+    list[str],
 ]:
     checkpoint_url = _checkpoints[lang]
     class_model_type, tokenizer_type = _model_types[lang]
@@ -72,7 +70,7 @@ def __init__(
         device: str = "cpu",
         lang: str = "en",
         overflow_strategy: str = "truncate",
-        max_input_length: Optional[int] = None,
+        max_input_length: int | None = None,
     ):
         """
         Initialize the scorer with the provided configs.
@@ -96,7 +94,7 @@ def __init__(
             max_input_length or self.tokenizer.model_max_length
         )
 
-    def _tokenize(self, inputs: list[str]) -> Tuple[BatchEncoding, list[bool]]:
+    def _tokenize(self, inputs: list[str]) -> tuple[BatchEncoding, list[bool]]:
         """Tokenize the inputs. It also does the validation on the token length,
         and return the results as a list of boolean values. If the validation
         mode is 'raise', it raises an error when the token length is invalid.
@@ -143,10 +141,10 @@ def _validate_inputs(self, inputs: list[str]) -> list[bool]:
 
     def _slice_tokens(
         self,
-        tokens: Tuple[BatchEncoding, list[bool]],
+        tokens: tuple[BatchEncoding, list[bool]],
         start_idx: int,
         end_idx: int,
-    ) -> Tuple[BatchEncoding, list[bool]]:
+    ) -> tuple[BatchEncoding, list[bool]]:
         input_tokens, validation_results = tokens
 
         return (
@@ -158,8 +156,8 @@ def _slice_tokens(
         )
 
     def _score_tokens(
-        self, tokens: Tuple[BatchEncoding, list[bool]]
-    ) -> list[Optional[float]]:
+        self, tokens: tuple[BatchEncoding, list[bool]]
+    ) -> list[float | None]:
         input_tokens, validation_results = tokens
         out = self.model(**input_tokens)[0]
         scores = torch.sigmoid(out).cpu().detach().numpy()
diff --git a/src/langcheck/metrics/scorer/hf_models.py b/src/langcheck/metrics/scorer/hf_models.py
index 20cfb1f2..599fcdd3 100644
--- a/src/langcheck/metrics/scorer/hf_models.py
+++ b/src/langcheck/metrics/scorer/hf_models.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import Optional, Tuple
-
 import torch
 from transformers import BatchEncoding
 
@@ -19,7 +17,7 @@ def __init__(
         metric,
         class_weights,
         overflow_strategy: str = "truncate",
-        max_input_length: Optional[int] = None,
+        max_input_length: int | None = None,
     ):
         """
         Initialize the scorer with the provided configs.
@@ -49,7 +47,7 @@ def __init__(
         else:
             self.max_input_length = self.model.config.max_position_embeddings  # type: ignore
 
-    def _tokenize(self, inputs: list[str]) -> Tuple[BatchEncoding, list[bool]]:
+    def _tokenize(self, inputs: list[str]) -> tuple[BatchEncoding, list[bool]]:
         """Tokenize the inputs. It also does the validation on the token length,
         and return the results as a list of boolean values. If the validation
         mode is 'raise', it raises an error when the token length is invalid.
@@ -95,13 +93,13 @@ def _validate_inputs(self, inputs: list[str]) -> list[bool]:
         return validation_results
 
     def _score_tokens(
-        self, tokens: Tuple[BatchEncoding, list[bool]]
-    ) -> list[Optional[float]]:
+        self, tokens: tuple[BatchEncoding, list[bool]]
+    ) -> list[float | None]:
         """Return the prediction results as scores."""
         input_tokens, validation_results = tokens
         with torch.no_grad():
             logits: torch.Tensor = self.model(**input_tokens).logits  # type: ignore
-            scores: list[Optional[float]] = self._logits_to_scores(logits)  # type: ignore
+            scores: list[float | None] = self._logits_to_scores(logits)  # type: ignore
 
         for i, validation_result in enumerate(validation_results):
             if not validation_result:
@@ -111,10 +109,10 @@ def _score_tokens(
 
     def _slice_tokens(
         self,
-        tokens: Tuple[BatchEncoding, list[bool]],
+        tokens: tuple[BatchEncoding, list[bool]],
         start_idx: int,
         end_idx: int,
-    ) -> Tuple[BatchEncoding, list[bool]]:
+    ) -> tuple[BatchEncoding, list[bool]]:
         input_tokens, validation_results = tokens
 
         return (
diff --git a/src/langcheck/metrics/text_structure.py b/src/langcheck/metrics/text_structure.py
index 3315490d..a659efc7 100644
--- a/src/langcheck/metrics/text_structure.py
+++ b/src/langcheck/metrics/text_structure.py
@@ -2,7 +2,7 @@
 
 import json
 import re
-from typing import Callable, Container, Iterable, List, Optional
+from collections.abc import Callable, Container, Iterable
 
 from langcheck.metrics.metric_inputs import (
     get_metric_inputs_with_required_lists,
@@ -12,9 +12,9 @@
 
 
 def is_int(
-    generated_outputs: List[str] | str,
+    generated_outputs: list[str] | str,
     domain: Iterable[int] | Container[int] | None = None,
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs can be parsed as integers, optionally within
     a domain of integers like `range(1, 11)` or `{1, 3, 5}`. This metric takes
@@ -57,10 +57,10 @@ def is_int(
 
 
 def is_float(
-    generated_outputs: List[str] | str,
-    min: Optional[float] = None,
-    max: Optional[float] = None,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    min: float | None = None,
+    max: float | None = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs can be parsed as floating point numbers,
     optionally within a min/max range. This metric takes on binary 0 or 1
@@ -109,8 +109,8 @@ def is_float(
 
 
 def is_json_object(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs can be parsed as JSON objects. This metric
     takes on binary 0 or 1 values.
@@ -151,8 +151,8 @@ def is_json_object(
 
 
 def is_json_array(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs can be parsed as JSON arrays. This metric
     takes on binary 0 or 1 values.
@@ -193,9 +193,9 @@ def is_json_array(
 
 
 def matches_regex(
-    generated_outputs: List[str] | str,
+    generated_outputs: list[str] | str,
     regex: str,
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs fully match a given regular expression. This
     metric takes on binary 0 or 1 values.
@@ -233,9 +233,9 @@ def matches_regex(
 
 
 def contains_regex(
-    generated_outputs: List[str] | str,
+    generated_outputs: list[str] | str,
     regex: str,
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs partially contain a given regular expression.
     This metric takes on binary 0 or 1 values.
@@ -273,10 +273,10 @@ def contains_regex(
 
 
 def contains_all_strings(
-    generated_outputs: List[str] | str,
-    strings: List[str],
+    generated_outputs: list[str] | str,
+    strings: list[str],
     case_sensitive: bool = False,
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs contain all strings in of a given list. This
     metric takes on binary 0 or 1 values.
@@ -323,10 +323,10 @@ def contains_all_strings(
 
 
 def contains_any_strings(
-    generated_outputs: List[str] | str,
-    strings: List[str],
+    generated_outputs: list[str] | str,
+    strings: list[str],
     case_sensitive: bool = False,
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs contain any strings in a given list. This
     metric takes on binary 0 or 1 values.
@@ -374,9 +374,9 @@ def contains_any_strings(
 
 
 def validation_fn(
-    generated_outputs: List[str] | str,
+    generated_outputs: list[str] | str,
     valid_fn: Callable[[str], bool],
-    prompts: Optional[List[str] | str] = None,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[int]:
     """Checks if generated outputs are valid according to an arbitrary function.
     This metric takes on binary 0 or 1 values.
diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py
index 2498af09..dbae15b9 100644
--- a/src/langcheck/metrics/zh/reference_based_text_quality.py
+++ b/src/langcheck/metrics/zh/reference_based_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 from rouge_score import rouge_scorer
 from rouge_score.tokenizers import Tokenizer
 
@@ -19,9 +17,9 @@
 
 
 def semantic_similarity(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
 ) -> MetricValue[float]:
     """
@@ -91,11 +89,11 @@ def semantic_similarity(
 
 
 def rouge1(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-1 scores between the generated
     (single tokens) between the generated outputs and the reference outputs.
@@ -136,11 +134,11 @@ def rouge1(
 
 
 def rouge2(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-2 scores between the generated
     outputs and the reference outputs. It evaluates the overlap of bigrams
@@ -182,11 +180,11 @@ def rouge2(
 
 
 def rougeL(
-    generated_outputs: List[str] | str,
-    reference_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    reference_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     *,
-    tokenizer: Optional[Tokenizer] = None,
+    tokenizer: Tokenizer | None = None,
 ) -> MetricValue[float]:
     """Calculates the F1 metrics of the ROUGE-L scores between the generated
     outputs and the reference outputs. It evaluates the longest common
@@ -238,12 +236,12 @@ def rougeL(
 
 
 def _rouge(
-    generated_outputs: List[str],
-    reference_outputs: List[str],
+    generated_outputs: list[str],
+    reference_outputs: list[str],
     rouge_type: str,
     *,
-    tokenizer: Optional[Tokenizer] = None,
-) -> List[float]:
+    tokenizer: Tokenizer | None = None,
+) -> list[float]:
     """Helper function for computing the rouge1, rouge2, and rougeL metrics.
     This uses Google Research's implementation of ROUGE:
     https://github.com/google-research/google-research/tree/master/rouge
diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py
index e74ee6d3..ed0bd504 100644
--- a/src/langcheck/metrics/zh/reference_free_text_quality.py
+++ b/src/langcheck/metrics/zh/reference_free_text_quality.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import List, Optional
-
 import hanlp
 from transformers.pipelines import pipeline
 
@@ -21,10 +19,10 @@
 
 
 def sentiment(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the sentiment scores of generated outputs. This metric takes
     on float values between [0, 1], where 0 is negative sentiment and 1 is
     positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
@@ -100,11 +98,11 @@ def sentiment(
 
 
 def toxicity(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
     eval_prompt_version: str = "v2",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the toxicity scores of generated outputs. This metric takes on
     float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
     (NOTE: when using an EvalClient, the toxicity scores are in steps of
@@ -167,7 +165,7 @@ def toxicity(
         )
 
 
-def _toxicity_local(generated_outputs: List[str]) -> List[float]:
+def _toxicity_local(generated_outputs: list[str]) -> list[float]:
     """Calculates the toxicity scores of generated outputs using a fine-tuned
     model from `alibaba-pai/pai-bert-base-zh-llm-risk-detection`. This metric
     takes on float values between [0, 1], where 0 is low toxicity and 1 is high
@@ -183,7 +181,7 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]:
         A list of scores
     """
     # this pipeline output predict probability for each text on each label.
-    # the output format is List[List[Dict(str)]]
+    # the output format is list[list[dict(str)]]
     from langcheck.metrics.model_manager import manager
 
     tokenizer, model = manager.fetch_model(language="zh", metric="toxicity")
@@ -210,8 +208,8 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]:
 
 
 def xuyaochen_report_readability(
-    generated_outputs: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    prompts: list[str] | str | None = None,
 ) -> MetricValue[float]:
     """Calculates the readability scores of generated outputs introduced in
     "中文年报可读性"(Chinese annual report readability). This metric calculates
@@ -262,27 +260,27 @@ def xuyaochen_report_readability(
     # List[List[List[POS]]]
     output_pos = list(map(pos_pipeline, generated_outputs))
 
-    def count_tokens(sent_tokens: List[str]) -> int:
+    def count_tokens(sent_tokens: list[str]) -> int:
         count = sum([
             not hanlp.utils.string_util.ispunct(token) for token in   # type: ignore[reportGeneralTypeIssues]
             sent_tokens
         ])
         return count
 
-    def count_postags(sent_poses: List[str]) -> int:
+    def count_postags(sent_poses: list[str]) -> int:
         # AD: adverb, CC: coordinating conjunction,
         # CS: subordinating conjunction
         count = sum([pos in ["AD", "CC", "CS"] for pos in sent_poses])
         return count
 
-    def calc_r1(content: List[List[str]]) -> float:
+    def calc_r1(content: list[list[str]]) -> float:
         token_count_by_sentence = list(map(count_tokens, content))
         if len(token_count_by_sentence) == 0:
             return 0
         else:
             return sum(token_count_by_sentence) / len(token_count_by_sentence)
 
-    def calc_r2(content: List[List[str]]) -> float:
+    def calc_r2(content: list[list[str]]) -> float:
         pos_count_by_sentence = list(map(count_postags, content))
         if len(pos_count_by_sentence) == 0:
             return 0
diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py
index edb84beb..27ae8ba4 100644
--- a/src/langcheck/metrics/zh/source_based_text_quality.py
+++ b/src/langcheck/metrics/zh/source_based_text_quality.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Optional, cast
+from typing import cast
 
 from transformers.pipelines import pipeline
 
@@ -17,11 +17,11 @@
 
 
 def factual_consistency(
-    generated_outputs: List[str] | str,
-    sources: List[str] | str,
-    prompts: Optional[List[str] | str] = None,
+    generated_outputs: list[str] | str,
+    sources: list[str] | str,
+    prompts: list[str] | str | None = None,
     eval_model: str | EvalClient = "local",
-) -> MetricValue[Optional[float]]:
+) -> MetricValue[float | None]:
     """Calculates the factual consistency between the generated outputs and
     the sources. This metric takes on float values between [0, 1], where 0
     means that the output is not at all consistent with the source text, and 1
diff --git a/src/langcheck/plot/_scatter.py b/src/langcheck/plot/_scatter.py
index eb37e8ad..5721325f 100644
--- a/src/langcheck/plot/_scatter.py
+++ b/src/langcheck/plot/_scatter.py
@@ -1,7 +1,8 @@
+from __future__ import annotations
+
 import math
 import textwrap
 from copy import deepcopy
-from typing import Optional, Union
 
 import plotly.express as px
 from dash import Dash, Input, Output, dcc, html
@@ -14,7 +15,7 @@
 
 def scatter(
     metric_value: MetricValue,
-    other_metric_value: Optional[MetricValue] = None,
+    other_metric_value: MetricValue | None = None,
     jupyter_mode: str = "inline",
 ) -> None:
     """Shows an interactive scatter plot of all data points in an
@@ -422,7 +423,7 @@ def update_figure(
         # Unfortunately it's not possible to make "index" show up at the top of
         # the tooltip like _scatter_one_metric_value() since Plotly always
         # displays the x and y values at the top.)
-        hover_data: dict[str, Union[bool, Index]] = {
+        hover_data: dict[str, bool | Index] = {
             col: True for col in filtered_df.columns
         }
         hover_data["index"] = filtered_df.index
diff --git a/src/langcheck/plot/_utils.py b/src/langcheck/plot/_utils.py
index 726bcd05..1ad7db31 100644
--- a/src/langcheck/plot/_utils.py
+++ b/src/langcheck/plot/_utils.py
@@ -1,5 +1,6 @@
+from __future__ import annotations
+
 from enum import Enum
-from typing import Union
 
 from plotly.graph_objects import Figure
 
@@ -9,8 +10,9 @@ class Axis(Enum):
     horizontal = 1
 
 
-def _plot_threshold(fig: Figure, threshold_op: str,
-                    threshold: Union[float, int], direction: Axis):
+def _plot_threshold(
+    fig: Figure, threshold_op: str, threshold: float | int, direction: Axis
+):
     """Draw a dashed line on the target figure at the specified threshold value
     along either the horizontal or vertical axis.
 
@@ -23,15 +25,19 @@ def _plot_threshold(fig: Figure, threshold_op: str,
     """
     threshold_text = f"{threshold_op} {threshold}"
     if direction == Axis.horizontal:  # Draw a horizontal line
-        fig.add_hline(y=threshold,
-                      line_width=3,
-                      line_dash="dash",
-                      annotation_text=threshold_text,
-                      annotation_position="right")
+        fig.add_hline(
+            y=threshold,
+            line_width=3,
+            line_dash="dash",
+            annotation_text=threshold_text,
+            annotation_position="right",
+        )
     elif direction == Axis.vertical:  # Draw a vertical line
-        fig.add_vline(x=threshold,
-                      line_width=3,
-                      line_dash="dash",
-                      annotation_text=threshold_text,
-                      annotation_position="top")
+        fig.add_vline(
+            x=threshold,
+            line_width=3,
+            line_dash="dash",
+            annotation_text=threshold_text,
+            annotation_position="top",
+        )
     return
diff --git a/src/langcheck/utils/progress_bar.py b/src/langcheck/utils/progress_bar.py
index bba07e15..a56a1995 100644
--- a/src/langcheck/utils/progress_bar.py
+++ b/src/langcheck/utils/progress_bar.py
@@ -1,12 +1,17 @@
-from typing import Any, Iterable, Optional
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any
 
 from tqdm import tqdm
 
 
-def tqdm_wrapper(iterable: Iterable[Any],
-                 desc: Optional[str] = None,
-                 total: Optional[int] = None,
-                 unit: str = "it"):
+def tqdm_wrapper(
+    iterable: Iterable[Any],
+    desc: str | None = None,
+    total: int | None = None,
+    unit: str = "it",
+):
     """
     Wrapper for tqdm to make it optional
     """
diff --git a/tests/augment/en/test_change_case.py b/tests/augment/en/test_change_case.py
index 127c3ce6..cac143cc 100644
--- a/tests/augment/en/test_change_case.py
+++ b/tests/augment/en/test_change_case.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -15,51 +14,118 @@
         # To uppercase, single input
         ########################################################################
         ("Hello, world!", 1, 0.9, "uppercase", ["HELLO, WORLD!"]),
-        ("Hello, world!", 2, 0.9, "uppercase",
-         ["HELLO, WORLD!", "HELLO, WORLd!"]),
+        (
+            "Hello, world!",
+            2,
+            0.9,
+            "uppercase",
+            ["HELLO, WORLD!", "HELLO, WORLd!"],
+        ),
         (["Hello, world!"], 1, 0.9, "uppercase", ["HELLO, WORLD!"]),
-        (["Hello, world!"
-         ], 2, 0.9, "uppercase", ["HELLO, WORLD!", "HELLO, WORLd!"]),
+        (
+            ["Hello, world!"],
+            2,
+            0.9,
+            "uppercase",
+            ["HELLO, WORLD!", "HELLO, WORLd!"],
+        ),
         ("Hello, world!", 1, 0.1, "uppercase", ["HEllo, WoRld!"]),
-        ("Hello, world!", 2, 0.1, "uppercase",
-         ["HEllo, WoRld!", "Hello, world!"]),
+        (
+            "Hello, world!",
+            2,
+            0.1,
+            "uppercase",
+            ["HEllo, WoRld!", "Hello, world!"],
+        ),
         (["Hello, world!"], 1, 0.1, "uppercase", ["HEllo, WoRld!"]),
-        (["Hello, world!"
-         ], 2, 0.1, "uppercase", ["HEllo, WoRld!", "Hello, world!"]),
+        (
+            ["Hello, world!"],
+            2,
+            0.1,
+            "uppercase",
+            ["HEllo, WoRld!", "Hello, world!"],
+        ),
         ########################################################################
         # To lowercase, single input
         ########################################################################
         ("HELLO, world!", 1, 0.9, "lowercase", ["hello, world!"]),
-        ("HELLO, world!", 2, 0.9, "lowercase",
-         ["hello, world!", "hello, world!"]),
+        (
+            "HELLO, world!",
+            2,
+            0.9,
+            "lowercase",
+            ["hello, world!", "hello, world!"],
+        ),
         (["HELLO, world!"], 1, 0.9, "lowercase", ["hello, world!"]),
-        (["HELLO, world!"
-         ], 2, 0.9, "lowercase", ["hello, world!", "hello, world!"]),
+        (
+            ["HELLO, world!"],
+            2,
+            0.9,
+            "lowercase",
+            ["hello, world!", "hello, world!"],
+        ),
         ("HELLO, world!", 1, 0.1, "lowercase", ["HeLLO, world!"]),
-        ("HELLO, world!", 2, 0.1, "lowercase",
-         ["HeLLO, world!", "HELLO, world!"]),
+        (
+            "HELLO, world!",
+            2,
+            0.1,
+            "lowercase",
+            ["HeLLO, world!", "HELLO, world!"],
+        ),
         (["HELLO, world!"], 1, 0.1, "lowercase", ["HeLLO, world!"]),
-        (["HELLO, world!"
-         ], 2, 0.1, "lowercase", ["HeLLO, world!", "HELLO, world!"]),
+        (
+            ["HELLO, world!"],
+            2,
+            0.1,
+            "lowercase",
+            ["HeLLO, world!", "HELLO, world!"],
+        ),
         ########################################################################
         # Multiple inputs
         ########################################################################
-        (["HELLO, world!", "I'm hungry"
-         ], 1, 0.9, "lowercase", ["hello, world!", "i'm hungry"]),
-        (["HELLO, world!", "I'm hungry"], 2, 0.9, "lowercase",
-         ["hello, world!", "hello, world!", "i'm hungry", "i'm hungry"]),
-        (["HELLO, world!", "I'm hungry"
-         ], 1, 0.1, "uppercase", ["HELLO, WoRld!", "I'm huNgry"]),
-        (["HELLO, world!", "I'm hungry"], 2, 0.1, "uppercase",
-         ["HELLO, WoRld!", "HELLO, world!", "I'm hungry", "I'm hUngRy"])
+        (
+            ["HELLO, world!", "I'm hungry"],
+            1,
+            0.9,
+            "lowercase",
+            ["hello, world!", "i'm hungry"],
+        ),
+        (
+            ["HELLO, world!", "I'm hungry"],
+            2,
+            0.9,
+            "lowercase",
+            ["hello, world!", "hello, world!", "i'm hungry", "i'm hungry"],
+        ),
+        (
+            ["HELLO, world!", "I'm hungry"],
+            1,
+            0.1,
+            "uppercase",
+            ["HELLO, WoRld!", "I'm huNgry"],
+        ),
+        (
+            ["HELLO, world!", "I'm hungry"],
+            2,
+            0.1,
+            "uppercase",
+            ["HELLO, WoRld!", "HELLO, world!", "I'm hungry", "I'm hUngRy"],
+        ),
     ],
 )
-def test_change_case(instances: List[str] | str, num_perturbations: int,
-                     aug_char_p: float, to_case: str, expected: List[str]):
+def test_change_case(
+    instances: list[str] | str,
+    num_perturbations: int,
+    aug_char_p: float,
+    to_case: str,
+    expected: list[str],
+):
     seed = 42
     random.seed(seed)
-    actual = change_case(instances,
-                         to_case=to_case,
-                         aug_char_p=aug_char_p,
-                         num_perturbations=num_perturbations)
+    actual = change_case(
+        instances,
+        to_case=to_case,
+        aug_char_p=aug_char_p,
+        num_perturbations=num_perturbations,
+    )
     assert actual == expected
diff --git a/tests/augment/en/test_gender.py b/tests/augment/en/test_gender.py
index 70f8d21a..17479e9b 100644
--- a/tests/augment/en/test_gender.py
+++ b/tests/augment/en/test_gender.py
@@ -1,5 +1,6 @@
+from __future__ import annotations
+
 import random
-from typing import List, Optional
 
 import pytest
 
@@ -21,33 +22,58 @@ def test_invalid_input():
     [
         (["He cooks by himself.", "This is his dog.", "I gave him a book."]),
         (["She cooks by herself.", "This is her dog.", "I gave her a book."]),
-        ([
-            "They cooks by themselves.", "This is their dog.",
-            "I gave them a book."
-        ]),
+        (
+            [
+                "They cooks by themselves.",
+                "This is their dog.",
+                "I gave them a book.",
+            ]
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "to_gender, expected",
+    [
+        (
+            None,
+            [
+                "They cooks by themselves.",
+                "This is their dog.",
+                "I gave them a book.",
+            ],
+        ),
+        (
+            "female",
+            ["She cooks by herself.", "This is her dog.", "I gave her a book."],
+        ),
+        (
+            "male",
+            ["He cooks by himself.", "This is his dog.", "I gave him a book."],
+        ),
+        (
+            "neutral",
+            ["Xe cooks by xyrself.", "This is xyr dog.", "I gave xem a book."],
+        ),
+        (
+            "plural",
+            [
+                "They cooks by themselves.",
+                "This is their dog.",
+                "I gave them a book.",
+            ],
+        ),
     ],
 )
-@pytest.mark.parametrize("to_gender, expected", [
-    (None, [
-        "They cooks by themselves.", "This is their dog.", "I gave them a book."
-    ]),
-    ("female",
-     ["She cooks by herself.", "This is her dog.", "I gave her a book."]),
-    ("male", ["He cooks by himself.", "This is his dog.", "I gave him a book."
-             ]),
-    ("neutral",
-     ["Xe cooks by xyrself.", "This is xyr dog.", "I gave xem a book."]),
-    ("plural", [
-        "They cooks by themselves.", "This is their dog.", "I gave them a book."
-    ]),
-])
 def test_gender(
-    texts: List[str],
-    to_gender: Optional[str],
-    expected: List[str],
+    texts: list[str],
+    to_gender: str | None,
+    expected: list[str],
 ):
     seed = 42
     random.seed(seed)
-    actual = gender(texts) if to_gender is None else gender(texts,
-                                                            to_gender=to_gender)
+    actual = (
+        gender(texts)
+        if to_gender is None
+        else gender(texts, to_gender=to_gender)
+    )
     assert actual == expected
diff --git a/tests/augment/en/test_keyboard_typo.py b/tests/augment/en/test_keyboard_typo.py
index b740dd3b..8a9cfcfa 100644
--- a/tests/augment/en/test_keyboard_typo.py
+++ b/tests/augment/en/test_keyboard_typo.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -16,12 +15,16 @@
         (["Hello, world!"], 1, ["HePlo, wLrld!"]),
         (["Hello, world!"], 2, ["HePlo, wLrld!", "Helll, Aorld!"]),
         (["Hello, world!", "I'm hungry"], 1, ["HePlo, wLrld!", "I ' m hungrt"]),
-        (["Hello, world!", "I'm hungry"], 2,
-         ["HePlo, wLrld!", "Helll, Aorld!", "I ' m hKngry", "I ' m hungGy"]),
+        (
+            ["Hello, world!", "I'm hungry"],
+            2,
+            ["HePlo, wLrld!", "Helll, Aorld!", "I ' m hKngry", "I ' m hungGy"],
+        ),
     ],
 )
-def test_keyboard_typo(instances: List[str] | str, num_perturbations: int,
-                       expected: List[str]):
+def test_keyboard_typo(
+    instances: list[str] | str, num_perturbations: int, expected: list[str]
+):
     seed = 42
     random.seed(seed)
     actual = keyboard_typo(instances, num_perturbations=num_perturbations)
diff --git a/tests/augment/en/test_ocr_typo.py b/tests/augment/en/test_ocr_typo.py
index 80676eaf..bc0d876a 100644
--- a/tests/augment/en/test_ocr_typo.py
+++ b/tests/augment/en/test_ocr_typo.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -16,12 +15,16 @@
         (["Hello, world!"], 1, ["Hel1u, world!"]),
         (["Hello, world!"], 2, ["Hel1u, world!", "Hello, w0r1d!"]),
         (["Hello, world!", "I'm hungry"], 1, ["Hel1u, world!", "I ' m hungry"]),
-        (["Hello, world!", "I'm hungry"], 2,
-         ["Hel1u, world!", "Hello, w0r1d!", "1 ' m hongky", "I ' m hun9ky"]),
+        (
+            ["Hello, world!", "I'm hungry"],
+            2,
+            ["Hel1u, world!", "Hello, w0r1d!", "1 ' m hongky", "I ' m hun9ky"],
+        ),
     ],
 )
-def test_ocr_typo(instances: List[str] | str, num_perturbations: int,
-                  expected: List[str]):
+def test_ocr_typo(
+    instances: list[str] | str, num_perturbations: int, expected: list[str]
+):
     seed = 42
     random.seed(seed)
     actual = ocr_typo(instances, num_perturbations=num_perturbations)
diff --git a/tests/augment/en/test_remove_punctuation.py b/tests/augment/en/test_remove_punctuation.py
index da2e217f..a1680c57 100644
--- a/tests/augment/en/test_remove_punctuation.py
+++ b/tests/augment/en/test_remove_punctuation.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -15,17 +14,29 @@
         ("Hello, world...!?", 2, 0.5, ["Hello, world!?", "Hello, world?"]),
         (["Hello, world...!?"], 1, 0.5, ["Hello, world!?"]),
         (["Hello, world...!?"], 2, 0.5, ["Hello, world!?", "Hello, world?"]),
-        (["Hello, world...!?", "!@#$%^&*()_+,./"
-         ], 1, 0.5, ["Hello, world!?", "!^()+,/"]),
-        (["Hello, world...!?", "!@#$%^&*()_+,./"], 2, 0.5,
-         ["Hello, world!?", "Hello, world?", "#$^&(),", "@#$%^&()_+,."]),
+        (
+            ["Hello, world...!?", "!@#$%^&*()_+,./"],
+            1,
+            0.5,
+            ["Hello, world!?", "!^()+,/"],
+        ),
+        (
+            ["Hello, world...!?", "!@#$%^&*()_+,./"],
+            2,
+            0.5,
+            ["Hello, world!?", "Hello, world?", "#$^&(),", "@#$%^&()_+,."],
+        ),
     ],
 )
-def test_remove_punctuation(instances: List[str] | str, num_perturbations: int,
-                            aug_char_p: float, expected: List[str]):
+def test_remove_punctuation(
+    instances: list[str] | str,
+    num_perturbations: int,
+    aug_char_p: float,
+    expected: list[str],
+):
     seed = 42
     random.seed(seed)
-    actual = remove_punctuation(instances,
-                                aug_char_p=aug_char_p,
-                                num_perturbations=num_perturbations)
+    actual = remove_punctuation(
+        instances, aug_char_p=aug_char_p, num_perturbations=num_perturbations
+    )
     assert actual == expected
diff --git a/tests/augment/en/test_to_full_width.py b/tests/augment/en/test_to_full_width.py
index 2de533f7..c81032a8 100644
--- a/tests/augment/en/test_to_full_width.py
+++ b/tests/augment/en/test_to_full_width.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -82,10 +81,10 @@
     ],
 )
 def test_to_ful_width(
-    instances: List[str] | str,
+    instances: list[str] | str,
     num_perturbations: int,
     aug_char_p: float,
-    expected: List[str],
+    expected: list[str],
 ):
     seed = 42
     random.seed(seed)
diff --git a/tests/augment/ja/test_conv_kana.py b/tests/augment/ja/test_conv_kana.py
index 8da87bd4..50eabf64 100644
--- a/tests/augment/ja/test_conv_kana.py
+++ b/tests/augment/ja/test_conv_kana.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import random
-from typing import List
 
 import pytest
 
@@ -226,11 +225,11 @@
     ],
 )
 def test_change_case(
-    instances: List[str] | str,
+    instances: list[str] | str,
     num_perturbations: int,
     aug_char_p: float,
     convert_to: str,
-    expected: List[str],
+    expected: list[str],
 ):
     seed = 42
     random.seed(seed)
diff --git a/tests/metrics/de/test_tokenizers.py b/tests/metrics/de/test_tokenizers.py
index ef6d5e1c..0762c910 100644
--- a/tests/metrics/de/test_tokenizers.py
+++ b/tests/metrics/de/test_tokenizers.py
@@ -1,24 +1,48 @@
-from typing import List
-
 import pytest
 
 from langcheck.metrics.de import DeTokenizer
 
 
-@pytest.mark.parametrize("text,expected_tokens", [
-    ([
-        "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.",
-        [
-            "Ich", "habe", "keine", "persönlichen", "Meinungen", ",",
-            "Emotionen", "oder", "Bewusstsein", "."
-        ]
-    ]),
-    ("Mein Freund. Willkommen in den Karpaten. Ich erwarte dich sehnsüchtig.\n",
-     [
-         "Mein", "Freund", ".", "Willkommen", "in", "den", "Karpaten", ".",
-         "Ich", "erwarte", "dich", "sehnsüchtig", "."
-     ]),
-])
-def test_de_tokenizer(text: str, expected_tokens: List[str]) -> None:
+@pytest.mark.parametrize(
+    "text,expected_tokens",
+    [
+        (
+            [
+                "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.",
+                [
+                    "Ich",
+                    "habe",
+                    "keine",
+                    "persönlichen",
+                    "Meinungen",
+                    ",",
+                    "Emotionen",
+                    "oder",
+                    "Bewusstsein",
+                    ".",
+                ],
+            ]
+        ),
+        (
+            "Mein Freund. Willkommen in den Karpaten. Ich erwarte dich sehnsüchtig.\n",
+            [
+                "Mein",
+                "Freund",
+                ".",
+                "Willkommen",
+                "in",
+                "den",
+                "Karpaten",
+                ".",
+                "Ich",
+                "erwarte",
+                "dich",
+                "sehnsüchtig",
+                ".",
+            ],
+        ),
+    ],
+)
+def test_de_tokenizer(text: str, expected_tokens: list[str]) -> None:
     tokenizer = DeTokenizer()  # type: ignore[reportGeneralTypeIssues]
     assert tokenizer.tokenize(text) == expected_tokens
diff --git a/tests/metrics/de/test_translation.py b/tests/metrics/de/test_translation.py
index 948dd698..2d285809 100644
--- a/tests/metrics/de/test_translation.py
+++ b/tests/metrics/de/test_translation.py
@@ -1,5 +1,3 @@
-from typing import List
-
 import pytest
 
 from langcheck.metrics.de import Translate
@@ -8,31 +6,45 @@
 @pytest.mark.parametrize(
     "de_text,en_text",
     [
-        ([
-            "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.",  # noqa: E501
-            "I have no personal opinions, emotions or consciousness."
-        ]),
-        ([
-            "Mein Freund. Willkommen in den Karpaten.",
-            "My friend, welcome to the Carpathians."
-        ]),
-        ([
-            "Tokio ist die Hauptstadt von Japan.",
-            "Tokyo is the capital of Japan."
-        ]),
-    ])
+        (
+            [
+                "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.",  # noqa: E501
+                "I have no personal opinions, emotions or consciousness.",
+            ]
+        ),
+        (
+            [
+                "Mein Freund. Willkommen in den Karpaten.",
+                "My friend, welcome to the Carpathians.",
+            ]
+        ),
+        (
+            [
+                "Tokio ist die Hauptstadt von Japan.",
+                "Tokyo is the capital of Japan.",
+            ]
+        ),
+    ],
+)
 def test_translate_de_en(de_text: str, en_text: str) -> None:
     translation = Translate("Helsinki-NLP/opus-mt-de-en")
     assert translation(de_text) == en_text
 
 
-@pytest.mark.parametrize("en_text,de_text", [
-    ("I have no personal opinions, emotions or consciousness.",
-     "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein."),
-    ("My Friend. Welcome to the Carpathians. I am anxiously expecting you.",
-     "Willkommen bei den Karpaten, ich erwarte Sie."),
-    ("Tokyo is the capital of Japan.", "Tokio ist die Hauptstadt Japans."),
-])
-def test_translate_en_de(en_text: str, de_text: List[str]) -> None:
+@pytest.mark.parametrize(
+    "en_text,de_text",
+    [
+        (
+            "I have no personal opinions, emotions or consciousness.",
+            "Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.",
+        ),
+        (
+            "My Friend. Welcome to the Carpathians. I am anxiously expecting you.",
+            "Willkommen bei den Karpaten, ich erwarte Sie.",
+        ),
+        ("Tokyo is the capital of Japan.", "Tokio ist die Hauptstadt Japans."),
+    ],
+)
+def test_translate_en_de(en_text: str, de_text: list[str]) -> None:
     translation = Translate("Helsinki-NLP/opus-mt-en-de")
     assert translation(en_text) == de_text
diff --git a/tests/metrics/ja/test_reference_based_text_quality.py b/tests/metrics/ja/test_reference_based_text_quality.py
index f1b5df84..ce781bad 100644
--- a/tests/metrics/ja/test_reference_based_text_quality.py
+++ b/tests/metrics/ja/test_reference_based_text_quality.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import os
-from typing import Callable, Optional
+from collections.abc import Callable
 from unittest.mock import Mock, patch
 
 import pytest
@@ -95,9 +97,9 @@ def test_rouge_identical(
     generated_outputs: str,
     reference_outputs: str,
     rouge_function: Callable[
-        [str, str, Optional[_JapaneseTokenizer]], MetricValue[float]
+        [str, str, _JapaneseTokenizer | None], MetricValue[float]
     ],
-    tokenizer: Optional[_JapaneseTokenizer],
+    tokenizer: _JapaneseTokenizer | None,
 ) -> None:
     # All ROUGE scores are 1 if the generated and reference outputs are
     # identical
@@ -125,7 +127,7 @@ def test_rouge_no_overlap(
     generated_outputs: str,
     reference_outputs: str,
     rouge_function: Callable[[str, str], MetricValue[float]],
-    tokenizer: Optional[_JapaneseTokenizer],
+    tokenizer: _JapaneseTokenizer | None,
 ) -> None:
     # All ROUGE scores are 0 if the generated and reference outputs have no
     # overlapping words
@@ -153,7 +155,7 @@ def test_rouge_some_overlap(
     generated_outputs: str,
     reference_outputs: str,
     rouge_function: Callable[[str, str], MetricValue[float]],
-    tokenizer: Optional[_JapaneseTokenizer],
+    tokenizer: _JapaneseTokenizer | None,
 ) -> None:
     expected_value = {
         "rouge1": [0.823529411764706],
diff --git a/tests/metrics/ja/test_tokenizers.py b/tests/metrics/ja/test_tokenizers.py
index 78172df0..ebaa19a7 100644
--- a/tests/metrics/ja/test_tokenizers.py
+++ b/tests/metrics/ja/test_tokenizers.py
@@ -1,5 +1,4 @@
 import pkgutil
-from typing import List
 
 import pytest
 
@@ -7,22 +6,36 @@
 from langcheck.metrics.ja._tokenizers import _JapaneseTokenizer
 
 
-@pytest.mark.parametrize("text,expected_tokens", [
-    (["頭が赤い魚を食べる猫", ["頭", "が", "赤い", "魚", "を", "食べる", "猫"]]),
-    ("猫が、マットの上に座った。", ["猫", "が", "マット", "の", "上", "に", "座っ", "た"]),
-])
+@pytest.mark.parametrize(
+    "text,expected_tokens",
+    [
+        (
+            [
+                "頭が赤い魚を食べる猫",
+                ["頭", "が", "赤い", "魚", "を", "食べる", "猫"],
+            ]
+        ),
+        (
+            "猫が、マットの上に座った。",
+            ["猫", "が", "マット", "の", "上", "に", "座っ", "た"],
+        ),
+    ],
+)
 @pytest.mark.parametrize(
     "tokenizer",
-    [JanomeTokenizer,
-     pytest.param(MeCabTokenizer, marks=pytest.mark.optional)])
-def test_janome_tokenizer(text: str, expected_tokens: List[str],
-                          tokenizer: _JapaneseTokenizer) -> None:
+    [JanomeTokenizer, pytest.param(MeCabTokenizer, marks=pytest.mark.optional)],
+)
+def test_janome_tokenizer(
+    text: str, expected_tokens: list[str], tokenizer: _JapaneseTokenizer
+) -> None:
     tokenizer = tokenizer()  # type: ignore[reportGeneralTypeIssues]
     assert tokenizer.tokenize(text) == expected_tokens
 
 
-@pytest.mark.skipif(pkgutil.find_loader("MeCab") is not None,
-                    reason="MeCab has already been installed.")
+@pytest.mark.skipif(
+    pkgutil.find_loader("MeCab") is not None,
+    reason="MeCab has already been installed.",
+)
 def test_handle_mecab_not_found() -> None:
     with pytest.raises(ModuleNotFoundError):
         MeCabTokenizer()
diff --git a/tests/metrics/test_metric_value.py b/tests/metrics/test_metric_value.py
index f44dfe06..983ef98f 100644
--- a/tests/metrics/test_metric_value.py
+++ b/tests/metrics/test_metric_value.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from __future__ import annotations
 
 import pandas as pd
 import pytest
@@ -69,7 +69,7 @@ def test_optional_metric_values():
         },
         required_params=["generated_outputs"],
     )
-    metric_value: MetricValue[Optional[float]] = MetricValue(
+    metric_value: MetricValue[float | None] = MetricValue(
         metric_name="test",
         metric_inputs=metric_inputs,
         explanations=None,
diff --git a/tests/metrics/zh/test_reference_based_text_quality.py b/tests/metrics/zh/test_reference_based_text_quality.py
index 05c4776f..460a8791 100644
--- a/tests/metrics/zh/test_reference_based_text_quality.py
+++ b/tests/metrics/zh/test_reference_based_text_quality.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import os
-from typing import Callable, Optional
+from collections.abc import Callable
 from unittest.mock import Mock, patch
 
 import pytest
@@ -23,81 +25,114 @@
 ################################################################################
 # Tests
 ################################################################################
-parametrize_rouge_function = pytest.mark.parametrize("rouge_function",
-                                                     [rouge1, rouge2, rougeL])
-parametrize_tokenizer = pytest.mark.parametrize("tokenizer",
-                                                [None, HanLPTokenizer])
+parametrize_rouge_function = pytest.mark.parametrize(
+    "rouge_function", [rouge1, rouge2, rougeL]
+)
+parametrize_tokenizer = pytest.mark.parametrize(
+    "tokenizer", [None, HanLPTokenizer]
+)
 
 
-@pytest.mark.parametrize("generated_outputs,reference_outputs",
-                         [("宇宙的终极答案是什么？", "宇宙的终极答案是什么。"),
-                          (["宇宙的终极答案是什么。"], ["宇宙的终极答案是什么？"])])
+@pytest.mark.parametrize(
+    "generated_outputs,reference_outputs",
+    [
+        ("宇宙的终极答案是什么？", "宇宙的终极答案是什么。"),
+        (["宇宙的终极答案是什么。"], ["宇宙的终极答案是什么？"]),
+    ],
+)
 @parametrize_rouge_function
 @parametrize_tokenizer
-def test_rouge_identical(generated_outputs: str, reference_outputs: str,
-                         rouge_function: Callable[
-                             [str, str, Optional[_ChineseTokenizer]],
-                             MetricValue[float]],
-                         tokenizer: Optional[_ChineseTokenizer]) -> None:
+def test_rouge_identical(
+    generated_outputs: str,
+    reference_outputs: str,
+    rouge_function: Callable[
+        [str, str, _ChineseTokenizer | None], MetricValue[float]
+    ],
+    tokenizer: _ChineseTokenizer | None,
+) -> None:
     # All ROUGE scores are 1 if the generated and reference outputs are
     # identical
     actual_metric_value = rouge_function(
         generated_outputs,
         reference_outputs,
         tokenizer=tokenizer()  # type: ignore[reportGeneralTypeIssues]
-        if tokenizer else None)
-    assert actual_metric_value.metric_values == [1.]
+        if tokenizer
+        else None,
+    )
+    assert actual_metric_value.metric_values == [1.0]
     assert actual_metric_value.language == "zh"
 
 
-@pytest.mark.parametrize("generated_outputs,reference_outputs",
-                         [("这样的姑娘是受不了的。", "您到底有什么事？"),
-                          (["这样的姑娘是受不了的。"], ["您到底有什么事？"])])
+@pytest.mark.parametrize(
+    "generated_outputs,reference_outputs",
+    [
+        ("这样的姑娘是受不了的。", "您到底有什么事？"),
+        (["这样的姑娘是受不了的。"], ["您到底有什么事？"]),
+    ],
+)
 @parametrize_rouge_function
 @parametrize_tokenizer
-def test_rouge_no_overlap(generated_outputs: str, reference_outputs: str,
-                          rouge_function: Callable[[str, str],
-                                                   MetricValue[float]],
-                          tokenizer: Optional[_ChineseTokenizer]) -> None:
+def test_rouge_no_overlap(
+    generated_outputs: str,
+    reference_outputs: str,
+    rouge_function: Callable[[str, str], MetricValue[float]],
+    tokenizer: _ChineseTokenizer | None,
+) -> None:
     # All ROUGE scores are 0 if the generated and reference outputs have no
     # overlapping words
     actual_metric_value = rouge_function(
         generated_outputs,
         reference_outputs,
         tokenizer=tokenizer()  # type: ignore[reportGeneralTypeIssues]
-        if tokenizer else None)
-    assert actual_metric_value.metric_values == [0.]
+        if tokenizer
+        else None,
+    )
+    assert actual_metric_value.metric_values == [0.0]
     assert actual_metric_value.language == "zh"
 
 
-@pytest.mark.parametrize("generated_outputs,reference_outputs",
-                         [("床前明月光，下一句是什么？", "床前明月光的下一句是什么？"),
-                          (["床前明月光，下一句是什么？"], ["床前明月光的下一句是什么？"])])
+@pytest.mark.parametrize(
+    "generated_outputs,reference_outputs",
+    [
+        ("床前明月光，下一句是什么？", "床前明月光的下一句是什么？"),
+        (["床前明月光，下一句是什么？"], ["床前明月光的下一句是什么？"]),
+    ],
+)
 @parametrize_rouge_function
 @parametrize_tokenizer
-def test_rouge_some_overlap(generated_outputs: str, reference_outputs: str,
-                            rouge_function: Callable[[str, str],
-                                                     MetricValue[float]],
-                            tokenizer: Optional[_ChineseTokenizer]) -> None:
+def test_rouge_some_overlap(
+    generated_outputs: str,
+    reference_outputs: str,
+    rouge_function: Callable[[str, str], MetricValue[float]],
+    tokenizer: _ChineseTokenizer | None,
+) -> None:
     expected_value = {
         "rouge1": [0.941176],
         "rouge2": [0.8],
-        "rougeL": [0.941176]
+        "rougeL": [0.941176],
     }
     # The ROUGE-2 score is lower than the ROUGE-1 and ROUGE-L scores
     actual_metric_value = rouge_function(
         generated_outputs,
         reference_outputs,
         tokenizer=tokenizer()  # type: ignore[reportGeneralTypeIssues]
-        if tokenizer else None)
-    is_close(actual_metric_value.metric_values,
-             expected_value[rouge_function.__name__])
+        if tokenizer
+        else None,
+    )
+    is_close(
+        actual_metric_value.metric_values,
+        expected_value[rouge_function.__name__],
+    )
     assert actual_metric_value.language == "zh"
 
 
-@pytest.mark.parametrize("generated_outputs,reference_outputs",
-                         [("那里有一本三体小说。", "那里有一本三体小说。"),
-                          (["那里有一本三体小说。"], ["那里有一本三体小说。"])])
+@pytest.mark.parametrize(
+    "generated_outputs,reference_outputs",
+    [
+        ("那里有一本三体小说。", "那里有一本三体小说。"),
+        (["那里有一本三体小说。"], ["那里有一本三体小说。"]),
+    ],
+)
 def test_semantic_similarity_identical(generated_outputs, reference_outputs):
     metric_value = semantic_similarity(generated_outputs, reference_outputs)
     assert 0.99 <= metric_value <= 1
@@ -105,29 +140,44 @@ def test_semantic_similarity_identical(generated_outputs, reference_outputs):
 
 @pytest.mark.parametrize(
     "generated_outputs,reference_outputs",
-    [("php是世界上最好的语言，学计算机要从娃娃抓起。", "在石家庄，有一支摇滚乐队，他们创作了很多音乐。"),
-     (["php是世界上最好的语言，学计算机要从娃娃抓起。"], ["在石家庄，有一支摇滚乐队，他们创作了很多音乐。"])])
+    [
+        (
+            "php是世界上最好的语言，学计算机要从娃娃抓起。",
+            "在石家庄，有一支摇滚乐队，他们创作了很多音乐。",
+        ),
+        (
+            ["php是世界上最好的语言，学计算机要从娃娃抓起。"],
+            ["在石家庄，有一支摇滚乐队，他们创作了很多音乐。"],
+        ),
+    ],
+)
 def test_semantic_similarity_not_similar(generated_outputs, reference_outputs):
     metric_value = semantic_similarity(generated_outputs, reference_outputs)
     assert 0.0 <= metric_value <= 0.5
 
 
-@pytest.mark.parametrize("generated_outputs,reference_outputs",
-                         [("学习中文很快乐。", "学习中文很快乐。"),
-                          (["学习中文很快乐。"], ["学习中文很快乐。"])])
+@pytest.mark.parametrize(
+    "generated_outputs,reference_outputs",
+    [
+        ("学习中文很快乐。", "学习中文很快乐。"),
+        (["学习中文很快乐。"], ["学习中文很快乐。"]),
+    ],
+)
 def test_semantic_similarity_openai(generated_outputs, reference_outputs):
     mock_embedding_response = Mock(spec=CreateEmbeddingResponse)
     mock_embedding_response.data = [Mock(embedding=[0.1, 0.2, 0.3])]
     # Calling the openai.Embedding.create method requires an OpenAI API key, so
     # we mock the return value instead
-    with patch("openai.resources.Embeddings.create",
-               Mock(return_value=mock_embedding_response)):
+    with patch(
+        "openai.resources.Embeddings.create",
+        Mock(return_value=mock_embedding_response),
+    ):
         # Set the necessary env vars for the 'openai' embedding model type
         os.environ["OPENAI_API_KEY"] = "dummy_key"
         openai_client = OpenAIEvalClient()
-        metric_value = semantic_similarity(generated_outputs,
-                                           reference_outputs,
-                                           eval_model=openai_client)
+        metric_value = semantic_similarity(
+            generated_outputs, reference_outputs, eval_model=openai_client
+        )
         # Since the mock embeddings are the same for the generated and reference
         # outputs, the semantic similarity should be 1.
         assert 0.99 <= metric_value <= 1
@@ -137,10 +187,11 @@ def test_semantic_similarity_openai(generated_outputs, reference_outputs):
         os.environ["OPENAI_API_VERSION"] = "dummy_version"
         os.environ["AZURE_OPENAI_ENDPOINT"] = "dummy_endpoint"
         azure_openai_client = AzureOpenAIEvalClient(
-            embedding_model_name="foo bar")
-        metric_value = semantic_similarity(generated_outputs,
-                                           reference_outputs,
-                                           eval_model=azure_openai_client)
+            embedding_model_name="foo bar"
+        )
+        metric_value = semantic_similarity(
+            generated_outputs, reference_outputs, eval_model=azure_openai_client
+        )
         # Since the mock embeddings are the same for the generated and reference
         # outputs, the semantic similarity should be 1.
         assert 0.99 <= metric_value <= 1
diff --git a/tests/metrics/zh/test_tokenizers.py b/tests/metrics/zh/test_tokenizers.py
index 4e5d309a..79b41e65 100644
--- a/tests/metrics/zh/test_tokenizers.py
+++ b/tests/metrics/zh/test_tokenizers.py
@@ -1,18 +1,36 @@
-from typing import List
-
 import pytest
 
 from langcheck.metrics.zh import HanLPTokenizer
 from langcheck.metrics.zh._tokenizers import _ChineseTokenizer
 
 
-@pytest.mark.parametrize("text,expected_tokens", [
-    ("吃葡萄不吐葡萄皮。不吃葡萄到吐葡萄皮。",
-     ["吃", "葡萄", "不", "吐", "葡萄", "皮", "不", "吃", "葡萄", "到", "吐", "葡萄", "皮"]),
-    ("北京是中国的首都", ["北京", "是", "中国", "的", "首都"]),
-])
+@pytest.mark.parametrize(
+    "text,expected_tokens",
+    [
+        (
+            "吃葡萄不吐葡萄皮。不吃葡萄到吐葡萄皮。",
+            [
+                "吃",
+                "葡萄",
+                "不",
+                "吐",
+                "葡萄",
+                "皮",
+                "不",
+                "吃",
+                "葡萄",
+                "到",
+                "吐",
+                "葡萄",
+                "皮",
+            ],
+        ),
+        ("北京是中国的首都", ["北京", "是", "中国", "的", "首都"]),
+    ],
+)
 @pytest.mark.parametrize("tokenizer", [HanLPTokenizer])
-def test_hanlp_tokenizer(text: str, expected_tokens: List[str],
-                         tokenizer: _ChineseTokenizer) -> None:
+def test_hanlp_tokenizer(
+    text: str, expected_tokens: list[str], tokenizer: _ChineseTokenizer
+) -> None:
     tokenizer = tokenizer()  # type: ignore[reportGeneralTypeIssues]
     assert tokenizer.tokenize(text) == expected_tokens
diff --git a/tests/utils.py b/tests/utils.py
index 7f9a375e..70759b87 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import math
-from typing import Iterable, List
+from collections.abc import Iterable
 
 from langcheck.metrics.eval_clients import EvalClient
 
@@ -20,20 +20,19 @@ def __init__(self, evaluation_result: str | None = None) -> None:
         self.evaluation_result = evaluation_result
 
     def get_text_responses(
-            self,
-            prompts: Iterable[str],
-            *,
-            tqdm_description: str | None = None) -> list[str | None]:
+        self, prompts: Iterable[str], *, tqdm_description: str | None = None
+    ) -> list[str | None]:
         return [self.evaluation_result] * len(list(prompts))
 
     def get_float_score(
-            self,
-            metric_name: str,
-            language: str,
-            unstructured_assessment_result: list[str | None],
-            score_map: dict[str, float],
-            *,
-            tqdm_description: str | None = None) -> list[float | None]:
+        self,
+        metric_name: str,
+        language: str,
+        unstructured_assessment_result: list[str | None],
+        score_map: dict[str, float],
+        *,
+        tqdm_description: str | None = None,
+    ) -> list[float | None]:
         eval_results = []
         # Assume that the evaluation result is actually structured and it can be
         # put into the score_map directly
@@ -51,13 +50,13 @@ def get_float_score(
 ################################################################################
 
 
-def is_close(a: List, b: List) -> bool:
+def is_close(a: list, b: list) -> bool:
     """Returns True if two lists of numbers are element-wise close."""
     assert len(a) == len(b)
     return all(math.isclose(x, y) for x, y in zip(a, b))
 
 
-def lists_are_equal(a: List[str] | str, b: List[str] | str) -> bool:
+def lists_are_equal(a: list[str] | str, b: list[str] | str) -> bool:
     """Returns True if two lists of strings are equal. If either argument is a
     single string, it's automatically converted to a list.
     """