Skip to content

Commit

Permalink
Merge pull request #163 from citadel-ai/drop-38
Browse files Browse the repository at this point in the history
Drop Python 3.8 support
  • Loading branch information
liwii authored Oct 28, 2024
2 parents 1c39619 + 0a79fc7 commit 4be82f5
Show file tree
Hide file tree
Showing 54 changed files with 782 additions and 564 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/pip_install_matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,15 @@ jobs:
fail-fast: false # Continue running jobs even if another fails
matrix:
# We specify Python versions as strings so 3.10 doesn't become 3.1
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-latest, windows-latest, macos-14]
# "en", "de", and "" are equivalent
# "all" is tested by pytest.yml
language: ["en", "ja", "zh"]

exclude:
# GitHub Actions doesn't support Python 3.8 and 3.9 on M1 macOS yet:
# GitHub Actions doesn't support Python 3.9 on M1 macOS yet:
# https://github.com/actions/setup-python/issues/696
- python-version: "3.8"
os: macos-14
- python-version: "3.9"
os: macos-14
# TODO: Figure out how to install MeCab on Windows to install
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
python: "3.9"

sphinx:
configuration: docs/conf.py
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pip install --upgrade pip
pip install langcheck[all]
```

LangCheck works with Python 3.8 or higher.
LangCheck works with Python 3.9 or higher.

:::{note}
Model files are lazily downloaded the first time you run a metric function. For example, the first time you run the ``langcheck.metrics.sentiment()`` function, LangCheck will automatically download the Twitter-roBERTa-base model.
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial_langcheckchat.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Here’s the response from the LLM:
>
> pip install langcheck
>
> Please note that LangCheck requires Python 3.8 or higher to work properly.
> Please note that LangCheck requires Python 3.9 or higher to work properly.
We can also see the sources that were retrieved from the index. By default, the top 2 most relevant source nodes are returned, which is what we see in `response.source_nodes`.

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ dependencies = [
'tomli; python_version < "3.11"',
'tokenizers >= 0.13.2; python_version >= "3.11"', # See https://github.com/citadel-ai/langcheck/pull/45
'torch >= 2',
'transformers >= 4.6, < 4.46',
'transformers >= 4.6',
'tabulate >= 0.9.0', # For model manager print table
'omegaconf >= 2.3.0' # For model manager print table
]
requires-python = ">=3.8"
requires-python = ">=3.9"

[project.optional-dependencies]
de = [] # No extra dependencies needed for German
Expand Down
4 changes: 1 addition & 3 deletions src/langcheck/metrics/de/_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import List

from nltk.stem.cistem import Cistem
from nltk.tokenize import word_tokenize
from rouge_score.tokenizers import Tokenizer as BaseTokenizer
Expand All @@ -16,7 +14,7 @@ def __init__(self, stemmer=False):
if stemmer:
self.stemmer = Cistem()

def tokenize(self, text: str) -> List[str]:
def tokenize(self, text: str) -> list[str]:
if self.stemmer:
# use only the stem part of the word
text, _ = self.stemmer.segment(text)
Expand Down
30 changes: 14 additions & 16 deletions src/langcheck/metrics/de/reference_based_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from rouge_score import rouge_scorer

from langcheck.metrics.de._tokenizers import DeTokenizer
Expand All @@ -19,9 +17,9 @@


def semantic_similarity(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[float]:
"""Calculates the semantic similarities between the generated outputs and
Expand Down Expand Up @@ -85,9 +83,9 @@ def semantic_similarity(


def rouge1(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-1 scores between the generated
outputs and the reference outputs. It evaluates the overlap of unigrams
Expand Down Expand Up @@ -127,9 +125,9 @@ def rouge1(


def rouge2(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-2 scores between the generated
outputs and the reference outputs. It evaluates the overlap of bigrams
Expand Down Expand Up @@ -169,9 +167,9 @@ def rouge2(


def rougeL(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-L scores between the generated
outputs and the reference outputs. It evaluates the longest common
Expand Down Expand Up @@ -221,8 +219,8 @@ def rougeL(


def _rouge(
generated_outputs: List[str], reference_outputs: List[str], rouge_type: str
) -> List[float]:
generated_outputs: list[str], reference_outputs: list[str], rouge_type: str
) -> list[float]:
"""Helper function for computing the rouge1, rouge2, and rougeL metrics.
This uses Google Research's implementation of ROUGE:
https://github.com/google-research/google-research/tree/master/rouge
Expand Down
40 changes: 19 additions & 21 deletions src/langcheck/metrics/de/reference_free_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from langcheck.metrics.de._translation import Translate
from langcheck.metrics.de.reference_based_text_quality import (
semantic_similarity,
Expand Down Expand Up @@ -30,11 +28,11 @@


def sentiment(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
local_overflow_strategy: str = "truncate",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the sentiment scores of generated outputs. This metric takes
on float values between [0, 1], where 0 is negative sentiment and 1 is
positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
Expand Down Expand Up @@ -112,8 +110,8 @@ def sentiment(


def _sentiment_local(
generated_outputs: List[str], overflow_strategy: str
) -> List[Optional[float]]:
generated_outputs: list[str], overflow_strategy: str
) -> list[float | None]:
"""Calculates the sentiment scores of generated outputs using the
twitter-xlm-roberta-base-sentiment-finetunned model. This metric takes on
float values between [0, 1], where 0 is negative sentiment and 1 is positive
Expand Down Expand Up @@ -142,10 +140,10 @@ def _sentiment_local(


def fluency(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the fluency scores of generated outputs. This metric takes on
float values between [0, 1], where 0 is low fluency and 1 is high fluency.
(NOTE: when using an EvalClient, the fluency scores are either 0.0
Expand Down Expand Up @@ -220,11 +218,11 @@ def fluency(


def toxicity(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
local_overflow_strategy: str = "truncate",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the toxicity scores of generated outputs. This metric takes on
float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
(NOTE: when using an EvalClient, the toxicity scores are in steps of
Expand Down Expand Up @@ -301,8 +299,8 @@ def toxicity(


def _toxicity_local(
generated_outputs: List[str], overflow_strategy: str
) -> List[Optional[float]]:
generated_outputs: list[str], overflow_strategy: str
) -> list[float | None]:
"""Calculates the toxicity scores of generated outputs using the Detoxify
model. This metric takes on float values between [0, 1], where 0 is low
toxicity and 1 is high toxicity.
Expand All @@ -324,8 +322,8 @@ def _toxicity_local(


def flesch_kincaid_grade(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the readability of generated outputs using the Flesch-Kincaid.
It is the same as in English (but higher):
Expand All @@ -338,8 +336,8 @@ def flesch_kincaid_grade(


def flesch_reading_ease(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the readability of generated outputs using the Flesch Reading
Ease Score. This metric takes on float values between (-∞, 121.22], but
Expand Down Expand Up @@ -387,8 +385,8 @@ def flesch_reading_ease(


def ai_disclaimer_similarity(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
ai_disclaimer_phrase: str = (
"Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein."
),
Expand Down
14 changes: 6 additions & 8 deletions src/langcheck/metrics/de/source_based_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from langcheck.metrics.de._translation import Translate
from langcheck.metrics.en.source_based_text_quality import (
factual_consistency as en_factual_consistency,
Expand All @@ -20,11 +18,11 @@


def factual_consistency(
generated_outputs: List[str] | str,
sources: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
sources: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the factual consistency between the generated outputs and
the sources. This metric takes on float values between [0, 1], where 0
means that the output is not at all consistent with the source text, and 1
Expand Down Expand Up @@ -123,8 +121,8 @@ def factual_consistency(


def context_relevance(
sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
) -> MetricValue[Optional[float]]:
sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
) -> MetricValue[float | None]:
"""Calculates the relevance of the sources to the prompts. This metric takes
on float values between [0, 1], where 0 means that the source text is not at
all relevant to the prompt, and 1 means that the source text is fully
Expand Down
24 changes: 12 additions & 12 deletions src/langcheck/metrics/en/pairwise_text_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import math
import random
from typing import List, Optional, cast
from typing import cast

from langcheck.metrics._pairwise_text_quality_utils import (
compute_pairwise_comparison_metric_values_with_consistency,
Expand All @@ -16,13 +16,13 @@


def simulated_annotators(
prompt_params: List[dict[str, str | None]],
prompt_params: list[dict[str, str | None]],
eval_model: EvalClient,
preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
k: int = 5,
n: int = 5,
seed: int | None = None,
) -> List[float | None]:
) -> list[float | None]:
"""Compute a confidence score for the pairwise comparison metric based on
the method Simulated Annotators proposed in the paper "Trust or Escalate:
LLM Judges with Provable Guarantees for Human Agreement"
Expand Down Expand Up @@ -73,7 +73,7 @@ def simulated_annotators(
prompts.append(prompt_template.render(prompt_param))

# Get the response and top five logprobs of the first token
responses: List[Optional[TextResponseWithLogProbs]] = (
responses: list[TextResponseWithLogProbs | None] = (
eval_model.get_text_responses_with_log_likelihood(
prompts, top_logprobs=5
)
Expand All @@ -83,7 +83,7 @@ def simulated_annotators(
if response:
response = cast(TextResponseWithLogProbs, response)
top_five_first_token_logprobs = cast(
List[TokenLogProb], response["response_logprobs"][0]
list[TokenLogProb], response["response_logprobs"][0]
)
# Extract logprobs for tokens 'A' and 'B'
logprobs_dict = {
Expand All @@ -110,20 +110,20 @@ def simulated_annotators(


def pairwise_comparison(
generated_outputs_a: List[str] | str,
generated_outputs_b: List[str] | str,
prompts: List[str] | str,
sources_a: Optional[List[str] | str] = None,
sources_b: Optional[List[str] | str] = None,
reference_outputs: Optional[List[str] | str] = None,
generated_outputs_a: list[str] | str,
generated_outputs_b: list[str] | str,
prompts: list[str] | str,
sources_a: list[str] | str | None = None,
sources_b: list[str] | str | None = None,
reference_outputs: list[str] | str | None = None,
enforce_consistency: bool = True,
calculated_confidence: bool = False,
preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
k: int = 5,
n: int = 5,
seed: int | None = None,
eval_model: EvalClient | None = None,
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the pairwise comparison metric. This metric takes on float
values of either 0.0 (Response A is better), 0.5 (Tie), or 1.0 (Response B
is better). The score may also be `None` if it could not be computed.
Expand Down
Loading

0 comments on commit 4be82f5

Please sign in to comment.