Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop Python 3.8 support #163

Merged
merged 5 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/workflows/pip_install_matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,15 @@ jobs:
fail-fast: false # Continue running jobs even if another fails
matrix:
# We specify Python versions as strings so 3.10 doesn't become 3.1
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-latest, windows-latest, macos-14]
# "en", "de", and "" are equivalent
# "all" is tested by pytest.yml
language: ["en", "ja", "zh"]

exclude:
# GitHub Actions doesn't support Python 3.8 and 3.9 on M1 macOS yet:
# GitHub Actions doesn't support Python 3.9 on M1 macOS yet:
# https://github.com/actions/setup-python/issues/696
- python-version: "3.8"
os: macos-14
- python-version: "3.9"
os: macos-14
# TODO: Figure out how to install MeCab on Windows to install
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
python: "3.9"

sphinx:
configuration: docs/conf.py
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pip install --upgrade pip
pip install langcheck[all]
```

LangCheck works with Python 3.8 or higher.
LangCheck works with Python 3.9 or higher.

:::{note}
Model files are lazily downloaded the first time you run a metric function. For example, the first time you run the ``langcheck.metrics.sentiment()`` function, LangCheck will automatically download the Twitter-roBERTa-base model.
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial_langcheckchat.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Here’s the response from the LLM:
>
> pip install langcheck
>
> Please note that LangCheck requires Python 3.8 or higher to work properly.
> Please note that LangCheck requires Python 3.9 or higher to work properly.

We can also see the sources that were retrieved from the index. By default, the top 2 most relevant source nodes are returned, which is what we see in `response.source_nodes`.

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ dependencies = [
'tomli; python_version < "3.11"',
'tokenizers >= 0.13.2; python_version >= "3.11"', # See https://github.com/citadel-ai/langcheck/pull/45
'torch >= 2',
'transformers >= 4.6, < 4.46',
'transformers >= 4.6',
'tabulate >= 0.9.0', # For model manager print table
'omegaconf >= 2.3.0' # For model manager print table
]
requires-python = ">=3.8"
requires-python = ">=3.9"

[project.optional-dependencies]
de = [] # No extra dependencies needed for German
Expand Down
4 changes: 1 addition & 3 deletions src/langcheck/metrics/de/_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import List

from nltk.stem.cistem import Cistem
from nltk.tokenize import word_tokenize
from rouge_score.tokenizers import Tokenizer as BaseTokenizer
Expand All @@ -16,7 +14,7 @@ def __init__(self, stemmer=False):
if stemmer:
self.stemmer = Cistem()

def tokenize(self, text: str) -> List[str]:
def tokenize(self, text: str) -> list[str]:
if self.stemmer:
# use only the stem part of the word
text, _ = self.stemmer.segment(text)
Expand Down
30 changes: 14 additions & 16 deletions src/langcheck/metrics/de/reference_based_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from rouge_score import rouge_scorer

from langcheck.metrics.de._tokenizers import DeTokenizer
Expand All @@ -19,9 +17,9 @@


def semantic_similarity(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[float]:
"""Calculates the semantic similarities between the generated outputs and
Expand Down Expand Up @@ -85,9 +83,9 @@ def semantic_similarity(


def rouge1(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-1 scores between the generated
outputs and the reference outputs. It evaluates the overlap of unigrams
Expand Down Expand Up @@ -127,9 +125,9 @@ def rouge1(


def rouge2(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-2 scores between the generated
outputs and the reference outputs. It evaluates the overlap of bigrams
Expand Down Expand Up @@ -169,9 +167,9 @@ def rouge2(


def rougeL(
generated_outputs: List[str] | str,
reference_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
reference_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the F1 metrics of the ROUGE-L scores between the generated
outputs and the reference outputs. It evaluates the longest common
Expand Down Expand Up @@ -221,8 +219,8 @@ def rougeL(


def _rouge(
generated_outputs: List[str], reference_outputs: List[str], rouge_type: str
) -> List[float]:
generated_outputs: list[str], reference_outputs: list[str], rouge_type: str
) -> list[float]:
"""Helper function for computing the rouge1, rouge2, and rougeL metrics.
This uses Google Research's implementation of ROUGE:
https://github.com/google-research/google-research/tree/master/rouge
Expand Down
40 changes: 19 additions & 21 deletions src/langcheck/metrics/de/reference_free_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from langcheck.metrics.de._translation import Translate
from langcheck.metrics.de.reference_based_text_quality import (
semantic_similarity,
Expand Down Expand Up @@ -30,11 +28,11 @@


def sentiment(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
local_overflow_strategy: str = "truncate",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the sentiment scores of generated outputs. This metric takes
on float values between [0, 1], where 0 is negative sentiment and 1 is
positive sentiment. (NOTE: when using an EvalClient, the sentiment scores
Expand Down Expand Up @@ -112,8 +110,8 @@ def sentiment(


def _sentiment_local(
generated_outputs: List[str], overflow_strategy: str
) -> List[Optional[float]]:
generated_outputs: list[str], overflow_strategy: str
) -> list[float | None]:
"""Calculates the sentiment scores of generated outputs using the
twitter-xlm-roberta-base-sentiment-finetunned model. This metric takes on
float values between [0, 1], where 0 is negative sentiment and 1 is positive
Expand Down Expand Up @@ -142,10 +140,10 @@ def _sentiment_local(


def fluency(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the fluency scores of generated outputs. This metric takes on
float values between [0, 1], where 0 is low fluency and 1 is high fluency.
(NOTE: when using an EvalClient, the fluency scores are either 0.0
Expand Down Expand Up @@ -220,11 +218,11 @@ def fluency(


def toxicity(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
local_overflow_strategy: str = "truncate",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the toxicity scores of generated outputs. This metric takes on
float values between [0, 1], where 0 is low toxicity and 1 is high toxicity.
(NOTE: when using an EvalClient, the toxicity scores are in steps of
Expand Down Expand Up @@ -301,8 +299,8 @@ def toxicity(


def _toxicity_local(
generated_outputs: List[str], overflow_strategy: str
) -> List[Optional[float]]:
generated_outputs: list[str], overflow_strategy: str
) -> list[float | None]:
"""Calculates the toxicity scores of generated outputs using the Detoxify
model. This metric takes on float values between [0, 1], where 0 is low
toxicity and 1 is high toxicity.
Expand All @@ -324,8 +322,8 @@ def _toxicity_local(


def flesch_kincaid_grade(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the readability of generated outputs using the Flesch-Kincaid.
It is the same as in English (but higher):
Expand All @@ -338,8 +336,8 @@ def flesch_kincaid_grade(


def flesch_reading_ease(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
) -> MetricValue[float]:
"""Calculates the readability of generated outputs using the Flesch Reading
Ease Score. This metric takes on float values between (-∞, 121.22], but
Expand Down Expand Up @@ -387,8 +385,8 @@ def flesch_reading_ease(


def ai_disclaimer_similarity(
generated_outputs: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
prompts: list[str] | str | None = None,
ai_disclaimer_phrase: str = (
"Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein."
),
Expand Down
14 changes: 6 additions & 8 deletions src/langcheck/metrics/de/source_based_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import List, Optional

from langcheck.metrics.de._translation import Translate
from langcheck.metrics.en.source_based_text_quality import (
factual_consistency as en_factual_consistency,
Expand All @@ -20,11 +18,11 @@


def factual_consistency(
generated_outputs: List[str] | str,
sources: List[str] | str,
prompts: Optional[List[str] | str] = None,
generated_outputs: list[str] | str,
sources: list[str] | str,
prompts: list[str] | str | None = None,
eval_model: str | EvalClient = "local",
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the factual consistency between the generated outputs and
the sources. This metric takes on float values between [0, 1], where 0
means that the output is not at all consistent with the source text, and 1
Expand Down Expand Up @@ -123,8 +121,8 @@ def factual_consistency(


def context_relevance(
sources: List[str] | str, prompts: List[str] | str, eval_model: EvalClient
) -> MetricValue[Optional[float]]:
sources: list[str] | str, prompts: list[str] | str, eval_model: EvalClient
) -> MetricValue[float | None]:
"""Calculates the relevance of the sources to the prompts. This metric takes
on float values between [0, 1], where 0 means that the source text is not at
all relevant to the prompt, and 1 means that the source text is fully
Expand Down
24 changes: 12 additions & 12 deletions src/langcheck/metrics/en/pairwise_text_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import math
import random
from typing import List, Optional, cast
from typing import cast

from langcheck.metrics._pairwise_text_quality_utils import (
compute_pairwise_comparison_metric_values_with_consistency,
Expand All @@ -16,13 +16,13 @@


def simulated_annotators(
prompt_params: List[dict[str, str | None]],
prompt_params: list[dict[str, str | None]],
eval_model: EvalClient,
preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
k: int = 5,
n: int = 5,
seed: int | None = None,
) -> List[float | None]:
) -> list[float | None]:
"""Compute a confidence score for the pairwise comparison metric based on
the method Simulated Annotators proposed in the paper "Trust or Escalate:
LLM Judges with Provable Guarantees for Human Agreement"
Expand Down Expand Up @@ -73,7 +73,7 @@ def simulated_annotators(
prompts.append(prompt_template.render(prompt_param))

# Get the response and top five logprobs of the first token
responses: List[Optional[TextResponseWithLogProbs]] = (
responses: list[TextResponseWithLogProbs | None] = (
eval_model.get_text_responses_with_log_likelihood(
prompts, top_logprobs=5
)
Expand All @@ -83,7 +83,7 @@ def simulated_annotators(
if response:
response = cast(TextResponseWithLogProbs, response)
top_five_first_token_logprobs = cast(
List[TokenLogProb], response["response_logprobs"][0]
list[TokenLogProb], response["response_logprobs"][0]
)
# Extract logprobs for tokens 'A' and 'B'
logprobs_dict = {
Expand All @@ -110,20 +110,20 @@ def simulated_annotators(


def pairwise_comparison(
generated_outputs_a: List[str] | str,
generated_outputs_b: List[str] | str,
prompts: List[str] | str,
sources_a: Optional[List[str] | str] = None,
sources_b: Optional[List[str] | str] = None,
reference_outputs: Optional[List[str] | str] = None,
generated_outputs_a: list[str] | str,
generated_outputs_b: list[str] | str,
prompts: list[str] | str,
sources_a: list[str] | str | None = None,
sources_b: list[str] | str | None = None,
reference_outputs: list[str] | str | None = None,
enforce_consistency: bool = True,
calculated_confidence: bool = False,
preference_data_path: str = "en/confidence_estimating/preference_data_examples.jsonl",
k: int = 5,
n: int = 5,
seed: int | None = None,
eval_model: EvalClient | None = None,
) -> MetricValue[Optional[float]]:
) -> MetricValue[float | None]:
"""Calculates the pairwise comparison metric. This metric takes on float
values of either 0.0 (Response A is better), 0.5 (Tie), or 1.0 (Response B
is better). The score may also be `None` if it could not be computed.
Expand Down
Loading
Loading