From b594d83d581b6d66314cb841a07dea060a02122b Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 18 Dec 2024 13:35:44 -0800 Subject: [PATCH 1/3] Exported https://github.com/Future-House/aviary/pull/148's contents --- src/aviary/core.py | 2 + src/aviary/utils.py | 39 ++++++- .../test_extract_answer[complex].yaml | 109 ++++++++++++++++++ .../test_extract_answer[empty-proposal].yaml | 102 ++++++++++++++++ .../test_extract_answer[gave-two].yaml | 102 ++++++++++++++++ .../test_extract_answer[not in options].yaml | 108 +++++++++++++++++ tests/test_utils.py | 31 ++++- 7 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 tests/cassettes/test_extract_answer[complex].yaml create mode 100644 tests/cassettes/test_extract_answer[empty-proposal].yaml create mode 100644 tests/cassettes/test_extract_answer[gave-two].yaml create mode 100644 tests/cassettes/test_extract_answer[not in options].yaml diff --git a/src/aviary/core.py b/src/aviary/core.py index 587e76b5..6170a7bd 100644 --- a/src/aviary/core.py +++ b/src/aviary/core.py @@ -40,6 +40,7 @@ EvalAnswerMode, encode_image_to_base64, eval_answer, + extract_answer, is_coroutine_callable, partial_format, ) @@ -82,6 +83,7 @@ "encode_image_to_base64", "eval_answer", "eval_answer", + "extract_answer", "fenv", "is_coroutine_callable", "join", diff --git a/src/aviary/utils.py b/src/aviary/utils.py index 35687bd2..2dee1bf1 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -22,7 +22,7 @@ DEFAULT_EVAL_MODEL_NAME = "gpt-4o" -LLM_BOOL_EVAL_CONFIG = { +LLM_BOOL_EVAL_CONFIG: dict[str, Any] = { "prompt": ( "Here is a question, the correct answer to the question, and a proposed answer" " to the question. Please tell me if the proposed answer is correct, given the" @@ -35,6 +35,18 @@ "temperature": 0, } +LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | { + "prompt": ( + "You are evaluating answers for a test which has fixed options. " + "Repeat back which option the proposed answer matches. " + "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. " + "If the proposed answer is empty, invalid, or ambiguous, " + "return an empty string." + "\n\nOptions:\n{options}" + "\n\nProposed answer: {proposed_answer}" + ) +} + LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | { "prompt": ( "Here is a question, the correct answer to the question, and a rubric for" @@ -175,6 +187,31 @@ async def eval_answer( raise RuntimeError(f"Invalid evaluation mode: {eval_mode}") +async def extract_answer( + proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None +) -> str | None: + """Extract the answer matching a proposal from a list of options using an LLM.""" + for option in options: + if proposed_answer.strip().casefold() == option.strip().casefold(): + return option + + default_config = LLM_EXTRACT_CONFIG + config = llm_eval_config or default_config + response_msg = await run_prompt( + prompt=config.get("prompt", default_config["prompt"]).format( + options="\n".join(options), + proposed_answer=proposed_answer, + ), + model=config.get("model", default_config["model"]), + temperature=config.get("temperature", default_config["temperature"]), + ) + answer = response_msg.strip().casefold() # noqa: FURB184 + for option in options: + if answer == option.strip().casefold(): + return option + return None + + _CAPITAL_A_INDEX = ord("A") diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml new file mode 100644 index 00000000..319c6183 --- /dev/null +++ b/tests/cassettes/test_extract_answer[complex].yaml @@ -0,0 +1,109 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial + unrest\nPolitical corruption\n\nProposed answer: Based on the context given, + Serif et al. (2026) claim that the overwhelming cause of regime collapse arises + from economic factors. Yet, most other scholars (Gerald and Robinson for example) + believe the collapse was due to social unrest because of the prolonged epidemic + of 2025. I tend to agree with the majority - although I can see both sides. + Thus my response is that the social unrest was the significant factor in the + collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature": + 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "861" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW + FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH + sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA + nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC + yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib + 3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D + krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f42461018c5eb29-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:33:52 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "235" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999790" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_366dfd5f505d08facd0f7d10e64a9f5e + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml new file mode 100644 index 00000000..cb61987f --- /dev/null +++ b/tests/cassettes/test_extract_answer[empty-proposal].yaml @@ -0,0 +1,102 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: + ", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "369" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+ + 1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6 + EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG + Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4 + pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W + AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK + 1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f424615ca81eb32-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:33:53 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "171" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999912" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_de2070d3e02afd584ac618042c22382d + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml new file mode 100644 index 00000000..a70e7e68 --- /dev/null +++ b/tests/cassettes/test_extract_answer[gave-two].yaml @@ -0,0 +1,102 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: + A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "375" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85 + zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN + QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm + rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK + l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL + EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed + 2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f42460a68a467f1-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:33:51 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "241" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999911" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_83d07d0983e1d4d1995bfa068db503dd + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer[not in options].yaml b/tests/cassettes/test_extract_answer[not in options].yaml new file mode 100644 index 00000000..e48c8e4c --- /dev/null +++ b/tests/cassettes/test_extract_answer[not in options].yaml @@ -0,0 +1,108 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer: + F", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "367" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "0" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArr5gYl/d9uLIxIMLBQFLn2JXFxfJbtQKuq3x05DU0r + QGLJ8H73Xt5dckwYAyVhzUDUPIjG6vS+/Ni3T5TnlpYPyA/ukT4r2r3kcqKeYRQdtN2hCN+uO0GN + 1RgUmTMWDnnAmJovJtPZbDGfZB1oSKKOtsqGdErpOBtP02yZZvPeWJMS6GHNXhPGGDt2z1jRSNzD + mnUxndKg97xCWF+GGANHOirAvVc+cBNgNEBBJqDpWm9gA9fIYdl6HpuZVuteP13epamyjra+5xe9 + VEb5unDIPZmY6wNZ6OgpYeyt26m9qQnWUWNDEegdTQycr85xMBxxgHnPAgWuB3nRn+E2rJAYuNL+ + 6iQguKhRDs7hfryViq5AcrXyzy6/ZZ/XVqb6T/wAhEAbUBbWoVTidt9hzGH8w/4au5y4Kwz+4AM2 + RalMhc46df7IpS1WJd/yVZlNlpCcki8AAAD//wMAJ6aXP+0CAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f4246051c1ceb26-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 18 Dec 2024 21:33:50 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=in2yLMzYdxfPIHSQDPq17chYgGhrTOolB6HZrJk9Iy8-1734557630-1.0.1.1-LYUU4oNWUKO8gNwjcIktYjnSyIsGLKQGmQKI54P4UxfMJ330MXeZFWWhVoJnP0b1M92ejFaHTWHlz4eHH30gIA; + path=/; expires=Wed, 18-Dec-24 22:03:50 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=El.JjK6nMT19ye2jesHXCIAySeg4BN7pKN7mVnzqSM8-1734557630598-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "253" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999912" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_5992eb433053f3b82b29a5319a96ef7e + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_utils.py b/tests/test_utils.py index 3469f01c..e3e7af1c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import pytest -from aviary.core import eval_answer +from aviary.core import eval_answer, extract_answer from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion from tests.conftest import VCR_DEFAULT_MATCH_ON @@ -39,6 +39,35 @@ async def test_eval_answer( assert await eval_answer(proposed, correct, question, eval_mode) == expected +@pytest.mark.vcr +@pytest.mark.parametrize( + ("proposed_answer", "options", "expected"), + [ + pytest.param("A", ["A", "B", "C"], "A", id="exact-uppercase"), + pytest.param("a", ["A", "B", "C"], "A", id="exact-lowercase"), + pytest.param("F", ["B", "C"], None, id="not in options"), + pytest.param("A or B", ["A", "B", "C"], None, id="gave-two"), + pytest.param( + "Based on the context given, Serif et al. (2026) claim that " + "the overwhelming cause of regime collapse arises from economic factors. " + "Yet, most other scholars (Gerald and Robinson for example) believe the collapse " + "was due to social unrest because of the prolonged epidemic of 2025. I tend to agree " + "with the majority - although I can see both sides. Thus my response " + "is that the social unrest was the significant factor in the collapse of the regime.", + ["Economic factors", "Social unrest", "Political corruption"], + "Social unrest", + id="complex", + ), + pytest.param("", ["A", "B", "C"], None, id="empty-proposal"), + ], +) +@pytest.mark.asyncio +async def test_extract_answer( + proposed_answer: str, options: Sequence[str], expected: str | None +) -> None: + assert await extract_answer(proposed_answer, options) == expected + + @pytest.mark.vcr @pytest.mark.asyncio async def test_eval_llm_config(): From d018adfaaba6622a6c4ef11293a76dff9804e0a9 Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 18 Dec 2024 13:47:38 -0800 Subject: [PATCH 2/3] Moved MultipleChoiceQuestion to now use extract_answer --- src/aviary/utils.py | 64 ++++++------------- ...t-match-and-llm-has-innate-knowledge].yaml | 39 ++++++----- ...nt-match-and-no-llm-innate-knowledge].yaml | 39 ++++++----- ...AEvaluation.test_grade[empty-answer1].yaml | 39 ++++++----- ...AEvaluation.test_grade[empty-answer2].yaml | 39 ++++++----- ...AEvaluation.test_grade[empty-answer3].yaml | 40 ++++++------ ...on.test_grade[matched-correct-option].yaml | 47 ++++++++------ ....test_grade[matched-incorrect-option].yaml | 39 ++++++----- ...n.test_grade[matched-several-options].yaml | 39 ++++++----- ...ion.test_grade[matched-unsure-option].yaml | 40 ++++++------ tests/test_utils.py | 16 ++--- 11 files changed, 205 insertions(+), 236 deletions(-) diff --git a/src/aviary/utils.py b/src/aviary/utils.py index 2dee1bf1..fcbf130f 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -3,10 +3,9 @@ import inspect import io import random -import re import string from ast import literal_eval -from collections.abc import Awaitable, Callable, Sequence +from collections.abc import Sequence from enum import StrEnum from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast @@ -217,16 +216,6 @@ async def extract_answer( class MultipleChoiceQuestion(BaseModel): QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}" - # TODO: combine with above eval_answer and its prompts - EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = ( - "Given the following question and a proposed answer to the question, return the" - " single-letter choice in the question that matches the proposed answer." - " If the proposed answer is blank or an empty string," - " or multiple options are matched, respond with '0'." - "\n\nQuestion: {qa_prompt}" - "\n\nProposed Answer: {qa_answer}" - "\n\nSingle Letter Answer:" - ) DEFAULT_UNSURE_OPTION: ClassVar[str] = ( "Insufficient information to answer this question" ) @@ -317,18 +306,14 @@ def split_options(options: str) -> list[str]: return split_options async def grade( - self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None - ) -> "tuple[MultipleChoiceEvaluation, str, str]": - if prompt_runner is None: - prompt_runner = run_prompt - eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format( - qa_prompt=self.question_prompt, qa_answer=answer - ) - raw_evaluation = await prompt_runner(eval_prompt) - evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer( - raw_evaluation, self + self, proposed_answer: str + ) -> "tuple[MultipleChoiceEvaluation, str | None]": + extracted_answer = await extract_answer( + proposed_answer=proposed_answer, options=self.options ) - return evaluation, raw_evaluation, parsed_answer + return MultipleChoiceEvaluation.from_answer( + extracted_answer, self + ), extracted_answer class MultipleChoiceEvaluation(StrEnum): @@ -360,32 +345,19 @@ def calculate_accuracy_precision( @classmethod def from_answer( - cls, answer: str, question: MultipleChoiceQuestion - ) -> "tuple[MultipleChoiceEvaluation, str]": + cls, extracted_answer: str | None, question: MultipleChoiceQuestion + ) -> "MultipleChoiceEvaluation": """Make an evaluation from the input answer and multiple choice question. Returns: - Two-tuple of answer enum and the raw answer extracted from the input answer. + Evaluation corresponding to the parsed answer. """ - # SEE: https://regex101.com/r/vcE9Hb/1 - letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL) - # Get the letter answer, or fail over to the first non-whitespace char - answer_char = ( - letter_search.group(1) - if letter_search is not None - else answer.split()[0][0].upper() - ) - answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX - if answer_letter_index < 0 or answer_letter_index > len(question.options): - # The result extracted was not in the options (e.g. '0') - return cls.INCORRECT, answer_char + if extracted_answer is None: + return MultipleChoiceEvaluation.INCORRECT # From here, if we don't match either the ideal or the unsure multiple choice # options then we declare the answer as incorrect. - if ( - question.unsure_answer_index is not None - and answer_letter_index == question.unsure_answer_index - ): - return cls.UNSURE, cast(str, question.unsure_answer) - if answer_letter_index == question.ideal_answer_index: - return cls.CORRECT, question.ideal_answer - return cls.INCORRECT, question.options[answer_letter_index] + if extracted_answer == question.ideal_answer: + return MultipleChoiceEvaluation.CORRECT + if question.unsure_answer and extracted_answer == question.unsure_answer: + return MultipleChoiceEvaluation.UNSURE + return MultipleChoiceEvaluation.INCORRECT diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml index be2df091..c87a0ba3 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle - Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information + to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role": + "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "513" + - "437" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k - u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw - 47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK - RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie - H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu - tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6 - AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA + H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVJIe6u4wKESIHFCKHKdTWpwvJa9FaCq/46cvtUicfFh + Zmc8s/Y6EQJ0DVMBailZdc6ks+Zrko2cxvljMXvunmavPHkJTA9zk9/DICpo8YGK96obRZ0zyJrs + llYeJWN0Hd7lxXhc5sOyJzqq0URZ6zgtKB1loyLNyjS73QmXpBUGmIq3RAgh1v0ZI9oav2EqssEe + 6TAE2SJMD0NCgCcTEZAh6MDSMgyOpCLLaPvUp7DHZhVkTGVXxuzwzeEeQ63ztAg7/oA32uqwrDzK + QDZ6BiYHPbtJhHjv+6zOIoLz1DmumD7RRsNyvLWD4wKP5K4qMLE0VzRnZlWNLLUJJ+sAJdUS6wtD + IUCuak0nRHJS+TLLNe9tbW3b/9gfCaXQMdaV81hrdbVvbx5/119jhxX3gSH8BMauarRt0Tuvtw/c + uGrSyIWcNFleQrJJfgEAAP//AwA5BypS6QIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fde1cf88cf1b-SJC + - 8f4256d28f9615ff-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:29 GMT + - Wed, 18 Dec 2024 21:45:18 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "363" + - "208" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999874" + - "29999896" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_aff8daa48aa43d3df077f97da6136e5a + - req_7db1d1f6dded4679e43cc12a2183fa21 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml index 38077163..f043c502 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 14004\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 14004", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "459" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLT8MwEITv+RWWzw1KS2lCb3BCSDykcuAhFBl7kxocr2Vveajqf0dO - 2yQVIHHJYb6dyewm64QxrhWfMy6XgmTjTHpWPV6fFzf1Jcwebu4WF4tbunKFwvzj/n3GR9GBL68g - ae86ktg4A6TRbrH0IAhi6jg/nk7zLC9OWtCgAhNttaN0iukkm0zTrEizXa5copYQ+Jw9JYwxtm6f - saJV8MnnLBvtlQZCEDXweTfEGPdoosJFCDqQsMRHPZRoCWzbOhvqHqpVELGWXRmz0zfdiwzWzuNL - 2PFOr7TVYVl6EAFtDA2Ejrd0kzD23C60OujIncfGUUn4BjYGjscn2zzen3BAd4yQhBmaZqNf4koF - JLQJg4twKeQSVG/tzydWSuMAJIOlf5b5LXu7uLb1f+J7ICU4AlU6D0rLw4X7MQ/xB/trrDtyW5iH - r0DQlJW2NXjn9fYbV64U+akqlJDjiieb5BsAAP//AwBRMcSQ7AIAAA== + H4sIAAAAAAAAAwAAAP//jJLNTsMwEITveQprzw1K/6DNDakIwYkLqgRFkeNsEhfHNrarUlV9d2S3 + TVJRJC4+7Lcznl17HxECvICUAKupY40W8X25ndktPqzrRb37envdVk/LxfKRvkx4/gwDr1D5Gpk7 + q26YarRAx5U8YmaQOvSuw7vxZDqdjYdJAI0qUHhZpV08UfEoGU3iZBYntydhrThDCyl5jwghZB9O + H1EW+A0pCTah0qC1tEJI2yZCwCjhK0Ct5dZR6WDQQaakQxlSr2AFfWSw3Fjqk8mNEKf6ob1LqEob + ldsTb+sll9zWmUFqlfS+1ikNgR4iQj7CTJuLmKCNarTLnPpE6Q3nw6MddEvs4Jk55ajoaUaDK2ZZ + gY5yYXsrAUZZjUWn7PZHNwVXPRD1Rv6d5Zr3cWwuq//Yd4Ax1A6LTBssOLuct2sz6H/YX23tikNg + sDvrsMlKLis02vDjI5c6m5c0p/MyGc8gOkQ/AAAA//8DAEsANTftAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdc63fbf9e53-SJC + - 8f42569f7d2a2379-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:25 GMT + - Wed, 18 Dec 2024 21:45:10 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "212" + - "240" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999868" + - "29999890" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_afd8c66d84f3b42a8cd2b8a6bf855054 + - req_363f6da2908247ad8c711b11d1593ae7 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml index 057ef1d0..662716da 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: \n\nSingle - Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role": + "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "517" + - "440" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLLTsMwELznKyyfG+S2gaa5FegFJKRKnEAocp1NaurYlr3hoar/jpyG - JBUgcfFhZmc8O/YhIoTKgmaEih1HUVsVr8qnh/XbzfvdI7LN7WzF9vf8Mik31xtYN3QSFGb7CgK/ - VRfC1FYBSqNPtHDAEYLrdDFPkgVbpGlL1KYAFWSVxTgx8YzNkpilMbvqhDsjBXiakeeIEEIO7Rki - 6gI+aEbY5BupwXteAc36IUKoMyoglHsvPXKNdDKQwmgE3aZmY9xB2XgeYulGqQ4/9hcpU1lntr7j - e7yUWvpd7oB7o4OpR2Npyx4jQl7ahZqzjNQ6U1vM0exBB8MpW5786FDhiO04NMjVCJ52LZzb5QUg - l8qPGqGCix0Ug3SojzeFNCMiGi39M8xv3qfFpa7+Yz8QQoBFKHLroJDifOFhzEH4YH+N9SW3gan/ - 9Ah1XkpdgbNOnt64tPmy5Fu+LNk8pdEx+gIAAP//AwDTwVpp7AIAAA== + H4sIAAAAAAAAA4xSy27CMBC85yusPZMqkAApt6rHSr0U9VJVkbE3wa3jtWyj8hD/XjlAAEGlXnyY + 2RnPrL1LGAMlYcZALHkQrdXpU/1TbhfrIn+j+Vxs281Cv7/k9JrN1/gMg6igxReKcFI9CGqtxqDI + HGjhkAeMrsNpXozHZT6cdkRLEnWUNTakBaWjbFSkWZlmk6NwSUqghxn7SBhjbNedMaKRuIYZywYn + pEXveYMw64cYA0c6IsC9Vz5wE2BwJgWZgKZLfQk7rFeex1RmpfUR3/f3aGqso4U/8j1eK6P8snLI + PZno6QNZ6Nh9wthn12d1FRGso9aGKtA3mmhYTg52cF7gmTxWhUCB6zuaK7NKYuBK+4t1gOBiifLG + kDHgK6nogkguKt9mued9qK1M8x/7MyEE2oCysg6lEnf7dubxd/011q+4Cwx+4wO2Va1Mg846dXjg + 2lZ8+ihLycWwhmSf/AIAAP//AwBsWlME6QIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fddcea1d251d-SJC + - 8f4256cce931eb29-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:28 GMT + - Wed, 18 Dec 2024 21:45:17 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "174" + - "217" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999872" + - "29999895" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_4d40eb2c66dfd308a7b75c7cd80c405b + - req_520872e529ccbb680d27a3729fbe637e status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml index a0acce15..483daa67 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: \n\nSingle Letter - Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information + to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}], + "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "511" + - "435" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJRT4MwFIXf+RVNn4eBicPx5oszWTJjfNBoDOnaC9SVtmlLMrPsv5sW - HCzOxBceznfP4dwLhwghzBkuEKYNcbTVIr6r3jar1eb5IXtKb7brPL9/3e/a5fxFrOtHPPMOtf0E - 6n5cV1S1WoDjSvaYGiAOfGqaX2dZnuTLJIBWMRDeVmsXZyqeJ/MsTm7jZDEYG8UpWFyg9wghhA7h - 6StKBntcoBATlBasJTXg4jSEEDZKeAUTa7l1RDo8GyFV0oEMrZOpbqDqLPG1ZCfEoB9PLxKq1kZt - 7cBPesUlt01pgFglfah1SuNAjxFCH2Gh7qwj1ka12pVO7UD6wDRZ9Hl4POGEDswpR8TUlM8uxJUM - HOHCTi6CKaENsNE6no90jKsJiCZL/y5zKbtfnMv6P/EjoBS0A1ZqA4zT84XHMQP+B/tr7HTkUBjb - L+ugLSsuazDa8P4bV7ok+ZLdMkLTCkfH6BsAAP//AwBwbnWk7AIAAA== + H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUg0CRbh7ZDB9Shqqqqioz9Etw6tmU7AoT498oB + QhBU6uLh7t357tm7iBAQHAoCbEU9a4yMH6p1Pp5/uKf59rF5b95e08X4hanFZv3sMhgFhV5+I/Mn + 1R3TjZHohVYHmlmkHoPr+H6azmbZdJx3RKM5yiCrjY9THU+SSRonWZzMj8KVFgwdFOQzIoSQXXeG + iIrjBgqSjE5Ig87RGqHohwgBq2VAgDonnKfKw+hMMq08qi71ELZYtY6GVKqV8ojv+3ukro3VS3fk + e7wSSrhVaZE6rYKn89pAx+4jQr66Pu1FRDBWN8aXXv+gCoZZerCD8wLP5LEqeO2pvKG5MCs5eiqk + G6wDGGUr5FeGhABtudADIhpUvs5yy/tQW6j6P/ZngjE0HnlpLHLBbvbtzMPv+musX3EXGNzWeWzK + SqgarbHi8MCVKfOKLmleJdMMon30CwAA//8DAFd1apnpAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fde81f05ceb1-SJC + - 8f4256d7fd75cf0d-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:30 GMT + - Wed, 18 Dec 2024 21:45:19 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "332" + - "520" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999875" + - "29999896" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_817ca7ae018d7baa48236c7ad4f4f151 + - req_f943511f12de0306ff59cccd017e98f1 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml index d70cc972..46a00d4f 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml @@ -1,14 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What method - was used to demonstrate that the enzyme PafA is stable after incubation with - 4M urea for 14 days?\n\nOptions:\nA) cryo EM\nB) Insufficient information to - answer this question\nC) NMR\nD) x-ray crystallography\nE) circular dichroism\n\nProposed - Answer: \n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\ncryo EM\nInsufficient information + to answer this question\nNMR\nx-ray crystallography\ncircular dichroism\n\nProposed + answer: ", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -17,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "624" + - "467" content-type: - application/json host: @@ -47,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLNboMwEITvPIXlM1SEkJJyS9VIlfpz6SmpKuSYhTg1tmVvpEZR3r0y - IUDVVOqFw3w7w+zCMSCEipLmhPItQ94YGS2q9eujmL9NXtaJWKW7wxM+r+TD8rC4ny5p6B16swOO - F9cN142RgEKrM+YWGIJPnWTTNM3i7G7SgkaXIL2tNhilOkriJI3ieRTfdsatFhwczcl7QAghx/bp - K6oSvmhO4vCiNOAcq4Hm/RAh1GrpFcqcEw6ZQhoOkGuFoNrW8Vi3UO0d87XUXspOP/Uvkro2Vm9c - x3u9Ekq4bWGBOa18qENtaEtPASEf7UL7Hx2psboxWKD+BOUDJ9PknEeHE45ox1Ajk2PTNLwSV5SA - TEg3ugjljG+hHKzD+di+FHoEgtHSv8tcyz4vLlT9n/gBcA4GoSyMhVLwnwsPYxb8D/bXWH/ktjB1 - B4fQFJVQNVhjxfkbV6aosvkMNrMqzWhwCr4BAAD//wMANO06tewCAAA= + H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hViz3FxXs3jFkIooZSQQ3spxSjy2lYrS6qkkJaQ/14k + p7ZDUujFhx3N55mVjhEhwDOYE2AldazSIl7kh9lgtZmMt4+r4fPL08NivN5uPvvLw3Kzhp53qN07 + MvfrumOq0gIdV7KWmUHq0FP7k+FoPJ4OB0kQKpWh8LZCu3ik4kEyGMXJNE7uz8ZScYYW5uQ1IoSQ + Y/j6iDLDL5iTgAmTCq2lBcK8OUQIGCX8BKi13DoqHfRakSnpUIbU3bHBfG+pTyX3Qpznp+Y/QhXa + qJ09680855LbMjVIrZKeaZ3SENRTRMhb6LO/iAjaqEq71KkPlB4469c4aBfYiueq4JSj4obnApZm + 6CgXtrMOYJSVmF0BCQG6z7jqCFGn8nWWW+y6NpfFf/CtwBhqh1mqDWac3ewb4P51/XWsWXEIDPbb + OqzSnMsCjTa8vuBcp7Oc7ugsT4ZTiE7RDwAAAP//AwBp6cyD6QIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdedda0ceb36-SJC + - 8f4256dfbb4a7ac7-SJC Connection: - keep-alive Content-Encoding: @@ -66,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:31 GMT + - Wed, 18 Dec 2024 21:45:20 GMT Server: - cloudflare Transfer-Encoding: @@ -80,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "259" + - "207" openai-version: - "2020-10-01" strict-transport-security: @@ -92,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999845" + - "29999889" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_223a9415a5a19029f86768ffbabf3d6f + - req_06b5243be6b66dfc6827056b2c49f4c4 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml index 7f73abaa..f9092a53 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "459" content-type: - application/json host: @@ -36,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6ggoXlwyyGKcql6Sx+qkLEXcGtsyzZSoyj/vTKQ - QNRU6oXDfDvD7MIpQAhzhjOEaU0cbbSINuXb02Z73L287ndH1ias2C8k3a0OB1Y949A7VPEJ1F1c - D1Q1WoDjSvaYGiAOfGqynKfpMl6u0g40ioHwtkq7KFXRLJ6lUbyK4sVgrBWnYHGG3gOEEDp1T19R - MvjGGYrDi9KAtaQCnF2HEMJGCa9gYi23jkiHwxFSJR3IrvV2qhsoW0t8LdkKMejn64uEqrRRhR34 - VS+55LbODRCrpA+1Tmnc0XOA0Ee3UHvTEWujGu1yp75A+sAkeezz8HjCCR2YU46IqWkR3onLGTjC - hZ1cBFNCa2CjdTwfaRlXExBMlv5d5l52vziX1X/iR0ApaAcs1wYYp7cLj2MG/A/219j1yF1hbI/W - QZOXXFZgtOH9Ny51vi5JQdZlPF/h4Bz8AAAA//8DAKuPA4PsAgAA + H4sIAAAAAAAAAwAAAP//jJLLTsMwEEX3+QrL6wSlL9pkx4KHVLFDdIFQ5DiTxOB4LNtRgar/jpyG + JBVFYuPFnLnXd8Y+BIRQUdCUUF4zxxsto5tyvzHPt1X5uJvva51v758g2d09oNx+tTT0CszfgLsf + 1RXHRktwAtUJcwPMgXedrRfL1WqziJMONFiA9LJKu2iJ0TyeL6N4E8XXvbBGwcHSlLwEhBBy6E4f + URXwQVMShz+VBqxlFdB0aCKEGpS+Qpm1wjqmHA1HyFE5UF3qZDmL11NmoGwt89FUK2VfPw6XSay0 + wdz2fKiXQglbZwaYReWNrUNNO3oMCHnthmrPclJtsNEuc/gOyhsms5MdHbc4wnnPHDomJ5pFeMEs + K8AxIe1kJ5QzXkMxKscFsrYQOAHBZOTfWS55n8YWqvqP/Qg4B+2gyLSBQvDzecc2A/6L/dU2rLgL + TO2nddBkpVAVGG3E6ZVLnSUly1lSxosNDY7BNwAAAP//AwBPnKtg7gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdc06b78cf26-SJC + - 8f42569a694415a4-SJC Connection: - keep-alive Content-Encoding: @@ -65,9 +64,15 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:24 GMT + - Wed, 18 Dec 2024 21:45:09 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=VhM7SMRUwaYcBGPmD54eZozV9ZASFwCpD2uUzfcbygQ-1734558309-1.0.1.1-PO0cbnzYww6YCdbMGHLbwDXTjy0s_I50cJaqd7OUcIUnT7C0j_EJ9CBwZm8nRmzrv2FivdnDcss9GtsBOjjypw; + path=/; expires=Wed, 18-Dec-24 22:15:09 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=tUjQ_WfZ7MP0T2BILMqKk0y_GuM5NWbc6M_7vifNjIU-1734558309829-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -79,7 +84,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "291" + - "215" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +96,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999868" + - "29999890" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_89e3d88c7f12861d7e774e452300b36d + - req_9990262c4eaef569ab37ffc7cf363319 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml index 45376f48..a14c592f 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94106\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94106", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "536" + - "459" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4ySX2uDMBTF3/0UIc912M7WzreVjv1hDPo06BiSJlfNFpMsibBS+t1H1KplHezF - h/O753ju1UOAEOYMpwjTkjhaaRHe5tuX1erpfsv3z8s4ed1sHvL1191+UT9WDk+8Q+0+gLqT64qq - SgtwXMkWUwPEgU+dJtdxnETJct6ASjEQ3lZoF8YqnEWzOIyWYbTojKXiFCxO0VuAEEKH5ukrSgbf - OEXR5KRUYC0pAKf9EELYKOEVTKzl1hHZ1u0gVdKBbFqvx7qBvLbE15K1EJ1+7F8kVKGN2tmO93rO - JbdlZoBYJX2odUrjhh4DhN6bheqzjlgbVWmXOfUJ0gdOp/M2Dw8nHNGOOeWIGJsWkwtxGQNHuLCj - i2BKaAlssA7nIzXjagSC0dK/y1zKbhfnsvhP/AAoBe2AZdoA4/R84WHMgP/B/hrrj9wUxnZvHVRZ - zmUBRhvefuNcZyS5YUtG6DTHwTH4AQAA//8DAK9WW8vsAgAA + H4sIAAAAAAAAAwAAAP//jJLLbsIwEEX3+QrLa1IlECiwo1I/oKioL1WRY08SF8e27KEvxL9XDoGA + SqVuvJgz9/rO2NuIECoFnRPKa4a8sSpelB9TzG4m9eo5KWB5qxfrh9Xj8u7p+754p4OgMMUbcDyo + rrhprAKURu8xd8AQgmt6PcrG4+koTVvQGAEqyCqLcWbiYTLM4mQaJ5NOWBvJwdM5eYkIIWTbniGi + FvBJ5yQZHCoNeM8qoPNjEyHUGRUqlHkvPTKNdNBDbjSCblPPsrS78iCEcuNZiKY3SnX13fEyZSrr + TOE7fqyXUktf5w6YNzoYezSWtnQXEfLaDrU5y0mtM43FHM0adDCcpXs72m+xh8OOoUGmTjSjwQWz + XAAyqfzJTihnvAbRK/sFso2Q5gREJyP/znLJez+21NV/7HvAOVgEkVsHQvLzefs2B+GL/dV2XHEb + mPovj9DkpdQVOOvk/pVLm7PrmZgKxtOSRrvoBwAA//8DAPNQ/XLuAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdcb6d3b645e-SJC + - 8f4256a569fd6809-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:26 GMT + - Wed, 18 Dec 2024 21:45:11 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "282" + - "224" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999868" + - "29999890" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_0f4462cd5dd31fe3e1a9d6847e563042 + - req_ad4574ef78bcc23b8c85835d827e63ec status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml index abe7094e..3cf90e2d 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer - is 94106 or 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer + is 94106 or 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "545" + - "468" content-type: - application/json host: @@ -46,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFJBS8MwGL33V4ScV+m2sm69CaLTgyjIDhMpWfK1zZYmIUnFMfbfJW3X - dqjgJYf3vvfyvpecAoQwZzhFmJbE0UqL8DbfPt8V+4U8bO6PL0/rTUnWD8XrVr59PkZ44hVqtwfq - LqobqiotwHElW5oaIA686zSZx3ESJcukISrFQHhZoV0Yq3AWzeIwWobRohOWilOwOEXvAUIInZrT - R5QMvnCKoskFqcBaUgBO+yGEsFHCI5hYy60j0uHJQFIlHcgmdTTGDeS1JT6WrIXo8HN/kVCFNmpn - O77Hcy65LTMDxCrpTa1TGjfsOUDoo1movsqItVGVdplTB5DecDpdtX54qHDEdpxTjogRPOtauLbL - GDjChR01gimhJbBBOtRHasbViAhGS/8M85t3uziXxX/sB4JS0A5Ypg0wTq8XHsYM+A/211hfchMY - 26N1UGU5lwUYbXj7xrnOVjnZkVUezZc4OAffAAAA//8DAAUNxI3sAgAA + H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUIlMBWsYNUVepQVZHjPAdTx7ZsRy1F/HvlEJJU + UKmLh7t357tnnyJCQJSwJsD21LPayPiJf2bfxbZ+3iqeqdfjruCr3eblkLqj2cAkKHRxQOavqgem + ayPRC60uNLNIPQbX6TKdLxZZOl22RK1LlEFWGR/PdTxLZvM4yeLksRPutWDoYE3eIkIIObVniKhK + /II1SSZXpEbnaIWw7ocIAatlQIA6J5ynysNkIJlWHlWbegxb5I2jIZVqpOzwc3+P1JWxunAd3+Nc + KOH2uUXqtAqezmsDLXuOCHlv+zS/IoKxujY+9/oDVTBcLS52MCxwILuq4LWn8o7ml1leoqdCutE6 + gFG2x/LGkBCgTSn0iIhGlW+z3PO+1Baq+o/9QDCGxmOZG4ulYHf7tubhd/011q+4DQzu6DzWOReq + QmusuDwwN/mK04KueJJmEJ2jHwAAAP//AwCZn7Uj6QIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdd6bc71fa2e-SJC + - 8f4256c89c4217de-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:27 GMT + - Wed, 18 Dec 2024 21:45:17 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +78,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "249" + - "140" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +90,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999865" + - "29999889" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_72f1a2af642dad2884e52d652e775182 + - req_92a0dc630d0d21997ff3aeede540417d status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml index 607cd8a2..84c270eb 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml @@ -1,13 +1,12 @@ interactions: - request: body: - '{"messages": [{"content": "Given the following question and a proposed - answer to the question, return the single-letter choice in the question that - matches the proposed answer. If the proposed answer is blank or an empty string, - or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is - my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer - this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: Insufficient - information\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information + to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: Insufficient + information", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "541" + - "464" content-type: - application/json host: @@ -46,18 +45,19 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6gIoDy4pS+lPeRWVWpVIWMWcGNsyzZSqyj/vTIQ - IGoq9cJhvp1hduHoIYRZgVOEaU0sbRQPtuXb/u4pP7w0z69kt3+I6221q/fN/SOJNPadQ+afQO3Z - dUNlozhYJkWPqQZiwaUuVnGSrMLVetmBRhbAna1SNkhkEIVREoTrIFwOxloyCgan6N1DCKFj93QV - RQFfOEWhf1YaMIZUgNNxCCGsJXcKJsYwY4mw2J8glcKC6FrfznUNZWuIqyVazgf9NL6Iy0ppmZuB - j3rJBDN1poEYKVyosVLhjp48hD66hdqLjlhp2SibWXkA4QIXi6jPw9MJZ3RgVlrC56bYvxKXFWAJ - 42Z2EUwJraGYrNP5SFswOQPebOnfZa5l94szUf0nfgKUgrJQZEpDwejlwtOYBveD/TU2HrkrjM23 - sdBkJRMVaKVZ/41LlW1KkpNNGcZr7J28HwAAAP//AwBPQ8gX7AIAAA== + H4sIAAAAAAAAAwAAAP//jFLLbtswELzrKwiercJO/JB9cwOk7aHXXopCYMilxIbiMtxV4iLwvxeU + bclBUqAXHmZ2hjOLfS2EkM7InZC6Vay76Mu9fakO62Z/f9duH5+rz+tKfz18WS/2P56+P8tZVuDD + b9B8UX3S2EUP7DCcaJ1AMWTXxeZ2uVpVt4vVQHRowGdZE7lcYnkzv1mW86qcr8/CFp0GkjvxsxBC + iNfhzRGDgYPcifnsgnRApBqQu3FICJnQZ0QqIkesAsvZRGoMDGFI/S1Qb63TDgILFyymTuX4glGo + QC+QBLeOxFMPNNa6/AG2J5VbhN77M34cc3lsYsIHOvMjbl1w1NYJFGHIGYgxyoE9FkL8Gvr3byrJ + mLCLXDM+QsiGVXWyk9PCJ3Jz5hhZ+QnermYfmNUGWDlPV+uTWukWzKScdq164/CKKK4qv8/ykfep + tgvN/9hPhNYQGUwdExin3/adxhLka/zX2LjiIbCkP8TQ1daFBlJM7nQQNtZqszWVUXphZXEs/gIA + AP//AwDr0DopGQMAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdd11ed0175e-SJC + - 8f4256aaff33ce5c-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +65,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:27 GMT + - Wed, 18 Dec 2024 21:45:16 GMT Server: - cloudflare Transfer-Encoding: @@ -79,7 +79,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "196" + - "1096" openai-version: - "2020-10-01" strict-transport-security: @@ -91,13 +91,13 @@ interactions: x-ratelimit-remaining-requests: - "9999" x-ratelimit-remaining-tokens: - - "29999867" + - "29999890" x-ratelimit-reset-requests: - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_9dd0f40823dceb910336a862f0513c68 + - req_d492552c2043885324019334103f08d1 status: code: 200 message: OK diff --git a/tests/test_utils.py b/tests/test_utils.py index e3e7af1c..0962e24c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -137,7 +137,7 @@ def _assert_prompt_is_valid( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "the answer is 14004", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="didnt-match-and-no-llm-innate-knowledge", ), pytest.param( @@ -158,35 +158,35 @@ def _assert_prompt_is_valid( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "the answer is 94106 or 94107", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="matched-several-options", ), pytest.param( *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer1", ), pytest.param( *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, "14", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="didnt-match-and-llm-has-innate-knowledge", ), pytest.param( *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer2", ), pytest.param( *LITQA2_QUESTION_IDEAL_DISTRACTORS, "", MultipleChoiceEvaluation.INCORRECT, - "0", + None, id="empty-answer3", ), ], @@ -198,7 +198,7 @@ async def test_grade( distractors: str | list[str], actual_answer: str, expected_eval: MultipleChoiceEvaluation, - expected_extracted_answer: str, + expected_extracted_answer: str | None, ) -> None: """Tests that we can create a multiple choice question and evaluate answers.""" mc_question = MultipleChoiceQuestion( @@ -208,7 +208,7 @@ async def test_grade( shuffle_seed=42, # Seed for VCR cassette ) self._assert_prompt_is_valid(mc_question, question, ideal_answer, distractors) - evaluation, _, graded_answer = await mc_question.grade(actual_answer) + evaluation, graded_answer = await mc_question.grade(actual_answer) assert evaluation == expected_eval if evaluation == MultipleChoiceEvaluation.CORRECT: assert graded_answer == ideal_answer From 90dbcf4cbcac959e9d922ea928a3843ba2d947c4 Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 18 Dec 2024 13:48:54 -0800 Subject: [PATCH 3/3] Moved back to gpt-4o-mini --- src/aviary/utils.py | 2 +- ...t-match-and-llm-has-innate-knowledge].yaml | 44 ++++++++++-------- ...nt-match-and-no-llm-innate-knowledge].yaml | 44 ++++++++++-------- ...AEvaluation.test_grade[empty-answer1].yaml | 40 ++++++++-------- ...AEvaluation.test_grade[empty-answer2].yaml | 36 +++++++-------- ...AEvaluation.test_grade[empty-answer3].yaml | 36 +++++++-------- ...on.test_grade[matched-correct-option].yaml | 44 ++++++++---------- ....test_grade[matched-incorrect-option].yaml | 36 +++++++-------- ...n.test_grade[matched-several-options].yaml | 45 ++++++++++-------- ...ion.test_grade[matched-unsure-option].yaml | 38 +++++++-------- .../test_eval_answer[llm basic].yaml | 46 +++++++++---------- tests/cassettes/test_eval_llm_config.yaml | 36 +++++++-------- .../test_extract_answer[complex].yaml | 36 +++++++-------- .../test_extract_answer[empty-proposal].yaml | 44 ++++++++++-------- .../test_extract_answer[gave-two].yaml | 44 ++++++++++-------- .../test_extract_answer[not in options].yaml | 44 +++++++++--------- 16 files changed, 320 insertions(+), 295 deletions(-) diff --git a/src/aviary/utils.py b/src/aviary/utils.py index fcbf130f..b495f18b 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -20,7 +20,7 @@ import numpy as np -DEFAULT_EVAL_MODEL_NAME = "gpt-4o" +DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini" LLM_BOOL_EVAL_CONFIG: dict[str, Any] = { "prompt": ( "Here is a question, the correct answer to the question, and a proposed answer" diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml index c87a0ba3..617568d2 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role": - "user"}], "model": "gpt-4o", "temperature": 0}' + "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "437" + - "442" content-type: - application/json host: @@ -35,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVJIe6u4wKESIHFCKHKdTWpwvJa9FaCq/46cvtUicfFh - Zmc8s/Y6EQJ0DVMBailZdc6ks+Zrko2cxvljMXvunmavPHkJTA9zk9/DICpo8YGK96obRZ0zyJrs - llYeJWN0Hd7lxXhc5sOyJzqq0URZ6zgtKB1loyLNyjS73QmXpBUGmIq3RAgh1v0ZI9oav2EqssEe - 6TAE2SJMD0NCgCcTEZAh6MDSMgyOpCLLaPvUp7DHZhVkTGVXxuzwzeEeQ63ztAg7/oA32uqwrDzK - QDZ6BiYHPbtJhHjv+6zOIoLz1DmumD7RRsNyvLWD4wKP5K4qMLE0VzRnZlWNLLUJJ+sAJdUS6wtD - IUCuak0nRHJS+TLLNe9tbW3b/9gfCaXQMdaV81hrdbVvbx5/119jhxX3gSH8BMauarRt0Tuvtw/c - uGrSyIWcNFleQrJJfgEAAP//AwA5BypS6QIAAA== + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da + tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX + wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr + +ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2 + EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI + XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH + wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256d28f9615ff-SJC + - 8f425bb2ac70f953-SJC Connection: - keep-alive Content-Encoding: @@ -64,9 +64,15 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:18 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -78,25 +84,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "208" + - "144" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999896" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_7db1d1f6dded4679e43cc12a2183fa21 + - req_503cd8163bd0d3b634eb723d6874b1da status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml index f043c502..77357e4c 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer - is 14004", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + is 14004", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "459" + - "464" content-type: - application/json host: @@ -35,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLNTsMwEITveQprzw1K/6DNDakIwYkLqgRFkeNsEhfHNrarUlV9d2S3 - TVJRJC4+7Lcznl17HxECvICUAKupY40W8X25ndktPqzrRb37envdVk/LxfKRvkx4/gwDr1D5Gpk7 - q26YarRAx5U8YmaQOvSuw7vxZDqdjYdJAI0qUHhZpV08UfEoGU3iZBYntydhrThDCyl5jwghZB9O - H1EW+A0pCTah0qC1tEJI2yZCwCjhK0Ct5dZR6WDQQaakQxlSr2AFfWSw3Fjqk8mNEKf6ob1LqEob - ldsTb+sll9zWmUFqlfS+1ikNgR4iQj7CTJuLmKCNarTLnPpE6Q3nw6MddEvs4Jk55ajoaUaDK2ZZ - gY5yYXsrAUZZjUWn7PZHNwVXPRD1Rv6d5Zr3cWwuq//Yd4Ax1A6LTBssOLuct2sz6H/YX23tikNg - sDvrsMlKLis02vDjI5c6m5c0p/MyGc8gOkQ/AAAA//8DAEsANTftAgAA + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyqB8MoNoR7aAxdKVamqImNvElPHtmyjPhD/XjlQ + AoJKvfgwszOeWXsXEQKCQ06A1dSzxsh4Vn7M71fZMntcjAbfz6uXjTSbh9mCPk2Xc+gFhV5vkPlf + 1R3TjZHohVYHmlmkHoNrOh5kw+FkmE5aotEcZZBVxseZjhuhRNxP+lmcjON0clTXWjB0kJPXiBBC + du0ZciqOn5CTpPeLNOgcrRDy0xAhYLUMCFDnhPNUeeh1JNPKo2qjn8MWy62jIZraSnnE96d7pK6M + 1Wt35E94KZRwdWGROq2Cp/PaQMvuI0Le2j7bi4hgrG6ML7x+RxUMp+nBDrotduSxKnjtqbyhuTAr + OHoqpDtbBzDKauRXhoQA3XKhz4jorPJ1llveh9pCVf+x7wjG0HjkhbHIBbvZtzUPX+yvsdOK28Dg + vpzHpiiFqtAaKw4PXJpiVLI0wTTBNUT76AcAAP//AwBkI2np7gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f42569f7d2a2379-SJC + - 8f425bb11b702519-SJC Connection: - keep-alive Content-Encoding: @@ -64,9 +64,15 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:10 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=6j4w6Jnsg0wGsZf61WcNCvHdr1Vcb6uVLFFhTQQgcv4-1734558518-1.0.1.1-D0vsT8nCM66xiA.Xa6ijXpgeGPM65Iux2KhQqUiD8wToq.VmwT03dnkmELw1qn0GvHJvh8g7H6WkqYzXVgs2Xg; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=LFVOxysXKxTPNQ2KK05aqbBnIRDPc45hskCPkFcOjXA-1734558518178-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -78,25 +84,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "240" + - "131" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999890" + - "149999890" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_363f6da2908247ad8c711b11d1593ae7 + - req_12c5e1cdb8b2ba32b075f04f20194421 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml index 662716da..6865d713 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role": - "user"}], "model": "gpt-4o", "temperature": 0}' + "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "440" + - "445" content-type: - application/json host: @@ -45,18 +45,16 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4xSy27CMBC85yusPZMqkAApt6rHSr0U9VJVkbE3wa3jtWyj8hD/XjlAAEGlXnyY - 2RnPrL1LGAMlYcZALHkQrdXpU/1TbhfrIn+j+Vxs281Cv7/k9JrN1/gMg6igxReKcFI9CGqtxqDI - HGjhkAeMrsNpXozHZT6cdkRLEnWUNTakBaWjbFSkWZlmk6NwSUqghxn7SBhjbNedMaKRuIYZywYn - pEXveYMw64cYA0c6IsC9Vz5wE2BwJgWZgKZLfQk7rFeex1RmpfUR3/f3aGqso4U/8j1eK6P8snLI - PZno6QNZ6Nh9wthn12d1FRGso9aGKtA3mmhYTg52cF7gmTxWhUCB6zuaK7NKYuBK+4t1gOBiifLG - kDHgK6nogkguKt9mued9qK1M8x/7MyEE2oCysg6lEnf7dubxd/011q+4Cwx+4wO2Va1Mg846dXjg - 2lZ8+ihLycWwhmSf/AIAAP//AwBsWlME6QIAAA== + H4sIAAAAAAAAAwAAAP//jFJdS8MwFH3vrwj3eZV2X46+6RAR0T2JikjJkts2miYhSVEZ+++Srms3 + NsGXPJxzz8k5N9lEhIDgkBFgFfWsNjK+Kr6WN6vb8XL+9LqSxj3Qu2R1ff/8yCfiBUZBodcfyPxe + dcF0bSR6odWOZhapx+CaXk6ms9lili5aotYcZZCVxsdTHddCiXicjKdxchmni05dacHQQUbeIkII + 2bRnyKk4fkNGktEeqdE5WiJk/RAhYLUMCFDnhPNUeRgNJNPKo2qjH8IWi8bREE01Unb4tr9H6tJY + vXYd3+OFUMJVuUXqtAqezmsDLbuNCHlv+zRHEcFYXRufe/2JKhgu5js7GLY4kF1V8NpTeUZzZJZz + 9FRId7AOYJRVyE8MCQHacKEPiOig8mmWc9672kKV/7EfCMbQeOS5scgFO9u3NQ9f7K+xfsVtYHA/ + zmOdF0KVaI0VuwcuTD4vWJpgmuAaom30CwAA//8DAL0A1qzuAgAA headers: - CF-Cache-Status: - - DYNAMIC CF-RAY: - - 8f4256cce931eb29-SJC + - 8f425bb5de5996de-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +62,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:17 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -75,28 +73,30 @@ interactions: - X-Request-ID alt-svc: - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC openai-organization: - future-house-xr4tdh openai-processing-ms: - - "217" + - "233" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999895" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_520872e529ccbb680d27a3729fbe637e + - req_0c845e0049332bd1fa73fdbe76005ea1 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml index 483daa67..4a0fa4ae 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}], - "model": "gpt-4o", "temperature": 0}' + "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "435" + - "440" content-type: - application/json host: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUg0CRbh7ZDB9Shqqqqioz9Etw6tmU7AoT498oB - QhBU6uLh7t357tm7iBAQHAoCbEU9a4yMH6p1Pp5/uKf59rF5b95e08X4hanFZv3sMhgFhV5+I/Mn - 1R3TjZHohVYHmlmkHoPr+H6azmbZdJx3RKM5yiCrjY9THU+SSRonWZzMj8KVFgwdFOQzIoSQXXeG - iIrjBgqSjE5Ig87RGqHohwgBq2VAgDonnKfKw+hMMq08qi71ELZYtY6GVKqV8ojv+3ukro3VS3fk - e7wSSrhVaZE6rYKn89pAx+4jQr66Pu1FRDBWN8aXXv+gCoZZerCD8wLP5LEqeO2pvKG5MCs5eiqk - G6wDGGUr5FeGhABtudADIhpUvs5yy/tQW6j6P/ZngjE0HnlpLHLBbvbtzMPv+musX3EXGNzWeWzK - SqgarbHi8MCVKfOKLmleJdMMon30CwAA//8DAFd1apnpAgAA + H4sIAAAAAAAAAwAAAP//jFJda8IwFH3vrwj32Q6tLTrfhjB8FhyyMUpMbttomoQk3Qfifx+ptXXo + YC95OOeek3NucowIAcFhQYBV1LPayPip+Fw+rz5e6221zta4fKENiu1qszkk2z2MgkLv9sj8RfXA + dG0keqHVmWYWqcfgOplN0yybZ5PHlqg1RxlkpfFxquNaKBEn4ySNx7N4Mu/UlRYMHSzIW0QIIcf2 + DDkVxy9YkPHogtToHC0RFv0QIWC1DAhQ54TzVHkYDSTTyqNqo1/DFovG0RBNNVJ2+Km/R+rSWL1z + Hd/jhVDCVblF6rQKns5rAy17igh5b/s0vyKCsbo2Pvf6gCoYztOzHQxbHMiuKnjtqbyj+WWWc/RU + SHe1DmCUVchvDAkB2nChr4joqvJtlnve59pClf+xHwjG0HjkubHIBbvbtzUPX+yvsX7FbWBw385j + nRdClWiNFecHLkzOxwnPppNdOoPoFP0AAAD//wMAMCnsc+4CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256d7fd75cf0d-SJC + - 8f425bb72f9b67dc-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:19 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -78,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "520" + - "532" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999896" + - "149999896" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_f943511f12de0306ff59cccd017e98f1 + - req_ed9d0e7998f792094d5aefe723693f28 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml index 46a00d4f..f6e5e085 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\ncryo EM\nInsufficient information to answer this question\nNMR\nx-ray crystallography\ncircular dichroism\n\nProposed - answer: ", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + answer: ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "467" + - "472" content-type: - application/json host: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hViz3FxXs3jFkIooZSQQ3spxSjy2lYrS6qkkJaQ/14k - p7ZDUujFhx3N55mVjhEhwDOYE2AldazSIl7kh9lgtZmMt4+r4fPL08NivN5uPvvLw3Kzhp53qN07 - MvfrumOq0gIdV7KWmUHq0FP7k+FoPJ4OB0kQKpWh8LZCu3ik4kEyGMXJNE7uz8ZScYYW5uQ1IoSQ - Y/j6iDLDL5iTgAmTCq2lBcK8OUQIGCX8BKi13DoqHfRakSnpUIbU3bHBfG+pTyX3Qpznp+Y/QhXa - qJ09680855LbMjVIrZKeaZ3SENRTRMhb6LO/iAjaqEq71KkPlB4469c4aBfYiueq4JSj4obnApZm - 6CgXtrMOYJSVmF0BCQG6z7jqCFGn8nWWW+y6NpfFf/CtwBhqh1mqDWac3ewb4P51/XWsWXEIDPbb - OqzSnMsCjTa8vuBcp7Oc7ugsT4ZTiE7RDwAAAP//AwBp6cyD6QIAAA== + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kypAKIRbH+pDVU+9VK2qyDibxNSxLXtRSxH/Xjm8 + gqBSLz7M7Ixn1l5FjIEsYMpA1JxEY1V8VX7d3N2/PP7czp9f39QwmTzJ5UNqqb52M+gFhZnNUdBO + dSFMYxWSNHpDC4ecMLj2x8N0NJqM+llLNKZAFWSVpTg1cSO1jAfJII2TcdyfbNW1kQI9TNl7xBhj + q/YMOXWB3zBlSW+HNOg9rxCm+yHGwBkVEODeS09cE/QOpDCaULfRu7DDcuF5iKYXSm3x9f4eZSrr + zMxv+T1eSi19nTvk3ujg6clYaNl1xNhH22dxFBGsM42lnMwn6mCY9Td2cNjigdxWBTLE1RnNkVle + IHGpfGcdILiosTgxZAz4opCmQ0SdyqdZznlvaktd/cf+QAiBlrDIrcNCirN9W/Pwxf4a26+4DQx+ + 6QmbvJS6Qmed3DxwafNhxtNEZJc8gWgd/QIAAP//AwCjNKe67gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256dfbb4a7ac7-SJC + - 8f425bbaab9a236e-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:20 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -78,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "207" + - "231" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999889" + - "149999888" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_06b5243be6b66dfc6827056b2c49f4c4 + - req_427dff29f2a632ec0882c27c797f5d5a status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml index f9092a53..f126cb68 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer - is 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + is 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "459" + - "464" content-type: - application/json host: @@ -35,7 +35,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "0" + - "1" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLLTsMwEEX3+QrL6wSlL9pkx4KHVLFDdIFQ5DiTxOB4LNtRgar/jpyG - JBVFYuPFnLnXd8Y+BIRQUdCUUF4zxxsto5tyvzHPt1X5uJvva51v758g2d09oNx+tTT0CszfgLsf - 1RXHRktwAtUJcwPMgXedrRfL1WqziJMONFiA9LJKu2iJ0TyeL6N4E8XXvbBGwcHSlLwEhBBy6E4f - URXwQVMShz+VBqxlFdB0aCKEGpS+Qpm1wjqmHA1HyFE5UF3qZDmL11NmoGwt89FUK2VfPw6XSay0 - wdz2fKiXQglbZwaYReWNrUNNO3oMCHnthmrPclJtsNEuc/gOyhsms5MdHbc4wnnPHDomJ5pFeMEs - K8AxIe1kJ5QzXkMxKscFsrYQOAHBZOTfWS55n8YWqvqP/Qg4B+2gyLSBQvDzecc2A/6L/dU2rLgL - TO2nddBkpVAVGG3E6ZVLnSUly1lSxosNDY7BNwAAAP//AwBPnKtg7gIAAA== + H4sIAAAAAAAAAwAAAP//jJI/b8MgEMV3fwrEbFf+FznxllSpOlTKVHWoKovA2abFgACrSaN89won + jR01lbow3O/e493BIUAIc4ZLhGlLHO20iJb15/3DelPvN7De7lY6fukfnxl7ylaL5RcOvUJt34G6 + H9UdVZ0W4LiSJ0wNEAfeNSmyfDabz5LFADrFQHhZo12Uq6jjkkdpnOZRXETJ/KxuFadgcYleA4QQ + OgynzykZ7HCJ4vCn0oG1pAFcXpoQwkYJX8HEWm4dkQ6HI6RKOpBD9EWexMWUGah7S3w+2Qtxrh8v + lwnVaKO29swv9ZpLbtvKALFKemPrlMYDPQYIvQ1D9Vc5sTaq065y6gOkN1wkJzs8rnKE6Zk55YiY + aLLwhlnFwBEu7GQnmBLaAhuV4wJJz7iagGAy8u8st7xPY3PZ/Md+BJSCdsAqbYBxej3v2GbA/7O/ + 2i4rHgJju7cOuqrmsgGjDT+9cq0rFqdsliXbvMDBMfgGAAD//wMAitN9t/MCAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f42569a694415a4-SJC + - 8f425bb60cfe17e4-SJC Connection: - keep-alive Content-Encoding: @@ -64,15 +64,9 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:09 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare - Set-Cookie: - - __cf_bm=VhM7SMRUwaYcBGPmD54eZozV9ZASFwCpD2uUzfcbygQ-1734558309-1.0.1.1-PO0cbnzYww6YCdbMGHLbwDXTjy0s_I50cJaqd7OUcIUnT7C0j_EJ9CBwZm8nRmzrv2FivdnDcss9GtsBOjjypw; - path=/; expires=Wed, 18-Dec-24 22:15:09 GMT; domain=.api.openai.com; HttpOnly; - Secure; SameSite=None - - _cfuvid=tUjQ_WfZ7MP0T2BILMqKk0y_GuM5NWbc6M_7vifNjIU-1734558309829-0.0.1.1-604800000; - path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -84,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "215" + - "538" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999890" + - "149999891" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_9990262c4eaef569ab37ffc7cf363319 + - req_9bd9d799783ab13ef59ce8e5ca7fd25f status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml index a14c592f..842cfbf0 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer - is 94106", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + is 94106", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "459" + - "464" content-type: - application/json host: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLLbsIwEEX3+QrLa1IlECiwo1I/oKioL1WRY08SF8e27KEvxL9XDoGA - SqVuvJgz9/rO2NuIECoFnRPKa4a8sSpelB9TzG4m9eo5KWB5qxfrh9Xj8u7p+754p4OgMMUbcDyo - rrhprAKURu8xd8AQgmt6PcrG4+koTVvQGAEqyCqLcWbiYTLM4mQaJ5NOWBvJwdM5eYkIIWTbniGi - FvBJ5yQZHCoNeM8qoPNjEyHUGRUqlHkvPTKNdNBDbjSCblPPsrS78iCEcuNZiKY3SnX13fEyZSrr - TOE7fqyXUktf5w6YNzoYezSWtnQXEfLaDrU5y0mtM43FHM0adDCcpXs72m+xh8OOoUGmTjSjwQWz - XAAyqfzJTihnvAbRK/sFso2Q5gREJyP/znLJez+21NV/7HvAOVgEkVsHQvLzefs2B+GL/dV2XHEb - mPovj9DkpdQVOOvk/pVLm7PrmZgKxtOSRrvoBwAA//8DAPNQ/XLuAgAA + H4sIAAAAAAAAA4ySPW/CMBCG9/wKyzOpEggfYauqbgwsnaoqMs4lmDo+y74IWsR/rxw+ElQqdfFw + z72v3zv7GDHGVcmXjMutINlYHT9X+5dXO1mtdEbYzla7w/z7sH5bi30+rfkoKHCzA0lX1ZPExmog + heaMpQNBEFzT+SSbThfTdNGBBkvQQVZbijOMG2VUPE7GWZzM43RxUW9RSfB8yd4jxhg7dmfIaUo4 + 8CVLRtdKA96LGvjy1sQYd6hDhQvvlSdhiI96KNEQmC56nqXJbMgcVK0XIZ9ptb7UT7fLNNbW4cZf + +K1eKaP8tnAgPJpg7Akt7+gpYuyjG6q9y8mtw8ZSQfgJJhjm6dmO96vs4fjCCEnogWYyemBWlEBC + aT/YCZdCbqHslf0CRVsqHIBoMPLvLI+8z2MrU//HvgdSgiUoC+ugVPJ+3r7NQfhnf7XdVtwF5v7L + EzRFpUwNzjp1fuXKFpNcZInMZyLh0Sn6AQAA//8DAL5Pl0/zAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256a569fd6809-SJC + - 8f425bb64ed0fa36-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:11 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -78,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "224" + - "247" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999890" + - "149999891" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_ad4574ef78bcc23b8c85835d827e63ec + - req_eb9ad02601ae4b1b2b579657ed9a7bef status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml index 3cf90e2d..26df9d56 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml @@ -6,7 +6,8 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer - is 94106 or 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + is 94106 or 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature": + 0}' headers: accept: - application/json @@ -15,7 +16,7 @@ interactions: connection: - keep-alive content-length: - - "468" + - "473" content-type: - application/json host: @@ -35,7 +36,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -45,18 +46,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUIlMBWsYNUVepQVZHjPAdTx7ZsRy1F/HvlEJJU - UKmLh7t357tnnyJCQJSwJsD21LPayPiJf2bfxbZ+3iqeqdfjruCr3eblkLqj2cAkKHRxQOavqgem - ayPRC60uNLNIPQbX6TKdLxZZOl22RK1LlEFWGR/PdTxLZvM4yeLksRPutWDoYE3eIkIIObVniKhK - /II1SSZXpEbnaIWw7ocIAatlQIA6J5ynysNkIJlWHlWbegxb5I2jIZVqpOzwc3+P1JWxunAd3+Nc - KOH2uUXqtAqezmsDLXuOCHlv+zS/IoKxujY+9/oDVTBcLS52MCxwILuq4LWn8o7ml1leoqdCutE6 - gFG2x/LGkBCgTSn0iIhGlW+z3PO+1Baq+o/9QDCGxmOZG4ulYHf7tubhd/011q+4DQzu6DzWOReq - QmusuDwwN/mK04KueJJmEJ2jHwAAAP//AwCZn7Uj6QIAAA== + H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyoJz3Jrq35ARakqVVVk7E0wdWzLXkQrxL9XDhBA + UKkXH2Z2xjNrbxPGQEmYMhBLTqJxOn2oNk/PzWJFr+KlmM03j/NZ8TbavMt+UVfQiwq7WKGgo+pO + 2MZpJGXNnhYeOWF0zcf9wXA4GeaTlmisRB1ltaN0YNNGGZUWWTFIs3GaTw7qpVUCA0zZR8IYY9v2 + jDmNxG+Ysqx3RBoMgdcI026IMfBWRwR4CCoQNwS9EymsITRt9HPYY7UOPEYza60P+K67R9vaebsI + B77DK2VUWJYeebAmegayDlp2lzD22fZZX0QE523jqCT7hSYa3g/3dnDa4ok8VAWyxPUNzYVZKZG4 + 0uFsHSC4WKK8MmQM+Foqe0YkZ5Wvs9zy3tdWpv6P/YkQAh2hLJ1HqcTNvq15/GJ/jXUrbgND+AmE + TVkpU6N3Xu0fuHLlqBJ5hnmGC0h2yS8AAAD//wMALlTCsO4CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256c89c4217de-SJC + - 8f425bb169ea22f6-SJC Connection: - keep-alive Content-Encoding: @@ -64,9 +65,15 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:17 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=d8n1B6AzFA1xougxyBgoPLD0ITgb.iimKMM9kNYr6NA-1734558518-1.0.1.1-c8MRCOD4wNoPcANGb9a6gOWsl6NhHqx911Ktp.RARxFa..7XVR9hKaZVQ2nRa8g.bTL2e2pT7EpsuMaFLlx6Sw; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=DPEKvT7hx6XvGnKxQqNrPq5Y4dSqkyQo4hPKRlWd79E-1734558518261-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -78,25 +85,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "140" + - "168" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999889" + - "149999888" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_92a0dc630d0d21997ff3aeede540417d + - req_becb26d30d1adf2d410f311a4664a6b2 status: code: 200 message: OK diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml index 84c270eb..5b56af9b 100644 --- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml @@ -6,7 +6,7 @@ interactions: ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: Insufficient - information", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + information", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "464" + - "469" content-type: - application/json host: @@ -45,19 +45,19 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLLbtswELzrKwiercJO/JB9cwOk7aHXXopCYMilxIbiMtxV4iLwvxeU - bclBUqAXHmZ2hjOLfS2EkM7InZC6Vay76Mu9fakO62Z/f9duH5+rz+tKfz18WS/2P56+P8tZVuDD - b9B8UX3S2EUP7DCcaJ1AMWTXxeZ2uVpVt4vVQHRowGdZE7lcYnkzv1mW86qcr8/CFp0GkjvxsxBC - iNfhzRGDgYPcifnsgnRApBqQu3FICJnQZ0QqIkesAsvZRGoMDGFI/S1Qb63TDgILFyymTuX4glGo - QC+QBLeOxFMPNNa6/AG2J5VbhN77M34cc3lsYsIHOvMjbl1w1NYJFGHIGYgxyoE9FkL8Gvr3byrJ - mLCLXDM+QsiGVXWyk9PCJ3Jz5hhZ+QnermYfmNUGWDlPV+uTWukWzKScdq164/CKKK4qv8/ykfep - tgvN/9hPhNYQGUwdExin3/adxhLka/zX2LjiIbCkP8TQ1daFBlJM7nQQNtZqszWVUXphZXEs/gIA - AP//AwDr0DopGQMAAA== + H4sIAAAAAAAAAwAAAP//jFJNj9MwEL3nV1g+NygpLc32tnwcOC2CA0gIRa4zTgZsj9eeaIFV/zty + 2iZdsUhcfHhv3vN7o3kshJDYyb2QelCsXbDlrXl48y58+njv4i3s3n7+cPf6d1g3d1XT0Be5ygo6 + fAfNF9ULTS5YYCR/onUExZBd693LzXbbbOtmIhx1YLOsD1xuqHTosVxX601Z7cq6OasHQg1J7sXX + QgghHqc35/Qd/JR7Ua0uiIOUVA9yPw8JISPZjEiVEiZWnuVqITV5Bj9Ff+/TaAxqBM8CvaHoVO4g + mITy6QGi4AGTuB8hzd0uf4AZk8pV/GjtGT/OuSz1IdIhnfkZN+gxDW0ElcjnDIkpyIk9FkJ8m/qP + TyrJEMkFbpl+gM+GTXOyk8vWF3J35phY2QW+2a6eMWs7YIU2Xa1PaqUH6Bblsms1dkhXRHFV+e8s + z3mfaqPv/8d+IbSGwNC1IUKH+mnfZSxCPsl/jc0rngLL9CsxuNag7yGGiKeDMKF9ZXRdQV3BQRbH + 4g8AAAD//wMAIVEMVh4DAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4256aaff33ce5c-SJC + - 8f425bb5cdb77ac1-SJC Connection: - keep-alive Content-Encoding: @@ -65,7 +65,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:45:16 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -79,25 +79,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "1096" + - "262" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999890" + - "149999890" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_d492552c2043885324019334103f08d1 + - req_ca5799089a4ca130483ac0a6fa172710 status: code: 200 message: OK diff --git a/tests/cassettes/test_eval_answer[llm basic].yaml b/tests/cassettes/test_eval_answer[llm basic].yaml index 63f1bb18..18f9bfd9 100644 --- a/tests/cassettes/test_eval_answer[llm basic].yaml +++ b/tests/cassettes/test_eval_answer[llm basic].yaml @@ -7,7 +7,7 @@ interactions: other output is permitted.\n\nQuestion: Which of the following is most likely true:\n\nA) Piggie, B) Pigeon, C) Gerald\n\n\nCorrect answer: C\n\nProposed answer: Based on all factors considered, the most compelling answer is Gerald, - C", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + C", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -16,7 +16,7 @@ interactions: connection: - keep-alive content-length: - - "516" + - "521" content-type: - application/json host: @@ -46,18 +46,16 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJI/b8IwEMX3fArLM6kCpPzbGDpQVahSh4pWVWTsS+Li+CzbkaCI7145 - AZKqVOri4X73nt+dfYwIoVLQBaG8ZJ5XRsXL/G09e3zePyX1YbxZrhzuynKTi9WXW7/SQVDg9hO4 - v6juOFZGgZeoW8wtMA/BdTgdp+k0mc5GDahQgAqywvg4xXiUjNI4mcXJ5CwsUXJwdEHeI0IIOTZn - iKgF7OmCJINLpQLnWAF0cW0ihFpUoUKZc9J5pj0ddJCj9qCb1JuHlz6xkNeOhWC6VupcP12vUlgY - i1t35td6LrV0ZWaBOdTB1nk0tKGniJCPZqT6R0pqLFbGZx53oIPhMLlv/Wi3xB49M4+eqb5oMrhh - lwnwTCrX2wnljJcgOmm3QFYLiT0Q9Yb+HeaWdzu41MV/7DvAORgPIjMWhOQ/B+7aLIQv9lfbdclN - YOoOzkOV5VIXYI2V7SvnJpvnbMvmeTKe0egUfQMAAP//AwAWS34s7gIAAA== + H4sIAAAAAAAAA4ySy2rDMBBF9/4KoXVcnMTOa1dKCl00lAZKHxSjSGNbjSwJaUJTSv69yHnYoSl0 + o8WcuVd3RvqOCKFS0BmhvGLIa6vi6+LzZr54vh2tB/Xr0zgTi8Lh/ZY93vHlA+0FhVl9AMej6oqb + 2ipAafQecwcMIbj2x8M0yyZZf9KA2ghQQVZajFMT11LLeJAM0jgZx/3JQV0ZycHTGXmLCCHkuzlD + Ti1gS2ck6R0rNXjPSqCzUxMh1BkVKpR5Lz0yjbTXQm40gm6iv8yXXeKg2HgW0umNUof67nSVMqV1 + ZuUP/FQvpJa+yh0wb3Sw9WgsbeguIuS9GWlzlpJaZ2qLOZo16GDYT7K9H2032aEHhgaZ6opGvQt2 + uQBkUvnOTihnvALRStsFso2QpgOiztC/w1zy3g8udfkf+xZwDhZB5NaBkPx84LbNQfhnf7WdltwE + pv7LI9R5IXUJzjq5f+XC5sMpSxM+HbGERrvoBwAA//8DAJN7IxXzAgAA headers: - CF-Cache-Status: - - DYNAMIC CF-RAY: - - 8f39fdb5cae1158a-SJC + - 8f425bb118049453-SJC Connection: - keep-alive Content-Encoding: @@ -65,14 +63,14 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:22 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=lVkT7i5qloNOJW3VW5kf8Ohm6U080WiPUv6XirXCoFk-1734470782-1.0.1.1-nAgxt2GizSWkF.auEc_j1tv3Erjbd74Lsh9WJmMaZa_E8fpVuEZ8SsBIqLBHICQDV0sfwSjHgP9mTBHQujl_XA; - path=/; expires=Tue, 17-Dec-24 21:56:22 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=shlFi0WrRQqtHm9BFHA8BA_DE3OgD.WLNX_BG0MJ.Uc-1734558518-1.0.1.1-dTPiGPfeRXm4eFyNx5Qhh98ITpHISNJJ15gnJl7VfBbOzj3CoF.H.Mssss_WvoWjPSiaq4ZWwBKCF16.mbMFig; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=YCWb3aZdtzEmsWTuiPgC.gchnL7jvJLEWh9yvJqAiAw-1734470782603-0.0.1.1-604800000; + - _cfuvid=LbfayFWmgFkPH4gOfhOfLicD7koAa3IqwrVpt0Q2uQ0-1734558518270-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -82,28 +80,30 @@ interactions: - X-Request-ID alt-svc: - h3=":443"; ma=86400 + cf-cache-status: + - DYNAMIC openai-organization: - future-house-xr4tdh openai-processing-ms: - - "124" + - "226" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29998" x-ratelimit-remaining-tokens: - - "29999876" + - "149999877" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_84a9ec4746765b74e4d84610ebc880ad + - req_c627f8d13c1969c6fd3a26f94a43a44f status: code: 200 message: OK diff --git a/tests/cassettes/test_eval_llm_config.yaml b/tests/cassettes/test_eval_llm_config.yaml index 383479dc..7268d855 100644 --- a/tests/cassettes/test_eval_llm_config.yaml +++ b/tests/cassettes/test_eval_llm_config.yaml @@ -5,7 +5,7 @@ interactions: question, and a proposed answer to the question. Please tell me if the proposed answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No other output is permitted.\n\nQuestion: What is 25 * 10?\n\nCorrect answer: - 250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o", "temperature": + 250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0.5}' headers: accept: @@ -15,7 +15,7 @@ interactions: connection: - keep-alive content-length: - - "387" + - "392" content-type: - application/json host: @@ -45,18 +45,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1ycH+okt0Ja6KUthBzaUowirW2lslZIG0gJefci - x4kT2kIvPsy3M55de58wxrXic8ZlLUg2zqR35dvTLF+8PJgRPhLUrtwszWKVPa92M+CD6MD1BiSd - XDcSG2eANNojlh4EQUwd5uPJJM/y6bgFDSow0VY5SieYjrLRJM2maXbbGWvUEgKfs/eEMcb27TNW - tAp2fM6ywUlpIARRAZ+fhxjjHk1UuAhBBxKW+KCHEi2BbVu/3i8viYdyG0QsZrfGdPrh/CqDlfO4 - Dh0/66W2OtSFBxHQxthA6HhLDwljH+1K26uW3HlsHBWEn2BjYD47xvH+hj0cdoyQhOnlaXeF67BC - AQltwsVFuBSyBtU7+/OJrdJ4AZKLlX92+S37uLa21X/ieyAlOAJVOA9Ky+t9+zEP8Qf7a+x84rYw - D1+BoClKbSvwzuvjNy5dIfKZmiohhyVPDsk3AAAA//8DADQLsKzsAgAA + H4sIAAAAAAAAAwAAAP//jJJfa4MwFMXf/RQhz3Vo/8zWt1H60LExaAdjjCFpctVsMQlJpCul333E + WrVsg734cH73HM+9egwQwpzhFGFaEkcrLcK7fL9cMbp+vF/L/fLpsH3eVGabLDb5w+QFj7xD7T6A + uovrhqpKC3BcyTOmBogDnxonk+lsNp/F8wZUioHwtkK7cKrCiksejqPxNIySMJ637lJxChan6C1A + CKFj8/Q9JYMvnKJodFEqsJYUgNNuCCFslPAKJtZy64h0eNRDqqQD2VR/XW2HxEBeW+LbyVqIVj91 + rxKq0EbtbMs7PeeS2zIzQKySPtY6pXFDTwFC781K9VVLrI2qtMuc+gTpA5PFOQ73h+xh3DKnHBG9 + PG+vcB2WMXCECzu4CKaElsB6Z38+UjOuBiAYrPyzy2/Z57W5LP4T3wNKQTtgmTbAOL3etx8z4P+y + v8a6EzeFsT1YB1WWc1mA0Yafv3Gus9ucxhHEEexwcAq+AQAA//8DAPSiOYXxAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f39fdbac9db643b-SJC + - 8f425bb65e7c6453-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +64,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 17 Dec 2024 21:26:23 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -78,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "231" + - "229" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999909" + - "149999909" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_6538a77713d1ec9b61a8e15f3cf37377 + - req_1f8e0a7bd96417a061e010db00f50b6f status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml index 319c6183..8b076955 100644 --- a/tests/cassettes/test_extract_answer[complex].yaml +++ b/tests/cassettes/test_extract_answer[complex].yaml @@ -11,7 +11,7 @@ interactions: believe the collapse was due to social unrest because of the prolonged epidemic of 2025. I tend to agree with the majority - although I can see both sides. Thus my response is that the social unrest was the significant factor in the - collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature": + collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: @@ -21,7 +21,7 @@ interactions: connection: - keep-alive content-length: - - "861" + - "866" content-type: - application/json host: @@ -51,18 +51,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW - FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH - sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA - nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC - yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib - 3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D - krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA + H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1zsNE5S30qhhzZQaKE5lGIUaW2rkbVCkklDyLsX + OT92aAq96LDfzmh2pV1ECJWC5oTymnneGBXfl5uHx4VKt7PNy3KdLerpUj8/vavXxtkNHQUFrr6A + +5PqhmNjFHiJ+oC5BeYhuKaz20mWzbP0rgMNClBBVhkfTzBupJbxOBlP4mQWp/OjukbJwdGcfESE + ELLrzpBTC/imOUlGp0oDzrEKaH5uIoRaVKFCmXPSeaY9HfWQo/agu+hvyCVTpNUW3EWPhbJ1LOTU + rVLH+v58qcLKWFy5Iz/XS6mlqwsLzKEOFziPhnZ0HxHy2Q3XXuSlxmJjfOFxDToYptPs4Ef7nfZ0 + fGQePVND0Wx0xa4Q4JlUbrAdyhmvQfTSfpWsFRIHIBoM/TvMNe/D4FJX/7HvAedgPIjCWBCSXw7c + t1kIP+6vtvOSu8DUbZ2HpiilrsAaKw/vXZpiWvI0gTSBFY320Q8AAAD//wMA8VLBff0CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f42461018c5eb29-SJC + - 8f425bb69fe5250c-SJC Connection: - keep-alive Content-Encoding: @@ -70,7 +70,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:33:52 GMT + - Wed, 18 Dec 2024 21:48:39 GMT Server: - cloudflare Transfer-Encoding: @@ -84,25 +84,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "235" + - "491" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999790" + - "149999790" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_366dfd5f505d08facd0f7d10e64a9f5e + - req_0446ed4c188b77427f33f74f91e0d112 status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml index cb61987f..576fc15e 100644 --- a/tests/cassettes/test_extract_answer[empty-proposal].yaml +++ b/tests/cassettes/test_extract_answer[empty-proposal].yaml @@ -5,7 +5,7 @@ interactions: has fixed options. Repeat back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: - ", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -14,7 +14,7 @@ interactions: connection: - keep-alive content-length: - - "369" + - "374" content-type: - application/json host: @@ -34,7 +34,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -44,18 +44,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+ - 1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6 - EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG - Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4 - pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W - AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK - 1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA= + H4sIAAAAAAAAAwAAAP//jFJNS8QwFLz3V4R33kq72/3qTUQ8CSLiRaSkyWsbTZOQpKgs+98l3d22 + y67gJYeZN5OZl+wiQkBwyAmwhnrWGhnfVl939zxzDy+CCfO6KB+TLadl6rbPTyuYBYUuP5D5k+qG + 6dZI9EKrA80sUo/BNV0vsuVys0w3PdFqjjLIauPjTMetUCKeJ/MsTtZxujmqGy0YOsjJW0QIIbv+ + DDkVx2/ISTI7IS06R2uEfBgiBKyWAQHqnHCeKg+zkWRaeVR99ClsseocDdFUJ+UR3w/3SF0bq0t3 + 5Ae8Ekq4prBInVbB03ltoGf3ESHvfZ/uLCIYq1vjC68/UQXDdXqwg3GLI3msCl57Kq9ozswKjp4K + 6SbrAEZZg/zCkBCgHRd6QkSTypdZrnkfagtV/8d+JBhD45EXxiIX7Grf3jx8sb/GhhX3gcH9OI9t + UQlVozVWHB64MsWqYmmCaYIlRPvoFwAA//8DACbc6TvuAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f424615ca81eb32-SJC + - 8f425bb11ee3eb30-SJC Connection: - keep-alive Content-Encoding: @@ -63,9 +63,15 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:33:53 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=3EbR6c_9nmNeI58TWDLCiyFbbzWnxiCAQfgz1Ou5oXQ-1734558518-1.0.1.1-_OVXY1MiEfz9j5Sl02ocx_beYJRhzMj_5kdzhk9Gq_NIORYBNM4OqmSmTCUwNu.EObKQiWZdQdrwqZ84sr8.cQ; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=U.00GXQIFA3gE8IldpDjXxcp1niJXAkehSRhHT85pWs-1734558518279-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -77,25 +83,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "171" + - "249" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29998" x-ratelimit-remaining-tokens: - - "29999912" + - "149999913" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_de2070d3e02afd584ac618042c22382d + - req_63d20bd456f7f2145bc66a3ae269bc1e status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml index a70e7e68..a529a885 100644 --- a/tests/cassettes/test_extract_answer[gave-two].yaml +++ b/tests/cassettes/test_extract_answer[gave-two].yaml @@ -5,7 +5,7 @@ interactions: has fixed options. Repeat back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: - A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -14,7 +14,7 @@ interactions: connection: - keep-alive content-length: - - "375" + - "380" content-type: - application/json host: @@ -34,7 +34,7 @@ interactions: x-stainless-raw-response: - "true" x-stainless-retry-count: - - "1" + - "0" x-stainless-runtime: - CPython x-stainless-runtime-version: @@ -44,18 +44,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85 - zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN - QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm - rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK - l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL - EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed - 2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA== + H4sIAAAAAAAAAwAAAP//jFJNTwIxFLzvr2jemTW7fAhyI8TozUSjiRizKd23S7Xb17QPlRD+u+mC + gAETLz3MvJnOvHadCAG6hLEAtZCsGmfSSfU5vX6YXd2E2YRGt4Mp493qqZp/PD7jPXSiguZvqPhH + daGocQZZk93SyqNkjK75sNcfDEaDfNQSDZVooqx2nPYpbbTVaTfr9tNsmOajnXpBWmGAsXhJhBBi + 3Z4xpy3xC8Yi6/wgDYYga4TxfkgI8GQiAjIEHVhahs6BVGQZbRv9GPZYLYOM0ezSmB2+2d9jqHae + 5mHH7/FKWx0WhUcZyEbPwOSgZTeJEK9tn+WviOA8NY4Lpne00XDY29rBYYsHclcVmFiaM5pfZkWJ + LLUJR+sAJdUCyxNDIUAuS01HRHJU+TTLOe9tbW3r/9gfCKXQMZaF81hqdbZvax6/2F9j+xW3gSGs + AmNTVNrW6J3X2weuXHFZqTzDPMM5JJvkGwAA//8DAOjXCFXuAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f42460a68a467f1-SJC + - 8f425bb11e1069a2-SJC Connection: - keep-alive Content-Encoding: @@ -63,9 +63,15 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:33:51 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare + Set-Cookie: + - __cf_bm=eAk9PjLOP_uC98HrFuiPUxdGbMOD0FndASetRInyC8E-1734558518-1.0.1.1-czBHIlZrAXhRtJiNtQMJ4FNObmpYfP0sPzRSb84VB2iiFfmBNMFsZOSzB8kN5BWGvHDUXsKgWJTphYPTQzM3FA; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=dYXAYAvcpEWoKaqCouzZ9rcGFRQEzhYA4XzFKsQi83I-1734558518200-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -77,25 +83,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "241" + - "171" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29999" x-ratelimit-remaining-tokens: - - "29999911" + - "149999912" x-ratelimit-reset-requests: - - 6ms + - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_83d07d0983e1d4d1995bfa068db503dd + - req_efdaa27fda18e26d87bcadcc80237c76 status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer[not in options].yaml b/tests/cassettes/test_extract_answer[not in options].yaml index e48c8e4c..70884d60 100644 --- a/tests/cassettes/test_extract_answer[not in options].yaml +++ b/tests/cassettes/test_extract_answer[not in options].yaml @@ -5,7 +5,7 @@ interactions: has fixed options. Repeat back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer: - F", "role": "user"}], "model": "gpt-4o", "temperature": 0}' + F", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -14,7 +14,7 @@ interactions: connection: - keep-alive content-length: - - "367" + - "372" content-type: - application/json host: @@ -44,18 +44,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArr5gYl/d9uLIxIMLBQFLn2JXFxfJbtQKuq3x05DU0r - QGLJ8H73Xt5dckwYAyVhzUDUPIjG6vS+/Ni3T5TnlpYPyA/ukT4r2r3kcqKeYRQdtN2hCN+uO0GN - 1RgUmTMWDnnAmJovJtPZbDGfZB1oSKKOtsqGdErpOBtP02yZZvPeWJMS6GHNXhPGGDt2z1jRSNzD - mnUxndKg97xCWF+GGANHOirAvVc+cBNgNEBBJqDpWm9gA9fIYdl6HpuZVuteP13epamyjra+5xe9 - VEb5unDIPZmY6wNZ6OgpYeyt26m9qQnWUWNDEegdTQycr85xMBxxgHnPAgWuB3nRn+E2rJAYuNL+ - 6iQguKhRDs7hfryViq5AcrXyzy6/ZZ/XVqb6T/wAhEAbUBbWoVTidt9hzGH8w/4au5y4Kwz+4AM2 - RalMhc46df7IpS1WJd/yVZlNlpCcki8AAAD//wMAJ6aXP+0CAAA= + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9NyjpM/QGiAsSEuICEkKR62wSg2Nb9rY8qv47chLa + VC0SFx9mdsYza28jxkAWsGQgak6isSq+Kj9ubt0dyflYyIfy8b7mT5ss3Txff6sJjILCrN5Q0K/q + QpjGKiRpdEcLh5wwuKaLyXQ2y2Zp1hKNKVAFWWUpnpq4kVrG42Q8jZNFnGa9ujZSoIcle4kYY2zb + niGnLvATliwZ/SINes8rhOV+iDFwRgUEuPfSE9cEowMpjCbUbfQh7LBcex6i6bVSPb7b36NMZZ1Z + +Z7f46XU0te5Q+6NDp6ejIWW3UWMvbZ91kcRwTrTWMrJvKMOhvPLzg4OWzyQfVUgQ1yd0RyZ5QUS + l8oP1gGCixqLE0PGgK8LaQZENKh8muWcd1db6uo/9gdCCLSERW4dFlKc7duahy/219h+xW1g8F+e + sMlLqSt01snugUubz0uRJpgmuIJoF/0AAAD//wMAUYws+e4CAAA= headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f4246051c1ceb26-SJC + - 8f425bb11ca22513-SJC Connection: - keep-alive Content-Encoding: @@ -63,14 +63,14 @@ interactions: Content-Type: - application/json Date: - - Wed, 18 Dec 2024 21:33:50 GMT + - Wed, 18 Dec 2024 21:48:38 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=in2yLMzYdxfPIHSQDPq17chYgGhrTOolB6HZrJk9Iy8-1734557630-1.0.1.1-LYUU4oNWUKO8gNwjcIktYjnSyIsGLKQGmQKI54P4UxfMJ330MXeZFWWhVoJnP0b1M92ejFaHTWHlz4eHH30gIA; - path=/; expires=Wed, 18-Dec-24 22:03:50 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=ABTDwd4t79cPLIko1hPlFoZXxUQ6rzPq8jHwq1Xy7XE-1734558518-1.0.1.1-Qqt3v2jz7xPx17Fx0ehWguxbmaMuZk4B3NM4Z1HW2aMmaaTMq2RvfX.y5A9X5qv4xoO0qWDJdyM.E9ahp.RW5A; + path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=El.JjK6nMT19ye2jesHXCIAySeg4BN7pKN7mVnzqSM8-1734557630598-0.0.1.1-604800000; + - _cfuvid=17oj8YL1hlYLaR7o.N8HEjWKDALCyYtBfmHe30jFAG0-1734558518262-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -83,25 +83,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "253" + - "232" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "10000" + - "30000" x-ratelimit-limit-tokens: - - "30000000" + - "150000000" x-ratelimit-remaining-requests: - - "9999" + - "29996" x-ratelimit-remaining-tokens: - - "29999912" + - "149992191" x-ratelimit-reset-requests: - - 6ms + - 7ms x-ratelimit-reset-tokens: - - 0s + - 3ms x-request-id: - - req_5992eb433053f3b82b29a5319a96ef7e + - req_e11e29110308fec0a5310bf18d49c27d status: code: 200 message: OK