From 50769512563befca3472b22722ee4953588bcff4 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Wed, 11 Dec 2024 09:01:14 -0800 Subject: [PATCH 1/4] Added new extract answer feature --- src/aviary/core.py | 3 +- src/aviary/utils.py | 56 +++++++++ .../test_extract_answer_llm[complex].yaml | 109 ++++++++++++++++++ .../test_extract_answer_llm[not exact].yaml | 103 +++++++++++++++++ ...st_extract_answer_llm[not in options].yaml | 109 ++++++++++++++++++ tests/test_utils.py | 29 ++++- 6 files changed, 407 insertions(+), 2 deletions(-) create mode 100644 tests/cassettes/test_extract_answer_llm[complex].yaml create mode 100644 tests/cassettes/test_extract_answer_llm[not exact].yaml create mode 100644 tests/cassettes/test_extract_answer_llm[not in options].yaml diff --git a/src/aviary/core.py b/src/aviary/core.py index 587e76b5..3a4d6690 100644 --- a/src/aviary/core.py +++ b/src/aviary/core.py @@ -40,6 +40,7 @@ EvalAnswerMode, encode_image_to_base64, eval_answer, + extract_answer_llm, is_coroutine_callable, partial_format, ) @@ -81,7 +82,7 @@ "argref_by_name", "encode_image_to_base64", "eval_answer", - "eval_answer", + "extract_answer_llm", "fenv", "is_coroutine_callable", "join", diff --git a/src/aviary/utils.py b/src/aviary/utils.py index 4d4fb3ef..f1332d20 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -22,6 +22,22 @@ "temperature": 0, } +LLM_EXTRACT_CONFIG = { + "prompt": ( + "You are evaluating answers for a test which has fixed options. " + "Here are the fixed options and a proposed answer. " + "Repeat back which option the proposed answer matches. " + "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. " + "If the proposed answer is empty, invalid, or ambiguous, " + "return an empty string." + "\n\nOptions:\n{options}" + "\n\nProposed answer: {proposed_answer}" + ), + "model": "gpt-4o-mini", + "temperature": 0, +} + + LLM_SCORE_EVAL_CONFIG = { "prompt": ( "Here is a question, the correct answer to the question, and a rubric for" @@ -79,6 +95,46 @@ def is_coroutine_callable(obj) -> bool: return False +async def extract_answer_llm( + proposed: str, + options: list[str], +) -> str | None: + """Extract the answer from a proposed answer and a list of options.""" + if not proposed: + return None + for option in options: + if proposed.strip().casefold() == option.casefold().strip(): + return option + + try: + from litellm import acompletion + except ImportError as e: + raise ImportError( + "eval_answer requires the 'llm' extra for 'litellm'. Please:" + " `pip install aviary[llm]`." + ) from e + config = LLM_EXTRACT_CONFIG + prompt = cast(str, config["prompt"]).format( + options="\n".join(options), + proposed_answer=proposed, + ) + + print("prompt", prompt) + response = await acompletion( + model=config["model"], + temperature=config["temperature"], + messages=[{"content": prompt, "role": "user"}], + ) + + extracted = response.choices[0].message.content.strip() + print("here it is", extracted) + for option in options: + if extracted.casefold() == option.casefold().strip(): + return option + + return None + + async def eval_answer( proposed: str, correct: str, diff --git a/tests/cassettes/test_extract_answer_llm[complex].yaml b/tests/cassettes/test_extract_answer_llm[complex].yaml new file mode 100644 index 00000000..7ec60953 --- /dev/null +++ b/tests/cassettes/test_extract_answer_llm[complex].yaml @@ -0,0 +1,109 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Here are the fixed options and a proposed answer. Repeat + back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF + A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return + an empty string.\n\nOptions:\nEconomic factors\nSocial unrest\nPolitical corruption\n\nProposed + answer: Based on the context given, Serif et al. (2026) claim that the overwhelming + cause of regime collapse arises from economic factors. Yet, most other scholars + (Gerald and Robinson for example) believe the collapse was due to social unrest + because of the prolonged epidemic of 2025. I tend to agree with the majority + - although I can see both sides. Thus my response is that the social unrest + was the significant factor in the collapse of the regime.", "role": "user"}], + "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "916" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.2 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.2 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.4 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFLBThsxFLzvV1g+Z1HIBgK5wQkVAVJ7oUJo5bVfdt3Yfpbf2yoF5d+r + dUI2EVTqxYc3b8YzY78XQkhr5FJI3SnWPrryxtw//Pz+m+82T99md+v15vHt/tY+dz5Q8nIyMLD5 + BZo/WGcafXTAFsMO1gkUw6B6vqiq6+pyXk0z4NGAG2ht5HKOpbfBlrPpbF5OF+X51Z7dodVAcile + CiGEeM/n4DMY2MilyFp54oFItSCXhyUhZEI3TKQissQqsJyMoMbAELL1H6itcqIPCehkJ8GqJzX4 + DL1z+/n2cKnDNiZsaI8f5isbLHV1AkUYhguIMcqMbgshXnO4/sSvjAl95JpxDYFyXxc7PTl2OqKz + PcbIyh2TFpMv5GoDrKyjo3akVroDM1LHKlVvLB4BxVHoz2a+0t4Ft6H9H/kR0Boig6ljAmP1aeBx + LcHw4/61dig5G5b0hxh8vbKhhRST3b33KtZNoyp9BYtpI4tt8RcAAP//AwC+Pd9w/QIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f0708028d4067b3-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 11 Dec 2024 17:00:30 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "219" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29999" + x-ratelimit-remaining-tokens: + - "149999777" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_f1ad9fa4f694c6f3fa43b1e22e47a6e7 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer_llm[not exact].yaml b/tests/cassettes/test_extract_answer_llm[not exact].yaml new file mode 100644 index 00000000..9d9c2500 --- /dev/null +++ b/tests/cassettes/test_extract_answer_llm[not exact].yaml @@ -0,0 +1,103 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Here are the fixed options and a proposed answer. Repeat + back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF + A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return + an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: A or B", "role": "user"}], + "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "430" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.2 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.2 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.4 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFLLasMwELz7K8Se4+LEaV63XBJK6TmlpRhZ2thqZK2QZGgI+fci501S + 6EWHmZ3RzEq7hDFQEmYMRM2DaKxO5/L17X25Eh/L+cpl/VG74P3nxZa29eJlA72ooPIbRTipngQ1 + VmNQZA60cMgDRtf+OM+n+Wg4mHZEQxJ1lFU2pENKG2VUOsgGwzQbp/3JUV2TEuhhxj4TxhjbdWfM + aST+wIxlvRPSoPe8QpidhxgDRzoiwL1XPnAToHchBZmApot+DTtct57HaKbV+ojvz/doqqyj0h/5 + M75WRvm6cMg9mejpA1no2H3C2FfXp72JCNZRY0MRaIMmGk7ygx1ctnghj1UhUOD6gebGrJAYuNL+ + ah0guKhR3hkyBryViq6I5KryfZZH3ofaylT/sb8QQqANKAvrUCrxsG9nHr/YX2PnFXeBwW99wKZY + K1Ohs04dHnhti7LkuZjgOCsh2Se/AAAA//8DAI7MQKjuAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f0707fdccb9ed3b-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 11 Dec 2024 17:00:30 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "187" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29999" + x-ratelimit-remaining-tokens: + - "149999898" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_6096d9659ea5fa164eceaca3c0ff9fbb + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_extract_answer_llm[not in options].yaml b/tests/cassettes/test_extract_answer_llm[not in options].yaml new file mode 100644 index 00000000..5dbd3b77 --- /dev/null +++ b/tests/cassettes/test_extract_answer_llm[not in options].yaml @@ -0,0 +1,109 @@ +interactions: + - request: + body: + '{"messages": [{"content": "You are evaluating answers for a test which + has fixed options. Here are the fixed options and a proposed answer. Repeat + back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF + A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return + an empty string.\n\nOptions:\nB\nC\n\nProposed answer: F", "role": "user"}], + "model": "gpt-4o-mini", "temperature": 0}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "422" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.2 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.2 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "0" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.4 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAA4xSTU8CMRS8769o3pk1sIvycTPxoDHoDU2M2XTbx26129e0xYCE/266IAsBEy89 + zLyZzrx2kzAGSsKUgah5EI3V6a18nL0+PD+t58N5PtA0n62+X/idzO7t9Rf0ooLKDxThV3UlqLEa + gyKzo4VDHjC6DkZ5PslvhtmkJRqSqKOssiEdUtooo9Ksnw3T/igdjPfqmpRAD1P2ljDG2KY9Y04j + cQVT1u/9Ig16zyuE6WGIMXCkIwLce+UDNwF6HSnIBDRt9GPY4WLpeYxmllrv8e3hHk2VdVT6PX/A + F8ooXxcOuScTPX0gCy27TRh7b/ssTyKCddTYUAT6RBMNR5OdHXRb7Mh9VQgUuL6gOTErJAautD9a + BwguapRnhowBX0pFR0RyVPk8yyXvXW1lqv/Yd4QQaAPKwjqUSlzs25rHL/bX2GHFbWDwax+wKRbK + VOisU7sHXtiiLHkuxjjql5Bskx8AAAD//wMAztjv0e4CAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f0707f8196a7ad6-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Wed, 11 Dec 2024 17:00:29 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=rcc8uJSoYq4Sbo_GUuI7O04sMAoaQ0vy.vIgnWH.Nv8-1733936429-1.0.1.1-XbksLC1b8zqp8quutgJYFvxYvYPvO82TaSkdFap094o8_1.HBMP.3TwDmpx36yIipRy5bJBnSaDXLegbPaTwEg; + path=/; expires=Wed, 11-Dec-24 17:30:29 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=WkmDGCsakb0rOihw3PqRfbieMakXTIGQdIGD7BKSG5k-1733936429180-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "243" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "30000" + x-ratelimit-limit-tokens: + - "150000000" + x-ratelimit-remaining-requests: + - "29999" + x-ratelimit-remaining-tokens: + - "149999901" + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_fdcfe4f30d07daf21fc88414f0358901 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_utils.py b/tests/test_utils.py index c13a07a4..b656d874 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from aviary.core import eval_answer +from aviary.core import eval_answer, extract_answer_llm @pytest.mark.vcr @@ -38,3 +38,30 @@ async def test_eval_answer(proposed, correct, question, eval_mode, expected): async def test_eval_llm_config(): config = {"temperature": 0.5} assert await eval_answer("250", "250", "What is 25 * 10?", "llm", config) + + +@pytest.mark.vcr +@pytest.mark.parametrize( + ("proposed", "options", "expected"), + [ + pytest.param("A", ["A", "B", "C"], "A", id="exact"), + pytest.param("a", ["A", "B", "C"], "A", id="exact"), + pytest.param("F", ["B", "C"], None, id="not in options"), + pytest.param("A or B", ["A", "B", "C"], None, id="not exact"), + pytest.param( + "Based on the context given, Serif et al. (2026) claim that " + "the overwhelming cause of regime collapse arises from economic factors. " + "Yet, most other scholars (Gerald and Robinson for example) believe the collapse " + "was due to social unrest because of the prolonged epidemic of 2025. I tend to agree " + "with the majority - although I can see both sides. Thus my response " + "is that the social unrest was the significant factor in the collapse of the regime.", + ["Economic factors", "Social unrest", "Political corruption"], + "Social unrest", + id="complex", + ), + pytest.param("", ["A", "B", "C"], None, id="empty proposed"), + ], +) +@pytest.mark.asyncio +async def test_extract_answer_llm(proposed, options, expected): + assert await extract_answer_llm(proposed, options) == expected From 783b4831ae86ae0b9b70aeb5be4407f320416ad0 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Wed, 11 Dec 2024 09:02:18 -0800 Subject: [PATCH 2/4] Simplified prompts --- src/aviary/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aviary/utils.py b/src/aviary/utils.py index f1332d20..d7a9f269 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -25,7 +25,6 @@ LLM_EXTRACT_CONFIG = { "prompt": ( "You are evaluating answers for a test which has fixed options. " - "Here are the fixed options and a proposed answer. " "Repeat back which option the proposed answer matches. " "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. " "If the proposed answer is empty, invalid, or ambiguous, " From 9547134f1530ed9270817977acd463796c4be8d2 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Wed, 11 Dec 2024 09:03:07 -0800 Subject: [PATCH 3/4] Updated casettes --- .../test_extract_answer_llm[complex].yaml | 48 +++++++++---------- .../test_extract_answer_llm[not exact].yaml | 35 +++++++------- ...st_extract_answer_llm[not in options].yaml | 41 ++++++++-------- 3 files changed, 61 insertions(+), 63 deletions(-) diff --git a/tests/cassettes/test_extract_answer_llm[complex].yaml b/tests/cassettes/test_extract_answer_llm[complex].yaml index 7ec60953..24ecd40c 100644 --- a/tests/cassettes/test_extract_answer_llm[complex].yaml +++ b/tests/cassettes/test_extract_answer_llm[complex].yaml @@ -2,17 +2,17 @@ interactions: - request: body: '{"messages": [{"content": "You are evaluating answers for a test which - has fixed options. Here are the fixed options and a proposed answer. Repeat - back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF - A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return - an empty string.\n\nOptions:\nEconomic factors\nSocial unrest\nPolitical corruption\n\nProposed - answer: Based on the context given, Serif et al. (2026) claim that the overwhelming - cause of regime collapse arises from economic factors. Yet, most other scholars - (Gerald and Robinson for example) believe the collapse was due to social unrest - because of the prolonged epidemic of 2025. I tend to agree with the majority - - although I can see both sides. Thus my response is that the social unrest - was the significant factor in the collapse of the regime.", "role": "user"}], - "model": "gpt-4o-mini", "temperature": 0}' + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial + unrest\nPolitical corruption\n\nProposed answer: Based on the context given, + Serif et al. (2026) claim that the overwhelming cause of regime collapse arises + from economic factors. Yet, most other scholars (Gerald and Robinson for example) + believe the collapse was due to social unrest because of the prolonged epidemic + of 2025. I tend to agree with the majority - although I can see both sides. + Thus my response is that the social unrest was the significant factor in the + collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature": + 0}' headers: accept: - application/json @@ -21,7 +21,7 @@ interactions: connection: - keep-alive content-length: - - "916" + - "866" content-type: - application/json host: @@ -51,18 +51,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLBThsxFLzvV1g+Z1HIBgK5wQkVAVJ7oUJo5bVfdt3Yfpbf2yoF5d+r - dUI2EVTqxYc3b8YzY78XQkhr5FJI3SnWPrryxtw//Pz+m+82T99md+v15vHt/tY+dz5Q8nIyMLD5 - BZo/WGcafXTAFsMO1gkUw6B6vqiq6+pyXk0z4NGAG2ht5HKOpbfBlrPpbF5OF+X51Z7dodVAcile - CiGEeM/n4DMY2MilyFp54oFItSCXhyUhZEI3TKQissQqsJyMoMbAELL1H6itcqIPCehkJ8GqJzX4 - DL1z+/n2cKnDNiZsaI8f5isbLHV1AkUYhguIMcqMbgshXnO4/sSvjAl95JpxDYFyXxc7PTl2OqKz - PcbIyh2TFpMv5GoDrKyjo3akVroDM1LHKlVvLB4BxVHoz2a+0t4Ft6H9H/kR0Boig6ljAmP1aeBx - LcHw4/61dig5G5b0hxh8vbKhhRST3b33KtZNoyp9BYtpI4tt8RcAAP//AwC+Pd9w/QIAAA== + H4sIAAAAAAAAAwAAAP//jJJBTwIxEIXv+yuanlkDLLDITT140OiBxJgYs+m2w1Lpdpp2ViGE/266 + IAsREy89zDfv9c2024QxrhWfMS6XgmTtTHqjHp59/YQyyLn6vJ28Klo/ipfNF6zu7nkvKrD8AEk/ + qiuJtTNAGu0eSw+CILoO8iy7zibjPGtBjQpMlFWO0hGmtbY6HfaHo7Sfp4PpQb1ELSHwGXtLGGNs + 254xp1Ww5jPW7/1UaghBVMBnxybGuEcTK1yEoAMJS7zXQYmWwLbR5yi1MKyxHsJZj4dFE0TMaRtj + DvXd8VKDlfNYhgM/1hfa6rAsPIiANl4QCB1v6S5h7L0drjnLy53H2lFBuAIbDQeT8d6Pdzvt6PDA + CEmYU1Heu2BXKCChTTjZDpdCLkF10m6VolEaT0ByMvTvMJe894NrW/3HvgNSgiNQhfOgtDwfuGvz + EH/cX23HJbeBedgEgrpYaFuBd17v33vhirIUmZxC3i95sku+AQAA//8DAAcy6K79AgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f0708028d4067b3-SJC + - 8f070b7e9be306ad-SJC Connection: - keep-alive Content-Encoding: @@ -70,7 +70,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 11 Dec 2024 17:00:30 GMT + - Wed, 11 Dec 2024 17:02:53 GMT Server: - cloudflare Transfer-Encoding: @@ -84,7 +84,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "219" + - "244" openai-version: - "2020-10-01" strict-transport-security: @@ -96,13 +96,13 @@ interactions: x-ratelimit-remaining-requests: - "29999" x-ratelimit-remaining-tokens: - - "149999777" + - "149999790" x-ratelimit-reset-requests: - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_f1ad9fa4f694c6f3fa43b1e22e47a6e7 + - req_e8f3bb69f3add846e2a40af6c0982db6 status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer_llm[not exact].yaml b/tests/cassettes/test_extract_answer_llm[not exact].yaml index 9d9c2500..b0a3efe3 100644 --- a/tests/cassettes/test_extract_answer_llm[not exact].yaml +++ b/tests/cassettes/test_extract_answer_llm[not exact].yaml @@ -2,11 +2,10 @@ interactions: - request: body: '{"messages": [{"content": "You are evaluating answers for a test which - has fixed options. Here are the fixed options and a proposed answer. Repeat - back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF - A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return - an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: A or B", "role": "user"}], - "model": "gpt-4o-mini", "temperature": 0}' + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: + A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +14,7 @@ interactions: connection: - keep-alive content-length: - - "430" + - "380" content-type: - application/json host: @@ -45,18 +44,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAAwAAAP//jFLLasMwELz7K8Se4+LEaV63XBJK6TmlpRhZ2thqZK2QZGgI+fci501S - 6EWHmZ3RzEq7hDFQEmYMRM2DaKxO5/L17X25Eh/L+cpl/VG74P3nxZa29eJlA72ooPIbRTipngQ1 - VmNQZA60cMgDRtf+OM+n+Wg4mHZEQxJ1lFU2pENKG2VUOsgGwzQbp/3JUV2TEuhhxj4TxhjbdWfM - aST+wIxlvRPSoPe8QpidhxgDRzoiwL1XPnAToHchBZmApot+DTtct57HaKbV+ojvz/doqqyj0h/5 - M75WRvm6cMg9mejpA1no2H3C2FfXp72JCNZRY0MRaIMmGk7ygx1ctnghj1UhUOD6gebGrJAYuNL+ - ah0guKhR3hkyBryViq6I5KryfZZH3ofaylT/sb8QQqANKAvrUCrxsG9nHr/YX2PnFXeBwW99wKZY - K1Ohs04dHnhti7LkuZjgOCsh2Se/AAAA//8DAI7MQKjuAgAA + H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVpCe0PigARSOSEkhCLH3iYGxzb2FlGq/jty+krVInHx + YWZnPLP2OmEMlIQZA9FwEq3T6a18mH/OH5ufVRVoNZk+PRfmXovs5e4rz2EQFbZ6R0F71ZWwrdNI + ypotLTxywug6LPJ8ml9PilFHtFaijrLaUTq2aauMSkfZaJxmRTq82akbqwQGmLHXhDHG1t0ZcxqJ + 3zBj2WCPtBgCrxFmhyHGwFsdEeAhqEDcEAyOpLCG0HTR+7DHxTLwGM0std7hm8M92tbO2yrs+AO+ + UEaFpvTIgzXRM5B10LGbhLG3rs/yJCI4b1tHJdkPNNGwyLd2cNzikdxVBbLE9QXNiVkpkbjSobcO + EFw0KM8MGQO+lMr2iKRX+TzLJe9tbWXq/9gfCSHQEcrSeZRKXOzbmccv9tfYYcVdYAirQNiWC2Vq + 9M6r7QMvXFlVPBc3WGQVJJvkFwAA//8DABYKnlruAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f0707fdccb9ed3b-SJC + - 8f070b799eb5679d-SJC Connection: - keep-alive Content-Encoding: @@ -64,7 +63,7 @@ interactions: Content-Type: - application/json Date: - - Wed, 11 Dec 2024 17:00:30 GMT + - Wed, 11 Dec 2024 17:02:52 GMT Server: - cloudflare Transfer-Encoding: @@ -78,7 +77,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "187" + - "193" openai-version: - "2020-10-01" strict-transport-security: @@ -90,13 +89,13 @@ interactions: x-ratelimit-remaining-requests: - "29999" x-ratelimit-remaining-tokens: - - "149999898" + - "149999912" x-ratelimit-reset-requests: - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_6096d9659ea5fa164eceaca3c0ff9fbb + - req_24d3312a6ad717a657fe3e693bd24613 status: code: 200 message: OK diff --git a/tests/cassettes/test_extract_answer_llm[not in options].yaml b/tests/cassettes/test_extract_answer_llm[not in options].yaml index 5dbd3b77..e9aa9016 100644 --- a/tests/cassettes/test_extract_answer_llm[not in options].yaml +++ b/tests/cassettes/test_extract_answer_llm[not in options].yaml @@ -2,11 +2,10 @@ interactions: - request: body: '{"messages": [{"content": "You are evaluating answers for a test which - has fixed options. Here are the fixed options and a proposed answer. Repeat - back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF - A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return - an empty string.\n\nOptions:\nB\nC\n\nProposed answer: F", "role": "user"}], - "model": "gpt-4o-mini", "temperature": 0}' + has fixed options. Repeat back which option the proposed answer matches. GIVE + ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid, + or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer: + F", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' headers: accept: - application/json @@ -15,7 +14,7 @@ interactions: connection: - keep-alive content-length: - - "422" + - "372" content-type: - application/json host: @@ -45,18 +44,18 @@ interactions: response: body: string: !!binary | - H4sIAAAAAAAAA4xSTU8CMRS8769o3pk1sIvycTPxoDHoDU2M2XTbx26129e0xYCE/266IAsBEy89 - zLyZzrx2kzAGSsKUgah5EI3V6a18nL0+PD+t58N5PtA0n62+X/idzO7t9Rf0ooLKDxThV3UlqLEa - gyKzo4VDHjC6DkZ5PslvhtmkJRqSqKOssiEdUtooo9Ksnw3T/igdjPfqmpRAD1P2ljDG2KY9Y04j - cQVT1u/9Ig16zyuE6WGIMXCkIwLce+UDNwF6HSnIBDRt9GPY4WLpeYxmllrv8e3hHk2VdVT6PX/A - F8ooXxcOuScTPX0gCy27TRh7b/ssTyKCddTYUAT6RBMNR5OdHXRb7Mh9VQgUuL6gOTErJAautD9a - BwguapRnhowBX0pFR0RyVPk8yyXvXW1lqv/Yd4QQaAPKwjqUSlzs25rHL/bX2GHFbWDwax+wKRbK - VOisU7sHXtiiLHkuxjjql5Bskx8AAAD//wMAztjv0e4CAAA= + H4sIAAAAAAAAA4xSy2rDMBC8+yvEnuPiPO3kVtLSQik9pT2UYhRp7aiRJSHJkAf59yLbzYOk0IsO + MzujmZX2ESEgOMwIsBX1rDIyvucvbwa3T7vieT5JkvVm/lgv3hcPH8Pdq4FeUOjlNzL/q7pjujIS + vdCqpZlF6jG49tPhcDqcjNN+Q1Saowyy0vh4pONKKBEPksEoTtK4n3XqlRYMHczIZ0QIIfvmDDkV + xw3MSNL7RSp0jpYIs+MQIWC1DAhQ54TzVHnonUimlUfVRD+HLRa1oyGaqqXs8MPxHqlLY/XSdfwR + L4QSbpVbpE6r4Om8NtCwh4iQr6ZPfRERjNWV8bnXa1TBcDJt7eC0xRPZVQWvPZU3NBdmOUdPhXRn + 6wBG2Qr5lSEhQGsu9BkRnVW+znLLu60tVPkf+xPBGBqPPDcWuWA3+zbm4Yv9NXZccRMY3NZ5rPJC + qBKtsaJ94MLkWT9jg2yUpGOIDtEPAAAA//8DAPFLi1buAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8f0707f8196a7ad6-SJC + - 8f070b74a8f296d2-SJC Connection: - keep-alive Content-Encoding: @@ -64,14 +63,14 @@ interactions: Content-Type: - application/json Date: - - Wed, 11 Dec 2024 17:00:29 GMT + - Wed, 11 Dec 2024 17:02:51 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=rcc8uJSoYq4Sbo_GUuI7O04sMAoaQ0vy.vIgnWH.Nv8-1733936429-1.0.1.1-XbksLC1b8zqp8quutgJYFvxYvYPvO82TaSkdFap094o8_1.HBMP.3TwDmpx36yIipRy5bJBnSaDXLegbPaTwEg; - path=/; expires=Wed, 11-Dec-24 17:30:29 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=cPN3yGhPdDhm..__K3xNc3MOLUBlr1oNqkhiHU7tJZ4-1733936571-1.0.1.1-MtqJwEyFDgd8Vni.ynFHGXcJPXLffNE5vA844QFRzo.pM.7HV8F4xT54VCeWBp2zImYM6SMGBKTeaixHDUg9Ng; + path=/; expires=Wed, 11-Dec-24 17:32:51 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=WkmDGCsakb0rOihw3PqRfbieMakXTIGQdIGD7BKSG5k-1733936429180-0.0.1.1-604800000; + - _cfuvid=guVLBTPXR9itj1suHbM9nVSWOWIE5OU2MTh7PlMzNK0-1733936571897-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -84,7 +83,7 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "243" + - "155" openai-version: - "2020-10-01" strict-transport-security: @@ -96,13 +95,13 @@ interactions: x-ratelimit-remaining-requests: - "29999" x-ratelimit-remaining-tokens: - - "149999901" + - "149999913" x-ratelimit-reset-requests: - 2ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_fdcfe4f30d07daf21fc88414f0358901 + - req_daa6490ff2cc82dbb9cea5befb1d758d status: code: 200 message: OK From d7ef73aa6f7f2cfd0e45f313d7e8b5707b565e6f Mon Sep 17 00:00:00 2001 From: Andrew White Date: Wed, 11 Dec 2024 09:04:05 -0800 Subject: [PATCH 4/4] Removed prints --- src/aviary/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/aviary/utils.py b/src/aviary/utils.py index d7a9f269..46c45935 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -118,7 +118,6 @@ async def extract_answer_llm( proposed_answer=proposed, ) - print("prompt", prompt) response = await acompletion( model=config["model"], temperature=config["temperature"], @@ -126,7 +125,6 @@ async def extract_answer_llm( ) extracted = response.choices[0].message.content.strip() - print("here it is", extracted) for option in options: if extracted.casefold() == option.casefold().strip(): return option