From 50769512563befca3472b22722ee4953588bcff4 Mon Sep 17 00:00:00 2001
From: Andrew White <white.d.andrew@gmail.com>
Date: Wed, 11 Dec 2024 09:01:14 -0800
Subject: [PATCH 1/4] Added new extract answer feature

---
 src/aviary/core.py                            |   3 +-
 src/aviary/utils.py                           |  56 +++++++++
 .../test_extract_answer_llm[complex].yaml     | 109 ++++++++++++++++++
 .../test_extract_answer_llm[not exact].yaml   | 103 +++++++++++++++++
 ...st_extract_answer_llm[not in options].yaml | 109 ++++++++++++++++++
 tests/test_utils.py                           |  29 ++++-
 6 files changed, 407 insertions(+), 2 deletions(-)
 create mode 100644 tests/cassettes/test_extract_answer_llm[complex].yaml
 create mode 100644 tests/cassettes/test_extract_answer_llm[not exact].yaml
 create mode 100644 tests/cassettes/test_extract_answer_llm[not in options].yaml

diff --git a/src/aviary/core.py b/src/aviary/core.py
index 587e76b5..3a4d6690 100644
--- a/src/aviary/core.py
+++ b/src/aviary/core.py
@@ -40,6 +40,7 @@
     EvalAnswerMode,
     encode_image_to_base64,
     eval_answer,
+    extract_answer_llm,
     is_coroutine_callable,
     partial_format,
 )
@@ -81,7 +82,7 @@
     "argref_by_name",
     "encode_image_to_base64",
     "eval_answer",
-    "eval_answer",
+    "extract_answer_llm",
     "fenv",
     "is_coroutine_callable",
     "join",
diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index 4d4fb3ef..f1332d20 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -22,6 +22,22 @@
     "temperature": 0,
 }
 
+LLM_EXTRACT_CONFIG = {
+    "prompt": (
+        "You are evaluating answers for a test which has fixed options. "
+        "Here are the fixed options and a proposed answer. "
+        "Repeat back which option the proposed answer matches. "
+        "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
+        "If the proposed answer is empty, invalid, or ambiguous, "
+        "return an empty string."
+        "\n\nOptions:\n{options}"
+        "\n\nProposed answer: {proposed_answer}"
+    ),
+    "model": "gpt-4o-mini",
+    "temperature": 0,
+}
+
+
 LLM_SCORE_EVAL_CONFIG = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
@@ -79,6 +95,46 @@ def is_coroutine_callable(obj) -> bool:
     return False
 
 
+async def extract_answer_llm(
+    proposed: str,
+    options: list[str],
+) -> str | None:
+    """Extract the answer from a proposed answer and a list of options."""
+    if not proposed:
+        return None
+    for option in options:
+        if proposed.strip().casefold() == option.casefold().strip():
+            return option
+
+    try:
+        from litellm import acompletion
+    except ImportError as e:
+        raise ImportError(
+            "eval_answer requires the 'llm' extra for 'litellm'. Please:"
+            " `pip install aviary[llm]`."
+        ) from e
+    config = LLM_EXTRACT_CONFIG
+    prompt = cast(str, config["prompt"]).format(
+        options="\n".join(options),
+        proposed_answer=proposed,
+    )
+
+    print("prompt", prompt)
+    response = await acompletion(
+        model=config["model"],
+        temperature=config["temperature"],
+        messages=[{"content": prompt, "role": "user"}],
+    )
+
+    extracted = response.choices[0].message.content.strip()
+    print("here it is", extracted)
+    for option in options:
+        if extracted.casefold() == option.casefold().strip():
+            return option
+
+    return None
+
+
 async def eval_answer(
     proposed: str,
     correct: str,
diff --git a/tests/cassettes/test_extract_answer_llm[complex].yaml b/tests/cassettes/test_extract_answer_llm[complex].yaml
new file mode 100644
index 00000000..7ec60953
--- /dev/null
+++ b/tests/cassettes/test_extract_answer_llm[complex].yaml
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Here are the fixed options and a proposed answer. Repeat
+        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
+        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
+        an empty string.\n\nOptions:\nEconomic factors\nSocial unrest\nPolitical corruption\n\nProposed
+        answer: Based on the context given, Serif et al. (2026) claim that the overwhelming
+        cause of regime collapse arises from economic factors. Yet, most other scholars
+        (Gerald and Robinson for example) believe the collapse was due to social unrest
+        because of the prolonged epidemic of 2025. I tend to agree with the majority
+        - although I can see both sides. Thus my response is that the social unrest
+        was the significant factor in the collapse of the regime.", "role": "user"}],
+        "model": "gpt-4o-mini", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "916"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.2
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.2
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.4
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFLBThsxFLzvV1g+Z1HIBgK5wQkVAVJ7oUJo5bVfdt3Yfpbf2yoF5d+r
+          dUI2EVTqxYc3b8YzY78XQkhr5FJI3SnWPrryxtw//Pz+m+82T99md+v15vHt/tY+dz5Q8nIyMLD5
+          BZo/WGcafXTAFsMO1gkUw6B6vqiq6+pyXk0z4NGAG2ht5HKOpbfBlrPpbF5OF+X51Z7dodVAcile
+          CiGEeM/n4DMY2MilyFp54oFItSCXhyUhZEI3TKQissQqsJyMoMbAELL1H6itcqIPCehkJ8GqJzX4
+          DL1z+/n2cKnDNiZsaI8f5isbLHV1AkUYhguIMcqMbgshXnO4/sSvjAl95JpxDYFyXxc7PTl2OqKz
+          PcbIyh2TFpMv5GoDrKyjo3akVroDM1LHKlVvLB4BxVHoz2a+0t4Ft6H9H/kR0Boig6ljAmP1aeBx
+          LcHw4/61dig5G5b0hxh8vbKhhRST3b33KtZNoyp9BYtpI4tt8RcAAP//AwC+Pd9w/QIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f0708028d4067b3-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 11 Dec 2024 17:00:30 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "219"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999777"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_f1ad9fa4f694c6f3fa43b1e22e47a6e7
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer_llm[not exact].yaml b/tests/cassettes/test_extract_answer_llm[not exact].yaml
new file mode 100644
index 00000000..9d9c2500
--- /dev/null
+++ b/tests/cassettes/test_extract_answer_llm[not exact].yaml	
@@ -0,0 +1,103 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Here are the fixed options and a proposed answer. Repeat
+        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
+        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
+        an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: A or B", "role": "user"}],
+        "model": "gpt-4o-mini", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "430"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.2
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.2
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.4
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFLLasMwELz7K8Se4+LEaV63XBJK6TmlpRhZ2thqZK2QZGgI+fci501S
+          6EWHmZ3RzEq7hDFQEmYMRM2DaKxO5/L17X25Eh/L+cpl/VG74P3nxZa29eJlA72ooPIbRTipngQ1
+          VmNQZA60cMgDRtf+OM+n+Wg4mHZEQxJ1lFU2pENKG2VUOsgGwzQbp/3JUV2TEuhhxj4TxhjbdWfM
+          aST+wIxlvRPSoPe8QpidhxgDRzoiwL1XPnAToHchBZmApot+DTtct57HaKbV+ojvz/doqqyj0h/5
+          M75WRvm6cMg9mejpA1no2H3C2FfXp72JCNZRY0MRaIMmGk7ygx1ctnghj1UhUOD6gebGrJAYuNL+
+          ah0guKhR3hkyBryViq6I5KryfZZH3ofaylT/sb8QQqANKAvrUCrxsG9nHr/YX2PnFXeBwW99wKZY
+          K1Ohs04dHnhti7LkuZjgOCsh2Se/AAAA//8DAI7MQKjuAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f0707fdccb9ed3b-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 11 Dec 2024 17:00:30 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "187"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999898"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_6096d9659ea5fa164eceaca3c0ff9fbb
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer_llm[not in options].yaml b/tests/cassettes/test_extract_answer_llm[not in options].yaml
new file mode 100644
index 00000000..5dbd3b77
--- /dev/null
+++ b/tests/cassettes/test_extract_answer_llm[not in options].yaml	
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Here are the fixed options and a proposed answer. Repeat
+        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
+        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
+        an empty string.\n\nOptions:\nB\nC\n\nProposed answer: F", "role": "user"}],
+        "model": "gpt-4o-mini", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "422"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.2
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.2
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "0"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.4
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSTU8CMRS8769o3pk1sIvycTPxoDHoDU2M2XTbx26129e0xYCE/266IAsBEy89
+          zLyZzrx2kzAGSsKUgah5EI3V6a18nL0+PD+t58N5PtA0n62+X/idzO7t9Rf0ooLKDxThV3UlqLEa
+          gyKzo4VDHjC6DkZ5PslvhtmkJRqSqKOssiEdUtooo9Ksnw3T/igdjPfqmpRAD1P2ljDG2KY9Y04j
+          cQVT1u/9Ig16zyuE6WGIMXCkIwLce+UDNwF6HSnIBDRt9GPY4WLpeYxmllrv8e3hHk2VdVT6PX/A
+          F8ooXxcOuScTPX0gCy27TRh7b/ssTyKCddTYUAT6RBMNR5OdHXRb7Mh9VQgUuL6gOTErJAautD9a
+          BwguapRnhowBX0pFR0RyVPk8yyXvXW1lqv/Yd4QQaAPKwjqUSlzs25rHL/bX2GHFbWDwax+wKRbK
+          VOisU7sHXtiiLHkuxjjql5Bskx8AAAD//wMAztjv0e4CAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f0707f8196a7ad6-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 11 Dec 2024 17:00:29 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=rcc8uJSoYq4Sbo_GUuI7O04sMAoaQ0vy.vIgnWH.Nv8-1733936429-1.0.1.1-XbksLC1b8zqp8quutgJYFvxYvYPvO82TaSkdFap094o8_1.HBMP.3TwDmpx36yIipRy5bJBnSaDXLegbPaTwEg;
+            path=/; expires=Wed, 11-Dec-24 17:30:29 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=WkmDGCsakb0rOihw3PqRfbieMakXTIGQdIGD7BKSG5k-1733936429180-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "243"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999901"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_fdcfe4f30d07daf21fc88414f0358901
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c13a07a4..b656d874 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,6 @@
 import pytest
 
-from aviary.core import eval_answer
+from aviary.core import eval_answer, extract_answer_llm
 
 
 @pytest.mark.vcr
@@ -38,3 +38,30 @@ async def test_eval_answer(proposed, correct, question, eval_mode, expected):
 async def test_eval_llm_config():
     config = {"temperature": 0.5}
     assert await eval_answer("250", "250", "What is 25 * 10?", "llm", config)
+
+
+@pytest.mark.vcr
+@pytest.mark.parametrize(
+    ("proposed", "options", "expected"),
+    [
+        pytest.param("A", ["A", "B", "C"], "A", id="exact"),
+        pytest.param("a", ["A", "B", "C"], "A", id="exact"),
+        pytest.param("F", ["B", "C"], None, id="not in options"),
+        pytest.param("A or B", ["A", "B", "C"], None, id="not exact"),
+        pytest.param(
+            "Based on the context given, Serif et al. (2026) claim that "
+            "the overwhelming cause of regime collapse arises from economic factors. "
+            "Yet, most other scholars (Gerald and Robinson for example) believe the collapse "
+            "was due to social unrest because of the prolonged epidemic of 2025. I tend to agree "
+            "with the majority - although I can see both sides. Thus my response "
+            "is that the social unrest was the significant factor in the collapse of the regime.",
+            ["Economic factors", "Social unrest", "Political corruption"],
+            "Social unrest",
+            id="complex",
+        ),
+        pytest.param("", ["A", "B", "C"], None, id="empty proposed"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_extract_answer_llm(proposed, options, expected):
+    assert await extract_answer_llm(proposed, options) == expected

From 783b4831ae86ae0b9b70aeb5be4407f320416ad0 Mon Sep 17 00:00:00 2001
From: Andrew White <white.d.andrew@gmail.com>
Date: Wed, 11 Dec 2024 09:02:18 -0800
Subject: [PATCH 2/4] Simplified prompts

---
 src/aviary/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index f1332d20..d7a9f269 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -25,7 +25,6 @@
 LLM_EXTRACT_CONFIG = {
     "prompt": (
         "You are evaluating answers for a test which has fixed options. "
-        "Here are the fixed options and a proposed answer. "
         "Repeat back which option the proposed answer matches. "
         "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
         "If the proposed answer is empty, invalid, or ambiguous, "

From 9547134f1530ed9270817977acd463796c4be8d2 Mon Sep 17 00:00:00 2001
From: Andrew White <white.d.andrew@gmail.com>
Date: Wed, 11 Dec 2024 09:03:07 -0800
Subject: [PATCH 3/4] Updated casettes

---
 .../test_extract_answer_llm[complex].yaml     | 48 +++++++++----------
 .../test_extract_answer_llm[not exact].yaml   | 35 +++++++-------
 ...st_extract_answer_llm[not in options].yaml | 41 ++++++++--------
 3 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/tests/cassettes/test_extract_answer_llm[complex].yaml b/tests/cassettes/test_extract_answer_llm[complex].yaml
index 7ec60953..24ecd40c 100644
--- a/tests/cassettes/test_extract_answer_llm[complex].yaml
+++ b/tests/cassettes/test_extract_answer_llm[complex].yaml
@@ -2,17 +2,17 @@ interactions:
   - request:
       body:
         '{"messages": [{"content": "You are evaluating answers for a test which
-        has fixed options. Here are the fixed options and a proposed answer. Repeat
-        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
-        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
-        an empty string.\n\nOptions:\nEconomic factors\nSocial unrest\nPolitical corruption\n\nProposed
-        answer: Based on the context given, Serif et al. (2026) claim that the overwhelming
-        cause of regime collapse arises from economic factors. Yet, most other scholars
-        (Gerald and Robinson for example) believe the collapse was due to social unrest
-        because of the prolonged epidemic of 2025. I tend to agree with the majority
-        - although I can see both sides. Thus my response is that the social unrest
-        was the significant factor in the collapse of the regime.", "role": "user"}],
-        "model": "gpt-4o-mini", "temperature": 0}'
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
+        unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
+        Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
+        from economic factors. Yet, most other scholars (Gerald and Robinson for example)
+        believe the collapse was due to social unrest because of the prolonged epidemic
+        of 2025. I tend to agree with the majority - although I can see both sides.
+        Thus my response is that the social unrest was the significant factor in the
+        collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature":
+        0}'
       headers:
         accept:
           - application/json
@@ -21,7 +21,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "916"
+          - "866"
         content-type:
           - application/json
         host:
@@ -51,18 +51,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFLBThsxFLzvV1g+Z1HIBgK5wQkVAVJ7oUJo5bVfdt3Yfpbf2yoF5d+r
-          dUI2EVTqxYc3b8YzY78XQkhr5FJI3SnWPrryxtw//Pz+m+82T99md+v15vHt/tY+dz5Q8nIyMLD5
-          BZo/WGcafXTAFsMO1gkUw6B6vqiq6+pyXk0z4NGAG2ht5HKOpbfBlrPpbF5OF+X51Z7dodVAcile
-          CiGEeM/n4DMY2MilyFp54oFItSCXhyUhZEI3TKQissQqsJyMoMbAELL1H6itcqIPCehkJ8GqJzX4
-          DL1z+/n2cKnDNiZsaI8f5isbLHV1AkUYhguIMcqMbgshXnO4/sSvjAl95JpxDYFyXxc7PTl2OqKz
-          PcbIyh2TFpMv5GoDrKyjo3akVroDM1LHKlVvLB4BxVHoz2a+0t4Ft6H9H/kR0Boig6ljAmP1aeBx
-          LcHw4/61dig5G5b0hxh8vbKhhRST3b33KtZNoyp9BYtpI4tt8RcAAP//AwC+Pd9w/QIAAA==
+          H4sIAAAAAAAAAwAAAP//jJJBTwIxEIXv+yuanlkDLLDITT140OiBxJgYs+m2w1Lpdpp2ViGE/266
+          IAsREy89zDfv9c2024QxrhWfMS6XgmTtTHqjHp59/YQyyLn6vJ28Klo/ipfNF6zu7nkvKrD8AEk/
+          qiuJtTNAGu0eSw+CILoO8iy7zibjPGtBjQpMlFWO0hGmtbY6HfaHo7Sfp4PpQb1ELSHwGXtLGGNs
+          254xp1Ww5jPW7/1UaghBVMBnxybGuEcTK1yEoAMJS7zXQYmWwLbR5yi1MKyxHsJZj4dFE0TMaRtj
+          DvXd8VKDlfNYhgM/1hfa6rAsPIiANl4QCB1v6S5h7L0drjnLy53H2lFBuAIbDQeT8d6Pdzvt6PDA
+          CEmYU1Heu2BXKCChTTjZDpdCLkF10m6VolEaT0ByMvTvMJe894NrW/3HvgNSgiNQhfOgtDwfuGvz
+          EH/cX23HJbeBedgEgrpYaFuBd17v33vhirIUmZxC3i95sku+AQAA//8DAAcy6K79AgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f0708028d4067b3-SJC
+          - 8f070b7e9be306ad-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -70,7 +70,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 11 Dec 2024 17:00:30 GMT
+          - Wed, 11 Dec 2024 17:02:53 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -84,7 +84,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "219"
+          - "244"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -96,13 +96,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "29999"
         x-ratelimit-remaining-tokens:
-          - "149999777"
+          - "149999790"
         x-ratelimit-reset-requests:
           - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_f1ad9fa4f694c6f3fa43b1e22e47a6e7
+          - req_e8f3bb69f3add846e2a40af6c0982db6
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer_llm[not exact].yaml b/tests/cassettes/test_extract_answer_llm[not exact].yaml
index 9d9c2500..b0a3efe3 100644
--- a/tests/cassettes/test_extract_answer_llm[not exact].yaml	
+++ b/tests/cassettes/test_extract_answer_llm[not exact].yaml	
@@ -2,11 +2,10 @@ interactions:
   - request:
       body:
         '{"messages": [{"content": "You are evaluating answers for a test which
-        has fixed options. Here are the fixed options and a proposed answer. Repeat
-        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
-        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
-        an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer: A or B", "role": "user"}],
-        "model": "gpt-4o-mini", "temperature": 0}'
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +14,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "430"
+          - "380"
         content-type:
           - application/json
         host:
@@ -45,18 +44,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFLLasMwELz7K8Se4+LEaV63XBJK6TmlpRhZ2thqZK2QZGgI+fci501S
-          6EWHmZ3RzEq7hDFQEmYMRM2DaKxO5/L17X25Eh/L+cpl/VG74P3nxZa29eJlA72ooPIbRTipngQ1
-          VmNQZA60cMgDRtf+OM+n+Wg4mHZEQxJ1lFU2pENKG2VUOsgGwzQbp/3JUV2TEuhhxj4TxhjbdWfM
-          aST+wIxlvRPSoPe8QpidhxgDRzoiwL1XPnAToHchBZmApot+DTtct57HaKbV+ojvz/doqqyj0h/5
-          M75WRvm6cMg9mejpA1no2H3C2FfXp72JCNZRY0MRaIMmGk7ygx1ctnghj1UhUOD6gebGrJAYuNL+
-          ah0guKhR3hkyBryViq6I5KryfZZH3ofaylT/sb8QQqANKAvrUCrxsG9nHr/YX2PnFXeBwW99wKZY
-          K1Ohs04dHnhti7LkuZjgOCsh2Se/AAAA//8DAI7MQKjuAgAA
+          H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVpCe0PigARSOSEkhCLH3iYGxzb2FlGq/jty+krVInHx
+          YWZnPLP2OmEMlIQZA9FwEq3T6a18mH/OH5ufVRVoNZk+PRfmXovs5e4rz2EQFbZ6R0F71ZWwrdNI
+          ypotLTxywug6LPJ8ml9PilFHtFaijrLaUTq2aauMSkfZaJxmRTq82akbqwQGmLHXhDHG1t0ZcxqJ
+          3zBj2WCPtBgCrxFmhyHGwFsdEeAhqEDcEAyOpLCG0HTR+7DHxTLwGM0std7hm8M92tbO2yrs+AO+
+          UEaFpvTIgzXRM5B10LGbhLG3rs/yJCI4b1tHJdkPNNGwyLd2cNzikdxVBbLE9QXNiVkpkbjSobcO
+          EFw0KM8MGQO+lMr2iKRX+TzLJe9tbWXq/9gfCSHQEcrSeZRKXOzbmccv9tfYYcVdYAirQNiWC2Vq
+          9M6r7QMvXFlVPBc3WGQVJJvkFwAA//8DABYKnlruAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f0707fdccb9ed3b-SJC
+          - 8f070b799eb5679d-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +63,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 11 Dec 2024 17:00:30 GMT
+          - Wed, 11 Dec 2024 17:02:52 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -78,7 +77,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "187"
+          - "193"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -90,13 +89,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "29999"
         x-ratelimit-remaining-tokens:
-          - "149999898"
+          - "149999912"
         x-ratelimit-reset-requests:
           - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_6096d9659ea5fa164eceaca3c0ff9fbb
+          - req_24d3312a6ad717a657fe3e693bd24613
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer_llm[not in options].yaml b/tests/cassettes/test_extract_answer_llm[not in options].yaml
index 5dbd3b77..e9aa9016 100644
--- a/tests/cassettes/test_extract_answer_llm[not in options].yaml	
+++ b/tests/cassettes/test_extract_answer_llm[not in options].yaml	
@@ -2,11 +2,10 @@ interactions:
   - request:
       body:
         '{"messages": [{"content": "You are evaluating answers for a test which
-        has fixed options. Here are the fixed options and a proposed answer. Repeat
-        back which option the proposed answer matches. GIVE ONLY THE VERBATIM TEXT OF
-        A FIXED OPTION. If the proposed answer is empty, invalid, or ambiguous, return
-        an empty string.\n\nOptions:\nB\nC\n\nProposed answer: F", "role": "user"}],
-        "model": "gpt-4o-mini", "temperature": 0}'
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer:
+        F", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +14,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "422"
+          - "372"
         content-type:
           - application/json
         host:
@@ -45,18 +44,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA4xSTU8CMRS8769o3pk1sIvycTPxoDHoDU2M2XTbx26129e0xYCE/266IAsBEy89
-          zLyZzrx2kzAGSsKUgah5EI3V6a18nL0+PD+t58N5PtA0n62+X/idzO7t9Rf0ooLKDxThV3UlqLEa
-          gyKzo4VDHjC6DkZ5PslvhtmkJRqSqKOssiEdUtooo9Ksnw3T/igdjPfqmpRAD1P2ljDG2KY9Y04j
-          cQVT1u/9Ig16zyuE6WGIMXCkIwLce+UDNwF6HSnIBDRt9GPY4WLpeYxmllrv8e3hHk2VdVT6PX/A
-          F8ooXxcOuScTPX0gCy27TRh7b/ssTyKCddTYUAT6RBMNR5OdHXRb7Mh9VQgUuL6gOTErJAautD9a
-          BwguapRnhowBX0pFR0RyVPk8yyXvXW1lqv/Yd4QQaAPKwjqUSlzs25rHL/bX2GHFbWDwax+wKRbK
-          VOisU7sHXtiiLHkuxjjql5Bskx8AAAD//wMAztjv0e4CAAA=
+          H4sIAAAAAAAAA4xSy2rDMBC8+yvEnuPiPO3kVtLSQik9pT2UYhRp7aiRJSHJkAf59yLbzYOk0IsO
+          MzujmZX2ESEgOMwIsBX1rDIyvucvbwa3T7vieT5JkvVm/lgv3hcPH8Pdq4FeUOjlNzL/q7pjujIS
+          vdCqpZlF6jG49tPhcDqcjNN+Q1Saowyy0vh4pONKKBEPksEoTtK4n3XqlRYMHczIZ0QIIfvmDDkV
+          xw3MSNL7RSp0jpYIs+MQIWC1DAhQ54TzVHnonUimlUfVRD+HLRa1oyGaqqXs8MPxHqlLY/XSdfwR
+          L4QSbpVbpE6r4Om8NtCwh4iQr6ZPfRERjNWV8bnXa1TBcDJt7eC0xRPZVQWvPZU3NBdmOUdPhXRn
+          6wBG2Qr5lSEhQGsu9BkRnVW+znLLu60tVPkf+xPBGBqPPDcWuWA3+zbm4Yv9NXZccRMY3NZ5rPJC
+          qBKtsaJ94MLkWT9jg2yUpGOIDtEPAAAA//8DAPFLi1buAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f0707f8196a7ad6-SJC
+          - 8f070b74a8f296d2-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,14 +63,14 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 11 Dec 2024 17:00:29 GMT
+          - Wed, 11 Dec 2024 17:02:51 GMT
         Server:
           - cloudflare
         Set-Cookie:
-          - __cf_bm=rcc8uJSoYq4Sbo_GUuI7O04sMAoaQ0vy.vIgnWH.Nv8-1733936429-1.0.1.1-XbksLC1b8zqp8quutgJYFvxYvYPvO82TaSkdFap094o8_1.HBMP.3TwDmpx36yIipRy5bJBnSaDXLegbPaTwEg;
-            path=/; expires=Wed, 11-Dec-24 17:30:29 GMT; domain=.api.openai.com; HttpOnly;
+          - __cf_bm=cPN3yGhPdDhm..__K3xNc3MOLUBlr1oNqkhiHU7tJZ4-1733936571-1.0.1.1-MtqJwEyFDgd8Vni.ynFHGXcJPXLffNE5vA844QFRzo.pM.7HV8F4xT54VCeWBp2zImYM6SMGBKTeaixHDUg9Ng;
+            path=/; expires=Wed, 11-Dec-24 17:32:51 GMT; domain=.api.openai.com; HttpOnly;
             Secure; SameSite=None
-          - _cfuvid=WkmDGCsakb0rOihw3PqRfbieMakXTIGQdIGD7BKSG5k-1733936429180-0.0.1.1-604800000;
+          - _cfuvid=guVLBTPXR9itj1suHbM9nVSWOWIE5OU2MTh7PlMzNK0-1733936571897-0.0.1.1-604800000;
             path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
@@ -84,7 +83,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "243"
+          - "155"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -96,13 +95,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "29999"
         x-ratelimit-remaining-tokens:
-          - "149999901"
+          - "149999913"
         x-ratelimit-reset-requests:
           - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_fdcfe4f30d07daf21fc88414f0358901
+          - req_daa6490ff2cc82dbb9cea5befb1d758d
       status:
         code: 200
         message: OK

From d7ef73aa6f7f2cfd0e45f313d7e8b5707b565e6f Mon Sep 17 00:00:00 2001
From: Andrew White <white.d.andrew@gmail.com>
Date: Wed, 11 Dec 2024 09:04:05 -0800
Subject: [PATCH 4/4] Removed prints

---
 src/aviary/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index d7a9f269..46c45935 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -118,7 +118,6 @@ async def extract_answer_llm(
         proposed_answer=proposed,
     )
 
-    print("prompt", prompt)
     response = await acompletion(
         model=config["model"],
         temperature=config["temperature"],
@@ -126,7 +125,6 @@ async def extract_answer_llm(
     )
 
     extracted = response.choices[0].message.content.strip()
-    print("here it is", extracted)
     for option in options:
         if extracted.casefold() == option.casefold().strip():
             return option