From b594d83d581b6d66314cb841a07dea060a02122b Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Wed, 18 Dec 2024 13:35:44 -0800
Subject: [PATCH 1/3] Exported
 https://github.com/Future-House/aviary/pull/148's contents

---
 src/aviary/core.py                            |   2 +
 src/aviary/utils.py                           |  39 ++++++-
 .../test_extract_answer[complex].yaml         | 109 ++++++++++++++++++
 .../test_extract_answer[empty-proposal].yaml  | 102 ++++++++++++++++
 .../test_extract_answer[gave-two].yaml        | 102 ++++++++++++++++
 .../test_extract_answer[not in options].yaml  | 108 +++++++++++++++++
 tests/test_utils.py                           |  31 ++++-
 7 files changed, 491 insertions(+), 2 deletions(-)
 create mode 100644 tests/cassettes/test_extract_answer[complex].yaml
 create mode 100644 tests/cassettes/test_extract_answer[empty-proposal].yaml
 create mode 100644 tests/cassettes/test_extract_answer[gave-two].yaml
 create mode 100644 tests/cassettes/test_extract_answer[not in options].yaml

diff --git a/src/aviary/core.py b/src/aviary/core.py
index 587e76b5..6170a7bd 100644
--- a/src/aviary/core.py
+++ b/src/aviary/core.py
@@ -40,6 +40,7 @@
     EvalAnswerMode,
     encode_image_to_base64,
     eval_answer,
+    extract_answer,
     is_coroutine_callable,
     partial_format,
 )
@@ -82,6 +83,7 @@
     "encode_image_to_base64",
     "eval_answer",
     "eval_answer",
+    "extract_answer",
     "fenv",
     "is_coroutine_callable",
     "join",
diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index 35687bd2..2dee1bf1 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -22,7 +22,7 @@
 
 
 DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
-LLM_BOOL_EVAL_CONFIG = {
+LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a proposed answer"
         " to the question. Please tell me if the proposed answer is correct, given the"
@@ -35,6 +35,18 @@
     "temperature": 0,
 }
 
+LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
+    "prompt": (
+        "You are evaluating answers for a test which has fixed options. "
+        "Repeat back which option the proposed answer matches. "
+        "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
+        "If the proposed answer is empty, invalid, or ambiguous, "
+        "return an empty string."
+        "\n\nOptions:\n{options}"
+        "\n\nProposed answer: {proposed_answer}"
+    )
+}
+
 LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
@@ -175,6 +187,31 @@ async def eval_answer(
     raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")
 
 
+async def extract_answer(
+    proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
+) -> str | None:
+    """Extract the answer matching a proposal from a list of options using an LLM."""
+    for option in options:
+        if proposed_answer.strip().casefold() == option.strip().casefold():
+            return option
+
+    default_config = LLM_EXTRACT_CONFIG
+    config = llm_eval_config or default_config
+    response_msg = await run_prompt(
+        prompt=config.get("prompt", default_config["prompt"]).format(
+            options="\n".join(options),
+            proposed_answer=proposed_answer,
+        ),
+        model=config.get("model", default_config["model"]),
+        temperature=config.get("temperature", default_config["temperature"]),
+    )
+    answer = response_msg.strip().casefold()  # noqa: FURB184
+    for option in options:
+        if answer == option.strip().casefold():
+            return option
+    return None
+
+
 _CAPITAL_A_INDEX = ord("A")
 
 
diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml
new file mode 100644
index 00000000..319c6183
--- /dev/null
+++ b/tests/cassettes/test_extract_answer[complex].yaml
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
+        unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
+        Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
+        from economic factors. Yet, most other scholars (Gerald and Robinson for example)
+        believe the collapse was due to social unrest because of the prolonged epidemic
+        of 2025. I tend to agree with the majority - although I can see both sides.
+        Thus my response is that the social unrest was the significant factor in the
+        collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature":
+        0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "861"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW
+          FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH
+          sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA
+          nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC
+          yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib
+          3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D
+          krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f42461018c5eb29-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:52 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "235"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999790"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_366dfd5f505d08facd0f7d10e64a9f5e
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml
new file mode 100644
index 00000000..cb61987f
--- /dev/null
+++ b/tests/cassettes/test_extract_answer[empty-proposal].yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        ", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "369"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+
+          1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6
+          EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG
+          Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4
+          pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W
+          AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK
+          1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f424615ca81eb32-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:53 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "171"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999912"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_de2070d3e02afd584ac618042c22382d
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml
new file mode 100644
index 00000000..a70e7e68
--- /dev/null
+++ b/tests/cassettes/test_extract_answer[gave-two].yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "375"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85
+          zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN
+          QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm
+          rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK
+          l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL
+          EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed
+          2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f42460a68a467f1-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:51 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "241"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999911"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_83d07d0983e1d4d1995bfa068db503dd
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer[not in options].yaml b/tests/cassettes/test_extract_answer[not in options].yaml
new file mode 100644
index 00000000..e48c8e4c
--- /dev/null
+++ b/tests/cassettes/test_extract_answer[not in options].yaml	
@@ -0,0 +1,108 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer:
+        F", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "367"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "0"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArr5gYl/d9uLIxIMLBQFLn2JXFxfJbtQKuq3x05DU0r
+          QGLJ8H73Xt5dckwYAyVhzUDUPIjG6vS+/Ni3T5TnlpYPyA/ukT4r2r3kcqKeYRQdtN2hCN+uO0GN
+          1RgUmTMWDnnAmJovJtPZbDGfZB1oSKKOtsqGdErpOBtP02yZZvPeWJMS6GHNXhPGGDt2z1jRSNzD
+          mnUxndKg97xCWF+GGANHOirAvVc+cBNgNEBBJqDpWm9gA9fIYdl6HpuZVuteP13epamyjra+5xe9
+          VEb5unDIPZmY6wNZ6OgpYeyt26m9qQnWUWNDEegdTQycr85xMBxxgHnPAgWuB3nRn+E2rJAYuNL+
+          6iQguKhRDs7hfryViq5AcrXyzy6/ZZ/XVqb6T/wAhEAbUBbWoVTidt9hzGH8w/4au5y4Kwz+4AM2
+          RalMhc46df7IpS1WJd/yVZlNlpCcki8AAAD//wMAJ6aXP+0CAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f4246051c1ceb26-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 18 Dec 2024 21:33:50 GMT
+        Server:
+          - cloudflare
+        Set-Cookie:
+          - __cf_bm=in2yLMzYdxfPIHSQDPq17chYgGhrTOolB6HZrJk9Iy8-1734557630-1.0.1.1-LYUU4oNWUKO8gNwjcIktYjnSyIsGLKQGmQKI54P4UxfMJ330MXeZFWWhVoJnP0b1M92ejFaHTWHlz4eHH30gIA;
+            path=/; expires=Wed, 18-Dec-24 22:03:50 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=El.JjK6nMT19ye2jesHXCIAySeg4BN7pKN7mVnzqSM8-1734557630598-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "253"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999912"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_5992eb433053f3b82b29a5319a96ef7e
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3469f01c..e3e7af1c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from aviary.core import eval_answer
+from aviary.core import eval_answer, extract_answer
 from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion
 from tests.conftest import VCR_DEFAULT_MATCH_ON
 
@@ -39,6 +39,35 @@ async def test_eval_answer(
     assert await eval_answer(proposed, correct, question, eval_mode) == expected
 
 
+@pytest.mark.vcr
+@pytest.mark.parametrize(
+    ("proposed_answer", "options", "expected"),
+    [
+        pytest.param("A", ["A", "B", "C"], "A", id="exact-uppercase"),
+        pytest.param("a", ["A", "B", "C"], "A", id="exact-lowercase"),
+        pytest.param("F", ["B", "C"], None, id="not in options"),
+        pytest.param("A or B", ["A", "B", "C"], None, id="gave-two"),
+        pytest.param(
+            "Based on the context given, Serif et al. (2026) claim that "
+            "the overwhelming cause of regime collapse arises from economic factors. "
+            "Yet, most other scholars (Gerald and Robinson for example) believe the collapse "
+            "was due to social unrest because of the prolonged epidemic of 2025. I tend to agree "
+            "with the majority - although I can see both sides. Thus my response "
+            "is that the social unrest was the significant factor in the collapse of the regime.",
+            ["Economic factors", "Social unrest", "Political corruption"],
+            "Social unrest",
+            id="complex",
+        ),
+        pytest.param("", ["A", "B", "C"], None, id="empty-proposal"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_extract_answer(
+    proposed_answer: str, options: Sequence[str], expected: str | None
+) -> None:
+    assert await extract_answer(proposed_answer, options) == expected
+
+
 @pytest.mark.vcr
 @pytest.mark.asyncio
 async def test_eval_llm_config():

From d018adfaaba6622a6c4ef11293a76dff9804e0a9 Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Wed, 18 Dec 2024 13:47:38 -0800
Subject: [PATCH 2/3] Moved MultipleChoiceQuestion to now use extract_answer

---
 src/aviary/utils.py                           | 64 ++++++-------------
 ...t-match-and-llm-has-innate-knowledge].yaml | 39 ++++++-----
 ...nt-match-and-no-llm-innate-knowledge].yaml | 39 ++++++-----
 ...AEvaluation.test_grade[empty-answer1].yaml | 39 ++++++-----
 ...AEvaluation.test_grade[empty-answer2].yaml | 39 ++++++-----
 ...AEvaluation.test_grade[empty-answer3].yaml | 40 ++++++------
 ...on.test_grade[matched-correct-option].yaml | 47 ++++++++------
 ....test_grade[matched-incorrect-option].yaml | 39 ++++++-----
 ...n.test_grade[matched-several-options].yaml | 39 ++++++-----
 ...ion.test_grade[matched-unsure-option].yaml | 40 ++++++------
 tests/test_utils.py                           | 16 ++---
 11 files changed, 205 insertions(+), 236 deletions(-)

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index 2dee1bf1..fcbf130f 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -3,10 +3,9 @@
 import inspect
 import io
 import random
-import re
 import string
 from ast import literal_eval
-from collections.abc import Awaitable, Callable, Sequence
+from collections.abc import Sequence
 from enum import StrEnum
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast
 
@@ -217,16 +216,6 @@ async def extract_answer(
 
 class MultipleChoiceQuestion(BaseModel):
     QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}"
-    # TODO: combine with above eval_answer and its prompts
-    EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = (
-        "Given the following question and a proposed answer to the question, return the"
-        " single-letter choice in the question that matches the proposed answer."
-        " If the proposed answer is blank or an empty string,"
-        " or multiple options are matched, respond with '0'."
-        "\n\nQuestion: {qa_prompt}"
-        "\n\nProposed Answer: {qa_answer}"
-        "\n\nSingle Letter Answer:"
-    )
     DEFAULT_UNSURE_OPTION: ClassVar[str] = (
         "Insufficient information to answer this question"
     )
@@ -317,18 +306,14 @@ def split_options(options: str) -> list[str]:
         return split_options
 
     async def grade(
-        self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None
-    ) -> "tuple[MultipleChoiceEvaluation, str, str]":
-        if prompt_runner is None:
-            prompt_runner = run_prompt
-        eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
-            qa_prompt=self.question_prompt, qa_answer=answer
-        )
-        raw_evaluation = await prompt_runner(eval_prompt)
-        evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer(
-            raw_evaluation, self
+        self, proposed_answer: str
+    ) -> "tuple[MultipleChoiceEvaluation, str | None]":
+        extracted_answer = await extract_answer(
+            proposed_answer=proposed_answer, options=self.options
         )
-        return evaluation, raw_evaluation, parsed_answer
+        return MultipleChoiceEvaluation.from_answer(
+            extracted_answer, self
+        ), extracted_answer
 
 
 class MultipleChoiceEvaluation(StrEnum):
@@ -360,32 +345,19 @@ def calculate_accuracy_precision(
 
     @classmethod
     def from_answer(
-        cls, answer: str, question: MultipleChoiceQuestion
-    ) -> "tuple[MultipleChoiceEvaluation, str]":
+        cls, extracted_answer: str | None, question: MultipleChoiceQuestion
+    ) -> "MultipleChoiceEvaluation":
         """Make an evaluation from the input answer and multiple choice question.
 
         Returns:
-            Two-tuple of answer enum and the raw answer extracted from the input answer.
+            Evaluation corresponding to the parsed answer.
         """
-        # SEE: https://regex101.com/r/vcE9Hb/1
-        letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL)
-        # Get the letter answer, or fail over to the first non-whitespace char
-        answer_char = (
-            letter_search.group(1)
-            if letter_search is not None
-            else answer.split()[0][0].upper()
-        )
-        answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX
-        if answer_letter_index < 0 or answer_letter_index > len(question.options):
-            # The result extracted was not in the options (e.g. '0')
-            return cls.INCORRECT, answer_char
+        if extracted_answer is None:
+            return MultipleChoiceEvaluation.INCORRECT
         # From here, if we don't match either the ideal or the unsure multiple choice
         # options then we declare the answer as incorrect.
-        if (
-            question.unsure_answer_index is not None
-            and answer_letter_index == question.unsure_answer_index
-        ):
-            return cls.UNSURE, cast(str, question.unsure_answer)
-        if answer_letter_index == question.ideal_answer_index:
-            return cls.CORRECT, question.ideal_answer
-        return cls.INCORRECT, question.options[answer_letter_index]
+        if extracted_answer == question.ideal_answer:
+            return MultipleChoiceEvaluation.CORRECT
+        if question.unsure_answer and extracted_answer == question.unsure_answer:
+            return MultipleChoiceEvaluation.UNSURE
+        return MultipleChoiceEvaluation.INCORRECT
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
index be2df091..c87a0ba3 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle
-        Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
+        to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
+        "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "513"
+          - "437"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k
-          u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw
-          47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK
-          RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie
-          H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu
-          tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6
-          AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA
+          H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVJIe6u4wKESIHFCKHKdTWpwvJa9FaCq/46cvtUicfFh
+          Zmc8s/Y6EQJ0DVMBailZdc6ks+Zrko2cxvljMXvunmavPHkJTA9zk9/DICpo8YGK96obRZ0zyJrs
+          llYeJWN0Hd7lxXhc5sOyJzqq0URZ6zgtKB1loyLNyjS73QmXpBUGmIq3RAgh1v0ZI9oav2EqssEe
+          6TAE2SJMD0NCgCcTEZAh6MDSMgyOpCLLaPvUp7DHZhVkTGVXxuzwzeEeQ63ztAg7/oA32uqwrDzK
+          QDZ6BiYHPbtJhHjv+6zOIoLz1DmumD7RRsNyvLWD4wKP5K4qMLE0VzRnZlWNLLUJJ+sAJdUS6wtD
+          IUCuak0nRHJS+TLLNe9tbW3b/9gfCaXQMdaV81hrdbVvbx5/119jhxX3gSH8BMauarRt0Tuvtw/c
+          uGrSyIWcNFleQrJJfgEAAP//AwA5BypS6QIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fde1cf88cf1b-SJC
+          - 8f4256d28f9615ff-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:29 GMT
+          - Wed, 18 Dec 2024 21:45:18 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "363"
+          - "208"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999874"
+          - "29999896"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_aff8daa48aa43d3df077f97da6136e5a
+          - req_7db1d1f6dded4679e43cc12a2183fa21
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
index 38077163..f043c502 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
-        is 14004\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
+        is 14004", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "536"
+          - "459"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLT8MwEITv+RWWzw1KS2lCb3BCSDykcuAhFBl7kxocr2Vveajqf0dO
-          2yQVIHHJYb6dyewm64QxrhWfMy6XgmTjTHpWPV6fFzf1Jcwebu4WF4tbunKFwvzj/n3GR9GBL68g
-          ae86ktg4A6TRbrH0IAhi6jg/nk7zLC9OWtCgAhNttaN0iukkm0zTrEizXa5copYQ+Jw9JYwxtm6f
-          saJV8MnnLBvtlQZCEDXweTfEGPdoosJFCDqQsMRHPZRoCWzbOhvqHqpVELGWXRmz0zfdiwzWzuNL
-          2PFOr7TVYVl6EAFtDA2Ejrd0kzD23C60OujIncfGUUn4BjYGjscn2zzen3BAd4yQhBmaZqNf4koF
-          JLQJg4twKeQSVG/tzydWSuMAJIOlf5b5LXu7uLb1f+J7ICU4AlU6D0rLw4X7MQ/xB/trrDtyW5iH
-          r0DQlJW2NXjn9fYbV64U+akqlJDjiieb5BsAAP//AwBRMcSQ7AIAAA==
+          H4sIAAAAAAAAAwAAAP//jJLNTsMwEITveQprzw1K/6DNDakIwYkLqgRFkeNsEhfHNrarUlV9d2S3
+          TVJRJC4+7Lcznl17HxECvICUAKupY40W8X25ndktPqzrRb37envdVk/LxfKRvkx4/gwDr1D5Gpk7
+          q26YarRAx5U8YmaQOvSuw7vxZDqdjYdJAI0qUHhZpV08UfEoGU3iZBYntydhrThDCyl5jwghZB9O
+          H1EW+A0pCTah0qC1tEJI2yZCwCjhK0Ct5dZR6WDQQaakQxlSr2AFfWSw3Fjqk8mNEKf6ob1LqEob
+          ldsTb+sll9zWmUFqlfS+1ikNgR4iQj7CTJuLmKCNarTLnPpE6Q3nw6MddEvs4Jk55ajoaUaDK2ZZ
+          gY5yYXsrAUZZjUWn7PZHNwVXPRD1Rv6d5Zr3cWwuq//Yd4Ax1A6LTBssOLuct2sz6H/YX23tikNg
+          sDvrsMlKLis02vDjI5c6m5c0p/MyGc8gOkQ/AAAA//8DAEsANTftAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdc63fbf9e53-SJC
+          - 8f42569f7d2a2379-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:25 GMT
+          - Wed, 18 Dec 2024 21:45:10 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "212"
+          - "240"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999868"
+          - "29999890"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_afd8c66d84f3b42a8cd2b8a6bf855054
+          - req_363f6da2908247ad8c711b11d1593ae7
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
index 057ef1d0..662716da 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: \n\nSingle
-        Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role":
+        "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "517"
+          - "440"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznKyyfG+S2gaa5FegFJKRKnEAocp1NaurYlr3hoar/jpyG
-          JBUgcfFhZmc8O/YhIoTKgmaEih1HUVsVr8qnh/XbzfvdI7LN7WzF9vf8Mik31xtYN3QSFGb7CgK/
-          VRfC1FYBSqNPtHDAEYLrdDFPkgVbpGlL1KYAFWSVxTgx8YzNkpilMbvqhDsjBXiakeeIEEIO7Rki
-          6gI+aEbY5BupwXteAc36IUKoMyoglHsvPXKNdDKQwmgE3aZmY9xB2XgeYulGqQ4/9hcpU1lntr7j
-          e7yUWvpd7oB7o4OpR2Npyx4jQl7ahZqzjNQ6U1vM0exBB8MpW5786FDhiO04NMjVCJ52LZzb5QUg
-          l8qPGqGCix0Ug3SojzeFNCMiGi39M8xv3qfFpa7+Yz8QQoBFKHLroJDifOFhzEH4YH+N9SW3gan/
-          9Ah1XkpdgbNOnt64tPmy5Fu+LNk8pdEx+gIAAP//AwDTwVpp7AIAAA==
+          H4sIAAAAAAAAA4xSy27CMBC85yusPZMqkAApt6rHSr0U9VJVkbE3wa3jtWyj8hD/XjlAAEGlXnyY
+          2RnPrL1LGAMlYcZALHkQrdXpU/1TbhfrIn+j+Vxs281Cv7/k9JrN1/gMg6igxReKcFI9CGqtxqDI
+          HGjhkAeMrsNpXozHZT6cdkRLEnWUNTakBaWjbFSkWZlmk6NwSUqghxn7SBhjbNedMaKRuIYZywYn
+          pEXveYMw64cYA0c6IsC9Vz5wE2BwJgWZgKZLfQk7rFeex1RmpfUR3/f3aGqso4U/8j1eK6P8snLI
+          PZno6QNZ6Nh9wthn12d1FRGso9aGKtA3mmhYTg52cF7gmTxWhUCB6zuaK7NKYuBK+4t1gOBiifLG
+          kDHgK6nogkguKt9mued9qK1M8x/7MyEE2oCysg6lEnf7dubxd/011q+4Cwx+4wO2Va1Mg846dXjg
+          2lZ8+ihLycWwhmSf/AIAAP//AwBsWlME6QIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fddcea1d251d-SJC
+          - 8f4256cce931eb29-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:28 GMT
+          - Wed, 18 Dec 2024 21:45:17 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "174"
+          - "217"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999872"
+          - "29999895"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_4d40eb2c66dfd308a7b75c7cd80c405b
+          - req_520872e529ccbb680d27a3729fbe637e
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
index a0acce15..483daa67 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: \n\nSingle Letter
-        Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
+        to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}],
+        "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "511"
+          - "435"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJRT4MwFIXf+RVNn4eBicPx5oszWTJjfNBoDOnaC9SVtmlLMrPsv5sW
-          HCzOxBceznfP4dwLhwghzBkuEKYNcbTVIr6r3jar1eb5IXtKb7brPL9/3e/a5fxFrOtHPPMOtf0E
-          6n5cV1S1WoDjSvaYGiAOfGqaX2dZnuTLJIBWMRDeVmsXZyqeJ/MsTm7jZDEYG8UpWFyg9wghhA7h
-          6StKBntcoBATlBasJTXg4jSEEDZKeAUTa7l1RDo8GyFV0oEMrZOpbqDqLPG1ZCfEoB9PLxKq1kZt
-          7cBPesUlt01pgFglfah1SuNAjxFCH2Gh7qwj1ka12pVO7UD6wDRZ9Hl4POGEDswpR8TUlM8uxJUM
-          HOHCTi6CKaENsNE6no90jKsJiCZL/y5zKbtfnMv6P/EjoBS0A1ZqA4zT84XHMQP+B/tr7HTkUBjb
-          L+ugLSsuazDa8P4bV7ok+ZLdMkLTCkfH6BsAAP//AwBwbnWk7AIAAA==
+          H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUg0CRbh7ZDB9Shqqqqioz9Etw6tmU7AoT498oB
+          QhBU6uLh7t357tm7iBAQHAoCbEU9a4yMH6p1Pp5/uKf59rF5b95e08X4hanFZv3sMhgFhV5+I/Mn
+          1R3TjZHohVYHmlmkHoPr+H6azmbZdJx3RKM5yiCrjY9THU+SSRonWZzMj8KVFgwdFOQzIoSQXXeG
+          iIrjBgqSjE5Ig87RGqHohwgBq2VAgDonnKfKw+hMMq08qi71ELZYtY6GVKqV8ojv+3ukro3VS3fk
+          e7wSSrhVaZE6rYKn89pAx+4jQr66Pu1FRDBWN8aXXv+gCoZZerCD8wLP5LEqeO2pvKG5MCs5eiqk
+          G6wDGGUr5FeGhABtudADIhpUvs5yy/tQW6j6P/ZngjE0HnlpLHLBbvbtzMPv+musX3EXGNzWeWzK
+          SqgarbHi8MCVKfOKLmleJdMMon30CwAA//8DAFd1apnpAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fde81f05ceb1-SJC
+          - 8f4256d7fd75cf0d-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:30 GMT
+          - Wed, 18 Dec 2024 21:45:19 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "332"
+          - "520"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999875"
+          - "29999896"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_817ca7ae018d7baa48236c7ad4f4f151
+          - req_f943511f12de0306ff59cccd017e98f1
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
index d70cc972..46a00d4f 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
@@ -1,14 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What method
-        was used to demonstrate that the enzyme PafA is stable after incubation with
-        4M urea for 14 days?\n\nOptions:\nA) cryo EM\nB) Insufficient information to
-        answer this question\nC) NMR\nD) x-ray crystallography\nE) circular dichroism\n\nProposed
-        Answer: \n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\ncryo EM\nInsufficient information
+        to answer this question\nNMR\nx-ray crystallography\ncircular dichroism\n\nProposed
+        answer: ", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -17,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "624"
+          - "467"
         content-type:
           - application/json
         host:
@@ -47,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJLNboMwEITvPIXlM1SEkJJyS9VIlfpz6SmpKuSYhTg1tmVvpEZR3r0y
-          IUDVVOqFw3w7w+zCMSCEipLmhPItQ94YGS2q9eujmL9NXtaJWKW7wxM+r+TD8rC4ny5p6B16swOO
-          F9cN142RgEKrM+YWGIJPnWTTNM3i7G7SgkaXIL2tNhilOkriJI3ieRTfdsatFhwczcl7QAghx/bp
-          K6oSvmhO4vCiNOAcq4Hm/RAh1GrpFcqcEw6ZQhoOkGuFoNrW8Vi3UO0d87XUXspOP/Uvkro2Vm9c
-          x3u9Ekq4bWGBOa18qENtaEtPASEf7UL7Hx2psboxWKD+BOUDJ9PknEeHE45ox1Ajk2PTNLwSV5SA
-          TEg3ugjljG+hHKzD+di+FHoEgtHSv8tcyz4vLlT9n/gBcA4GoSyMhVLwnwsPYxb8D/bXWH/ktjB1
-          B4fQFJVQNVhjxfkbV6aosvkMNrMqzWhwCr4BAAD//wMANO06tewCAAA=
+          H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hViz3FxXs3jFkIooZSQQ3spxSjy2lYrS6qkkJaQ/14k
+          p7ZDUujFhx3N55mVjhEhwDOYE2AldazSIl7kh9lgtZmMt4+r4fPL08NivN5uPvvLw3Kzhp53qN07
+          MvfrumOq0gIdV7KWmUHq0FP7k+FoPJ4OB0kQKpWh8LZCu3ik4kEyGMXJNE7uz8ZScYYW5uQ1IoSQ
+          Y/j6iDLDL5iTgAmTCq2lBcK8OUQIGCX8BKi13DoqHfRakSnpUIbU3bHBfG+pTyX3Qpznp+Y/QhXa
+          qJ09680855LbMjVIrZKeaZ3SENRTRMhb6LO/iAjaqEq71KkPlB4469c4aBfYiueq4JSj4obnApZm
+          6CgXtrMOYJSVmF0BCQG6z7jqCFGn8nWWW+y6NpfFf/CtwBhqh1mqDWac3ewb4P51/XWsWXEIDPbb
+          OqzSnMsCjTa8vuBcp7Oc7ugsT4ZTiE7RDwAAAP//AwBp6cyD6QIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdedda0ceb36-SJC
+          - 8f4256dfbb4a7ac7-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -66,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:31 GMT
+          - Wed, 18 Dec 2024 21:45:20 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -80,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "259"
+          - "207"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -92,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999845"
+          - "29999889"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_223a9415a5a19029f86768ffbabf3d6f
+          - req_06b5243be6b66dfc6827056b2c49f4c4
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
index 7f73abaa..f9092a53 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
-        is 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
+        is 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "536"
+          - "459"
         content-type:
           - application/json
         host:
@@ -36,7 +35,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6ggoXlwyyGKcql6Sx+qkLEXcGtsyzZSoyj/vTKQ
-          QNRU6oXDfDvD7MIpQAhzhjOEaU0cbbSINuXb02Z73L287ndH1ias2C8k3a0OB1Y949A7VPEJ1F1c
-          D1Q1WoDjSvaYGiAOfGqynKfpMl6u0g40ioHwtkq7KFXRLJ6lUbyK4sVgrBWnYHGG3gOEEDp1T19R
-          MvjGGYrDi9KAtaQCnF2HEMJGCa9gYi23jkiHwxFSJR3IrvV2qhsoW0t8LdkKMejn64uEqrRRhR34
-          VS+55LbODRCrpA+1Tmnc0XOA0Ee3UHvTEWujGu1yp75A+sAkeezz8HjCCR2YU46IqWkR3onLGTjC
-          hZ1cBFNCa2CjdTwfaRlXExBMlv5d5l52vziX1X/iR0ApaAcs1wYYp7cLj2MG/A/219j1yF1hbI/W
-          QZOXXFZgtOH9Ny51vi5JQdZlPF/h4Bz8AAAA//8DAKuPA4PsAgAA
+          H4sIAAAAAAAAAwAAAP//jJLLTsMwEEX3+QrL6wSlL9pkx4KHVLFDdIFQ5DiTxOB4LNtRgar/jpyG
+          JBVFYuPFnLnXd8Y+BIRQUdCUUF4zxxsto5tyvzHPt1X5uJvva51v758g2d09oNx+tTT0CszfgLsf
+          1RXHRktwAtUJcwPMgXedrRfL1WqziJMONFiA9LJKu2iJ0TyeL6N4E8XXvbBGwcHSlLwEhBBy6E4f
+          URXwQVMShz+VBqxlFdB0aCKEGpS+Qpm1wjqmHA1HyFE5UF3qZDmL11NmoGwt89FUK2VfPw6XSay0
+          wdz2fKiXQglbZwaYReWNrUNNO3oMCHnthmrPclJtsNEuc/gOyhsms5MdHbc4wnnPHDomJ5pFeMEs
+          K8AxIe1kJ5QzXkMxKscFsrYQOAHBZOTfWS55n8YWqvqP/Qg4B+2gyLSBQvDzecc2A/6L/dU2rLgL
+          TO2nddBkpVAVGG3E6ZVLnSUly1lSxosNDY7BNwAAAP//AwBPnKtg7gIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdc06b78cf26-SJC
+          - 8f42569a694415a4-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,9 +64,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:24 GMT
+          - Wed, 18 Dec 2024 21:45:09 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=VhM7SMRUwaYcBGPmD54eZozV9ZASFwCpD2uUzfcbygQ-1734558309-1.0.1.1-PO0cbnzYww6YCdbMGHLbwDXTjy0s_I50cJaqd7OUcIUnT7C0j_EJ9CBwZm8nRmzrv2FivdnDcss9GtsBOjjypw;
+            path=/; expires=Wed, 18-Dec-24 22:15:09 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=tUjQ_WfZ7MP0T2BILMqKk0y_GuM5NWbc6M_7vifNjIU-1734558309829-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -79,7 +84,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "291"
+          - "215"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +96,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999868"
+          - "29999890"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_89e3d88c7f12861d7e774e452300b36d
+          - req_9990262c4eaef569ab37ffc7cf363319
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
index 45376f48..a14c592f 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
-        is 94106\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
+        is 94106", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "536"
+          - "459"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA4ySX2uDMBTF3/0UIc912M7WzreVjv1hDPo06BiSJlfNFpMsibBS+t1H1KplHezF
-          h/O753ju1UOAEOYMpwjTkjhaaRHe5tuX1erpfsv3z8s4ed1sHvL1191+UT9WDk+8Q+0+gLqT64qq
-          SgtwXMkWUwPEgU+dJtdxnETJct6ASjEQ3lZoF8YqnEWzOIyWYbTojKXiFCxO0VuAEEKH5ukrSgbf
-          OEXR5KRUYC0pAKf9EELYKOEVTKzl1hHZ1u0gVdKBbFqvx7qBvLbE15K1EJ1+7F8kVKGN2tmO93rO
-          JbdlZoBYJX2odUrjhh4DhN6bheqzjlgbVWmXOfUJ0gdOp/M2Dw8nHNGOOeWIGJsWkwtxGQNHuLCj
-          i2BKaAlssA7nIzXjagSC0dK/y1zKbhfnsvhP/AAoBe2AZdoA4/R84WHMgP/B/hrrj9wUxnZvHVRZ
-          zmUBRhvefuNcZyS5YUtG6DTHwTH4AQAA//8DAK9WW8vsAgAA
+          H4sIAAAAAAAAAwAAAP//jJLLbsIwEEX3+QrLa1IlECiwo1I/oKioL1WRY08SF8e27KEvxL9XDoGA
+          SqVuvJgz9/rO2NuIECoFnRPKa4a8sSpelB9TzG4m9eo5KWB5qxfrh9Xj8u7p+754p4OgMMUbcDyo
+          rrhprAKURu8xd8AQgmt6PcrG4+koTVvQGAEqyCqLcWbiYTLM4mQaJ5NOWBvJwdM5eYkIIWTbniGi
+          FvBJ5yQZHCoNeM8qoPNjEyHUGRUqlHkvPTKNdNBDbjSCblPPsrS78iCEcuNZiKY3SnX13fEyZSrr
+          TOE7fqyXUktf5w6YNzoYezSWtnQXEfLaDrU5y0mtM43FHM0adDCcpXs72m+xh8OOoUGmTjSjwQWz
+          XAAyqfzJTihnvAbRK/sFso2Q5gREJyP/znLJez+21NV/7HvAOVgEkVsHQvLzefs2B+GL/dV2XHEb
+          mPovj9DkpdQVOOvk/pVLm7PrmZgKxtOSRrvoBwAA//8DAPNQ/XLuAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdcb6d3b645e-SJC
+          - 8f4256a569fd6809-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:26 GMT
+          - Wed, 18 Dec 2024 21:45:11 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "282"
+          - "224"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999868"
+          - "29999890"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_0f4462cd5dd31fe3e1a9d6847e563042
+          - req_ad4574ef78bcc23b8c85835d827e63ec
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
index abe7094e..3cf90e2d 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
-        is 94106 or 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
+        is 94106 or 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "545"
+          - "468"
         content-type:
           - application/json
         host:
@@ -46,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFJBS8MwGL33V4ScV+m2sm69CaLTgyjIDhMpWfK1zZYmIUnFMfbfJW3X
-          dqjgJYf3vvfyvpecAoQwZzhFmJbE0UqL8DbfPt8V+4U8bO6PL0/rTUnWD8XrVr59PkZ44hVqtwfq
-          LqobqiotwHElW5oaIA686zSZx3ESJcukISrFQHhZoV0Yq3AWzeIwWobRohOWilOwOEXvAUIInZrT
-          R5QMvnCKoskFqcBaUgBO+yGEsFHCI5hYy60j0uHJQFIlHcgmdTTGDeS1JT6WrIXo8HN/kVCFNmpn
-          O77Hcy65LTMDxCrpTa1TGjfsOUDoo1movsqItVGVdplTB5DecDpdtX54qHDEdpxTjogRPOtauLbL
-          GDjChR01gimhJbBBOtRHasbViAhGS/8M85t3uziXxX/sB4JS0A5Ypg0wTq8XHsYM+A/211hfchMY
-          26N1UGU5lwUYbXj7xrnOVjnZkVUezZc4OAffAAAA//8DAAUNxI3sAgAA
+          H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUIlMBWsYNUVepQVZHjPAdTx7ZsRy1F/HvlEJJU
+          UKmLh7t357tnnyJCQJSwJsD21LPayPiJf2bfxbZ+3iqeqdfjruCr3eblkLqj2cAkKHRxQOavqgem
+          ayPRC60uNLNIPQbX6TKdLxZZOl22RK1LlEFWGR/PdTxLZvM4yeLksRPutWDoYE3eIkIIObVniKhK
+          /II1SSZXpEbnaIWw7ocIAatlQIA6J5ynysNkIJlWHlWbegxb5I2jIZVqpOzwc3+P1JWxunAd3+Nc
+          KOH2uUXqtAqezmsDLXuOCHlv+zS/IoKxujY+9/oDVTBcLS52MCxwILuq4LWn8o7ml1leoqdCutE6
+          gFG2x/LGkBCgTSn0iIhGlW+z3PO+1Baq+o/9QDCGxmOZG4ulYHf7tubhd/011q+4DQzu6DzWOReq
+          QmusuDwwN/mK04KueJJmEJ2jHwAAAP//AwCZn7Uj6QIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdd6bc71fa2e-SJC
+          - 8f4256c89c4217de-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:27 GMT
+          - Wed, 18 Dec 2024 21:45:17 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +78,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "249"
+          - "140"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +90,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999865"
+          - "29999889"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_72f1a2af642dad2884e52d652e775182
+          - req_92a0dc630d0d21997ff3aeede540417d
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
index 607cd8a2..84c270eb 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: Insufficient
-        information\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
+        to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: Insufficient
+        information", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "541"
+          - "464"
         content-type:
           - application/json
         host:
@@ -46,18 +45,19 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6gIoDy4pS+lPeRWVWpVIWMWcGNsyzZSqyj/vTIQ
-          IGoq9cJhvp1hduHoIYRZgVOEaU0sbRQPtuXb/u4pP7w0z69kt3+I6221q/fN/SOJNPadQ+afQO3Z
-          dUNlozhYJkWPqQZiwaUuVnGSrMLVetmBRhbAna1SNkhkEIVREoTrIFwOxloyCgan6N1DCKFj93QV
-          RQFfOEWhf1YaMIZUgNNxCCGsJXcKJsYwY4mw2J8glcKC6FrfznUNZWuIqyVazgf9NL6Iy0ppmZuB
-          j3rJBDN1poEYKVyosVLhjp48hD66hdqLjlhp2SibWXkA4QIXi6jPw9MJZ3RgVlrC56bYvxKXFWAJ
-          42Z2EUwJraGYrNP5SFswOQPebOnfZa5l94szUf0nfgKUgrJQZEpDwejlwtOYBveD/TU2HrkrjM23
-          sdBkJRMVaKVZ/41LlW1KkpNNGcZr7J28HwAAAP//AwBPQ8gX7AIAAA==
+          H4sIAAAAAAAAAwAAAP//jFLLbtswELzrKwiercJO/JB9cwOk7aHXXopCYMilxIbiMtxV4iLwvxeU
+          bclBUqAXHmZ2hjOLfS2EkM7InZC6Vay76Mu9fakO62Z/f9duH5+rz+tKfz18WS/2P56+P8tZVuDD
+          b9B8UX3S2EUP7DCcaJ1AMWTXxeZ2uVpVt4vVQHRowGdZE7lcYnkzv1mW86qcr8/CFp0GkjvxsxBC
+          iNfhzRGDgYPcifnsgnRApBqQu3FICJnQZ0QqIkesAsvZRGoMDGFI/S1Qb63TDgILFyymTuX4glGo
+          QC+QBLeOxFMPNNa6/AG2J5VbhN77M34cc3lsYsIHOvMjbl1w1NYJFGHIGYgxyoE9FkL8Gvr3byrJ
+          mLCLXDM+QsiGVXWyk9PCJ3Jz5hhZ+QnermYfmNUGWDlPV+uTWukWzKScdq164/CKKK4qv8/ykfep
+          tgvN/9hPhNYQGUwdExin3/adxhLka/zX2LjiIbCkP8TQ1daFBlJM7nQQNtZqszWVUXphZXEs/gIA
+          AP//AwDr0DopGQMAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdd11ed0175e-SJC
+          - 8f4256aaff33ce5c-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +65,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:27 GMT
+          - Wed, 18 Dec 2024 21:45:16 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,7 +79,7 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "196"
+          - "1096"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
@@ -91,13 +91,13 @@ interactions:
         x-ratelimit-remaining-requests:
           - "9999"
         x-ratelimit-remaining-tokens:
-          - "29999867"
+          - "29999890"
         x-ratelimit-reset-requests:
           - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_9dd0f40823dceb910336a862f0513c68
+          - req_d492552c2043885324019334103f08d1
       status:
         code: 200
         message: OK
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e3e7af1c..0962e24c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -137,7 +137,7 @@ def _assert_prompt_is_valid(
                 *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
                 "the answer is 14004",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="didnt-match-and-no-llm-innate-knowledge",
             ),
             pytest.param(
@@ -158,35 +158,35 @@ def _assert_prompt_is_valid(
                 *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
                 "the answer is 94106 or 94107",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="matched-several-options",
             ),
             pytest.param(
                 *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
                 "",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="empty-answer1",
             ),
             pytest.param(
                 *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
                 "14",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="didnt-match-and-llm-has-innate-knowledge",
             ),
             pytest.param(
                 *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
                 "",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="empty-answer2",
             ),
             pytest.param(
                 *LITQA2_QUESTION_IDEAL_DISTRACTORS,
                 "",
                 MultipleChoiceEvaluation.INCORRECT,
-                "0",
+                None,
                 id="empty-answer3",
             ),
         ],
@@ -198,7 +198,7 @@ async def test_grade(
         distractors: str | list[str],
         actual_answer: str,
         expected_eval: MultipleChoiceEvaluation,
-        expected_extracted_answer: str,
+        expected_extracted_answer: str | None,
     ) -> None:
         """Tests that we can create a multiple choice question and evaluate answers."""
         mc_question = MultipleChoiceQuestion(
@@ -208,7 +208,7 @@ async def test_grade(
             shuffle_seed=42,  # Seed for VCR cassette
         )
         self._assert_prompt_is_valid(mc_question, question, ideal_answer, distractors)
-        evaluation, _, graded_answer = await mc_question.grade(actual_answer)
+        evaluation, graded_answer = await mc_question.grade(actual_answer)
         assert evaluation == expected_eval
         if evaluation == MultipleChoiceEvaluation.CORRECT:
             assert graded_answer == ideal_answer

From 90dbcf4cbcac959e9d922ea928a3843ba2d947c4 Mon Sep 17 00:00:00 2001
From: James Braza <jamesbraza@gmail.com>
Date: Wed, 18 Dec 2024 13:48:54 -0800
Subject: [PATCH 3/3] Moved back to gpt-4o-mini

---
 src/aviary/utils.py                           |  2 +-
 ...t-match-and-llm-has-innate-knowledge].yaml | 44 ++++++++++--------
 ...nt-match-and-no-llm-innate-knowledge].yaml | 44 ++++++++++--------
 ...AEvaluation.test_grade[empty-answer1].yaml | 40 ++++++++--------
 ...AEvaluation.test_grade[empty-answer2].yaml | 36 +++++++--------
 ...AEvaluation.test_grade[empty-answer3].yaml | 36 +++++++--------
 ...on.test_grade[matched-correct-option].yaml | 44 ++++++++----------
 ....test_grade[matched-incorrect-option].yaml | 36 +++++++--------
 ...n.test_grade[matched-several-options].yaml | 45 ++++++++++--------
 ...ion.test_grade[matched-unsure-option].yaml | 38 +++++++--------
 .../test_eval_answer[llm basic].yaml          | 46 +++++++++----------
 tests/cassettes/test_eval_llm_config.yaml     | 36 +++++++--------
 .../test_extract_answer[complex].yaml         | 36 +++++++--------
 .../test_extract_answer[empty-proposal].yaml  | 44 ++++++++++--------
 .../test_extract_answer[gave-two].yaml        | 44 ++++++++++--------
 .../test_extract_answer[not in options].yaml  | 44 +++++++++---------
 16 files changed, 320 insertions(+), 295 deletions(-)

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index fcbf130f..b495f18b 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -20,7 +20,7 @@
     import numpy as np
 
 
-DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
+DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini"
 LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a proposed answer"
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
index c87a0ba3..617568d2 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
         to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
-        "user"}], "model": "gpt-4o", "temperature": 0}'
+        "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "437"
+          - "442"
         content-type:
           - application/json
         host:
@@ -35,7 +35,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVJIe6u4wKESIHFCKHKdTWpwvJa9FaCq/46cvtUicfFh
-          Zmc8s/Y6EQJ0DVMBailZdc6ks+Zrko2cxvljMXvunmavPHkJTA9zk9/DICpo8YGK96obRZ0zyJrs
-          llYeJWN0Hd7lxXhc5sOyJzqq0URZ6zgtKB1loyLNyjS73QmXpBUGmIq3RAgh1v0ZI9oav2EqssEe
-          6TAE2SJMD0NCgCcTEZAh6MDSMgyOpCLLaPvUp7DHZhVkTGVXxuzwzeEeQ63ztAg7/oA32uqwrDzK
-          QDZ6BiYHPbtJhHjv+6zOIoLz1DmumD7RRsNyvLWD4wKP5K4qMLE0VzRnZlWNLLUJJ+sAJdUS6wtD
-          IUCuak0nRHJS+TLLNe9tbW3b/9gfCaXQMdaV81hrdbVvbx5/119jhxX3gSH8BMauarRt0Tuvtw/c
-          uGrSyIWcNFleQrJJfgEAAP//AwA5BypS6QIAAA==
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da
+          tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX
+          wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr
+          +ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2
+          EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI
+          XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH
+          wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256d28f9615ff-SJC
+          - 8f425bb2ac70f953-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,9 +64,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:18 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -78,25 +84,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "208"
+          - "144"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999896"
+          - "149999896"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_7db1d1f6dded4679e43cc12a2183fa21
+          - req_503cd8163bd0d3b634eb723d6874b1da
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
index f043c502..77357e4c 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
-        is 14004", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        is 14004", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "459"
+          - "464"
         content-type:
           - application/json
         host:
@@ -35,7 +35,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJLNTsMwEITveQprzw1K/6DNDakIwYkLqgRFkeNsEhfHNrarUlV9d2S3
-          TVJRJC4+7Lcznl17HxECvICUAKupY40W8X25ndktPqzrRb37envdVk/LxfKRvkx4/gwDr1D5Gpk7
-          q26YarRAx5U8YmaQOvSuw7vxZDqdjYdJAI0qUHhZpV08UfEoGU3iZBYntydhrThDCyl5jwghZB9O
-          H1EW+A0pCTah0qC1tEJI2yZCwCjhK0Ct5dZR6WDQQaakQxlSr2AFfWSw3Fjqk8mNEKf6ob1LqEob
-          ldsTb+sll9zWmUFqlfS+1ikNgR4iQj7CTJuLmKCNarTLnPpE6Q3nw6MddEvs4Jk55ajoaUaDK2ZZ
-          gY5yYXsrAUZZjUWn7PZHNwVXPRD1Rv6d5Zr3cWwuq//Yd4Ax1A6LTBssOLuct2sz6H/YX23tikNg
-          sDvrsMlKLis02vDjI5c6m5c0p/MyGc8gOkQ/AAAA//8DAEsANTftAgAA
+          H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyqB8MoNoR7aAxdKVamqImNvElPHtmyjPhD/XjlQ
+          AoJKvfgwszOeWXsXEQKCQ06A1dSzxsh4Vn7M71fZMntcjAbfz6uXjTSbh9mCPk2Xc+gFhV5vkPlf
+          1R3TjZHohVYHmlmkHoNrOh5kw+FkmE5aotEcZZBVxseZjhuhRNxP+lmcjON0clTXWjB0kJPXiBBC
+          du0ZciqOn5CTpPeLNOgcrRDy0xAhYLUMCFDnhPNUeeh1JNPKo2qjn8MWy62jIZraSnnE96d7pK6M
+          1Wt35E94KZRwdWGROq2Cp/PaQMvuI0Le2j7bi4hgrG6ML7x+RxUMp+nBDrotduSxKnjtqbyhuTAr
+          OHoqpDtbBzDKauRXhoQA3XKhz4jorPJ1llveh9pCVf+x7wjG0HjkhbHIBbvZtzUPX+yvsdOK28Dg
+          vpzHpiiFqtAaKw4PXJpiVLI0wTTBNUT76AcAAP//AwBkI2np7gIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f42569f7d2a2379-SJC
+          - 8f425bb11b702519-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,9 +64,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:10 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=6j4w6Jnsg0wGsZf61WcNCvHdr1Vcb6uVLFFhTQQgcv4-1734558518-1.0.1.1-D0vsT8nCM66xiA.Xa6ijXpgeGPM65Iux2KhQqUiD8wToq.VmwT03dnkmELw1qn0GvHJvh8g7H6WkqYzXVgs2Xg;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=LFVOxysXKxTPNQ2KK05aqbBnIRDPc45hskCPkFcOjXA-1734558518178-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -78,25 +84,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "240"
+          - "131"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999890"
+          - "149999890"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_363f6da2908247ad8c711b11d1593ae7
+          - req_12c5e1cdb8b2ba32b075f04f20194421
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
index 662716da..6865d713 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role":
-        "user"}], "model": "gpt-4o", "temperature": 0}'
+        "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "440"
+          - "445"
         content-type:
           - application/json
         host:
@@ -45,18 +45,16 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA4xSy27CMBC85yusPZMqkAApt6rHSr0U9VJVkbE3wa3jtWyj8hD/XjlAAEGlXnyY
-          2RnPrL1LGAMlYcZALHkQrdXpU/1TbhfrIn+j+Vxs281Cv7/k9JrN1/gMg6igxReKcFI9CGqtxqDI
-          HGjhkAeMrsNpXozHZT6cdkRLEnWUNTakBaWjbFSkWZlmk6NwSUqghxn7SBhjbNedMaKRuIYZywYn
-          pEXveYMw64cYA0c6IsC9Vz5wE2BwJgWZgKZLfQk7rFeex1RmpfUR3/f3aGqso4U/8j1eK6P8snLI
-          PZno6QNZ6Nh9wthn12d1FRGso9aGKtA3mmhYTg52cF7gmTxWhUCB6zuaK7NKYuBK+4t1gOBiifLG
-          kDHgK6nogkguKt9mued9qK1M8x/7MyEE2oCysg6lEnf7dubxd/011q+4Cwx+4wO2Va1Mg846dXjg
-          2lZ8+ihLycWwhmSf/AIAAP//AwBsWlME6QIAAA==
+          H4sIAAAAAAAAAwAAAP//jFJdS8MwFH3vrwj3eZV2X46+6RAR0T2JikjJkts2miYhSVEZ+++Srms3
+          NsGXPJxzz8k5N9lEhIDgkBFgFfWsNjK+Kr6WN6vb8XL+9LqSxj3Qu2R1ff/8yCfiBUZBodcfyPxe
+          dcF0bSR6odWOZhapx+CaXk6ms9lili5aotYcZZCVxsdTHddCiXicjKdxchmni05dacHQQUbeIkII
+          2bRnyKk4fkNGktEeqdE5WiJk/RAhYLUMCFDnhPNUeRgNJNPKo2qjH8IWi8bREE01Unb4tr9H6tJY
+          vXYd3+OFUMJVuUXqtAqezmsDLbuNCHlv+zRHEcFYXRufe/2JKhgu5js7GLY4kF1V8NpTeUZzZJZz
+          9FRId7AOYJRVyE8MCQHacKEPiOig8mmWc9672kKV/7EfCMbQeOS5scgFO9u3NQ9f7K+xfsVtYHA/
+          zmOdF0KVaI0VuwcuTD4vWJpgmuAaom30CwAA//8DAL0A1qzuAgAA
       headers:
-        CF-Cache-Status:
-          - DYNAMIC
         CF-RAY:
-          - 8f4256cce931eb29-SJC
+          - 8f425bb5de5996de-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +62,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:17 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -75,28 +73,30 @@ interactions:
           - X-Request-ID
         alt-svc:
           - h3=":443"; ma=86400
+        cf-cache-status:
+          - DYNAMIC
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "217"
+          - "233"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999895"
+          - "149999896"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_520872e529ccbb680d27a3729fbe637e
+          - req_0c845e0049332bd1fa73fdbe76005ea1
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
index 483daa67..4a0fa4ae 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
         to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}],
-        "model": "gpt-4o", "temperature": 0}'
+        "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "435"
+          - "440"
         content-type:
           - application/json
         host:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUg0CRbh7ZDB9Shqqqqioz9Etw6tmU7AoT498oB
-          QhBU6uLh7t357tm7iBAQHAoCbEU9a4yMH6p1Pp5/uKf59rF5b95e08X4hanFZv3sMhgFhV5+I/Mn
-          1R3TjZHohVYHmlmkHoPr+H6azmbZdJx3RKM5yiCrjY9THU+SSRonWZzMj8KVFgwdFOQzIoSQXXeG
-          iIrjBgqSjE5Ig87RGqHohwgBq2VAgDonnKfKw+hMMq08qi71ELZYtY6GVKqV8ojv+3ukro3VS3fk
-          e7wSSrhVaZE6rYKn89pAx+4jQr66Pu1FRDBWN8aXXv+gCoZZerCD8wLP5LEqeO2pvKG5MCs5eiqk
-          G6wDGGUr5FeGhABtudADIhpUvs5yy/tQW6j6P/ZngjE0HnlpLHLBbvbtzMPv+musX3EXGNzWeWzK
-          SqgarbHi8MCVKfOKLmleJdMMon30CwAA//8DAFd1apnpAgAA
+          H4sIAAAAAAAAAwAAAP//jFJda8IwFH3vrwj32Q6tLTrfhjB8FhyyMUpMbttomoQk3Qfifx+ptXXo
+          YC95OOeek3NucowIAcFhQYBV1LPayPip+Fw+rz5e6221zta4fKENiu1qszkk2z2MgkLv9sj8RfXA
+          dG0keqHVmWYWqcfgOplN0yybZ5PHlqg1RxlkpfFxquNaKBEn4ySNx7N4Mu/UlRYMHSzIW0QIIcf2
+          DDkVxy9YkPHogtToHC0RFv0QIWC1DAhQ54TzVHkYDSTTyqNqo1/DFovG0RBNNVJ2+Km/R+rSWL1z
+          Hd/jhVDCVblF6rQKns5rAy17igh5b/s0vyKCsbo2Pvf6gCoYztOzHQxbHMiuKnjtqbyj+WWWc/RU
+          SHe1DmCUVchvDAkB2nChr4joqvJtlnve59pClf+xHwjG0HjkubHIBbvbtzUPX+yvsX7FbWBw385j
+          nRdClWiNFecHLkzOxwnPppNdOoPoFP0AAAD//wMAMCnsc+4CAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256d7fd75cf0d-SJC
+          - 8f425bb72f9b67dc-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:19 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -78,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "520"
+          - "532"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999896"
+          - "149999896"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_f943511f12de0306ff59cccd017e98f1
+          - req_ed9d0e7998f792094d5aefe723693f28
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
index 46a00d4f..f6e5e085 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\ncryo EM\nInsufficient information
         to answer this question\nNMR\nx-ray crystallography\ncircular dichroism\n\nProposed
-        answer: ", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        answer: ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "467"
+          - "472"
         content-type:
           - application/json
         host:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hViz3FxXs3jFkIooZSQQ3spxSjy2lYrS6qkkJaQ/14k
-          p7ZDUujFhx3N55mVjhEhwDOYE2AldazSIl7kh9lgtZmMt4+r4fPL08NivN5uPvvLw3Kzhp53qN07
-          MvfrumOq0gIdV7KWmUHq0FP7k+FoPJ4OB0kQKpWh8LZCu3ik4kEyGMXJNE7uz8ZScYYW5uQ1IoSQ
-          Y/j6iDLDL5iTgAmTCq2lBcK8OUQIGCX8BKi13DoqHfRakSnpUIbU3bHBfG+pTyX3Qpznp+Y/QhXa
-          qJ09680855LbMjVIrZKeaZ3SENRTRMhb6LO/iAjaqEq71KkPlB4469c4aBfYiueq4JSj4obnApZm
-          6CgXtrMOYJSVmF0BCQG6z7jqCFGn8nWWW+y6NpfFf/CtwBhqh1mqDWac3ewb4P51/XWsWXEIDPbb
-          OqzSnMsCjTa8vuBcp7Oc7ugsT4ZTiE7RDwAAAP//AwBp6cyD6QIAAA==
+          H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kypAKIRbH+pDVU+9VK2qyDibxNSxLXtRSxH/Xjm8
+          gqBSLz7M7Ixn1l5FjIEsYMpA1JxEY1V8VX7d3N2/PP7czp9f39QwmTzJ5UNqqb52M+gFhZnNUdBO
+          dSFMYxWSNHpDC4ecMLj2x8N0NJqM+llLNKZAFWSVpTg1cSO1jAfJII2TcdyfbNW1kQI9TNl7xBhj
+          q/YMOXWB3zBlSW+HNOg9rxCm+yHGwBkVEODeS09cE/QOpDCaULfRu7DDcuF5iKYXSm3x9f4eZSrr
+          zMxv+T1eSi19nTvk3ujg6clYaNl1xNhH22dxFBGsM42lnMwn6mCY9Td2cNjigdxWBTLE1RnNkVle
+          IHGpfGcdILiosTgxZAz4opCmQ0SdyqdZznlvaktd/cf+QAiBlrDIrcNCirN9W/Pwxf4a26+4DQx+
+          6QmbvJS6Qmed3DxwafNhxtNEZJc8gWgd/QIAAP//AwCjNKe67gIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256dfbb4a7ac7-SJC
+          - 8f425bbaab9a236e-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:20 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -78,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "207"
+          - "231"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999889"
+          - "149999888"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_06b5243be6b66dfc6827056b2c49f4c4
+          - req_427dff29f2a632ec0882c27c797f5d5a
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
index f9092a53..f126cb68 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
-        is 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        is 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "459"
+          - "464"
         content-type:
           - application/json
         host:
@@ -35,7 +35,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "0"
+          - "1"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJLLTsMwEEX3+QrL6wSlL9pkx4KHVLFDdIFQ5DiTxOB4LNtRgar/jpyG
-          JBVFYuPFnLnXd8Y+BIRQUdCUUF4zxxsto5tyvzHPt1X5uJvva51v758g2d09oNx+tTT0CszfgLsf
-          1RXHRktwAtUJcwPMgXedrRfL1WqziJMONFiA9LJKu2iJ0TyeL6N4E8XXvbBGwcHSlLwEhBBy6E4f
-          URXwQVMShz+VBqxlFdB0aCKEGpS+Qpm1wjqmHA1HyFE5UF3qZDmL11NmoGwt89FUK2VfPw6XSay0
-          wdz2fKiXQglbZwaYReWNrUNNO3oMCHnthmrPclJtsNEuc/gOyhsms5MdHbc4wnnPHDomJ5pFeMEs
-          K8AxIe1kJ5QzXkMxKscFsrYQOAHBZOTfWS55n8YWqvqP/Qg4B+2gyLSBQvDzecc2A/6L/dU2rLgL
-          TO2nddBkpVAVGG3E6ZVLnSUly1lSxosNDY7BNwAAAP//AwBPnKtg7gIAAA==
+          H4sIAAAAAAAAAwAAAP//jJI/b8MgEMV3fwrEbFf+FznxllSpOlTKVHWoKovA2abFgACrSaN89won
+          jR01lbow3O/e493BIUAIc4ZLhGlLHO20iJb15/3DelPvN7De7lY6fukfnxl7ylaL5RcOvUJt34G6
+          H9UdVZ0W4LiSJ0wNEAfeNSmyfDabz5LFADrFQHhZo12Uq6jjkkdpnOZRXETJ/KxuFadgcYleA4QQ
+          OgynzykZ7HCJ4vCn0oG1pAFcXpoQwkYJX8HEWm4dkQ6HI6RKOpBD9EWexMWUGah7S3w+2Qtxrh8v
+          lwnVaKO29swv9ZpLbtvKALFKemPrlMYDPQYIvQ1D9Vc5sTaq065y6gOkN1wkJzs8rnKE6Zk55YiY
+          aLLwhlnFwBEu7GQnmBLaAhuV4wJJz7iagGAy8u8st7xPY3PZ/Md+BJSCdsAqbYBxej3v2GbA/7O/
+          2i4rHgJju7cOuqrmsgGjDT+9cq0rFqdsliXbvMDBMfgGAAD//wMAitN9t/MCAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f42569a694415a4-SJC
+          - 8f425bb60cfe17e4-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,15 +64,9 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:09 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
-        Set-Cookie:
-          - __cf_bm=VhM7SMRUwaYcBGPmD54eZozV9ZASFwCpD2uUzfcbygQ-1734558309-1.0.1.1-PO0cbnzYww6YCdbMGHLbwDXTjy0s_I50cJaqd7OUcIUnT7C0j_EJ9CBwZm8nRmzrv2FivdnDcss9GtsBOjjypw;
-            path=/; expires=Wed, 18-Dec-24 22:15:09 GMT; domain=.api.openai.com; HttpOnly;
-            Secure; SameSite=None
-          - _cfuvid=tUjQ_WfZ7MP0T2BILMqKk0y_GuM5NWbc6M_7vifNjIU-1734558309829-0.0.1.1-604800000;
-            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -84,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "215"
+          - "538"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999890"
+          - "149999891"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_9990262c4eaef569ab37ffc7cf363319
+          - req_9bd9d799783ab13ef59ce8e5ca7fd25f
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
index a14c592f..842cfbf0 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
-        is 94106", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        is 94106", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "459"
+          - "464"
         content-type:
           - application/json
         host:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJLLbsIwEEX3+QrLa1IlECiwo1I/oKioL1WRY08SF8e27KEvxL9XDoGA
-          SqVuvJgz9/rO2NuIECoFnRPKa4a8sSpelB9TzG4m9eo5KWB5qxfrh9Xj8u7p+754p4OgMMUbcDyo
-          rrhprAKURu8xd8AQgmt6PcrG4+koTVvQGAEqyCqLcWbiYTLM4mQaJ5NOWBvJwdM5eYkIIWTbniGi
-          FvBJ5yQZHCoNeM8qoPNjEyHUGRUqlHkvPTKNdNBDbjSCblPPsrS78iCEcuNZiKY3SnX13fEyZSrr
-          TOE7fqyXUktf5w6YNzoYezSWtnQXEfLaDrU5y0mtM43FHM0adDCcpXs72m+xh8OOoUGmTjSjwQWz
-          XAAyqfzJTihnvAbRK/sFso2Q5gREJyP/znLJez+21NV/7HvAOVgEkVsHQvLzefs2B+GL/dV2XHEb
-          mPovj9DkpdQVOOvk/pVLm7PrmZgKxtOSRrvoBwAA//8DAPNQ/XLuAgAA
+          H4sIAAAAAAAAA4ySPW/CMBCG9/wKyzOpEggfYauqbgwsnaoqMs4lmDo+y74IWsR/rxw+ElQqdfFw
+          z72v3zv7GDHGVcmXjMutINlYHT9X+5dXO1mtdEbYzla7w/z7sH5bi30+rfkoKHCzA0lX1ZPExmog
+          heaMpQNBEFzT+SSbThfTdNGBBkvQQVZbijOMG2VUPE7GWZzM43RxUW9RSfB8yd4jxhg7dmfIaUo4
+          8CVLRtdKA96LGvjy1sQYd6hDhQvvlSdhiI96KNEQmC56nqXJbMgcVK0XIZ9ptb7UT7fLNNbW4cZf
+          +K1eKaP8tnAgPJpg7Akt7+gpYuyjG6q9y8mtw8ZSQfgJJhjm6dmO96vs4fjCCEnogWYyemBWlEBC
+          aT/YCZdCbqHslf0CRVsqHIBoMPLvLI+8z2MrU//HvgdSgiUoC+ugVPJ+3r7NQfhnf7XdVtwF5v7L
+          EzRFpUwNzjp1fuXKFpNcZInMZyLh0Sn6AQAA//8DAL5Pl0/zAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256a569fd6809-SJC
+          - 8f425bb64ed0fa36-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:11 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -78,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "224"
+          - "247"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999890"
+          - "149999891"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_ad4574ef78bcc23b8c85835d827e63ec
+          - req_eb9ad02601ae4b1b2b579657ed9a7bef
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
index 3cf90e2d..26df9d56 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
@@ -6,7 +6,8 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
-        is 94106 or 94107", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        is 94106 or 94107", "role": "user"}], "model": "gpt-4o-mini", "temperature":
+        0}'
       headers:
         accept:
           - application/json
@@ -15,7 +16,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "468"
+          - "473"
         content-type:
           - application/json
         host:
@@ -35,7 +36,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -45,18 +46,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUIlMBWsYNUVepQVZHjPAdTx7ZsRy1F/HvlEJJU
-          UKmLh7t357tnnyJCQJSwJsD21LPayPiJf2bfxbZ+3iqeqdfjruCr3eblkLqj2cAkKHRxQOavqgem
-          ayPRC60uNLNIPQbX6TKdLxZZOl22RK1LlEFWGR/PdTxLZvM4yeLksRPutWDoYE3eIkIIObVniKhK
-          /II1SSZXpEbnaIWw7ocIAatlQIA6J5ynysNkIJlWHlWbegxb5I2jIZVqpOzwc3+P1JWxunAd3+Nc
-          KOH2uUXqtAqezmsDLXuOCHlv+zS/IoKxujY+9/oDVTBcLS52MCxwILuq4LWn8o7ml1leoqdCutE6
-          gFG2x/LGkBCgTSn0iIhGlW+z3PO+1Baq+o/9QDCGxmOZG4ulYHf7tubhd/011q+4DQzu6DzWOReq
-          QmusuDwwN/mK04KueJJmEJ2jHwAAAP//AwCZn7Uj6QIAAA==
+          H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyoJz3Jrq35ARakqVVVk7E0wdWzLXkQrxL9XDhBA
+          UKkXH2Z2xjNrbxPGQEmYMhBLTqJxOn2oNk/PzWJFr+KlmM03j/NZ8TbavMt+UVfQiwq7WKGgo+pO
+          2MZpJGXNnhYeOWF0zcf9wXA4GeaTlmisRB1ltaN0YNNGGZUWWTFIs3GaTw7qpVUCA0zZR8IYY9v2
+          jDmNxG+Ysqx3RBoMgdcI026IMfBWRwR4CCoQNwS9EymsITRt9HPYY7UOPEYza60P+K67R9vaebsI
+          B77DK2VUWJYeebAmegayDlp2lzD22fZZX0QE523jqCT7hSYa3g/3dnDa4ok8VAWyxPUNzYVZKZG4
+          0uFsHSC4WKK8MmQM+Foqe0YkZ5Wvs9zy3tdWpv6P/YkQAh2hLJ1HqcTNvq15/GJ/jXUrbgND+AmE
+          TVkpU6N3Xu0fuHLlqBJ5hnmGC0h2yS8AAAD//wMALlTCsO4CAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256c89c4217de-SJC
+          - 8f425bb169ea22f6-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,9 +65,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:17 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=d8n1B6AzFA1xougxyBgoPLD0ITgb.iimKMM9kNYr6NA-1734558518-1.0.1.1-c8MRCOD4wNoPcANGb9a6gOWsl6NhHqx911Ktp.RARxFa..7XVR9hKaZVQ2nRa8g.bTL2e2pT7EpsuMaFLlx6Sw;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=DPEKvT7hx6XvGnKxQqNrPq5Y4dSqkyQo4hPKRlWd79E-1734558518261-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -78,25 +85,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "140"
+          - "168"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999889"
+          - "149999888"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_92a0dc630d0d21997ff3aeede540417d
+          - req_becb26d30d1adf2d410f311a4664a6b2
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
index 84c270eb..5b56af9b 100644
--- a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
@@ -6,7 +6,7 @@ interactions:
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
         to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: Insufficient
-        information", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        information", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "464"
+          - "469"
         content-type:
           - application/json
         host:
@@ -45,19 +45,19 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFLLbtswELzrKwiercJO/JB9cwOk7aHXXopCYMilxIbiMtxV4iLwvxeU
-          bclBUqAXHmZ2hjOLfS2EkM7InZC6Vay76Mu9fakO62Z/f9duH5+rz+tKfz18WS/2P56+P8tZVuDD
-          b9B8UX3S2EUP7DCcaJ1AMWTXxeZ2uVpVt4vVQHRowGdZE7lcYnkzv1mW86qcr8/CFp0GkjvxsxBC
-          iNfhzRGDgYPcifnsgnRApBqQu3FICJnQZ0QqIkesAsvZRGoMDGFI/S1Qb63TDgILFyymTuX4glGo
-          QC+QBLeOxFMPNNa6/AG2J5VbhN77M34cc3lsYsIHOvMjbl1w1NYJFGHIGYgxyoE9FkL8Gvr3byrJ
-          mLCLXDM+QsiGVXWyk9PCJ3Jz5hhZ+QnermYfmNUGWDlPV+uTWukWzKScdq164/CKKK4qv8/ykfep
-          tgvN/9hPhNYQGUwdExin3/adxhLka/zX2LjiIbCkP8TQ1daFBlJM7nQQNtZqszWVUXphZXEs/gIA
-          AP//AwDr0DopGQMAAA==
+          H4sIAAAAAAAAAwAAAP//jFJNj9MwEL3nV1g+NygpLc32tnwcOC2CA0gIRa4zTgZsj9eeaIFV/zty
+          2iZdsUhcfHhv3vN7o3kshJDYyb2QelCsXbDlrXl48y58+njv4i3s3n7+cPf6d1g3d1XT0Be5ygo6
+          fAfNF9ULTS5YYCR/onUExZBd693LzXbbbOtmIhx1YLOsD1xuqHTosVxX601Z7cq6OasHQg1J7sXX
+          QgghHqc35/Qd/JR7Ua0uiIOUVA9yPw8JISPZjEiVEiZWnuVqITV5Bj9Ff+/TaAxqBM8CvaHoVO4g
+          mITy6QGi4AGTuB8hzd0uf4AZk8pV/GjtGT/OuSz1IdIhnfkZN+gxDW0ElcjnDIkpyIk9FkJ8m/qP
+          TyrJEMkFbpl+gM+GTXOyk8vWF3J35phY2QW+2a6eMWs7YIU2Xa1PaqUH6Bblsms1dkhXRHFV+e8s
+          z3mfaqPv/8d+IbSGwNC1IUKH+mnfZSxCPsl/jc0rngLL9CsxuNag7yGGiKeDMKF9ZXRdQV3BQRbH
+          4g8AAAD//wMAIVEMVh4DAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4256aaff33ce5c-SJC
+          - 8f425bb5cdb77ac1-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,7 +65,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:45:16 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -79,25 +79,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "1096"
+          - "262"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999890"
+          - "149999890"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_d492552c2043885324019334103f08d1
+          - req_ca5799089a4ca130483ac0a6fa172710
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_eval_answer[llm basic].yaml b/tests/cassettes/test_eval_answer[llm basic].yaml
index 63f1bb18..18f9bfd9 100644
--- a/tests/cassettes/test_eval_answer[llm basic].yaml	
+++ b/tests/cassettes/test_eval_answer[llm basic].yaml	
@@ -7,7 +7,7 @@ interactions:
         other output is permitted.\n\nQuestion: Which of the following is most likely
         true:\n\nA) Piggie, B) Pigeon, C) Gerald\n\n\nCorrect answer: C\n\nProposed
         answer: Based on all factors considered, the most compelling answer is Gerald,
-        C", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        C", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +16,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "516"
+          - "521"
         content-type:
           - application/json
         host:
@@ -46,18 +46,16 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJI/b8IwEMX3fArLM6kCpPzbGDpQVahSh4pWVWTsS+Li+CzbkaCI7145
-          AZKqVOri4X73nt+dfYwIoVLQBaG8ZJ5XRsXL/G09e3zePyX1YbxZrhzuynKTi9WXW7/SQVDg9hO4
-          v6juOFZGgZeoW8wtMA/BdTgdp+k0mc5GDahQgAqywvg4xXiUjNI4mcXJ5CwsUXJwdEHeI0IIOTZn
-          iKgF7OmCJINLpQLnWAF0cW0ihFpUoUKZc9J5pj0ddJCj9qCb1JuHlz6xkNeOhWC6VupcP12vUlgY
-          i1t35td6LrV0ZWaBOdTB1nk0tKGniJCPZqT6R0pqLFbGZx53oIPhMLlv/Wi3xB49M4+eqb5oMrhh
-          lwnwTCrX2wnljJcgOmm3QFYLiT0Q9Yb+HeaWdzu41MV/7DvAORgPIjMWhOQ/B+7aLIQv9lfbdclN
-          YOoOzkOV5VIXYI2V7SvnJpvnbMvmeTKe0egUfQMAAP//AwAWS34s7gIAAA==
+          H4sIAAAAAAAAA4ySy2rDMBBF9/4KoXVcnMTOa1dKCl00lAZKHxSjSGNbjSwJaUJTSv69yHnYoSl0
+          o8WcuVd3RvqOCKFS0BmhvGLIa6vi6+LzZr54vh2tB/Xr0zgTi8Lh/ZY93vHlA+0FhVl9AMej6oqb
+          2ipAafQecwcMIbj2x8M0yyZZf9KA2ghQQVZajFMT11LLeJAM0jgZx/3JQV0ZycHTGXmLCCHkuzlD
+          Ti1gS2ck6R0rNXjPSqCzUxMh1BkVKpR5Lz0yjbTXQm40gm6iv8yXXeKg2HgW0umNUof67nSVMqV1
+          ZuUP/FQvpJa+yh0wb3Sw9WgsbeguIuS9GWlzlpJaZ2qLOZo16GDYT7K9H2032aEHhgaZ6opGvQt2
+          uQBkUvnOTihnvALRStsFso2QpgOiztC/w1zy3g8udfkf+xZwDhZB5NaBkPx84LbNQfhnf7WdltwE
+          pv7LI9R5IXUJzjq5f+XC5sMpSxM+HbGERrvoBwAA//8DAJN7IxXzAgAA
       headers:
-        CF-Cache-Status:
-          - DYNAMIC
         CF-RAY:
-          - 8f39fdb5cae1158a-SJC
+          - 8f425bb118049453-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -65,14 +63,14 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:22 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
         Set-Cookie:
-          - __cf_bm=lVkT7i5qloNOJW3VW5kf8Ohm6U080WiPUv6XirXCoFk-1734470782-1.0.1.1-nAgxt2GizSWkF.auEc_j1tv3Erjbd74Lsh9WJmMaZa_E8fpVuEZ8SsBIqLBHICQDV0sfwSjHgP9mTBHQujl_XA;
-            path=/; expires=Tue, 17-Dec-24 21:56:22 GMT; domain=.api.openai.com; HttpOnly;
+          - __cf_bm=shlFi0WrRQqtHm9BFHA8BA_DE3OgD.WLNX_BG0MJ.Uc-1734558518-1.0.1.1-dTPiGPfeRXm4eFyNx5Qhh98ITpHISNJJ15gnJl7VfBbOzj3CoF.H.Mssss_WvoWjPSiaq4ZWwBKCF16.mbMFig;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
             Secure; SameSite=None
-          - _cfuvid=YCWb3aZdtzEmsWTuiPgC.gchnL7jvJLEWh9yvJqAiAw-1734470782603-0.0.1.1-604800000;
+          - _cfuvid=LbfayFWmgFkPH4gOfhOfLicD7koAa3IqwrVpt0Q2uQ0-1734558518270-0.0.1.1-604800000;
             path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
@@ -82,28 +80,30 @@ interactions:
           - X-Request-ID
         alt-svc:
           - h3=":443"; ma=86400
+        cf-cache-status:
+          - DYNAMIC
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "124"
+          - "226"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29998"
         x-ratelimit-remaining-tokens:
-          - "29999876"
+          - "149999877"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_84a9ec4746765b74e4d84610ebc880ad
+          - req_c627f8d13c1969c6fd3a26f94a43a44f
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_eval_llm_config.yaml b/tests/cassettes/test_eval_llm_config.yaml
index 383479dc..7268d855 100644
--- a/tests/cassettes/test_eval_llm_config.yaml
+++ b/tests/cassettes/test_eval_llm_config.yaml
@@ -5,7 +5,7 @@ interactions:
         question, and a proposed answer to the question. Please tell me if the proposed
         answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No
         other output is permitted.\n\nQuestion: What is 25 * 10?\n\nCorrect answer:
-        250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o", "temperature":
+        250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o-mini", "temperature":
         0.5}'
       headers:
         accept:
@@ -15,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "387"
+          - "392"
         content-type:
           - application/json
         host:
@@ -45,18 +45,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1ycH+okt0Ja6KUthBzaUowirW2lslZIG0gJefci
-          x4kT2kIvPsy3M55de58wxrXic8ZlLUg2zqR35dvTLF+8PJgRPhLUrtwszWKVPa92M+CD6MD1BiSd
-          XDcSG2eANNojlh4EQUwd5uPJJM/y6bgFDSow0VY5SieYjrLRJM2maXbbGWvUEgKfs/eEMcb27TNW
-          tAp2fM6ywUlpIARRAZ+fhxjjHk1UuAhBBxKW+KCHEi2BbVu/3i8viYdyG0QsZrfGdPrh/CqDlfO4
-          Dh0/66W2OtSFBxHQxthA6HhLDwljH+1K26uW3HlsHBWEn2BjYD47xvH+hj0cdoyQhOnlaXeF67BC
-          AQltwsVFuBSyBtU7+/OJrdJ4AZKLlX92+S37uLa21X/ieyAlOAJVOA9Ky+t9+zEP8Qf7a+x84rYw
-          D1+BoClKbSvwzuvjNy5dIfKZmiohhyVPDsk3AAAA//8DADQLsKzsAgAA
+          H4sIAAAAAAAAAwAAAP//jJJfa4MwFMXf/RQhz3Vo/8zWt1H60LExaAdjjCFpctVsMQlJpCul333E
+          WrVsg734cH73HM+9egwQwpzhFGFaEkcrLcK7fL9cMbp+vF/L/fLpsH3eVGabLDb5w+QFj7xD7T6A
+          uovrhqpKC3BcyTOmBogDnxonk+lsNp/F8wZUioHwtkK7cKrCiksejqPxNIySMJ637lJxChan6C1A
+          CKFj8/Q9JYMvnKJodFEqsJYUgNNuCCFslPAKJtZy64h0eNRDqqQD2VR/XW2HxEBeW+LbyVqIVj91
+          rxKq0EbtbMs7PeeS2zIzQKySPtY6pXFDTwFC781K9VVLrI2qtMuc+gTpA5PFOQ73h+xh3DKnHBG9
+          PG+vcB2WMXCECzu4CKaElsB6Z38+UjOuBiAYrPyzy2/Z57W5LP4T3wNKQTtgmTbAOL3etx8z4P+y
+          v8a6EzeFsT1YB1WWc1mA0Yafv3Gus9ucxhHEEexwcAq+AQAA//8DAPSiOYXxAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fdbac9db643b-SJC
+          - 8f425bb65e7c6453-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,7 +64,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:23 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -78,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "231"
+          - "229"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999909"
+          - "149999909"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_6538a77713d1ec9b61a8e15f3cf37377
+          - req_1f8e0a7bd96417a061e010db00f50b6f
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer[complex].yaml b/tests/cassettes/test_extract_answer[complex].yaml
index 319c6183..8b076955 100644
--- a/tests/cassettes/test_extract_answer[complex].yaml
+++ b/tests/cassettes/test_extract_answer[complex].yaml
@@ -11,7 +11,7 @@ interactions:
         believe the collapse was due to social unrest because of the prolonged epidemic
         of 2025. I tend to agree with the majority - although I can see both sides.
         Thus my response is that the social unrest was the significant factor in the
-        collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature":
+        collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature":
         0}'
       headers:
         accept:
@@ -21,7 +21,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "861"
+          - "866"
         content-type:
           - application/json
         host:
@@ -51,18 +51,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW
-          FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH
-          sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA
-          nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC
-          yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib
-          3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D
-          krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA
+          H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1zsNE5S30qhhzZQaKE5lGIUaW2rkbVCkklDyLsX
+          OT92aAq96LDfzmh2pV1ECJWC5oTymnneGBXfl5uHx4VKt7PNy3KdLerpUj8/vavXxtkNHQUFrr6A
+          +5PqhmNjFHiJ+oC5BeYhuKaz20mWzbP0rgMNClBBVhkfTzBupJbxOBlP4mQWp/OjukbJwdGcfESE
+          ELLrzpBTC/imOUlGp0oDzrEKaH5uIoRaVKFCmXPSeaY9HfWQo/agu+hvyCVTpNUW3EWPhbJ1LOTU
+          rVLH+v58qcLKWFy5Iz/XS6mlqwsLzKEOFziPhnZ0HxHy2Q3XXuSlxmJjfOFxDToYptPs4Ef7nfZ0
+          fGQePVND0Wx0xa4Q4JlUbrAdyhmvQfTSfpWsFRIHIBoM/TvMNe/D4FJX/7HvAedgPIjCWBCSXw7c
+          t1kIP+6vtvOSu8DUbZ2HpiilrsAaKw/vXZpiWvI0gTSBFY320Q8AAAD//wMA8VLBff0CAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f42461018c5eb29-SJC
+          - 8f425bb69fe5250c-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -70,7 +70,7 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:33:52 GMT
+          - Wed, 18 Dec 2024 21:48:39 GMT
         Server:
           - cloudflare
         Transfer-Encoding:
@@ -84,25 +84,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "235"
+          - "491"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999790"
+          - "149999790"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_366dfd5f505d08facd0f7d10e64a9f5e
+          - req_0446ed4c188b77427f33f74f91e0d112
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer[empty-proposal].yaml b/tests/cassettes/test_extract_answer[empty-proposal].yaml
index cb61987f..576fc15e 100644
--- a/tests/cassettes/test_extract_answer[empty-proposal].yaml
+++ b/tests/cassettes/test_extract_answer[empty-proposal].yaml
@@ -5,7 +5,7 @@ interactions:
         has fixed options. Repeat back which option the proposed answer matches. GIVE
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
-        ", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        ", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -14,7 +14,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "369"
+          - "374"
         content-type:
           - application/json
         host:
@@ -34,7 +34,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -44,18 +44,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+
-          1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6
-          EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG
-          Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4
-          pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W
-          AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK
-          1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA=
+          H4sIAAAAAAAAAwAAAP//jFJNS8QwFLz3V4R33kq72/3qTUQ8CSLiRaSkyWsbTZOQpKgs+98l3d22
+          y67gJYeZN5OZl+wiQkBwyAmwhnrWGhnfVl939zxzDy+CCfO6KB+TLadl6rbPTyuYBYUuP5D5k+qG
+          6dZI9EKrA80sUo/BNV0vsuVys0w3PdFqjjLIauPjTMetUCKeJ/MsTtZxujmqGy0YOsjJW0QIIbv+
+          DDkVx2/ISTI7IS06R2uEfBgiBKyWAQHqnHCeKg+zkWRaeVR99ClsseocDdFUJ+UR3w/3SF0bq0t3
+          5Ae8Ekq4prBInVbB03ltoGf3ESHvfZ/uLCIYq1vjC68/UQXDdXqwg3GLI3msCl57Kq9ozswKjp4K
+          6SbrAEZZg/zCkBCgHRd6QkSTypdZrnkfagtV/8d+JBhD45EXxiIX7Grf3jx8sb/GhhX3gcH9OI9t
+          UQlVozVWHB64MsWqYmmCaYIlRPvoFwAA//8DACbc6TvuAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f424615ca81eb32-SJC
+          - 8f425bb11ee3eb30-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -63,9 +63,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:33:53 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=3EbR6c_9nmNeI58TWDLCiyFbbzWnxiCAQfgz1Ou5oXQ-1734558518-1.0.1.1-_OVXY1MiEfz9j5Sl02ocx_beYJRhzMj_5kdzhk9Gq_NIORYBNM4OqmSmTCUwNu.EObKQiWZdQdrwqZ84sr8.cQ;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=U.00GXQIFA3gE8IldpDjXxcp1niJXAkehSRhHT85pWs-1734558518279-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -77,25 +83,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "171"
+          - "249"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29998"
         x-ratelimit-remaining-tokens:
-          - "29999912"
+          - "149999913"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_de2070d3e02afd584ac618042c22382d
+          - req_63d20bd456f7f2145bc66a3ae269bc1e
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer[gave-two].yaml b/tests/cassettes/test_extract_answer[gave-two].yaml
index a70e7e68..a529a885 100644
--- a/tests/cassettes/test_extract_answer[gave-two].yaml
+++ b/tests/cassettes/test_extract_answer[gave-two].yaml
@@ -5,7 +5,7 @@ interactions:
         has fixed options. Repeat back which option the proposed answer matches. GIVE
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
-        A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -14,7 +14,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "375"
+          - "380"
         content-type:
           - application/json
         host:
@@ -34,7 +34,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -44,18 +44,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85
-          zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN
-          QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm
-          rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK
-          l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL
-          EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed
-          2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA==
+          H4sIAAAAAAAAAwAAAP//jFJNTwIxFLzvr2jemTW7fAhyI8TozUSjiRizKd23S7Xb17QPlRD+u+mC
+          gAETLz3MvJnOvHadCAG6hLEAtZCsGmfSSfU5vX6YXd2E2YRGt4Mp493qqZp/PD7jPXSiguZvqPhH
+          daGocQZZk93SyqNkjK75sNcfDEaDfNQSDZVooqx2nPYpbbTVaTfr9tNsmOajnXpBWmGAsXhJhBBi
+          3Z4xpy3xC8Yi6/wgDYYga4TxfkgI8GQiAjIEHVhahs6BVGQZbRv9GPZYLYOM0ezSmB2+2d9jqHae
+          5mHH7/FKWx0WhUcZyEbPwOSgZTeJEK9tn+WviOA8NY4Lpne00XDY29rBYYsHclcVmFiaM5pfZkWJ
+          LLUJR+sAJdUCyxNDIUAuS01HRHJU+TTLOe9tbW3r/9gfCKXQMZaF81hqdbZvax6/2F9j+xW3gSGs
+          AmNTVNrW6J3X2weuXHFZqTzDPMM5JJvkGwAA//8DAOjXCFXuAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f42460a68a467f1-SJC
+          - 8f425bb11e1069a2-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -63,9 +63,15 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:33:51 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=eAk9PjLOP_uC98HrFuiPUxdGbMOD0FndASetRInyC8E-1734558518-1.0.1.1-czBHIlZrAXhRtJiNtQMJ4FNObmpYfP0sPzRSb84VB2iiFfmBNMFsZOSzB8kN5BWGvHDUXsKgWJTphYPTQzM3FA;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=dYXAYAvcpEWoKaqCouzZ9rcGFRQEzhYA4XzFKsQi83I-1734558518200-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -77,25 +83,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "241"
+          - "171"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999911"
+          - "149999912"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_83d07d0983e1d4d1995bfa068db503dd
+          - req_efdaa27fda18e26d87bcadcc80237c76
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_extract_answer[not in options].yaml b/tests/cassettes/test_extract_answer[not in options].yaml
index e48c8e4c..70884d60 100644
--- a/tests/cassettes/test_extract_answer[not in options].yaml	
+++ b/tests/cassettes/test_extract_answer[not in options].yaml	
@@ -5,7 +5,7 @@ interactions:
         has fixed options. Repeat back which option the proposed answer matches. GIVE
         ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
         or ambiguous, return an empty string.\n\nOptions:\nB\nC\n\nProposed answer:
-        F", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
+        F", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -14,7 +14,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "367"
+          - "372"
         content-type:
           - application/json
         host:
@@ -44,18 +44,18 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArr5gYl/d9uLIxIMLBQFLn2JXFxfJbtQKuq3x05DU0r
-          QGLJ8H73Xt5dckwYAyVhzUDUPIjG6vS+/Ni3T5TnlpYPyA/ukT4r2r3kcqKeYRQdtN2hCN+uO0GN
-          1RgUmTMWDnnAmJovJtPZbDGfZB1oSKKOtsqGdErpOBtP02yZZvPeWJMS6GHNXhPGGDt2z1jRSNzD
-          mnUxndKg97xCWF+GGANHOirAvVc+cBNgNEBBJqDpWm9gA9fIYdl6HpuZVuteP13epamyjra+5xe9
-          VEb5unDIPZmY6wNZ6OgpYeyt26m9qQnWUWNDEegdTQycr85xMBxxgHnPAgWuB3nRn+E2rJAYuNL+
-          6iQguKhRDs7hfryViq5AcrXyzy6/ZZ/XVqb6T/wAhEAbUBbWoVTidt9hzGH8w/4au5y4Kwz+4AM2
-          RalMhc46df7IpS1WJd/yVZlNlpCcki8AAAD//wMAJ6aXP+0CAAA=
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9NyjpM/QGiAsSEuICEkKR62wSg2Nb9rY8qv47chLa
+          VC0SFx9mdsYza28jxkAWsGQgak6isSq+Kj9ubt0dyflYyIfy8b7mT5ss3Txff6sJjILCrN5Q0K/q
+          QpjGKiRpdEcLh5wwuKaLyXQ2y2Zp1hKNKVAFWWUpnpq4kVrG42Q8jZNFnGa9ujZSoIcle4kYY2zb
+          niGnLvATliwZ/SINes8rhOV+iDFwRgUEuPfSE9cEowMpjCbUbfQh7LBcex6i6bVSPb7b36NMZZ1Z
+          +Z7f46XU0te5Q+6NDp6ejIWW3UWMvbZ91kcRwTrTWMrJvKMOhvPLzg4OWzyQfVUgQ1yd0RyZ5QUS
+          l8oP1gGCixqLE0PGgK8LaQZENKh8muWcd1db6uo/9gdCCLSERW4dFlKc7duahy/219h+xW1g8F+e
+          sMlLqSt01snugUubz0uRJpgmuIJoF/0AAAD//wMAUYws+e4CAAA=
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f4246051c1ceb26-SJC
+          - 8f425bb11ca22513-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -63,14 +63,14 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 18 Dec 2024 21:33:50 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
         Set-Cookie:
-          - __cf_bm=in2yLMzYdxfPIHSQDPq17chYgGhrTOolB6HZrJk9Iy8-1734557630-1.0.1.1-LYUU4oNWUKO8gNwjcIktYjnSyIsGLKQGmQKI54P4UxfMJ330MXeZFWWhVoJnP0b1M92ejFaHTWHlz4eHH30gIA;
-            path=/; expires=Wed, 18-Dec-24 22:03:50 GMT; domain=.api.openai.com; HttpOnly;
+          - __cf_bm=ABTDwd4t79cPLIko1hPlFoZXxUQ6rzPq8jHwq1Xy7XE-1734558518-1.0.1.1-Qqt3v2jz7xPx17Fx0ehWguxbmaMuZk4B3NM4Z1HW2aMmaaTMq2RvfX.y5A9X5qv4xoO0qWDJdyM.E9ahp.RW5A;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
             Secure; SameSite=None
-          - _cfuvid=El.JjK6nMT19ye2jesHXCIAySeg4BN7pKN7mVnzqSM8-1734557630598-0.0.1.1-604800000;
+          - _cfuvid=17oj8YL1hlYLaR7o.N8HEjWKDALCyYtBfmHe30jFAG0-1734558518262-0.0.1.1-604800000;
             path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
@@ -83,25 +83,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "253"
+          - "232"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29996"
         x-ratelimit-remaining-tokens:
-          - "29999912"
+          - "149992191"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 7ms
         x-ratelimit-reset-tokens:
-          - 0s
+          - 3ms
         x-request-id:
-          - req_5992eb433053f3b82b29a5319a96ef7e
+          - req_e11e29110308fec0a5310bf18d49c27d
       status:
         code: 200
         message: OK