Future-House · whitead · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/src/aviary/core.py b/src/aviary/core.py
@@ -40,6 +40,7 @@
     EvalAnswerMode,
     encode_image_to_base64,
     eval_answer,
+    extract_answer_llm,
     is_coroutine_callable,
     partial_format,
 )
@@ -81,7 +82,7 @@
     "argref_by_name",
     "encode_image_to_base64",
     "eval_answer",
-    "eval_answer",
+    "extract_answer_llm",
     "fenv",
     "is_coroutine_callable",
     "join",

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
@@ -22,6 +22,21 @@
     "temperature": 0,
 }
 
+LLM_EXTRACT_CONFIG = {
+    "prompt": (
+        "You are evaluating answers for a test which has fixed options. "
+        "Repeat back which option the proposed answer matches. "
+        "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
+        "If the proposed answer is empty, invalid, or ambiguous, "
+        "return an empty string."
+        "\n\nOptions:\n{options}"
+        "\n\nProposed answer: {proposed_answer}"
+    ),
+    "model": "gpt-4o-mini",
+    "temperature": 0,
+}
+
+
 LLM_SCORE_EVAL_CONFIG = LLM_EVAL_CONFIG | {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
@@ -88,6 +103,44 @@ def is_coroutine_callable(obj) -> bool:
     return False
 
 
+async def extract_answer_llm(
+    proposed: str,
+    options: list[str],
+) -> str | None:
+    """Extract the answer from a proposed answer and a list of options."""
+    if not proposed:
+        return None
+    for option in options:
+        if proposed.strip().casefold() == option.casefold().strip():
+            return option
+
+    try:
+        from litellm import acompletion
+    except ImportError as e:
+        raise ImportError(
+            "eval_answer requires the 'llm' extra for 'litellm'. Please:"
+            " `pip install aviary[llm]`."
+        ) from e
+    config = LLM_EXTRACT_CONFIG
+    prompt = cast(str, config["prompt"]).format(
+        options="\n".join(options),
+        proposed_answer=proposed,
+    )
+
+    response = await acompletion(
+        model=config["model"],
+        temperature=config["temperature"],
+        messages=[{"content": prompt, "role": "user"}],
+    )
+
+    extracted = response.choices[0].message.content.strip()
+    for option in options:
+        if extracted.casefold() == option.casefold().strip():
+            return option
+
+    return None
+
+
 async def eval_answer(
     proposed: str,
     correct: str,

diff --git a/tests/cassettes/test_extract_answer_llm[complex].yaml b/tests/cassettes/test_extract_answer_llm[complex].yaml
@@ -0,0 +1,109 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
+        unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
+        Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
+        from economic factors. Yet, most other scholars (Gerald and Robinson for example)
+        believe the collapse was due to social unrest because of the prolonged epidemic
+        of 2025. I tend to agree with the majority - although I can see both sides.
+        Thus my response is that the social unrest was the significant factor in the
+        collapse of the regime.", "role": "user"}], "model": "gpt-4o-mini", "temperature":
+        0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "866"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.2
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.2
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.4
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJBTwIxEIXv+yuanlkDLLDITT140OiBxJgYs+m2w1Lpdpp2ViGE/266
+          IAsREy89zDfv9c2024QxrhWfMS6XgmTtTHqjHp59/YQyyLn6vJ28Klo/ipfNF6zu7nkvKrD8AEk/
+          qiuJtTNAGu0eSw+CILoO8iy7zibjPGtBjQpMlFWO0hGmtbY6HfaHo7Sfp4PpQb1ELSHwGXtLGGNs
+          254xp1Ww5jPW7/1UaghBVMBnxybGuEcTK1yEoAMJS7zXQYmWwLbR5yi1MKyxHsJZj4dFE0TMaRtj
+          DvXd8VKDlfNYhgM/1hfa6rAsPIiANl4QCB1v6S5h7L0drjnLy53H2lFBuAIbDQeT8d6Pdzvt6PDA
+          CEmYU1Heu2BXKCChTTjZDpdCLkF10m6VolEaT0ByMvTvMJe894NrW/3HvgNSgiNQhfOgtDwfuGvz
+          EH/cX23HJbeBedgEgrpYaFuBd17v33vhirIUmZxC3i95sku+AQAA//8DAAcy6K79AgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f070b7e9be306ad-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 11 Dec 2024 17:02:53 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "244"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999790"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_e8f3bb69f3add846e2a40af6c0982db6
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_extract_answer_llm[not exact].yaml b/tests/cassettes/test_extract_answer_llm[not exact].yaml
@@ -0,0 +1,102 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
+        A or B", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "380"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.2
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.2
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.4
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVpCe0PigARSOSEkhCLH3iYGxzb2FlGq/jty+krVInHx
+          YWZnPLP2OmEMlIQZA9FwEq3T6a18mH/OH5ufVRVoNZk+PRfmXovs5e4rz2EQFbZ6R0F71ZWwrdNI
+          ypotLTxywug6LPJ8ml9PilFHtFaijrLaUTq2aauMSkfZaJxmRTq82akbqwQGmLHXhDHG1t0ZcxqJ
+          3zBj2WCPtBgCrxFmhyHGwFsdEeAhqEDcEAyOpLCG0HTR+7DHxTLwGM0std7hm8M92tbO2yrs+AO+
+          UEaFpvTIgzXRM5B10LGbhLG3rs/yJCI4b1tHJdkPNNGwyLd2cNzikdxVBbLE9QXNiVkpkbjSobcO
+          EFw0KM8MGQO+lMr2iKRX+TzLJe9tbWXq/9gfCSHQEcrSeZRKXOzbmccv9tfYYcVdYAirQNiWC2Vq
+          9M6r7QMvXFlVPBc3WGQVJJvkFwAA//8DABYKnlruAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f070b799eb5679d-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Wed, 11 Dec 2024 17:02:52 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "193"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "30000"
+        x-ratelimit-limit-tokens:
+          - "150000000"
+        x-ratelimit-remaining-requests:
+          - "29999"
+        x-ratelimit-remaining-tokens:
+          - "149999912"
+        x-ratelimit-reset-requests:
+          - 2ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_24d3312a6ad717a657fe3e693bd24613
+      status:
+        code: 200
+        message: OK
+version: 1