Future-House · jamesbraza · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/src/aviary/core.py b/src/aviary/core.py
@@ -40,6 +40,7 @@
     EvalAnswerMode,
     encode_image_to_base64,
     eval_answer,
+    extract_answer,
     is_coroutine_callable,
     partial_format,
 )
@@ -82,6 +83,7 @@
     "encode_image_to_base64",
     "eval_answer",
     "eval_answer",
+    "extract_answer",
     "fenv",
     "is_coroutine_callable",
     "join",

diff --git a/src/aviary/utils.py b/src/aviary/utils.py
@@ -3,10 +3,9 @@
 import inspect
 import io
 import random
-import re
 import string
 from ast import literal_eval
-from collections.abc import Awaitable, Callable, Sequence
+from collections.abc import Sequence
 from enum import StrEnum
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast
 
@@ -21,8 +20,8 @@
     import numpy as np
 
 
-DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
-LLM_BOOL_EVAL_CONFIG = {
+DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini"
+LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a proposed answer"
         " to the question. Please tell me if the proposed answer is correct, given the"
@@ -35,6 +34,18 @@
     "temperature": 0,
 }
 
+LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
+    "prompt": (
+        "You are evaluating answers for a test which has fixed options. "
+        "Repeat back which option the proposed answer matches. "
+        "GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
+        "If the proposed answer is empty, invalid, or ambiguous, "
+        "return an empty string."
+        "\n\nOptions:\n{options}"
+        "\n\nProposed answer: {proposed_answer}"
+    )
+}
+
 LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
@@ -175,21 +186,36 @@ async def eval_answer(
     raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")
 
 
+async def extract_answer(
+    proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
+) -> str | None:
+    """Extract the answer matching a proposal from a list of options using an LLM."""
+    for option in options:
+        if proposed_answer.strip().casefold() == option.strip().casefold():
+            return option
+
+    default_config = LLM_EXTRACT_CONFIG
+    config = llm_eval_config or default_config
+    response_msg = await run_prompt(
+        prompt=config.get("prompt", default_config["prompt"]).format(
+            options="\n".join(options),
+            proposed_answer=proposed_answer,
+        ),
+        model=config.get("model", default_config["model"]),
+        temperature=config.get("temperature", default_config["temperature"]),
+    )
+    answer = response_msg.strip().casefold()  # noqa: FURB184
+    for option in options:
+        if answer == option.strip().casefold():
+            return option
+    return None
+
+
 _CAPITAL_A_INDEX = ord("A")
 
 
 class MultipleChoiceQuestion(BaseModel):
     QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}"
-    # TODO: combine with above eval_answer and its prompts
-    EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = (
-        "Given the following question and a proposed answer to the question, return the"
-        " single-letter choice in the question that matches the proposed answer."
-        " If the proposed answer is blank or an empty string,"
-        " or multiple options are matched, respond with '0'."
-        "\n\nQuestion: {qa_prompt}"
-        "\n\nProposed Answer: {qa_answer}"
-        "\n\nSingle Letter Answer:"
-    )
     DEFAULT_UNSURE_OPTION: ClassVar[str] = (
         "Insufficient information to answer this question"
     )
@@ -280,18 +306,14 @@ def split_options(options: str) -> list[str]:
         return split_options
 
     async def grade(
-        self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None
-    ) -> "tuple[MultipleChoiceEvaluation, str, str]":
-        if prompt_runner is None:
-            prompt_runner = run_prompt
-        eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
-            qa_prompt=self.question_prompt, qa_answer=answer
-        )
-        raw_evaluation = await prompt_runner(eval_prompt)
-        evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer(
-            raw_evaluation, self
+        self, proposed_answer: str
+    ) -> "tuple[MultipleChoiceEvaluation, str | None]":
+        extracted_answer = await extract_answer(
+            proposed_answer=proposed_answer, options=self.options
         )
-        return evaluation, raw_evaluation, parsed_answer
+        return MultipleChoiceEvaluation.from_answer(
+            extracted_answer, self
+        ), extracted_answer
 
 
 class MultipleChoiceEvaluation(StrEnum):
@@ -323,32 +345,19 @@ def calculate_accuracy_precision(
 
     @classmethod
     def from_answer(
-        cls, answer: str, question: MultipleChoiceQuestion
-    ) -> "tuple[MultipleChoiceEvaluation, str]":
+        cls, extracted_answer: str | None, question: MultipleChoiceQuestion
+    ) -> "MultipleChoiceEvaluation":
         """Make an evaluation from the input answer and multiple choice question.
 
         Returns:
-            Two-tuple of answer enum and the raw answer extracted from the input answer.
+            Evaluation corresponding to the parsed answer.
         """
-        # SEE: https://regex101.com/r/vcE9Hb/1
-        letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL)
-        # Get the letter answer, or fail over to the first non-whitespace char
-        answer_char = (
-            letter_search.group(1)
-            if letter_search is not None
-            else answer.split()[0][0].upper()
-        )
-        answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX
-        if answer_letter_index < 0 or answer_letter_index > len(question.options):
-            # The result extracted was not in the options (e.g. '0')
-            return cls.INCORRECT, answer_char
+        if extracted_answer is None:
+            return MultipleChoiceEvaluation.INCORRECT
         # From here, if we don't match either the ideal or the unsure multiple choice
         # options then we declare the answer as incorrect.
-        if (
-            question.unsure_answer_index is not None
-            and answer_letter_index == question.unsure_answer_index
-        ):
-            return cls.UNSURE, cast(str, question.unsure_answer)
-        if answer_letter_index == question.ideal_answer_index:
-            return cls.CORRECT, question.ideal_answer
-        return cls.INCORRECT, question.options[answer_letter_index]
+        if extracted_answer == question.ideal_answer:
+            return MultipleChoiceEvaluation.CORRECT
+        if question.unsure_answer and extracted_answer == question.unsure_answer:
+            return MultipleChoiceEvaluation.UNSURE
+        return MultipleChoiceEvaluation.INCORRECT
diff --git a/...s/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/...s/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
@@ -1,13 +1,12 @@
 interactions:
   - request:
       body:
-        '{"messages": [{"content": "Given the following question and a proposed
-        answer to the question, return the single-letter choice in the question that
-        matches the proposed answer. If the proposed answer is blank or an empty string,
-        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
-        the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
-        this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle
-        Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+        '{"messages": [{"content": "You are evaluating answers for a test which
+        has fixed options. Repeat back which option the proposed answer matches. GIVE
+        ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
+        or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
+        to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
+        "user"}], "model": "gpt-4o-mini", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,7 +15,7 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "513"
+          - "442"
         content-type:
           - application/json
         host:
@@ -36,7 +35,7 @@ interactions:
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "1"
+          - "0"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
@@ -46,28 +45,34 @@ interactions:
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k
-          u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw
-          47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK
-          RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie
-          H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu
-          tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6
-          AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da
+          tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX
+          wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr
+          +ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2
+          EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI
+          XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH
+          wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8f39fde1cf88cf1b-SJC
+          - 8f425bb2ac70f953-SJC
         Connection:
           - keep-alive
         Content-Encoding:
           - gzip
         Content-Type:
           - application/json
         Date:
-          - Tue, 17 Dec 2024 21:26:29 GMT
+          - Wed, 18 Dec 2024 21:48:38 GMT
         Server:
           - cloudflare
+        Set-Cookie:
+          - __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ;
+            path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
+            Secure; SameSite=None
+          - _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000;
+            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -79,25 +84,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "363"
+          - "144"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "10000"
+          - "30000"
         x-ratelimit-limit-tokens:
-          - "30000000"
+          - "150000000"
         x-ratelimit-remaining-requests:
-          - "9999"
+          - "29999"
         x-ratelimit-remaining-tokens:
-          - "29999874"
+          - "149999896"
         x-ratelimit-reset-requests:
-          - 6ms
+          - 2ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_aff8daa48aa43d3df077f97da6136e5a
+          - req_503cd8163bd0d3b634eb723d6874b1da
       status:
         code: 200
         message: OK