aitomatic · TheVinhLuong102 · Oct 9, 2024 · Oct 9, 2024 · Dec 15, 2024
diff --git a/examples/FinanceBench-Lite/.env.template → examples/FinanceBench-AMD/.env.template b/examples/FinanceBench-Lite/.env.template → examples/FinanceBench-AMD/.env.template
diff --git a/examples/FinanceBench-Lite/.gitignore → examples/FinanceBench-AMD/.gitignore b/examples/FinanceBench-Lite/.gitignore → examples/FinanceBench-AMD/.gitignore
diff --git a/examples/FinanceBench-Lite/Makefile → examples/FinanceBench-AMD/Makefile b/examples/FinanceBench-Lite/Makefile → examples/FinanceBench-AMD/Makefile
diff --git a/examples/FinanceBench-Lite/README.md → examples/FinanceBench-AMD/README.md b/examples/FinanceBench-Lite/README.md → examples/FinanceBench-AMD/README.md
@@ -1,9 +1,9 @@
-<!-- markdownlint-disable MD013 MD043 -->
+<!-- markdownlint-disable MD043 -->
 
 # OpenSSA-FinanceBench Lite benchmarking
 
 This is a lite version of the benchmarking of `OpenSSA` performance
-on the `FinanceBench` dataset. We will use 1 question from the dataset to demonstrate the use of `OpenSSA` with `DANA` architecture.
+on the `FinanceBench` dataset. We will use 1 question from the dataset to demonstrate the use of `OpenSSA` with `DANA` architecture. 
 
 ## [`FinanceBench` Dataset](https://github.com/patronus-ai/financebench/blob/main/financebench_sample_150.csv)
 
@@ -19,26 +19,26 @@
 __Solve__ the problem corresponding to a problem `00807` `financebench_id`:
 __`make dana-solve id=00807`__.
 
-### Question
+
+**Question**: 
 
 `Does 3M have a reasonably healthy liquidity profile based on its quick ratio for Q2 of FY2023? If the quick ratio is not relevant to measure liquidity, please state that and explain why.`
 
-### Knowledge
+**Knowledge**
 
 To solve this question, you can add knowledge related to `liquidity`. See the example below:
 
 - Liquidity Metric Formulas
-  - `(Net) Working Capital` = `(Total) Current Assets` - `(Total) Current Liabilities`
-  - `Working Capital Ratio` = `(Total) Current Assets` / `(Total) Current Liabilities`
+    - `(Net) Working Capital` = `(Total) Current Assets` - `(Total) Current Liabilities`
+    - `Working Capital Ratio` = `(Total) Current Assets` / `(Total) Current Liabilities`
 
 Go to `knowledge-store.txt` to add relevant knowledge yourself and see how it helps the agent to solve this question.
 
-### Program
-
-With the above-provided knowledge, the program we can provide to the agent could be as below:
+**Program**
 
+With the above-provided knowledge, the program we can provide to the agent could be as below: 
 - Goal: To assess liquidity health of a company, calculate `quick ratio`
-  - Task: To calculate `quick ratio`, use this formula
+    - Task: To calculate `quick ratio`, use this formula 
             `Quick Ratio` = (
           (`Cash & Cash Equivalents` +
            `Short-Term Investments or (Current) Marketable Securities` +
@@ -53,6 +53,5 @@
 Go to `program-store.yml` to see details of the program yourself! You can experimenting with different plans to see how it helps the agent solve the problem as well.
 
 ## Advancing DANA Agent with Domain Knowledge and Program Store
-
 - To solve the question with added domain knowledge, run `make dana-solve-w-knowledge id=00807`
 - To solve the question with added domain knowledge and program store, run `make dana-solve-w-knowledge-and-prog-store id=00807`
diff --git a/examples/FinanceBench-Lite/dana.py → examples/FinanceBench-AMD/dana.py b/examples/FinanceBench-Lite/dana.py → examples/FinanceBench-AMD/dana.py
@@ -8,7 +8,7 @@
 # pylint: disable=wrong-import-order,wrong-import-position
 from data_and_knowledge import (DocName, FbId, Answer, Doc, FB_ID_COL_NAME, DOC_NAMES_BY_FB_ID, QS_BY_FB_ID,
                                 EXPERT_KNOWLEDGE, EXPERT_PROGRAMS, EXPERT_HTP_COMPANY_KEY, EXPERT_HTP_PERIOD_KEY)
-from util import QAFunc, enable_batch_qa_and_eval, log_qa_and_update_output_file
+from util import QAFunc, log_qa_and_update_output_file
 
 
 @cache
@@ -51,63 +51,55 @@ def get_or_create_adaptations(doc_name: DocName) -> dict[str, str]:
     return {EXPERT_HTP_COMPANY_KEY: (doc := Doc(name=doc_name)).company, EXPERT_HTP_PERIOD_KEY: doc.period}
 
 
-@enable_batch_qa_and_eval(output_name='DANA')
 @log_qa_and_update_output_file(output_name='DANA')
 def solve(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id]).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wKnowledge')
 @log_qa_and_update_output_file(output_name='DANA-wKnowledge')
 def solve_with_knowledge(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wProgStore')
 @log_qa_and_update_output_file(output_name='DANA-wProgStore')
 def solve_with_program_store(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_programs=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wProgStore')
 @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wProgStore')
 def solve_with_knowledge_and_program_store(fb_id: FbId) -> Answer:
     return get_or_create_agent(DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, expert_programs=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wLlama3')
 @log_qa_and_update_output_file(output_name='DANA-wLlama3')
 def solve_with_llama3(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], use_llama3=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wLlama3')
 @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wLlama3')
 def solve_with_knowledge_with_llama3(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, use_llama3=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wProgStore-wLlama3')
 @log_qa_and_update_output_file(output_name='DANA-wProgStore-wLlama3')
 def solve_with_program_store_with_llama3(fb_id: FbId) -> Answer:
     return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_programs=True, use_llama3=True).solve(
         problem=QS_BY_FB_ID[fb_id],
         adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id]))
 
 
-@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wProgStore-wLlama3')
 @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wProgStore-wLlama3')
 def solve_with_knowledge_and_program_store_with_llama3(fb_id: FbId) -> Answer:
     return get_or_create_agent(DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, expert_programs=True, use_llama3=True).solve(  # noqa: E501

diff --git a/...s/FinanceBench-Lite/data_and_knowledge.py → ...es/FinanceBench-AMD/data_and_knowledge.py b/...s/FinanceBench-Lite/data_and_knowledge.py → ...es/FinanceBench-AMD/data_and_knowledge.py
diff --git a/examples/FinanceBench-Lite/ground-truths.yml → examples/FinanceBench-AMD/ground-truths.yml b/examples/FinanceBench-Lite/ground-truths.yml → examples/FinanceBench-AMD/ground-truths.yml
diff --git a/...les/FinanceBench-Lite/knowledge-store.txt → ...ples/FinanceBench-AMD/knowledge-store.txt b/...les/FinanceBench-Lite/knowledge-store.txt → ...ples/FinanceBench-AMD/knowledge-store.txt
diff --git a/examples/FinanceBench-Lite/log.py → examples/FinanceBench-AMD/log.py b/examples/FinanceBench-Lite/log.py → examples/FinanceBench-AMD/log.py
diff --git a/examples/FinanceBench-Lite/program-store.yml → examples/FinanceBench-AMD/program-store.yml b/examples/FinanceBench-Lite/program-store.yml → examples/FinanceBench-AMD/program-store.yml
diff --git a/...s/FinanceBench-Lite/rag-ground-truths.yml → ...es/FinanceBench-AMD/rag-ground-truths.yml b/...s/FinanceBench-Lite/rag-ground-truths.yml → ...es/FinanceBench-AMD/rag-ground-truths.yml
diff --git a/examples/FinanceBench-Lite/util.py → examples/FinanceBench-AMD/util.py b/examples/FinanceBench-Lite/util.py → examples/FinanceBench-AMD/util.py
@@ -9,7 +9,6 @@
 from tqdm import tqdm
 
 from data_and_knowledge import FbId, Answer, FB_IDS, DOC_NAMES_BY_FB_ID, QS_BY_FB_ID, OUTPUT_FILE_PATH, get_or_create_output_df  # noqa: E501
-from eval import eval_correctness, eval_all
 from log import switch_log_file
 
 if TYPE_CHECKING:
@@ -19,38 +18,6 @@
 type QAFunc = Callable[[FbId], Answer]
 
 
-@dataclass
-class enable_batch_qa_and_eval:  # noqa: N801
-    output_name: str
-
-    def __call__(self, qa_func: QAFunc) -> QAFunc:
-        @wraps(wrapped=qa_func)
-        def decorated_qa_func(fb_id: FbId) -> Answer | None:
-            if 'all' in fb_id.lower():
-                for _fb_id in tqdm(FB_IDS):
-                    # run inferencing and preliminarily evaluate
-                    eval_correctness(fb_id=_fb_id, answer=qa_func(_fb_id), output_name=self.output_name, human=False)
-
-                # rigorously evaluate again, including human evaluation for difficult cases
-                eval_all(output_name=self.output_name, refresh=True)
-                return None
-
-            if 'from:' in fb_id.lower():
-                for _fb_id in tqdm(FB_IDS[FB_IDS.index(fb_id[5:]):]):
-                    # run inferencing and preliminarily evaluate
-                    eval_correctness(fb_id=_fb_id, answer=qa_func(_fb_id), output_name=self.output_name, human=False)
-
-                # rigorously evaluate again, including human evaluation for difficult cases
-                eval_all(output_name=self.output_name, refresh=True)
-                return None
-
-            # run inferencing and evaluate
-            eval_correctness(fb_id=fb_id, answer=(answer := qa_func(fb_id)), output_name=self.output_name, human=True)
-            return answer
-
-        return decorated_qa_func
-
-
 @dataclass
 class log_qa_and_update_output_file:  # noqa: N801
     output_name: str