SalesforceAIResearch · lewtun · Sep 4, 2024 · lewtun · Sep 4, 2024 · lewtun
diff --git a/agentlite/llm/agent_llms.py b/agentlite/llm/agent_llms.py
@@ -1,6 +1,8 @@
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 from openai import OpenAI
+from huggingface_hub import model_info
+from huggingface_hub.utils._errors import RepositoryNotFoundError
 
 from agentlite.llm.LLMConfig import LLMConfig
 
@@ -17,6 +19,14 @@
 ]
 OPENAI_LLM_MODELS = ["text-davinci-003", "text-ada-001"]
 
+def is_huggingface_model(llm_name: str) -> bool:
+    """Check if the model is available on the Hugging Face Hub"""
+    try:
+        model_info(llm_name)
+        return True
+    except RepositoryNotFoundError:
+        return False
+
 
 class BaseLLM:
     def __init__(self, llm_config: LLMConfig) -> None:
@@ -50,6 +60,20 @@ def run(self, prompt: str):
         )
         return response.choices[0].message.content
 
+class VllmChatModel(BaseLLM):
+    def __init__(self, llm_config: LLMConfig):
+        super().__init__(llm_config)
+        self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
+
+    def run(self, prompt: str):
+        response = self.client.chat.completions.create(
+            model=self.llm_name,
+            messages=[
+                {"role": "user", "content": prompt},
+            ],
+        )
+        return response.choices[0].message.content
+
 
 class LangchainLLM(BaseLLM):
     def __init__(self, llm_config: LLMConfig):
@@ -117,6 +141,8 @@ def get_llm_backend(llm_config: LLMConfig):
         return LangchainChatModel(llm_config)
     elif llm_name in OPENAI_LLM_MODELS:
         return LangchainLLM(llm_config)
+    elif is_huggingface_model(llm_name):
+        return VllmChatModel(llm_config)
     else:
         return LangchainLLM(llm_config)
     # TODO: add more llm providers and inference APIs but for now we are using langchainLLM as the default

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -25,12 +25,26 @@ bash ./run_dev.sh
 it is highly suggested running webshop in backend with `tmux`.
 
 2. Since AgentLite is using a different python version, you should create a new environment for AgentLite.
-3. Run AgentLite evaluation in this folder  with
+3. Run AgentLite evaluation in this folder with
+
 ```
 cd webshop
 python evaluate_webshop.py --llm gpt-4-0613 --agent_arch act
 ```
 
+To evaluate a model from the [Hugging Face Hub](https://huggingface.co/models) with a [vLLM](https://github.com/vllm-project/vllm) backend, first spin up the server with:
+
+```shell
+vllm serve Salesforce/xLAM-v0.1-r --tensor-parallel-size {NUM_GPUS}
+```
+
+Then, in a separate terminal run:
+
+```shell
+python evaluate_webshop.py --llm Salesforce/xLAM-v0.1-r --agent_arch act
+```
+
+
 ## Tool-query
 We follow [AgentBoard](https://github.com/hkust-nlp/AgentBoard) environment to setup the tool-query benchmark. And we designed the individual agent via AgentLite with all the corresponding function call as actions.
 You should first get a `data/tool-query` folder, which is a copy of data from AgentBoard with

diff --git a/benchmark/webshop/evaluate_webshop.py b/benchmark/webshop/evaluate_webshop.py
@@ -88,7 +88,7 @@ def get_runned_ids(file_path):
     args = parser.parse_args()
     rewards = []
     all_task_ids = list(range(0, 251))
-    REWARD_LOG_FILE = f"{args.llm}_{args.agent_arch}_results_webshop.csv"
+    REWARD_LOG_FILE = f"{args.llm.replace('/', '_')}_{args.agent_arch}_results_webshop.csv"
     runned_ids = get_runned_ids(REWARD_LOG_FILE)
     if runned_ids is None:
         evaluate_ids = all_task_ids