From 19e9812e280b7f319a4591910de25cf41eec542b Mon Sep 17 00:00:00 2001 From: mondaychen Date: Wed, 15 May 2024 17:11:39 -0400 Subject: [PATCH 1/2] feat: scroll 2/3 of page instead of half for faster browsing --- src/helpers/rpc/domActions.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpers/rpc/domActions.ts b/src/helpers/rpc/domActions.ts index be2b7ba..218e8d0 100644 --- a/src/helpers/rpc/domActions.ts +++ b/src/helpers/rpc/domActions.ts @@ -231,7 +231,7 @@ export class DomActions { public async scrollUp() { await this.sendCommand("Runtime.evaluate", { expression: - 'window.scrollBy({left: 0, top: -window.innerHeight/2, behavior: "smooth"})', + 'window.scrollBy({left: 0, top: -window.innerHeight/1.5, behavior: "smooth"})', }); await sleep(300); } @@ -239,7 +239,7 @@ export class DomActions { public async scrollDown() { await this.sendCommand("Runtime.evaluate", { expression: - 'window.scrollBy({left: 0, top: window.innerHeight/2, behavior: "smooth"})', + 'window.scrollBy({left: 0, top: window.innerHeight/1.5, behavior: "smooth"})', }); await sleep(300); } From d6984cd1f76bf6f69db7fcfe32987901825ac6f6 Mon Sep 17 00:00:00 2001 From: mondaychen Date: Wed, 15 May 2024 17:27:51 -0400 Subject: [PATCH 2/2] feat: optimize prompt for better results --- src/helpers/vision-agent/determineNextAction.ts | 2 +- src/helpers/vision-agent/tools.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpers/vision-agent/determineNextAction.ts b/src/helpers/vision-agent/determineNextAction.ts index 092e415..7b8b5de 100644 --- a/src/helpers/vision-agent/determineNextAction.ts +++ b/src/helpers/vision-agent/determineNextAction.ts @@ -15,7 +15,7 @@ You can use the following tools: ${allToolsDescriptions} You will be given a task to perform, and an image. The image will contain two parts: on the left is a clean screenshot of the current page, and on the right is the same screenshot with interactive elements annotated with corresponding label. -You will also be given previous actions that you have taken. You may retry a failed action up to one time. +You will also be given previous actions that you have taken. If something does not work, try find an alternative solution. For example, instead of searching for a specific item that the user requested, perform a general search and apply filters, or simply browse the results page. You will also be given additional information of annotations. This is one example of expected response from you: diff --git a/src/helpers/vision-agent/tools.ts b/src/helpers/vision-agent/tools.ts index 28ca1bc..93a6100 100644 --- a/src/helpers/vision-agent/tools.ts +++ b/src/helpers/vision-agent/tools.ts @@ -52,7 +52,7 @@ export const scrollSchema = z.object({ name: z.literal("scroll"), description: z .literal( - 'Scroll the page to see the other parts. Use "up" or "down" to scroll half the height of the window. Use "top" or "bottom" to quickly scroll to the top or bottom of the page.', + 'Scroll the page to see the other parts. Use "up" or "down" to scroll 2/3 of height of the window. Use "top" or "bottom" to quickly scroll to the top or bottom of the page.', ) .optional(), args: z.object({