Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supported text-based model #29

Merged
merged 5 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 81 additions & 26 deletions src/helpers/availableActions.ts
Original file line number Diff line number Diff line change
@@ -1,60 +1,109 @@
// TODO: support the old Taxy actions
export const availableActions = [
// TODO: refactor such that it only has one "availableActions"
// which dynamically decides whether it's "label" or "elementId" based on the model type
export const availableActionsVision = [
{
name: 'click',
name: "click",
description:
'Clicks on an element with the text label appears on or associated with it.',
"Clicks on an element with the text label appears on or associated with it.",
args: [
{
name: 'label',
type: 'string',
name: "label",
type: "string",
},
],
},
{
name: 'setValue',
name: "setValue",
description:
'Focuses on and sets the value of an input element. Label can be the text label appears on or associated with it, or the value in it',
"Focuses on and sets the value of an input element. Label can be the text label appears on or associated with it, or the value in it",
args: [
{
name: 'label',
type: 'string',
name: "label",
type: "string",
},
{
name: 'value',
type: 'string',
name: "value",
type: "string",
},
],
},
{
name: 'scroll',
name: "scroll",
description: 'Scroll the page up or down. Value can be "up" or "down"',
args: [
{
name: 'value',
type: 'string',
name: "value",
type: "string",
},
],
},
{
name: 'finish',
description: 'Indicates the task is finished',
name: "finish",
description: "Indicates the task is finished",
args: [],
},
{
name: 'fail',
description: 'Indicates that you are unable to complete the task',
name: "fail",
description: "Indicates that you are unable to complete the task",
args: [],
},
] as const;

export const availableActions = [
{
name: "click",
description: "Clicks on an element",
args: [
{
name: "elementId",
type: "string",
},
],
},
{
name: "setValue",
description: "Focuses on and sets the value of an input element",
args: [
{
name: "elementId",
type: "string",
},
{
name: "value",
type: "string",
},
],
},
{
name: "scroll",
description: 'Scroll the page up or down. Value can be "up" or "down"',
args: [
{
name: "value",
type: "string",
},
],
},
{
name: "finish",
description: "Indicates the task is finished",
args: [],
},
{
name: "fail",
description: "Indicates that you are unable to complete the task",
args: [],
},
] as const;

type AvailableActionVision = (typeof availableActionsVision)[number];
type AvailableAction = (typeof availableActions)[number];

type ArgsToObject<T extends ReadonlyArray<{ name: string; type: string }>> = {
[K in T[number]['name']]: Extract<
[K in T[number]["name"]]: Extract<
T[number],
{ name: K }
>['type'] extends 'number'
>["type"] extends "number"
? number
: string;
};
Expand All @@ -63,14 +112,20 @@ export type ActionShape<
T extends {
name: string;
args: ReadonlyArray<{ name: string; type: string }>;
}
},
> = {
name: T['name'];
args: ArgsToObject<T['args']>;
name: T["name"];
args: ArgsToObject<T["args"]>;
};

export type ActionPayloadVision = {
[K in AvailableAction["name"]]: ActionShape<
Extract<AvailableAction, { name: K }>
>;
}[AvailableActionVision["name"]];

export type ActionPayload = {
[K in AvailableAction['name']]: ActionShape<
[K in AvailableAction["name"]]: ActionShape<
Extract<AvailableAction, { name: K }>
>;
}[AvailableAction['name']];
}[AvailableAction["name"]];
81 changes: 51 additions & 30 deletions src/helpers/determineNextAction.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
import OpenAI from 'openai';
import { useAppState } from '../state/store';
import { availableActions } from './availableActions';
import { ParsedResponseSuccess } from './parseResponse';
import OpenAI from "openai";
import { useAppState } from "../state/store";
import { availableActions, availableActionsVision } from "./availableActions";
import { ParsedResponseSuccess } from "./parseResponse";

const formattedActions = availableActions
.map((action, i) => {
const args = action.args
.map((arg) => `${arg.name}: ${arg.type}`)
.join(', ');
.join(", ");
return `${i + 1}. ${action.name}(${args}): ${action.description}`;
})
.join('\n');
.join("\n");

// TODO: remove this after refactoring availableActions file.
const formattedActionsVision = availableActionsVision
.map((action, i) => {
const args = action.args
.map((arg) => `${arg.name}: ${arg.type}`)
.join(", ");
return `${i + 1}. ${action.name}(${args}): ${action.description}`;
})
.join("\n");

const systemMessage = `
You are a browser automation assistant.
Expand All @@ -21,21 +31,30 @@ ${formattedActions}

You will be be given a task to perform and the current state of the DOM. You will also be given previous actions that you have taken. You may retry a failed action up to one time.

This is an example of an action:
There are two examples of actions:

Example 1:
{
thought: "I should click the add to cart button",
action: "click(223)"
}

Your response must always be in JSON format and must include "thought" and "action"`;
Example 2:
{
thought: "I should continue to scroll down to find the section",
action: "scroll("down")"
}

Your response must always be in JSON format and must include "thought" and "action".
When finish, use "finish()" in "action" and include a brief summary of the task in "thought".
`;

const visionSystemMessage = `
You are a browser automation assistant.

You can use the following tools:

${formattedActions}
${formattedActionsVision}

Pleaes note for setValue, you can press "enter" by including a line break (\`\\n\`) in the parameter to trigger form submitting.

Expand Down Expand Up @@ -63,11 +82,11 @@ export async function determineNextActionWithVision(
previousActions: ParsedResponseSuccess[],
screenshotData: string,
maxAttempts = 3,
notifyError?: (error: string) => void
notifyError?: (error: string) => void,
): Promise<NextAction> {
const key = useAppState.getState().settings.openAIKey;
if (!key) {
notifyError?.('No OpenAI key found');
notifyError?.("No OpenAI key found");
return null;
}
const model = useAppState.getState().settings.selectedModel;
Expand All @@ -87,18 +106,18 @@ export async function determineNextActionWithVision(
// },
messages: [
{
role: 'system',
role: "system",
content: visionSystemMessage,
},
{
role: 'user',
role: "user",
content: [
{
type: 'text',
type: "text",
text: prompt,
},
{
type: 'image_url',
type: "image_url",
image_url: {
// detail: 'low',
url: screenshotData, // this is already base64 encoded
Expand All @@ -114,13 +133,14 @@ export async function determineNextActionWithVision(
return {
usage: completion.usage,
prompt,
response: completion.choices[0].message?.content?.trim() || '',
response: completion.choices[0].message?.content?.trim() || "",
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (error: any) {
// TODO: need to verify the new API error format
console.error('determineNextAction error:');
console.error("determineNextAction error:");
console.error(error);
if (error.includes('server error')) {
if (error.includes("server error")) {
// Problem with the OpenAI API, try again
if (notifyError) {
notifyError(error);
Expand All @@ -132,7 +152,7 @@ export async function determineNextActionWithVision(
}
}
throw new Error(
`Failed to complete query after ${maxAttempts} attempts. Please try again later.`
`Failed to complete query after ${maxAttempts} attempts. Please try again later.`,
);
}

Expand All @@ -141,11 +161,11 @@ export async function determineNextAction(
previousActions: ParsedResponseSuccess[],
simplifiedDOM: string,
maxAttempts = 3,
notifyError?: (error: string) => void
notifyError?: (error: string) => void,
): Promise<NextAction> {
const key = useAppState.getState().settings.openAIKey;
if (!key) {
notifyError?.('No OpenAI key found');
notifyError?.("No OpenAI key found");
return null;
}
const model = useAppState.getState().settings.selectedModel;
Expand All @@ -165,10 +185,10 @@ export async function determineNextAction(
// },
messages: [
{
role: 'system',
role: "system",
content: systemMessage,
},
{ role: 'user', content: prompt },
{ role: "user", content: prompt },
],
max_tokens: 1000,
temperature: 0,
Expand All @@ -177,13 +197,14 @@ export async function determineNextAction(
return {
usage: completion.usage,
prompt,
response: completion.choices[0].message?.content?.trim() || '',
response: completion.choices[0].message?.content?.trim() || "",
};
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (error: any) {
// TODO: need to verify the new API error format
console.error('determineNextAction error:');
console.error("determineNextAction error:");
console.error(error);
if (error.includes('server error')) {
if (error.includes("server error")) {
// Problem with the OpenAI API, try again
if (notifyError) {
notifyError(error);
Expand All @@ -195,21 +216,21 @@ export async function determineNextAction(
}
}
throw new Error(
`Failed to complete query after ${maxAttempts} attempts. Please try again later.`
`Failed to complete query after ${maxAttempts} attempts. Please try again later.`,
);
}

export function formatPrompt(
taskInstructions: string,
previousActions: ParsedResponseSuccess[],
pageContents?: string
pageContents?: string,
) {
let previousActionsString = '';
let previousActionsString = "";

if (previousActions.length > 0) {
const serializedActions = previousActions
.map((action) => `Thought: ${action.thought}\nAction:${action.action}`)
.join('\n\n');
.join("\n\n");
previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
}

Expand Down
Loading
Loading