Get Started
Examples
Concepts
Resources
Projects
Integrations
copy markdown
Accurately detect buttons, menus, input fields, and other UI elements on a screen to perform computer use tasks like computer use, browsing the web, clicking buttons, filling forms, and navigating applications.

OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const ComputerUseSchema = z.object({
elements: z.array(
z.object({
type: z.string().describe("type of UI element, e.g. button, link, input, dropdown, text, image"),
label: z.string().describe("the visible text or label of the element"),
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
interactive: z.boolean().describe("whether the element can be clicked or typed into"),
})
),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Detect all interactive UI elements on this screen" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/computer_use.jpg",
},
},
],
},
],
response_format: zodResponseFormat(ComputerUseSchema, "computer_use_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("GUI Elements:", precontext[0]?.result?.gui_elements);JSON output
object contains the extracted information defined in the schema. precontext contains the raw GUI element metadata with bounding boxes and interactivity flags.
{
"object": {
"elements": [
{
"type": "button",
"label": "Sign In",
"top_left_x": 1120,
"top_left_y": 18,
"bottom_right_x": 1192,
"bottom_right_y": 44,
"interactive": true
},
{
"type": "input",
"label": "Search",
"top_left_x": 312,
"top_left_y": 12,
"bottom_right_x": 692,
"bottom_right_y": 42,
"interactive": true
},
{
"type": "link",
"label": "Home",
"top_left_x": 72,
"top_left_y": 64,
"bottom_right_x": 116,
"bottom_right_y": 88,
"interactive": true
}
]
},
"response": {
"id": "interfaze-1775001605440",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 3201,
"completion_tokens": 4812,
"total_tokens": 8013
},
"precontext": [
{
"name": "gui_detection",
"result": {
"gui_elements": [
{
"type": "button",
"top_left_x": 1120,
"top_left_y": 18,
"bottom_right_x": 1192,
"bottom_right_y": 44
},
{
"type": "input",
"top_left_x": 312,
"top_left_y": 12,
"bottom_right_x": 692,
"bottom_right_y": 42
},
{
"type": "link",
"top_left_x": 72,
"top_left_y": 64,
"bottom_right_x": 116,
"bottom_right_y": 88
}
]
}
}
]
}
}
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const FormSchema = z.object({
gui_elements: z.array(
z.object({
text_input_title: z.string().describe("title of the text input"),
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
})
),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Text inputs to fill in the form" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/GUI_form.png",
},
},
],
},
],
response_format: zodResponseFormat(FormSchema, "form_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("GUI Elements:", precontext[0]?.result?.gui_elements);JSON output
Generated bounding boxes for mouse to click on.
{
"object": {
"gui_elements": [
{
"text_input_title": "LinkedIn",
"top_left_x": 1192,
"top_left_y": 258,
"bottom_right_x": 1399,
"bottom_right_y": 313
},
{
"text_input_title": "Github (Required only if it's a technical role)",
"top_left_x": 1194,
"top_left_y": 559,
"bottom_right_x": 1396,
"bottom_right_y": 612
},
{
"text_input_title": "Share all the tools, languages and services you're experienced with",
"top_left_x": 1192,
"top_left_y": 1039,
"bottom_right_x": 1398,
"bottom_right_y": 1096
},
{
"text_input_title": "X/Twitter",
"top_left_x": 1192,
"top_left_y": 1337,
"bottom_right_x": 1394,
"bottom_right_y": 1392
},
{
"text_input_title": "Anything else you would like us to know?",
"top_left_x": 1194,
"top_left_y": 1638,
"bottom_right_x": 1391,
"bottom_right_y": 1687
}
]
},
"response": {
"id": "interfaze-1776383791550",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1776383791550",
"object": "chat.completion",
"created": 1776383791,
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 4196,
"completion_tokens": 11987,
"total_tokens": 16183
},
"precontext": [
{
"name": "gui_detection",
"result": {
"annotated_image": "https://r2public.jigsawstack.com/interfaze/examples/form_computer_use.png",
"gui_elements": [
{
"type": "icon",
"bounds": {
"top_left": {
"x": 1194,
"y": 1638
},
"top_right": {
"x": 1391,
"y": 1638
},
"bottom_left": {
"x": 1194,
"y": 1687
},
"bottom_right": {
"x": 1391,
"y": 1687
},
"width": 197,
"height": 49
}
},
{
"type": "icon",
"bounds": {
"top_left": {
"x": 1192,
"y": 1039
},
"top_right": {
"x": 1398,
"y": 1039
},
"bottom_left": {
"x": 1192,
"y": 1096
},
"bottom_right": {
"x": 1398,
"y": 1096
},
"width": 206,
"height": 57
}
},
{
"type": "icon",
"bounds": {
"top_left": {
"x": 1194,
"y": 559
},
"top_right": {
"x": 1396,
"y": 559
},
"bottom_left": {
"x": 1194,
"y": 612
},
"bottom_right": {
"x": 1396,
"y": 612
},
"width": 202,
"height": 53
}
},
{
"type": "icon",
"bounds": {
"top_left": {
"x": 1192,
"y": 258
},
"top_right": {
"x": 1399,
"y": 258
},
"bottom_left": {
"x": 1192,
"y": 313
},
"bottom_right": {
"x": 1399,
"y": 313
},
"width": 207,
"height": 55
}
},
{
"type": "icon",
"bounds": {
"top_left": {
"x": 1192,
"y": 1337
},
"top_right": {
"x": 1394,
"y": 1337
},
"bottom_left": {
"x": 1192,
"y": 1392
},
"bottom_right": {
"x": 1394,
"y": 1392
},
"width": 202,
"height": 55
}
}
]
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 4196,
"outputTokens": 11987,
"totalTokens": 16183
}
}Running GUI detection as a single task with <task>gui_detection</task> in the system message makes it cheaper and faster with a fixed structured output.
Learn more about running a task.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>gui_detection</task>",
},
{
role: "user",
content: [
{ type: "text", text: "Detect all interactive UI elements on this screen" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/computer_use.jpg",
},
},
],
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);JSON output
{
"object": {
"name": "gui_detection",
"result": {
"gui_elements": [
{
"type": "button",
"top_left_x": 1120,
"top_left_y": 18,
"bottom_right_x": 1192,
"bottom_right_y": 44
},
{
"type": "input",
"top_left_x": 312,
"top_left_y": 12,
"bottom_right_x": 692,
"bottom_right_y": 42
},
{
"type": "link",
"top_left_x": 72,
"top_left_y": 64,
"bottom_right_x": 116,
"bottom_right_y": 88
},
{
"type": "dropdown",
"top_left_x": 720,
"top_left_y": 64,
"bottom_right_x": 820,
"bottom_right_y": 90
}
]
}
}
}