Get Started
Examples
Concepts
Resources
Projects
Integrations
copy markdown
Detect and identify objects from an input image. It locates bounding boxes, classify objects, and return structured metadata describing each detected entity.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const DetectionSchema = z.object({
objects: z.array(
z.object({
name: z.string().describe("describe the object in the image"),
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
})
),
texts: z
.array(
z.object({
text: z.string(),
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
})
)
.describe("any alphabetic characters text in the image"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Get the position of the crane in the image and any text" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/construction.png",
},
},
],
},
],
response_format: zodResponseFormat(DetectionSchema, "detection_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("Object Detection Results:", precontext[0]?.result);Bounding boxes mapped to the image

JSON output
object contains the extracted information defined in the schema. precontext contains the raw metadata such as bounding boxes and confidence scores.
{
"object": {
"objects": [
{
"name": "crane",
"top_left_x": 630,
"top_left_y": 139,
"bottom_right_x": 769,
"bottom_right_y": 225
}
],
"texts": [
{
"text": "09-12-2020 Sat 21:26:51",
"top_left_x": 218,
"top_left_y": 88,
"bottom_right_x": 623,
"bottom_right_y": 117
},
{
"text": "WRW tower",
"top_left_x": 1069,
"top_left_y": 720,
"bottom_right_x": 1230,
"bottom_right_y": 751
}
]
},
"response": {
"id": "interfaze-1775001605439",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775001605439",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 4903,
"completion_tokens": 8566,
"total_tokens": 13469
},
"precontext": [
{
"name": "object_detection",
"result": {
"detected_objects": [
{
"bounds": {
"top_left": {
"x": 630,
"y": 139
},
"top_right": {
"x": 769,
"y": 139
},
"bottom_left": {
"x": 630,
"y": 225
},
"bottom_right": {
"x": 769,
"y": 225
},
"width": 139,
"height": 86
},
"label": "crane"
}
],
"gui_elements": [
{
"type": "text",
"bounds": {
"top_left": {
"x": 1140,
"y": 722
},
"top_right": {
"x": 1232,
"y": 722
},
"bottom_left": {
"x": 1140,
"y": 752
},
"bottom_right": {
"x": 1232,
"y": 752
},
"width": 92,
"height": 30
},
"interactivity": false,
"content": "tower"
}
]
}
},
{
"name": "ocr",
"result": {
"extracted_text": "09-12-2020 Sat 21:26:51\nWRW tower",
"sections": [
{
"text": "09-12-2020 Sat 21:26:51\nWRW tower",
"lines": [
{
"text": "09-12-2020 Sat 21:26:51",
"bounds": {
"top_left": {
"x": 218,
"y": 88
},
"top_right": {
"x": 623,
"y": 87
},
"bottom_right": {
"x": 623,
"y": 117
},
"bottom_left": {
"x": 218,
"y": 118
},
"width": 405,
"height": 30
},
"average_confidence": 0.99,
"words": [
{
"text": "09-12-2020",
"bounds": {
"top_left": {
"x": 219,
"y": 90
},
"top_right": {
"x": 392,
"y": 88
},
"bottom_right": {
"x": 391,
"y": 117
},
"bottom_left": {
"x": 219,
"y": 117
},
"width": 172.5,
"height": 28
},
"confidence": 0.99
},
{
"text": "Sat",
"bounds": {
"top_left": {
"x": 413,
"y": 88
},
"top_right": {
"x": 464,
"y": 88
},
"bottom_right": {
"x": 463,
"y": 117
},
"bottom_left": {
"x": 413,
"y": 117
},
"width": 50.5,
"height": 29
},
"confidence": 1
},
{
"text": "21:26:51",
"bounds": {
"top_left": {
"x": 483,
"y": 88
},
"top_right": {
"x": 622,
"y": 87
},
"bottom_right": {
"x": 622,
"y": 118
},
"bottom_left": {
"x": 483,
"y": 117
},
"width": 139,
"height": 30
},
"confidence": 0.99
}
]
},
{
"text": "WRW tower",
"bounds": {
"top_left": {
"x": 1069,
"y": 720
},
"top_right": {
"x": 1230,
"y": 721
},
"bottom_right": {
"x": 1230,
"y": 751
},
"bottom_left": {
"x": 1069,
"y": 750
},
"width": 161,
"height": 30
},
"average_confidence": 0.99,
"words": [
{
"text": "WRW",
"bounds": {
"top_left": {
"x": 1070,
"y": 721
},
"top_right": {
"x": 1120,
"y": 721
},
"bottom_right": {
"x": 1120,
"y": 750
},
"bottom_left": {
"x": 1070,
"y": 751
},
"width": 50,
"height": 29.5
},
"confidence": 0.99
},
{
"text": "tower",
"bounds": {
"top_left": {
"x": 1145,
"y": 722
},
"top_right": {
"x": 1227,
"y": 723
},
"bottom_right": {
"x": 1227,
"y": 750
},
"bottom_left": {
"x": 1145,
"y": 750
},
"width": 82,
"height": 27.5
},
"confidence": 0.99
}
]
}
]
}
],
"width": 1400,
"height": 789
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 4903,
"outputTokens": 8566,
"totalTokens": 13469
}
}Running object detection as a single task with <task>object_detection</task> in the system message makes it cheaper and faster with a fixed structured output that's pre-defined.
Learn more about running a task.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>object_detection</task>",
},
{
role: "user",
content: [
{ type: "text", text: "Get the position of the crane in the image and any text" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/construction.png",
},
},
],
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);JSON output
{
"object": {
"name": "object_detection",
"result": {
"detected_objects": [
{
"bounds": {
"top_left": { "x": 630, "y": 139 },
"top_right": { "x": 769, "y": 139 },
"bottom_left": { "x": 630, "y": 225 },
"bottom_right": { "x": 769, "y": 225 },
"width": 139,
"height": 86
},
"label": "crane"
}
],
"gui_elements": [
{
"type": "text",
"bounds": {
"top_left": { "x": 1140, "y": 722 },
"top_right": { "x": 1232, "y": 722 },
"bottom_left": { "x": 1140, "y": 752 },
"bottom_right": { "x": 1232, "y": 752 },
"width": 92,
"height": 30
},
"interactivity": false,
"content": "tower"
}
]
}
}
}