Interfaze

Beta

pricing

help

docs

blog

Get Started

Introduction

Examples

Vision

OCR (Image & Document)

Object Detection

GUI Detection

Web

Scraping

Audio

Speech-to-Text (STT)

Speaker Diarization

Translation

Code Sandboxing

Guardrails

Concepts

Precontext

Run Tasks

Structured Outputs

Reasoning

Streaming

Function Calling

Handling Files

Resources

Lowering costs & improving speed

Limits

Security

Supported Languages

FAQs

Projects

Interfaze as tools

Postgres LLM

Integrations

OpenAI SDK

Vercel AI SDK

Langchain SDK

n8n Integration

API Reference

Chat Completion API

Object Detection

copy markdown

Detect and identify objects from an input image. It locates bounding boxes, classify objects, and return structured metadata describing each detected entity.

Accurate bounding boxes and segmentation masks for each object
Support for a wide range of industries, construction, retail, healthcare, biology, and more
Computer GUI detection

Real time object detection of an image

OpenAI SDK

Vercel AI SDK

LangChain SDK

import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";

const DetectionSchema = z.object({
	objects: z.array(
		z.object({
			name: z.string().describe("describe the object in the image"),
			top_left_x: z.number(),
			top_left_y: z.number(),
			bottom_right_x: z.number(),
			bottom_right_y: z.number(),
		})
	),
	texts: z
		.array(
			z.object({
				text: z.string(),
				top_left_x: z.number(),
				top_left_y: z.number(),
				bottom_right_x: z.number(),
				bottom_right_y: z.number(),
			})
		)
		.describe("any alphabetic characters text in the image"),
});

const response = await interfaze.chat.completions.create({
	model: "interfaze-beta",
	messages: [
		{
			role: "user",
			content: [
				{ type: "text", text: "Get the position of the crane in the image and any text" },
				{
					type: "image_url",
					image_url: {
						url: "https://r2public.jigsawstack.com/interfaze/examples/construction.png",
					},
				},
			],
		},
	],
	response_format: zodResponseFormat(DetectionSchema, "detection_schema"),
});

console.log(response.choices[0].message.content);

//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("Object Detection Results:", precontext[0]?.result);

Bounding boxes mapped to the image

JSON output

object contains the extracted information defined in the schema. precontext contains the raw metadata such as bounding boxes and confidence scores.

{
  "object": {
    "objects": [
      {
        "name": "crane",
        "top_left_x": 630,
        "top_left_y": 139,
        "bottom_right_x": 769,
        "bottom_right_y": 225
      }
    ],
    "texts": [
      {
        "text": "09-12-2020 Sat 21:26:51",
        "top_left_x": 218,
        "top_left_y": 88,
        "bottom_right_x": 623,
        "bottom_right_y": 117
      },
      {
        "text": "WRW tower",
        "top_left_x": 1069,
        "top_left_y": 720,
        "bottom_right_x": 1230,
        "bottom_right_y": 751
      }
    ]
  },
  "response": {
    "id": "interfaze-1775001605439",
    "modelId": "interfaze-beta",
    "body": {
      "id": "interfaze-1775001605439",
      "object": "chat.completion",
      "model": "interfaze-beta",
      "usage": {
        "prompt_tokens": 4903,
        "completion_tokens": 8566,
        "total_tokens": 13469
      },
      "precontext": [
        {
          "name": "object_detection",
          "result": {
            "detected_objects": [
              {
                "bounds": {
                  "top_left": {
                    "x": 630,
                    "y": 139
                  },
                  "top_right": {
                    "x": 769,
                    "y": 139
                  },
                  "bottom_left": {
                    "x": 630,
                    "y": 225
                  },
                  "bottom_right": {
                    "x": 769,
                    "y": 225
                  },
                  "width": 139,
                  "height": 86
                },
                "label": "crane"
              }
            ],
            "gui_elements": [
              {
                "type": "text",
                "bounds": {
                  "top_left": {
                    "x": 1140,
                    "y": 722
                  },
                  "top_right": {
                    "x": 1232,
                    "y": 722
                  },
                  "bottom_left": {
                    "x": 1140,
                    "y": 752
                  },
                  "bottom_right": {
                    "x": 1232,
                    "y": 752
                  },
                  "width": 92,
                  "height": 30
                },
                "interactivity": false,
                "content": "tower"
              }
            ]
          }
        },
        {
          "name": "ocr",
          "result": {
            "extracted_text": "09-12-2020 Sat 21:26:51\nWRW tower",
            "sections": [
              {
                "text": "09-12-2020 Sat 21:26:51\nWRW tower",
                "lines": [
                  {
                    "text": "09-12-2020 Sat 21:26:51",
                    "bounds": {
                      "top_left": {
                        "x": 218,
                        "y": 88
                      },
                      "top_right": {
                        "x": 623,
                        "y": 87
                      },
                      "bottom_right": {
                        "x": 623,
                        "y": 117
                      },
                      "bottom_left": {
                        "x": 218,
                        "y": 118
                      },
                      "width": 405,
                      "height": 30
                    },
                    "average_confidence": 0.99,
                    "words": [
                      {
                        "text": "09-12-2020",
                        "bounds": {
                          "top_left": {
                            "x": 219,
                            "y": 90
                          },
                          "top_right": {
                            "x": 392,
                            "y": 88
                          },
                          "bottom_right": {
                            "x": 391,
                            "y": 117
                          },
                          "bottom_left": {
                            "x": 219,
                            "y": 117
                          },
                          "width": 172.5,
                          "height": 28
                        },
                        "confidence": 0.99
                      },
                      {
                        "text": "Sat",
                        "bounds": {
                          "top_left": {
                            "x": 413,
                            "y": 88
                          },
                          "top_right": {
                            "x": 464,
                            "y": 88
                          },
                          "bottom_right": {
                            "x": 463,
                            "y": 117
                          },
                          "bottom_left": {
                            "x": 413,
                            "y": 117
                          },
                          "width": 50.5,
                          "height": 29
                        },
                        "confidence": 1
                      },
                      {
                        "text": "21:26:51",
                        "bounds": {
                          "top_left": {
                            "x": 483,
                            "y": 88
                          },
                          "top_right": {
                            "x": 622,
                            "y": 87
                          },
                          "bottom_right": {
                            "x": 622,
                            "y": 118
                          },
                          "bottom_left": {
                            "x": 483,
                            "y": 117
                          },
                          "width": 139,
                          "height": 30
                        },
                        "confidence": 0.99
                      }
                    ]
                  },
                  {
                    "text": "WRW tower",
                    "bounds": {
                      "top_left": {
                        "x": 1069,
                        "y": 720
                      },
                      "top_right": {
                        "x": 1230,
                        "y": 721
                      },
                      "bottom_right": {
                        "x": 1230,
                        "y": 751
                      },
                      "bottom_left": {
                        "x": 1069,
                        "y": 750
                      },
                      "width": 161,
                      "height": 30
                    },
                    "average_confidence": 0.99,
                    "words": [
                      {
                        "text": "WRW",
                        "bounds": {
                          "top_left": {
                            "x": 1070,
                            "y": 721
                          },
                          "top_right": {
                            "x": 1120,
                            "y": 721
                          },
                          "bottom_right": {
                            "x": 1120,
                            "y": 750
                          },
                          "bottom_left": {
                            "x": 1070,
                            "y": 751
                          },
                          "width": 50,
                          "height": 29.5
                        },
                        "confidence": 0.99
                      },
                      {
                        "text": "tower",
                        "bounds": {
                          "top_left": {
                            "x": 1145,
                            "y": 722
                          },
                          "top_right": {
                            "x": 1227,
                            "y": 723
                          },
                          "bottom_right": {
                            "x": 1227,
                            "y": 750
                          },
                          "bottom_left": {
                            "x": 1145,
                            "y": 750
                          },
                          "width": 82,
                          "height": 27.5
                        },
                        "confidence": 0.99
                      }
                    ]
                  }
                ]
              }
            ],
            "width": 1400,
            "height": 789
          }
        }
      ]
    }
  },
  "finishReason": "stop",
  "usage": {
    "inputTokens": 4903,
    "outputTokens": 8566,
    "totalTokens": 13469
  }
}

Run object detection task with raw output

Running object detection as a single task with <task>object_detection</task> in the system message makes it cheaper and faster with a fixed structured output that's pre-defined.

Learn more about running a task.

OpenAI SDK

Vercel AI SDK

LangChain SDK

import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";

const response = await interfaze.chat.completions.create({
	model: "interfaze-beta",
	messages: [
		{
			role: "system",
			content: "<task>object_detection</task>",
		},
		{
			role: "user",
			content: [
				{ type: "text", text: "Get the position of the crane in the image and any text" },
				{
					type: "image_url",
					image_url: {
						url: "https://r2public.jigsawstack.com/interfaze/examples/construction.png",
					},
				},
			],
		},
	],
	response_format: zodResponseFormat(z.any(), "empty_schema"),
});

console.log(response.choices[0].message.content);

JSON output

{
  "object": {
    "name": "object_detection",
    "result": {
      "detected_objects": [
        {
          "bounds": {
            "top_left": { "x": 630, "y": 139 },
            "top_right": { "x": 769, "y": 139 },
            "bottom_left": { "x": 630, "y": 225 },
            "bottom_right": { "x": 769, "y": 225 },
            "width": 139,
            "height": 86
          },
          "label": "crane"
        }
      ],
      "gui_elements": [
        {
          "type": "text",
          "bounds": {
            "top_left": { "x": 1140, "y": 722 },
            "top_right": { "x": 1232, "y": 722 },
            "bottom_left": { "x": 1140, "y": 752 },
            "bottom_right": { "x": 1232, "y": 752 },
            "width": 92,
            "height": 30
          },
          "interactivity": false,
          "content": "tower"
        }
      ]
    }
  }
}

OCR (Image & Document)

GUI Detection