Get Started
Examples
Concepts
Resources
Projects
Integrations
API Reference
copy markdown
Extract text and bounds with confidence scores from dense images and large documents including handwritten text, printed documents, screenshots, and other visual content.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const IDSchema = z.object({
first_name: z.string().describe("First name on the ID"),
last_name: z.string().describe("Last name on the ID"),
dob: z.string().describe("Date of birth on the ID"),
driver_licence_number: z.string().describe("Driver licence number on the ID"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract the details from this ID" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/id.jpg",
},
},
],
},
],
response_format: zodResponseFormat(IDSchema, "id_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("OCR Results:", precontext[0]?.result);Bounding boxes mapped to the image
JSON output
object contains the extracted information defined in the schema. precontext contains the raw metadata such as bounding boxes and confidence scores.
Document: https://arxiv.org/pdf/2602.04101
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const ResearchPaperSchema = z.object({
title: z.string().describe("Title of the document"),
summary: z.string().describe("Summary of the document"),
formulas: z
.array(
z.object({
formula: z.string().describe("Formula in the document"),
bounds: z.object({
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
}),
})
)
.describe("Formulas in the document"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract the text and information from the document based on the schema." },
{
type: "file",
file: {
filename: "research-paper.pdf",
file_data: "https://arxiv.org/pdf/2602.04101",
},
},
],
},
],
response_format: zodResponseFormat(ResearchPaperSchema, "research_paper_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("OCR Results:", precontext[0]?.result);JSON output
{
"object": {
"title": "Interfaze: The Future of AI is built on Task-Specific Small Models",
"summary": "The paper introduces Interfaze, a system designed for modern LLM applications that focuses on building and acting over context rather than relying solely on monolithic models. Interfaze combines a heterogeneous stack of DNNs and small language models (SLMs) for perception and classification across various modalities (object detection, OCR, speech-to-text, text, and image classification). It includes a context-construction layer to crawl, index, and parse external sources (web pages, code, PDFs, diagrams) into a compact structured state, and an action layer with a thin controller. This controller selects which tools to run, compiles distilled context, and feeds it to a configured LLM for the final response.\n\nInterfaze-Beta, the concrete instantiation, achieves competitive or state-of-the-art results on challenging benchmarks by offloading the bulk of computation to smaller, task-specific models and tools. The system demonstrates strong performance across knowledge, reasoning, code, and multimodal tasks, with results like 83.6% on MMLU-Pro, 91.4% on MMLU, 81.3% on GPQA-Diamond, 57.8% on LiveCodeBench v5, 90.0% on AIME-2025, 77.3% on MMMU (val), 91.5% on AI2D, 90.9% on ChartQA, and 90.8% on Common Voice v16. The paper highlights that gains primarily come from the small-model and tool stack and context compilation, rather than relying on a single large general-purpose model. Limitations identified include delays and over-building of context, which are targets for future work.",
"formulas": [
{
"formula": "Χ(τ,ω) = Σα[η] [n – τ] e-jwn,",
"bounds": {
"top_left_x": 204,
"top_left_y": 1304,
"bottom_right_x": 500,
"bottom_right_y": 1332
}
},
{
"formula": "2f,r = log ((M|X(τ,·)|²) f + є)",
"bounds": {
"top_left_x": 215,
"top_left_y": 1445,
"bottom_right_x": 490,
"bottom_right_y": 1472
}
},
{
"formula": "P¢(Yt | Y<t,h1:T) = softmax(Wot + b)",
"bounds": {
"top_left_x": 698,
"top_left_y": 727,
"bottom_right_x": 1040,
"bottom_right_y": 753
}
},
{
"formula": "Φυ: RH×W×3 → RD×K",
"bounds": {
"top_left_x": 248,
"top_left_y": 663,
"bottom_right_x": 455,
"bottom_right_y": 688
}
},
{
"formula": "Ot : P → RD",
"bounds": {
"top_left_x": 296,
"top_left_y": 738,
"bottom_right_x": 407,
"bottom_right_y": 761
}
},
{
"formula": "Sk = σ ´φυ(x)φt(p) T k = 1,...,K,",
"bounds": {
"top_left_x": 179,
"top_left_y": 851,
"bottom_right_x": 300,
"bottom_right_y": 870
}
},
{
"formula": "B(p,x) = {bi = (Xmin, Ymin, Xmax, Ymax)}(i) No.",
"bounds": {
"top_left_x": 156,
"top_left_y": 1066,
"bottom_right_x": 554,
"bottom_right_y": 1099
}
},
{
"formula": "m₁ = S(x, bi) ∈ {0, 1}H×W",
"bounds": {
"top_left_x": 231,
"top_left_y": 1314,
"bottom_right_x": 473,
"bottom_right_y": 1338
}
}
]
},
"finishReason": "stop",
"usage": {
"inputTokens": 1284611,
"outputTokens": 129082,
"totalTokens": 1413693
}
}The output is truncated for this example.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const HandwritingSchema = z.object({
text: z.string().describe("all text in the image"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract text from the image based on the schema and correct the text if it is not correct based on the image." },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/handwriting.jpeg",
},
},
],
},
],
response_format: zodResponseFormat(HandwritingSchema, "handwriting_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("OCR Results:", precontext[0]?.result);JSON output
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const MultilingualSchema = z.object({
text: z
.string()
.describe("all text in the image without any translations. Return in native language of the image"),
english_text: z.string(),
other_language_text: z.string(),
languages_detected: z.array(z.string()).describe("iso languages detected in the image"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract text from the image based on the schema." },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/multilingual.jpeg",
},
},
],
},
],
response_format: zodResponseFormat(MultilingualSchema, "multilingual_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("OCR Results:", precontext[0]?.result);JSON output
Run OCR and Object detection on the same image in one request.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const OCRObjectDetectionSchema = z.object({
text: z.string().describe("all text in the image"),
graphic_objects: z
.array(
z.object({
description: z.string(),
top_left_x: z.number(),
top_left_y: z.number(),
bottom_right_x: z.number(),
bottom_right_y: z.number(),
})
)
.describe("graphics objects found in the image"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract the text and graphics from the image based on the schema." },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/dense_text_ocr_figures.png",
},
},
],
},
],
response_format: zodResponseFormat(OCRObjectDetectionSchema, "ocr_object_detection_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("OCR Results:", precontext[0]?.result);JSON output
Running OCR as a tasks with <task>ocr</task> in the system message make it cheaper and faster with a fixed structured output that's pre-defined.
Learn more about running a task.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>ocr</task>",
},
{
role: "user",
content: [
{ type: "text", text: "Extract all text from this ID" },
{
type: "image_url",
image_url: {
url: "https://r2public.jigsawstack.com/interfaze/examples/id.jpg",
},
},
],
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);JSON output