Get Started
Examples
Concepts
Resources
Projects
Integrations
API Reference
copy markdown
Transcribe and diarize audio files of multiple speakers and languages at blazing fast speeds.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const STTSchema = z.object({
text: z.string(),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Transcribe the audio file" },
{
type: "file",
file: {
filename: "stt_medical_short.mp4",
file_data: "https://r2public.jigsawstack.com/interfaze/examples/stt_medical_short.mp4",
},
},
],
},
],
response_format: zodResponseFormat(STTSchema, "stt_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("STT Results:", precontext?.[0]?.result);JSON output
{
"object": {
"text": "I just started a round of amoxicillin and I wanted to ask if it was safe to take that with my current spironolactone prescription."
},
"response": {
"id": "interfaze-1775087625228",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775087625228",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 2539,
"completion_tokens": 4845,
"total_tokens": 7384
},
"precontext": [
{
"name": "stt",
"result": {
"text": "I just started a round of amoxicillin and I wanted to ask if it was safe to take that with my current spironolactone prescription.",
"chunks": [
{
"timestamp": [0.28, 4],
"text": "I just started a round of amoxicillin and I wanted to ask"
},
{
"timestamp": [4, 7.72],
"text": "if it was safe to take that with my current spironolactone prescription."
}
]
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 2539,
"outputTokens": 4845,
"totalTokens": 7384
}
}Running STT as a single task with <task>speech_to_text</task> in the system message makes it significantly faster and cheaper with a fixed structured output that's pre-defined.
Learn more about running a task.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>speech_to_text</task>",
},
{
role: "user",
content: [
{ type: "text", text: "Transcribe the audio file https://r2public.jigsawstack.com/interfaze/examples/stt_medical_short.mp4" },
],
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);Note how the URL is passed in the prompt instead of in the file object. This is another way to pass files to the model which has a marginal speed increase.
JSON output
{
"object": {
"name": "speech_to_text",
"result": {
"text": "I just started a round of amoxicillin and I wanted to ask if it was safe to take that with my current spironolactone prescription.",
"chunks": [
{
"timestamp": [0.28, 4],
"text": "I just started a round of amoxicillin and I wanted to ask"
},
{
"timestamp": [4, 7.72],
"text": "if it was safe to take that with my current spironolactone prescription."
}
]
}
},
"response": {
"id": "interfaze-1775090484657",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775090484657",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 1653,
"completion_tokens": 551,
"total_tokens": 2204
}
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 1653,
"outputTokens": 551,
"totalTokens": 2204
}
}Translate any audio or text to over 100+ languages while maintaining the original meaning and context.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const STTSchema = z.object({
translated_text: z.string().describe("translated text"),
original_language_code: z.string(),
translated_language_code: z.string(),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: "Transcribe the audio file and translate it to chinese https://r2public.jigsawstack.com/interfaze/examples/stt_medical_short.mp4",
},
],
response_format: zodResponseFormat(STTSchema, "stt_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("STT Results:", precontext?.[0]?.result);JSON output
{
"object": {
"translated_text": "我刚开始服用一轮阿莫西林,想问一下把它和我目前的螺内酯处方一起服用是否安全。",
"original_language_code": "en",
"translated_language_code": "zh"
},
"response": {
"id": "interfaze-1775088833045",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775088833045",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 2758,
"completion_tokens": 800,
"total_tokens": 3558
},
"precontext": [
{
"name": "stt",
"result": {
"text": "I just started a round of amoxicillin and I wanted to ask if it was safe to take that with my current spironolactone prescription.",
"chunks": [
{
"timestamp": [0.28, 4],
"text": "I just started a round of amoxicillin and I wanted to ask"
},
{
"timestamp": [4, 7.72],
"text": "if it was safe to take that with my current spironolactone prescription."
}
]
}
},
{
"name": "translate",
"result": {
"translated_text": "我刚开始服用一轮阿莫西林,想问一下把它和我目前的螺内酯处方一起服用是否安全。",
"source_language": "auto-detected",
"target_language": "zh",
"batch_size": 1
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 2758,
"outputTokens": 800,
"totalTokens": 3558
}
}You can reference the precontext to get the raw results from the model for both the STT and translation processes.
Automatically de-noise low-quality enhance audio for better transcription.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const STTSchema = z.object({
text: z.string(),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Transcribe the audio file" },
{
type: "file",
file: {
filename: "noisy-audio.mp3",
file_data: "https://r2public.jigsawstack.com/interfaze/examples/noisy-audio.mp3",
},
},
],
},
],
response_format: zodResponseFormat(STTSchema, "stt_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("STT Results:", precontext?.[0]?.result);JSON output
{
"object": {
"text": "You are wearing the target speed steering system. You can click the button on the right and then look at me for a few seconds. Now when you are looking at me, the system will register my voice and enroll it. Now you can take a walk. So now the system has an enrollment of my voice, it can extract, it can focus on only my voice while ignoring all the interfering sounds in the environment. So we introduced this system called click on CD, where we, like suppose we're in a scenario like this where you're trying to hear my voice when someone else is trying to speak, then You can just look at my, you can look at me for a few seconds, get some noisy example of my voice, and then you can separate out or filter out my voice only from everybody else's."
},
"response": {
"id": "interfaze-1775093806362",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775093806362",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 6057,
"completion_tokens": 3861,
"total_tokens": 9918
},
"precontext": [
{
"name": "stt",
"result": {
"text": "You are wearing the target speed steering system. You can click the button on the right and then look at me for a few seconds. Now when you are looking at me, the system will register my voice and enroll it. Now you can take a walk. So now the system has an enrollment of my voice, it can extract, it can focus on only my voice while ignoring all the interfering sounds in the environment. So we introduced this system called click on CD, where we, like suppose we're in a scenario like this where you're trying to hear my voice when someone else is trying to speak, then You can just look at my, you can look at me for a few seconds, get some noisy example of my voice, and then you can separate out or filter out my voice only from everybody else's.",
"chunks": [
{
"timestamp": [0, 4.52],
"text": "You are"
},
{
"timestamp": [4.52, 9.04],
"text": "wearing the target"
},
{
"timestamp": [9.04, 13.56],
"text": "speed steering system."
},
{
"timestamp": [13.56, 17.8],
"text": "You can click the button on the right and then look at me for a few seconds."
},
{
"timestamp": [17.8, 22.8],
"text": "Now when you are looking at me, the system will register my voice and enroll it."
},
{
"timestamp": [22.8, 26.76],
"text": "Now you can take a walk."
},
{
"timestamp": [26.76, 29.91],
"text": "So now the system has an enrollment of my"
},
{
"timestamp": [29.91, 33.06],
"text": "voice, it can extract, it can focus on only"
},
{
"timestamp": [33.06, 38],
"text": "my voice while ignoring all the interfering sounds in the environment."
},
{
"timestamp": [38, 42.29],
"text": "So we introduced this system called click on"
},
{
"timestamp": [42.29, 46.58],
"text": "CD, where we, like suppose we're in a scenario"
},
{
"timestamp": [46.58, 49.59],
"text": "like this where you're trying to hear my"
},
{
"timestamp": [49.59, 52.6],
"text": "voice when someone else is trying to speak, then"
},
{
"timestamp": [52.6, 56.08],
"text": "You can just look at my, you can look at me for a few seconds,"
},
{
"timestamp": [56.08, 58.92],
"text": "get some noisy example of my voice,"
},
{
"timestamp": [58.92, 62.98],
"text": "and then you can separate out or filter out my voice only"
},
{
"timestamp": [62.98, 64.34],
"text": "from everybody else's."
}
]
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 6057,
"outputTokens": 3861,
"totalTokens": 9918
}
}Get a precise timestamp for every word in the audio by asking the model to transcribe by word. This pairs nicely with run task mode using <task>speech_to_text</task> for the fastest, cheapest path.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>speech_to_text</task>",
},
{
role: "user",
content: "Transcribe the following audio by word https://r2public.jigsawstack.com/interfaze/examples/stt_call.mp3",
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);JSON output
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const STTSchema = z.object({
text: z.string(),
summary: z.string().describe("two line summary of the audio"),
intent: z.string().describe("3 word intent of the audio"),
});
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "user",
content: "Transcribe the audio file and summarize it https://r2public.jigsawstack.com/interfaze/examples/stt_call.mp3",
},
],
response_format: zodResponseFormat(STTSchema, "stt_schema"),
});
console.log(response.choices[0].message.content);
//@ts-expect-error precontext is not typed
const precontext = response.precontext;
console.log("STT Results:", precontext?.[0]?.result);JSON output
{
"object": {
"text": "Hi, thank you so much for calling Wham. My name is Allie. How can I help you today? Hey, just trying to change the payment info on the website since my sub is about to renew. I was wondering if you could do it on the phone. Yeah, that shouldn't be a problem. Could I have your first and last name, please? Aaron Schertz, S-C-H-E-R-T-Z. All right. Thank you so much, Aaron. Could I also have your phone number, please? 713-899-0745. Excellent. And I'm just going to verify with your security question really quick, if you don't mind. What street did you grow up on? Cypress Avenue. Okay. Excellent. Thank you so much. And I can just go ahead and update that card info for you. So first, what is the card number? 4708. Okay. 1209. Okay. 8732. Uh-huh. 7655. Great. And could I also have your expiration date as well? February 2028. Great. And could I also have your CVC on the back? 482 Okay, thank you And the billing address is still the same? Yep Okay, great Then you're all set Thank you so much for calling in",
"summary": "A customer called to update their payment information over the phone because their subscription is renewing. The agent verified their identity and then processed the updated credit card details.",
"intent": "Update payment information"
},
"response": {
"id": "interfaze-1775100650607",
"modelId": "interfaze-beta",
"body": {
"id": "interfaze-1775100650607",
"object": "chat.completion",
"model": "interfaze-beta",
"usage": {
"prompt_tokens": 8992,
"completion_tokens": 2593,
"total_tokens": 11585
},
"precontext": [
{
"name": "stt",
"result": {
"text": "Hi, thank you so much for calling Wham. My name is Allie. How can I help you today? Hey, just trying to change the payment info on the website since my sub is about to renew. I was wondering if you could do it on the phone. Yeah, that shouldn't be a problem. Could I have your first and last name, please? Aaron Schertz, S-C-H-E-R-T-Z. All right. Thank you so much, Aaron. Could I also have your phone number, please? 713-899-0745. Excellent. And I'm just going to verify with your security question really quick, if you don't mind. What street did you grow up on? Cypress Avenue. Okay. Excellent. Thank you so much. And I can just go ahead and update that card info for you. So first, what is the card number? 4708. Okay. 1209. Okay. 8732. Uh-huh. 7655. Great. And could I also have your expiration date as well? February 2028. Great. And could I also have your CVC on the back? 482 Okay, thank you And the billing address is still the same? Yep Okay, great Then you're all set Thank you so much for calling in",
"chunks": [
{
"timestamp": [0, 5],
"text": "Hi, thank you so much for calling Wham. My name is Allie. How can I help you today?"
},
{
"timestamp": [5, 8.5],
"text": "Hey, just trying to change the payment info on"
},
{
"timestamp": [8.5, 12],
"text": "the website since my sub is about to renew."
},
{
"timestamp": [12, 14],
"text": "I was wondering if you could do it on the phone."
},
{
"timestamp": [14, 17.5],
"text": "Yeah, that shouldn't be a problem. Could"
},
{
"timestamp": [17.5, 21],
"text": "I have your first and last name, please?"
},
{
"timestamp": [21, 26],
"text": "Aaron Schertz, S-C-H-E-R-T-Z."
},
{
"timestamp": [26, 28.88],
"text": "All right. Thank you so much, Aaron."
},
{
"timestamp": [28.88, 31.76],
"text": "Could I also have your phone number, please?"
},
{
"timestamp": [32.58, 36.16],
"text": "713-899-0745."
},
{
"timestamp": [37.62, 40.32],
"text": "Excellent. And I'm just going to verify with"
},
{
"timestamp": [40.32, 43.02],
"text": "your security question really quick, if you don't mind."
},
{
"timestamp": [43.62, 45.32],
"text": "What street did you grow up on?"
},
{
"timestamp": [45.86, 46.86],
"text": "Cypress Avenue."
},
{
"timestamp": [47.98, 51.26],
"text": "Okay. Excellent. Thank you so much. And I can"
},
{
"timestamp": [51.26, 54.54],
"text": "just go ahead and update that card info for you."
},
{
"timestamp": [56, 58.74],
"text": "So first, what is the card number?"
},
{
"timestamp": [61.36, 62.04],
"text": "4708."
},
{
"timestamp": [62.78, 63.48],
"text": "Okay."
},
{
"timestamp": [64.56, 65.24],
"text": "1209."
},
{
"timestamp": [65.62, 66.14],
"text": "Okay."
},
{
"timestamp": [67.22, 67.94],
"text": "8732."
},
{
"timestamp": [68.74, 69.06],
"text": "Uh-huh."
},
{
"timestamp": [70.22, 70.86],
"text": "7655."
},
{
"timestamp": [72.26, 72.94],
"text": "Great."
},
{
"timestamp": [73.24, 76.38],
"text": "And could I also have your expiration date as well?"
},
{
"timestamp": [76.84, 78.42],
"text": "February 2028."
},
{
"timestamp": [79.84, 80.52],
"text": "Great."
},
{
"timestamp": [80.92, 84.32],
"text": "And could I also have your CVC on the back?"
},
{
"timestamp": [84.32, 86.76],
"text": "482"
},
{
"timestamp": [86.76, 89.14],
"text": "Okay, thank you"
},
{
"timestamp": [89.14, 91.88],
"text": "And the billing address is still the same?"
},
{
"timestamp": [92.32, 92.68],
"text": "Yep"
},
{
"timestamp": [92.68, 94.2],
"text": "Okay, great"
},
{
"timestamp": [94.2, 96.66],
"text": "Then you're all set"
},
{
"timestamp": [96.66, 98],
"text": "Thank you so much for calling in"
}
]
}
}
]
}
},
"finishReason": "stop",
"usage": {
"inputTokens": 8992,
"outputTokens": 2593,
"totalTokens": 11585
}
}To get the best performance with long audio file is to use run task with the <task>speech_to_text</task> in the system prompt, this only activates a part of the model used for audio.
OpenAI SDK
Vercel AI SDK
LangChain SDK
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const response = await interfaze.chat.completions.create({
model: "interfaze-beta",
messages: [
{
role: "system",
content: "<task>speech_to_text</task>",
},
{
role: "user",
content: [
{ type: "text", text: "Transcribe the audio file https://r2public.jigsawstack.com/interfaze/examples/stt_long_audio_sample_3.mp3" },
],
},
],
response_format: zodResponseFormat(z.any(), "empty_schema"),
});
console.log(response.choices[0].message.content);This took 50s to transcribe a 1hr and 35min audio file.
JSON output
The output is truncated for this example due to size.
Check out how to perform speaker diarization here.