> ## Documentation Index
> Fetch the complete documentation index at: https://docs.aihubmix.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Create Transcription

> Transcribes audio into the input language.

Returns a transcription object in `json`, `diarized_json`, or `verbose_json`
format, or a stream of transcript events.


## OpenAPI

````yaml /openapi.json post /v1/audio/transcriptions
openapi: 3.1.0
info:
  title: AIHubMix Gateway API
  description: |
    Unified multi-protocol gateway. Exposes three native shapes side-by-side:
      - OpenAI (`/v1/chat/completions`, `/v1/responses`, ...)
      - Anthropic (`/v1/messages`)
      - Gemini (`/v1beta/models/{model}:generateContent`)
    Each vendor's native SDK connects directly. Gateway-only fields are
    marked with `x-source.authority: gateway`.
  contact:
    name: AIHubMix Team
  version: 2.1.1
  x-upstream:
    vendorSpecs:
      packageVersion: 0.1.1
      installedFrom: npm
    sources:
      openai/official:
        version: '2026-05-31'
        hash: >-
          sha256:a203971a0bc0cd3b903b9488e3ac40eaa161a274dfaf172c1da73eee029147a8
      anthropic/official:
        version: '2026-05-31'
        hash: >-
          sha256:f9d488f9290d78a081bed62fa0c040476b5d009f3db92c06c8e279e69df73fc9
    layers:
      - role: L0-base
        source: avs://openai/official
        protocol: openai
        authority: official
      - role: L0-base
        source: avs://anthropic/official
        protocol: anthropic
        authority: official
      - role: gateway-other
        source: gateway/anthropic-mappings.yml
        protocol: all
        authority: gateway
      - role: horizontal-auth
        source: gateway/auth.yml
        protocol: all
        authority: gateway
      - role: horizontal-errors
        source: gateway/errors.yml
        protocol: all
        authority: gateway
      - role: L3-gateway
        source: gateway/openai-gateway.yml
        protocol: openai
        authority: gateway
      - role: gateway-other
        source: gateway/openai-image-mappings.yml
        protocol: all
        authority: gateway
      - role: gateway-other
        source: gateway/openai-mappings.yml
        protocol: all
        authority: gateway
      - role: L2-passthrough
        source: gateway/openai-passthrough.yml
        protocol: openai
        authority: gateway
servers:
  - url: https://aihubmix.com
    description: Production
  - url: https://api.inferera.com
    description: Backup (use when the primary domain is unreachable)
security:
  - gatewayBearer: []
tags:
  - name: OpenAI Compatible
    description: >-
      OpenAI-compatible endpoints (chat, completions, embeddings, images, audio,
      videos, moderations).
  - name: Anthropic Compatible
    description: Anthropic-native endpoints (messages with Anthropic protocol).
  - name: Google Vertex AI Compatible
    description: Google Gemini / Vertex AI endpoints.
paths:
  /v1/audio/transcriptions:
    post:
      tags:
        - OpenAI Compatible
      summary: Create Transcription
      description: >-
        Transcribes audio into the input language.


        Returns a transcription object in `json`, `diarized_json`, or
        `verbose_json`

        format, or a stream of transcript events.
      operationId: createTranscription
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/CreateTranscriptionRequest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/CreateTranscriptionResponseJson'
                  - $ref: >-
                      #/components/schemas/CreateTranscriptionResponseDiarizedJson
                  - $ref: >-
                      #/components/schemas/CreateTranscriptionResponseVerboseJson
            text/event-stream:
              schema:
                $ref: '#/components/schemas/CreateTranscriptionResponseStreamEvent'
        '400':
          description: Invalid request
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
        '401':
          description: Authentication failed
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
        '403':
          description: Permission denied
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
        '404':
          description: Not found
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
        '429':
          description: Rate limited
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
        5XX:
          description: Server / upstream error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GatewayError'
components:
  schemas:
    CreateTranscriptionRequest:
      type: object
      additionalProperties: false
      properties:
        file:
          description: >
            The audio file object (not file name) to transcribe, in one of these
            formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
          type: string
          format: binary
        model:
          description: >
            ID of the model to use. The options are `gpt-4o-transcribe`,
            `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`,
            `whisper-1` (which is powered by our open source Whisper V2 model),
            and `gpt-4o-transcribe-diarize`.
          example: gpt-4o-transcribe
          anyOf:
            - type: string
            - type: string
              enum:
                - whisper-1
                - gpt-4o-transcribe
                - gpt-4o-mini-transcribe
                - gpt-4o-mini-transcribe-2025-12-15
                - gpt-4o-transcribe-diarize
        language:
          description: >
            The language of the input audio. Supplying the input language in
            [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
            (e.g. `en`) format will improve accuracy and latency.
          type: string
        prompt:
          description: >
            An optional text to guide the model's style or continue a previous
            audio segment. The
            [prompt](https://developers.openai.com/api/docs/guides/speech-to-text#prompting)
            should match the audio language. This field is not supported when
            using `gpt-4o-transcribe-diarize`.
          type: string
        response_format:
          $ref: '#/components/schemas/AudioResponseFormat'
        temperature:
          description: >
            The sampling temperature, between 0 and 1. Higher values like 0.8
            will make the output more random, while lower values like 0.2 will
            make it more focused and deterministic. If set to 0, the model will
            use [log probability](https://en.wikipedia.org/wiki/Log_probability)
            to automatically increase the temperature until certain thresholds
            are hit.
          type: number
          default: 0
        include:
          description: >
            Additional information to include in the transcription response.

            `logprobs` will return the log probabilities of the tokens in the

            response to understand the model's confidence in the transcription.

            `logprobs` only works with response_format set to `json` and only
            with

            the models `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and
            `gpt-4o-mini-transcribe-2025-12-15`. This field is not supported
            when using `gpt-4o-transcribe-diarize`.
          type: array
          items:
            $ref: '#/components/schemas/TranscriptionInclude'
        timestamp_granularities:
          description: >
            The timestamp granularities to populate for this transcription.
            `response_format` must be set `verbose_json` to use timestamp
            granularities. Either or both of these options are supported:
            `word`, or `segment`. Note: There is no additional latency for
            segment timestamps, but generating word timestamps incurs additional
            latency.

            This option is not available for `gpt-4o-transcribe-diarize`.
          type: array
          items:
            type: string
            enum:
              - word
              - segment
          default:
            - segment
        stream:
          anyOf:
            - description: >
                If set to true, the model response data will be streamed to the
                client

                as it is generated using [server-sent
                events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).

                See the [Streaming section of the Speech-to-Text
                guide](https://developers.openai.com/api/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)

                for more information.


                Note: Streaming is not supported for the `whisper-1` model and
                will be ignored.
              type: boolean
              default: false
            - type: 'null'
        chunking_strategy:
          anyOf:
            - description: >-
                Controls how the audio is cut into chunks. When set to `"auto"`,
                the server first normalizes loudness and then uses voice
                activity detection (VAD) to choose boundaries. `server_vad`
                object can be provided to tweak VAD detection parameters
                manually. If unset, the audio is transcribed as a single block.
                Required when using `gpt-4o-transcribe-diarize` for inputs
                longer than 30 seconds. 
              anyOf:
                - type: string
                  enum:
                    - auto
                  default: auto
                  description: >
                    Automatically set chunking parameters based on the audio.
                    Must be set to `"auto"`.
                - $ref: '#/components/schemas/VadConfig'
            - type: 'null'
        known_speaker_names:
          description: >
            Optional list of speaker names that correspond to the audio samples
            provided in `known_speaker_references[]`. Each entry should be a
            short identifier (for example `customer` or `agent`). Up to 4
            speakers are supported.
          type: array
          maxItems: 4
          items:
            type: string
        known_speaker_references:
          description: >
            Optional list of audio samples (as [data
            URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
            that contain known speaker references matching
            `known_speaker_names[]`. Each sample must be between 2 and 10
            seconds, and can use any of the same input audio formats supported
            by `file`.
          type: array
          maxItems: 4
          items:
            type: string
      required:
        - file
        - model
      x-source:
        from: avs://openai/official
        authority: official
    CreateTranscriptionResponseJson:
      type: object
      description: >-
        Represents a transcription response returned by model, based on the
        provided input.
      properties:
        text:
          type: string
          description: The transcribed text.
        logprobs:
          type: array
          optional: true
          description: >
            The log probabilities of the tokens in the transcription. Only
            returned with the models `gpt-4o-transcribe` and
            `gpt-4o-mini-transcribe` if `logprobs` is added to the `include`
            array.
          items:
            type: object
            properties:
              token:
                type: string
                description: The token in the transcription.
              logprob:
                type: number
                description: The log probability of the token.
              bytes:
                type: array
                items:
                  type: number
                description: The bytes of the token.
        usage:
          description: Token usage statistics for the request.
          oneOf:
            - $ref: '#/components/schemas/TranscriptTextUsageTokens'
              title: Token Usage
            - $ref: '#/components/schemas/TranscriptTextUsageDuration'
              title: Duration Usage
      required:
        - text
      x-source:
        from: avs://openai/official
        authority: official
    CreateTranscriptionResponseDiarizedJson:
      type: object
      description: >
        Represents a diarized transcription response returned by the model,
        including the combined transcript and speaker-segment annotations.
      properties:
        task:
          type: string
          description: The type of task that was run. Always `transcribe`.
          enum:
            - transcribe
        duration:
          type: number
          format: double
          description: Duration of the input audio in seconds.
        text:
          type: string
          description: The concatenated transcript text for the entire audio input.
        segments:
          type: array
          description: >-
            Segments of the transcript annotated with timestamps and speaker
            labels.
          items:
            $ref: '#/components/schemas/TranscriptionDiarizedSegment'
        usage:
          description: Token or duration usage statistics for the request.
          oneOf:
            - $ref: '#/components/schemas/TranscriptTextUsageTokens'
              title: Token Usage
            - $ref: '#/components/schemas/TranscriptTextUsageDuration'
              title: Duration Usage
          discriminator:
            propertyName: type
      required:
        - task
        - duration
        - text
        - segments
      x-source:
        from: avs://openai/official
        authority: official
    CreateTranscriptionResponseVerboseJson:
      type: object
      description: >-
        Represents a verbose json transcription response returned by model,
        based on the provided input.
      properties:
        language:
          type: string
          description: The language of the input audio.
        duration:
          type: number
          format: double
          description: The duration of the input audio.
        text:
          type: string
          description: The transcribed text.
        words:
          type: array
          description: Extracted words and their corresponding timestamps.
          items:
            $ref: '#/components/schemas/TranscriptionWord'
        segments:
          type: array
          description: Segments of the transcribed text and their corresponding details.
          items:
            $ref: '#/components/schemas/TranscriptionSegment'
        usage:
          $ref: '#/components/schemas/TranscriptTextUsageDuration'
      required:
        - language
        - duration
        - text
      x-source:
        from: avs://openai/official
        authority: official
    CreateTranscriptionResponseStreamEvent:
      anyOf:
        - $ref: '#/components/schemas/TranscriptTextSegmentEvent'
        - $ref: '#/components/schemas/TranscriptTextDeltaEvent'
        - $ref: '#/components/schemas/TranscriptTextDoneEvent'
      discriminator:
        propertyName: type
      x-source:
        from: avs://openai/official
        authority: official
    GatewayError:
      type: object
      description: |
        OpenAI-compatible error envelope. Top-level `error` object carrying
        `message` / `type` / `param` / `code`. `request_id` lives in the
        `X-Request-Id` response header, not in the body. The legacy `upstream`
        field is no longer emitted.
      x-source:
        from: gateway
        authority: gateway
      required:
        - error
      properties:
        error:
          type: object
          required:
            - message
          properties:
            message:
              type: string
              description: Human-readable error message.
            type:
              type: string
              nullable: true
              description: >
                Error category. OpenAI-compatible values include

                invalid_request_error / authentication_error / permission_error
                /

                not_found_error / rate_limit_error, etc. The upstream value is

                passed through verbatim; the enum is not fixed.
            param:
              type: string
              nullable: true
              description: Name of the offending field, when applicable.
            code:
              nullable: true
              description: |
                Sub-error code. May be a string or integer (the gateway treats
                `code` as an arbitrary type).
    AudioResponseFormat:
      description: >
        The format of the output, in one of these options: `json`, `text`,
        `srt`, `verbose_json`, `vtt`, or `diarized_json`. For
        `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, the only supported
        format is `json`. For `gpt-4o-transcribe-diarize`, the supported formats
        are `json`, `text`, and `diarized_json`, with `diarized_json` required
        to receive speaker annotations.
      type: string
      enum:
        - json
        - text
        - srt
        - verbose_json
        - vtt
        - diarized_json
      default: json
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptionInclude:
      type: string
      enum:
        - logprobs
      default: []
      x-source:
        from: avs://openai/official
        authority: official
    VadConfig:
      type: object
      additionalProperties: false
      required:
        - type
      properties:
        type:
          type: string
          enum:
            - server_vad
          description: >-
            Must be set to `server_vad` to enable manual chunking using server
            side VAD.
        prefix_padding_ms:
          type: integer
          default: 300
          description: |
            Amount of audio to include before the VAD detected speech (in 
            milliseconds).
        silence_duration_ms:
          type: integer
          default: 200
          description: |
            Duration of silence to detect speech stop (in milliseconds).
            With shorter values the model will respond more quickly, 
            but may jump in on short pauses from the user.
        threshold:
          type: number
          default: 0.5
          description: >
            Sensitivity threshold (0.0 to 1.0) for voice activity detection. A 

            higher threshold will require louder audio to activate the model,
            and 

            thus might perform better in noisy environments.
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptTextUsageTokens:
      type: object
      title: Token Usage
      description: Usage statistics for models billed by token usage.
      properties:
        type:
          type: string
          enum:
            - tokens
          description: The type of the usage object. Always `tokens` for this variant.
        input_tokens:
          type: integer
          description: Number of input tokens billed for this request.
        input_token_details:
          type: object
          description: Details about the input tokens billed for this request.
          properties:
            text_tokens:
              type: integer
              description: Number of text tokens billed for this request.
            audio_tokens:
              type: integer
              description: Number of audio tokens billed for this request.
        output_tokens:
          type: integer
          description: Number of output tokens generated.
        total_tokens:
          type: integer
          description: Total number of tokens used (input + output).
      required:
        - type
        - input_tokens
        - output_tokens
        - total_tokens
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptTextUsageDuration:
      type: object
      title: Duration Usage
      description: Usage statistics for models billed by audio input duration.
      properties:
        type:
          type: string
          enum:
            - duration
          description: The type of the usage object. Always `duration` for this variant.
        seconds:
          type: number
          format: double
          description: Duration of the input audio in seconds.
      required:
        - type
        - seconds
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptionDiarizedSegment:
      type: object
      description: A segment of diarized transcript text with speaker metadata.
      properties:
        type:
          type: string
          description: |
            The type of the segment. Always `transcript.text.segment`.
          enum:
            - transcript.text.segment
        id:
          type: string
          description: Unique identifier for the segment.
        start:
          type: number
          format: double
          description: Start timestamp of the segment in seconds.
        end:
          type: number
          format: double
          description: End timestamp of the segment in seconds.
        text:
          type: string
          description: Transcript text for this segment.
        speaker:
          type: string
          description: >
            Speaker label for this segment. When known speakers are provided,
            the label matches `known_speaker_names[]`. Otherwise speakers are
            labeled sequentially using capital letters (`A`, `B`, ...).
      required:
        - type
        - id
        - start
        - end
        - text
        - speaker
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptionWord:
      type: object
      properties:
        word:
          type: string
          description: The text content of the word.
        start:
          type: number
          format: double
          description: Start time of the word in seconds.
        end:
          type: number
          format: double
          description: End time of the word in seconds.
      required:
        - word
        - start
        - end
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptionSegment:
      type: object
      properties:
        id:
          type: integer
          description: Unique identifier of the segment.
        seek:
          type: integer
          description: Seek offset of the segment.
        start:
          type: number
          format: double
          description: Start time of the segment in seconds.
        end:
          type: number
          format: double
          description: End time of the segment in seconds.
        text:
          type: string
          description: Text content of the segment.
        tokens:
          type: array
          items:
            type: integer
          description: Array of token IDs for the text content.
        temperature:
          type: number
          format: float
          description: Temperature parameter used for generating the segment.
        avg_logprob:
          type: number
          format: float
          description: >-
            Average logprob of the segment. If the value is lower than -1,
            consider the logprobs failed.
        compression_ratio:
          type: number
          format: float
          description: >-
            Compression ratio of the segment. If the value is greater than 2.4,
            consider the compression failed.
        no_speech_prob:
          type: number
          format: float
          description: >-
            Probability of no speech in the segment. If the value is higher than
            1.0 and the `avg_logprob` is below -1, consider this segment silent.
      required:
        - id
        - seek
        - start
        - end
        - text
        - tokens
        - temperature
        - avg_logprob
        - compression_ratio
        - no_speech_prob
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptTextSegmentEvent:
      type: object
      description: >
        Emitted when a diarized transcription returns a completed segment with
        speaker information. Only emitted when you [create a
        transcription](https://developers.openai.com/api/docs/api-reference/audio/create-transcription)
        with `stream` set to `true` and `response_format` set to
        `diarized_json`.
      properties:
        type:
          type: string
          description: The type of the event. Always `transcript.text.segment`.
          enum:
            - transcript.text.segment
        id:
          type: string
          description: Unique identifier for the segment.
        start:
          type: number
          format: double
          description: Start timestamp of the segment in seconds.
        end:
          type: number
          format: double
          description: End timestamp of the segment in seconds.
        text:
          type: string
          description: Transcript text for this segment.
        speaker:
          type: string
          description: Speaker label for this segment.
      required:
        - type
        - id
        - start
        - end
        - text
        - speaker
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptTextDeltaEvent:
      type: object
      description: >-
        Emitted when there is an additional text delta. This is also the first
        event emitted when the transcription starts. Only emitted when you
        [create a
        transcription](https://developers.openai.com/api/docs/api-reference/audio/create-transcription)
        with the `Stream` parameter set to `true`.
      properties:
        type:
          type: string
          description: |
            The type of the event. Always `transcript.text.delta`.
          enum:
            - transcript.text.delta
        delta:
          type: string
          description: |
            The text delta that was additionally transcribed.
        logprobs:
          type: array
          description: >
            The log probabilities of the delta. Only included if you [create a
            transcription](https://developers.openai.com/api/docs/api-reference/audio/create-transcription)
            with the `include[]` parameter set to `logprobs`.
          items:
            type: object
            properties:
              token:
                type: string
                description: |
                  The token that was used to generate the log probability.
              logprob:
                type: number
                description: |
                  The log probability of the token.
              bytes:
                type: array
                items:
                  type: integer
                description: |
                  The bytes that were used to generate the log probability.
        segment_id:
          type: string
          description: >
            Identifier of the diarized segment that this delta belongs to. Only
            present when using `gpt-4o-transcribe-diarize`.
      required:
        - type
        - delta
      x-source:
        from: avs://openai/official
        authority: official
    TranscriptTextDoneEvent:
      type: object
      description: >-
        Emitted when the transcription is complete. Contains the complete
        transcription text. Only emitted when you [create a
        transcription](https://developers.openai.com/api/docs/api-reference/audio/create-transcription)
        with the `Stream` parameter set to `true`.
      properties:
        type:
          type: string
          description: |
            The type of the event. Always `transcript.text.done`.
          enum:
            - transcript.text.done
        text:
          type: string
          description: |
            The text that was transcribed.
        logprobs:
          type: array
          description: >
            The log probabilities of the individual tokens in the transcription.
            Only included if you [create a
            transcription](https://developers.openai.com/api/docs/api-reference/audio/create-transcription)
            with the `include[]` parameter set to `logprobs`.
          items:
            type: object
            properties:
              token:
                type: string
                description: |
                  The token that was used to generate the log probability.
              logprob:
                type: number
                description: |
                  The log probability of the token.
              bytes:
                type: array
                items:
                  type: integer
                description: |
                  The bytes that were used to generate the log probability.
        usage:
          $ref: '#/components/schemas/TranscriptTextUsageTokens'
      required:
        - type
        - text
      x-source:
        from: avs://openai/official
        authority: official
  securitySchemes:
    gatewayBearer:
      type: http
      scheme: bearer
      bearerFormat: API Key
      description: |
        Gateway-issued API key, formatted as `sk-gateway-xxxxxxxx`.
        Used by OpenAI-shaped endpoints (/v1/chat/completions, etc.).
      x-source:
        from: gateway
        authority: gateway

````