Azure · jpalvarezl · Aug 24, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
@@ -52,6 +52,9 @@ enum ServiceApiVersions {
 
   @useDependency(Azure.Core.Versions.v1_0_Preview_1)
   v2023_08_01_Preview: "2023-08-01-preview",
+
+  @useDependency(Azure.Core.Versions.v1_0_Preview_1)
+  v2023_09_01_Preview: "2023-09-01-preview",
 }
 
 @doc("A specific deployment")

diff --git a/specification/cognitiveservices/OpenAI.Inference/models/transcription.create.tsp b/specification/cognitiveservices/OpenAI.Inference/models/transcription.create.tsp
@@ -0,0 +1,53 @@
+import "@typespec/rest";
+import "@typespec/http";
+
+using TypeSpec.Rest;
+using TypeSpec.Http;
+
+namespace Azure.OpenAI;
+
+@doc("""
+Transcription request.
+Requesting format 'json' will result on only the 'text' field being set.
+For more output data use 'verbose_json.
+""")
+model AudioTranscriptionsOptions {
+    @doc("The audio file object to transcribe.")
+    // how do we handle binary format for a member?
+    file: string;
+
+    @doc("An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.")
+    prompt?: string;
+
+    @doc("The format of the transcription output, in one of these options: json, text, srt, verbose_json, or vtt.")
+    @projectedName("json", "response_format")
+    responseFormat?: AudioTranscriptionFormat = AudioTranscriptionFormat.json;
+
+    @doc("""
+    The sampling temperature, between 0 and 1.
+    Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+    If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+    """)
+    temperature?: float32 = 0;
+
+    @doc("The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.")
+    language?: string;
+}
+
+@doc("Transcription response.")
+model AudioTranscription {
+    @doc("Transcription task.")
+    task?: string;
+
+    @doc("Transcribed text.")
+    text: string;
+
+    @doc("Language.")
+    language?: string;
+
+    @doc("Duration.")
+    duration?: duration;
+
+    @doc("Segments.")
+    segments?: AudioTranscriptionSegment[];
+}
diff --git a/specification/cognitiveservices/OpenAI.Inference/models/translation.create.tsp b/specification/cognitiveservices/OpenAI.Inference/models/translation.create.tsp
@@ -0,0 +1,52 @@
+import "@typespec/rest";
+import "@typespec/http";
+
+import "./whisper.common.tsp";
+
+using TypeSpec.Rest;
+using TypeSpec.Http;
+
+namespace Azure.OpenAI;
+
+@doc("""
+Translation request.
+Requesting format 'json' will result on only the 'text' field being set.
+For more output data use 'verbose_json.
+""")
+model AudioTranslationOptions {
+    @doc("The audio file to translate.")
+    // how do we handle binary format for a member?
+    file: string;
+
+    @doc("An optional text to guide the model's style or continue a previous audio segment. The prompt should be in English.")
+    prompt?: string;
+
+    @doc("The format of the transcription output, in one of these options: json, text, srt, verbose_json, or vtt.")
+    @projectedName("json", "response_format")
+    responseFormat?: AudioTranscriptionFormat = AudioTranscriptionFormat.json;
+
+    @doc("""
+    The sampling temperature, between 0 and 1.
+    Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+    If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+    """)
+    temperature?: float32 = 0;
+}
+
+@doc("Translation response.")
+model AudioTranslation {
+    @doc("Translation task.")
+    task?: string;
+
+    @doc("Translated text.")
+    text: string;
+
+    @doc("Language.")
+    language?: string;
+
+    @doc("Duration.")
+    duration?: duration;
+
+    @doc("Transcription segments.")
+    segments?: AudioTranscriptionSegment[];
+}
diff --git a/specification/cognitiveservices/OpenAI.Inference/models/whisper.common.tsp b/specification/cognitiveservices/OpenAI.Inference/models/whisper.common.tsp
@@ -0,0 +1,57 @@
+import "@typespec/rest";
+import "@typespec/http";
+
+import "./transcription.create.tsp";
+
+using TypeSpec.Rest;
+using TypeSpec.Http;
+
+namespace Azure.OpenAI;
+
+@doc("Defines the format of the output.")
+enum AudioTranscriptionFormat {
+    @doc("JSON format. The translation/transcription response will only contain 'text'.")
+    json: "json",
+
+    @doc("Text format. The translation/transcription response will be of type text/plain.")
+    text: "text",
+
+    @doc("SRT format. The translation/transcription response will be of type text/plain.")
+    srt: "srt",
+
+    @doc("Verbose JSON format.The translation/transcription response will contain additional information.")
+    verbose_json: "verbose_json",
+
+    @doc("VTT format. The translation/transcription response will be of type text/plain.")
+    vtt: "vtt",
+}
+
+@doc("Transcription segment.")
+model AudioTranscriptionSegment {
+    @doc("Segment identifier.")
+    id?: string;
+
+    @doc("Segment start offset.")
+    start?: int32;
+
+    @doc("Segment end offset.")
+    end?: int32;
+
+    @doc("Segment text.")
+    text?: string;
+
+    @doc("Temperature.")
+    temperature?: float32;
+
+    @doc("Average log probability.")
+    @projectedName("json", "avg_logprob")
+    averageLogProb?: float32;
+
+    @doc("Compression ratio.")
+    @projectedName("json", "compression_ratio")
+    compressionRatio?: float32;
+
+    @doc("Probability of 'no speech'.")
+    @projectedName("json", "no_speech_prob")
+    noSpeechProb?: float32;
+}
@@ -7,6 +7,8 @@ import "./models/completions.create.tsp";
 import "./models/chat.completions.tsp";
 import "./models/embeddings.create.tsp";
 import "./models/images.tsp";
+import "./models/transcription.create.tsp";
+import "./models/translation.create.tsp";
 
 using TypeSpec.Rest;
 using TypeSpec.Http;
@@ -99,3 +101,23 @@ op beginAzureBatchImageGeneration is OaiLongRunningRpcOperation<
   BatchImageGenerationOperationResponse,
   BatchImageGenerationOperationResponse
 >;
+
+@doc("Transcribes audio into the input language.")
+@added(ServiceApiVersions.v2023_09_01_Preview)
+@actionSeparator("/")
+@action("/deployments/{deployment-id}/audio/transcriptions") //@convenientAPI(true)
+op getAudioTranscriptions is Azure.Core.ResourceAction<
+  Deployment,
+  AudioTranscriptionsOptions,
+  AudioTranscription
+>;
+
+@doc("Transcribes and translates input audio into English text.")
+@added(ServiceApiVersions.v2023_09_01_Preview)
+@actionSeparator("/")
+@action("/deployments/{deployment-id}/audio/translations") //@convenientAPI(true)
+op getAudioTranslations is Azure.Core.ResourceAction<
+  Deployment,
+  AudioTranslationOptions,
+  AudioTranslation
+>;
@@ -102,11 +102,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/chat_completions.json"
-          }
         }
       }
     },
@@ -153,11 +148,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/completions.json"
-          }
         }
       }
     },
@@ -204,11 +194,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Return the embeddings for a given prompt.": {
-            "$ref": "./examples/embeddings.json"
-          }
         }
       }
     },
@@ -256,11 +241,6 @@
             }
           }
         },
-        "x-ms-examples": {
-          "Starts the generation of a batch of images from a text caption": {
-            "$ref": "./examples/start_generate_image.json"
-          }
-        },
         "x-ms-long-running-operation": true
       }
     },
@@ -299,11 +279,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Returns the status of the images operation": {
-            "$ref": "./examples/get_image_operation_status.json"
-          }
         }
       }
     }

@@ -102,11 +102,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/chat_completions.json"
-          }
         }
       }
     },
@@ -153,11 +148,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/completions.json"
-          }
         }
       }
     },
@@ -204,11 +194,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Return the embeddings for a given prompt.": {
-            "$ref": "./examples/embeddings.json"
-          }
         }
       }
     },
@@ -256,11 +241,6 @@
             }
           }
         },
-        "x-ms-examples": {
-          "Starts the generation of a batch of images from a text caption": {
-            "$ref": "./examples/start_generate_image.json"
-          }
-        },
         "x-ms-long-running-operation": true
       }
     },
@@ -299,11 +279,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Returns the status of the images operation": {
-            "$ref": "./examples/get_image_operation_status.json"
-          }
         }
       }
     }

@@ -102,11 +102,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/chat_completions.json"
-          }
         }
       }
     },
@@ -153,11 +148,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model.": {
-            "$ref": "./examples/completions.json"
-          }
         }
       }
     },
@@ -204,11 +194,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Return the embeddings for a given prompt.": {
-            "$ref": "./examples/embeddings.json"
-          }
         }
       }
     },
@@ -255,11 +240,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Creates a completion for the provided prompt, parameters and chosen model. Uses Azure OpenAI chat extensions.": {
-            "$ref": "./examples/extensions_chat_completions.json"
-          }
         }
       }
     },
@@ -307,11 +287,6 @@
             }
           }
         },
-        "x-ms-examples": {
-          "Starts the generation of a batch of images from a text caption": {
-            "$ref": "./examples/start_generate_image.json"
-          }
-        },
         "x-ms-long-running-operation": true
       }
     },
@@ -350,11 +325,6 @@
               }
             }
           }
-        },
-        "x-ms-examples": {
-          "Returns the status of the images operation": {
-            "$ref": "./examples/get_image_operation_status.json"
-          }
         }
       }
     }