From 7b11e31b4a4584c384cf92d3f5ada38c560eea9e Mon Sep 17 00:00:00 2001
From: Sun Yuhan <sunyuhan1998@users.noreply.github.com>
Date: Tue, 17 Jun 2025 10:06:03 +0800
Subject: [PATCH] fix: Fixed GH-3557, Fixed the issue where the audio filename
 was lost when requesting the OpenAI `/transcriptions` and `/translations`
 interfaces.

Signed-off-by: Sun Yuhan <sunyuhan1998@users.noreply.github.com>
---
 .../openai/OpenAiAudioTranscriptionModel.java |  4 ++-
 .../ai/openai/api/OpenAiAudioApi.java         | 29 +++++++++++++++----
 .../ai/openai/audio/api/OpenAiAudioApiIT.java | 19 +++++++-----
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
index 4c7bb105648..2773e163c0e 100644
--- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
+++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java
@@ -167,8 +167,10 @@ OpenAiAudioApi.TranscriptionRequest createRequest(AudioTranscriptionPrompt trans
 			}
 		}
 
+		Resource instructions = transcriptionPrompt.getInstructions();
 		return OpenAiAudioApi.TranscriptionRequest.builder()
-			.file(toBytes(transcriptionPrompt.getInstructions()))
+			.file(toBytes(instructions))
+			.fileName(instructions.getFilename())
 			.responseFormat(options.getResponseFormat())
 			.prompt(options.getPrompt())
 			.temperature(options.getTemperature())
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
index 1177a98f1d3..44e08cc114d 100644
--- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
+++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
@@ -160,7 +160,7 @@ public <T> ResponseEntity<T> createTranscription(TranscriptionRequest requestBod
 
 			@Override
 			public String getFilename() {
-				return "audio.webm";
+				return requestBody.fileName();
 			}
 		});
 		multipartBody.add("model", requestBody.model());
@@ -206,7 +206,7 @@ public <T> ResponseEntity<T> createTranslation(TranslationRequest requestBody, C
 
 			@Override
 			public String getFilename() {
-				return "audio.webm";
+				return requestBody.fileName();
 			}
 		});
 		multipartBody.add("model", requestBody.model());
@@ -496,6 +496,7 @@ public SpeechRequest build() {
 	 * Transcription</a>
 	 *
 	 * @param file The audio file to transcribe. Must be a valid audio file type.
+	 * @param fileName The audio file name.
 	 * @param model ID of the model to use. Only whisper-1 is currently available.
 	 * @param language The language of the input audio. Supplying the input language in
 	 * ISO-639-1 format will improve accuracy and latency.
@@ -517,6 +518,7 @@ public SpeechRequest build() {
 	public record TranscriptionRequest(
 	// @formatter:off
 		@JsonProperty("file") byte[] file,
+		@JsonProperty("fileName") String fileName,
 		@JsonProperty("model") String model,
 		@JsonProperty("language") String language,
 		@JsonProperty("prompt") String prompt,
@@ -554,6 +556,8 @@ public static class Builder {
 
 			private byte[] file;
 
+			private String fileName;
+
 			private String model = WhisperModel.WHISPER_1.getValue();
 
 			private String language;
@@ -571,6 +575,11 @@ public Builder file(byte[] file) {
 				return this;
 			}
 
+			public Builder fileName(String fileName) {
+				this.fileName = fileName;
+				return this;
+			}
+
 			public Builder model(String model) {
 				this.model = model;
 				return this;
@@ -603,11 +612,12 @@ public Builder granularityType(GranularityType granularityType) {
 
 			public TranscriptionRequest build() {
 				Assert.notNull(this.file, "file must not be null");
+				Assert.notNull(this.fileName, "fileName must not be null");
 				Assert.hasText(this.model, "model must not be empty");
 				Assert.notNull(this.responseFormat, "response_format must not be null");
 
-				return new TranscriptionRequest(this.file, this.model, this.language, this.prompt, this.responseFormat,
-						this.temperature, this.granularityType);
+				return new TranscriptionRequest(this.file, this.fileName, this.model, this.language, this.prompt,
+						this.responseFormat, this.temperature, this.granularityType);
 			}
 
 		}
@@ -619,6 +629,7 @@ public TranscriptionRequest build() {
 	 *
 	 * @param file The audio file object (not file name) to translate, in one of these
 	 * formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+	 * @param fileName The audio file name.
 	 * @param model ID of the model to use. Only whisper-1 is currently available.
 	 * @param prompt An optional text to guide the model's style or continue a previous
 	 * audio segment. The prompt should be in English.
@@ -633,6 +644,7 @@ public TranscriptionRequest build() {
 	public record TranslationRequest(
 	// @formatter:off
 		@JsonProperty("file") byte[] file,
+		@JsonProperty("fileName") String fileName,
 		@JsonProperty("model") String model,
 		@JsonProperty("prompt") String prompt,
 		@JsonProperty("response_format") TranscriptResponseFormat responseFormat,
@@ -647,6 +659,8 @@ public static class Builder {
 
 			private byte[] file;
 
+			private String fileName;
+
 			private String model = WhisperModel.WHISPER_1.getValue();
 
 			private String prompt;
@@ -660,6 +674,11 @@ public Builder file(byte[] file) {
 				return this;
 			}
 
+			public Builder fileName(String fileName) {
+				this.fileName = fileName;
+				return this;
+			}
+
 			public Builder model(String model) {
 				this.model = model;
 				return this;
@@ -685,7 +704,7 @@ public TranslationRequest build() {
 				Assert.hasText(this.model, "model must not be empty");
 				Assert.notNull(this.responseFormat, "response_format must not be null");
 
-				return new TranslationRequest(this.file, this.model, this.prompt, this.responseFormat,
+				return new TranslationRequest(this.file, this.fileName, this.model, this.prompt, this.responseFormat,
 						this.temperature);
 			}
 
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
index 6c933dec283..6533d15de56 100644
--- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
+++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
@@ -63,24 +63,29 @@ void speechTranscriptionAndTranslation() throws IOException {
 		FileCopyUtils.copy(speech, new File("target/speech.mp3"));
 
 		StructuredResponse translation = this.audioApi
-			.createTranslation(
-					TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
-					StructuredResponse.class)
+			.createTranslation(TranslationRequest.builder()
+				.model(WhisperModel.WHISPER_1.getValue())
+				.file(speech)
+				.fileName("speech.mp3")
+				.build(), StructuredResponse.class)
 			.getBody();
 
 		assertThat(translation.text().replaceAll(",", "")).isEqualTo("Hello my name is Chris and I love Spring AI.");
 
 		StructuredResponse transcriptionEnglish = this.audioApi
-			.createTranscription(
-					TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
-					StructuredResponse.class)
+			.createTranscription(TranscriptionRequest.builder()
+				.model(WhisperModel.WHISPER_1.getValue())
+				.file(speech)
+				.fileName("speech.mp3")
+				.build(), StructuredResponse.class)
 			.getBody();
 
 		assertThat(transcriptionEnglish.text().replaceAll(",", ""))
 			.isEqualTo("Hello my name is Chris and I love Spring AI.");
 
 		StructuredResponse transcriptionDutch = this.audioApi
-			.createTranscription(TranscriptionRequest.builder().file(speech).language("nl").build(),
+			.createTranscription(
+					TranscriptionRequest.builder().file(speech).fileName("speech.mp3").language("nl").build(),
 					StructuredResponse.class)
 			.getBody();