diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java index 4c7bb105648..2773e163c0e 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java @@ -167,8 +167,10 @@ OpenAiAudioApi.TranscriptionRequest createRequest(AudioTranscriptionPrompt trans } } + Resource instructions = transcriptionPrompt.getInstructions(); return OpenAiAudioApi.TranscriptionRequest.builder() - .file(toBytes(transcriptionPrompt.getInstructions())) + .file(toBytes(instructions)) + .fileName(instructions.getFilename()) .responseFormat(options.getResponseFormat()) .prompt(options.getPrompt()) .temperature(options.getTemperature()) diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java index 1177a98f1d3..44e08cc114d 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java @@ -160,7 +160,7 @@ public ResponseEntity createTranscription(TranscriptionRequest requestBod @Override public String getFilename() { - return "audio.webm"; + return requestBody.fileName(); } }); multipartBody.add("model", requestBody.model()); @@ -206,7 +206,7 @@ public ResponseEntity createTranslation(TranslationRequest requestBody, C @Override public String getFilename() { - return "audio.webm"; + return requestBody.fileName(); } }); multipartBody.add("model", requestBody.model()); @@ -496,6 +496,7 @@ public SpeechRequest build() { * Transcription * * @param file The audio file to transcribe. Must be a valid audio file type. + * @param fileName The audio file name. * @param model ID of the model to use. Only whisper-1 is currently available. * @param language The language of the input audio. Supplying the input language in * ISO-639-1 format will improve accuracy and latency. @@ -517,6 +518,7 @@ public SpeechRequest build() { public record TranscriptionRequest( // @formatter:off @JsonProperty("file") byte[] file, + @JsonProperty("fileName") String fileName, @JsonProperty("model") String model, @JsonProperty("language") String language, @JsonProperty("prompt") String prompt, @@ -554,6 +556,8 @@ public static class Builder { private byte[] file; + private String fileName; + private String model = WhisperModel.WHISPER_1.getValue(); private String language; @@ -571,6 +575,11 @@ public Builder file(byte[] file) { return this; } + public Builder fileName(String fileName) { + this.fileName = fileName; + return this; + } + public Builder model(String model) { this.model = model; return this; @@ -603,11 +612,12 @@ public Builder granularityType(GranularityType granularityType) { public TranscriptionRequest build() { Assert.notNull(this.file, "file must not be null"); + Assert.notNull(this.fileName, "fileName must not be null"); Assert.hasText(this.model, "model must not be empty"); Assert.notNull(this.responseFormat, "response_format must not be null"); - return new TranscriptionRequest(this.file, this.model, this.language, this.prompt, this.responseFormat, - this.temperature, this.granularityType); + return new TranscriptionRequest(this.file, this.fileName, this.model, this.language, this.prompt, + this.responseFormat, this.temperature, this.granularityType); } } @@ -619,6 +629,7 @@ public TranscriptionRequest build() { * * @param file The audio file object (not file name) to translate, in one of these * formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + * @param fileName The audio file name. * @param model ID of the model to use. Only whisper-1 is currently available. * @param prompt An optional text to guide the model's style or continue a previous * audio segment. The prompt should be in English. @@ -633,6 +644,7 @@ public TranscriptionRequest build() { public record TranslationRequest( // @formatter:off @JsonProperty("file") byte[] file, + @JsonProperty("fileName") String fileName, @JsonProperty("model") String model, @JsonProperty("prompt") String prompt, @JsonProperty("response_format") TranscriptResponseFormat responseFormat, @@ -647,6 +659,8 @@ public static class Builder { private byte[] file; + private String fileName; + private String model = WhisperModel.WHISPER_1.getValue(); private String prompt; @@ -660,6 +674,11 @@ public Builder file(byte[] file) { return this; } + public Builder fileName(String fileName) { + this.fileName = fileName; + return this; + } + public Builder model(String model) { this.model = model; return this; @@ -685,7 +704,7 @@ public TranslationRequest build() { Assert.hasText(this.model, "model must not be empty"); Assert.notNull(this.responseFormat, "response_format must not be null"); - return new TranslationRequest(this.file, this.model, this.prompt, this.responseFormat, + return new TranslationRequest(this.file, this.fileName, this.model, this.prompt, this.responseFormat, this.temperature); } diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java index 6c933dec283..6533d15de56 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java @@ -63,24 +63,29 @@ void speechTranscriptionAndTranslation() throws IOException { FileCopyUtils.copy(speech, new File("target/speech.mp3")); StructuredResponse translation = this.audioApi - .createTranslation( - TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(), - StructuredResponse.class) + .createTranslation(TranslationRequest.builder() + .model(WhisperModel.WHISPER_1.getValue()) + .file(speech) + .fileName("speech.mp3") + .build(), StructuredResponse.class) .getBody(); assertThat(translation.text().replaceAll(",", "")).isEqualTo("Hello my name is Chris and I love Spring AI."); StructuredResponse transcriptionEnglish = this.audioApi - .createTranscription( - TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(), - StructuredResponse.class) + .createTranscription(TranscriptionRequest.builder() + .model(WhisperModel.WHISPER_1.getValue()) + .file(speech) + .fileName("speech.mp3") + .build(), StructuredResponse.class) .getBody(); assertThat(transcriptionEnglish.text().replaceAll(",", "")) .isEqualTo("Hello my name is Chris and I love Spring AI."); StructuredResponse transcriptionDutch = this.audioApi - .createTranscription(TranscriptionRequest.builder().file(speech).language("nl").build(), + .createTranscription( + TranscriptionRequest.builder().file(speech).fileName("speech.mp3").language("nl").build(), StructuredResponse.class) .getBody();