Skip to content

Fixed GH-3557, Fixed the issue where the filename was lost when requesting the OpenAI audio processing interface. #3558

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,10 @@ OpenAiAudioApi.TranscriptionRequest createRequest(AudioTranscriptionPrompt trans
}
}

Resource instructions = transcriptionPrompt.getInstructions();
return OpenAiAudioApi.TranscriptionRequest.builder()
.file(toBytes(transcriptionPrompt.getInstructions()))
.file(toBytes(instructions))
.fileName(instructions.getFilename())
.responseFormat(options.getResponseFormat())
.prompt(options.getPrompt())
.temperature(options.getTemperature())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public <T> ResponseEntity<T> createTranscription(TranscriptionRequest requestBod

@Override
public String getFilename() {
return "audio.webm";
return requestBody.fileName();
}
});
multipartBody.add("model", requestBody.model());
Expand Down Expand Up @@ -206,7 +206,7 @@ public <T> ResponseEntity<T> createTranslation(TranslationRequest requestBody, C

@Override
public String getFilename() {
return "audio.webm";
return requestBody.fileName();
}
});
multipartBody.add("model", requestBody.model());
Expand Down Expand Up @@ -496,6 +496,7 @@ public SpeechRequest build() {
* Transcription</a>
*
* @param file The audio file to transcribe. Must be a valid audio file type.
* @param fileName The audio file name.
* @param model ID of the model to use. Only whisper-1 is currently available.
* @param language The language of the input audio. Supplying the input language in
* ISO-639-1 format will improve accuracy and latency.
Expand All @@ -517,6 +518,7 @@ public SpeechRequest build() {
public record TranscriptionRequest(
// @formatter:off
@JsonProperty("file") byte[] file,
@JsonProperty("fileName") String fileName,
@JsonProperty("model") String model,
@JsonProperty("language") String language,
@JsonProperty("prompt") String prompt,
Expand Down Expand Up @@ -554,6 +556,8 @@ public static class Builder {

private byte[] file;

private String fileName;

private String model = WhisperModel.WHISPER_1.getValue();

private String language;
Expand All @@ -571,6 +575,11 @@ public Builder file(byte[] file) {
return this;
}

public Builder fileName(String fileName) {
this.fileName = fileName;
return this;
}

public Builder model(String model) {
this.model = model;
return this;
Expand Down Expand Up @@ -603,11 +612,12 @@ public Builder granularityType(GranularityType granularityType) {

public TranscriptionRequest build() {
Assert.notNull(this.file, "file must not be null");
Assert.notNull(this.fileName, "fileName must not be null");
Assert.hasText(this.model, "model must not be empty");
Assert.notNull(this.responseFormat, "response_format must not be null");

return new TranscriptionRequest(this.file, this.model, this.language, this.prompt, this.responseFormat,
this.temperature, this.granularityType);
return new TranscriptionRequest(this.file, this.fileName, this.model, this.language, this.prompt,
this.responseFormat, this.temperature, this.granularityType);
}

}
Expand All @@ -619,6 +629,7 @@ public TranscriptionRequest build() {
*
* @param file The audio file object (not file name) to translate, in one of these
* formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
* @param fileName The audio file name.
* @param model ID of the model to use. Only whisper-1 is currently available.
* @param prompt An optional text to guide the model's style or continue a previous
* audio segment. The prompt should be in English.
Expand All @@ -633,6 +644,7 @@ public TranscriptionRequest build() {
public record TranslationRequest(
// @formatter:off
@JsonProperty("file") byte[] file,
@JsonProperty("fileName") String fileName,
@JsonProperty("model") String model,
@JsonProperty("prompt") String prompt,
@JsonProperty("response_format") TranscriptResponseFormat responseFormat,
Expand All @@ -647,6 +659,8 @@ public static class Builder {

private byte[] file;

private String fileName;

private String model = WhisperModel.WHISPER_1.getValue();

private String prompt;
Expand All @@ -660,6 +674,11 @@ public Builder file(byte[] file) {
return this;
}

public Builder fileName(String fileName) {
this.fileName = fileName;
return this;
}

public Builder model(String model) {
this.model = model;
return this;
Expand All @@ -685,7 +704,7 @@ public TranslationRequest build() {
Assert.hasText(this.model, "model must not be empty");
Assert.notNull(this.responseFormat, "response_format must not be null");

return new TranslationRequest(this.file, this.model, this.prompt, this.responseFormat,
return new TranslationRequest(this.file, this.fileName, this.model, this.prompt, this.responseFormat,
this.temperature);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,24 +63,29 @@ void speechTranscriptionAndTranslation() throws IOException {
FileCopyUtils.copy(speech, new File("target/speech.mp3"));

StructuredResponse translation = this.audioApi
.createTranslation(
TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
StructuredResponse.class)
.createTranslation(TranslationRequest.builder()
.model(WhisperModel.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
.getBody();

assertThat(translation.text().replaceAll(",", "")).isEqualTo("Hello my name is Chris and I love Spring AI.");

StructuredResponse transcriptionEnglish = this.audioApi
.createTranscription(
TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
StructuredResponse.class)
.createTranscription(TranscriptionRequest.builder()
.model(WhisperModel.WHISPER_1.getValue())
.file(speech)
.fileName("speech.mp3")
.build(), StructuredResponse.class)
.getBody();

assertThat(transcriptionEnglish.text().replaceAll(",", ""))
.isEqualTo("Hello my name is Chris and I love Spring AI.");

StructuredResponse transcriptionDutch = this.audioApi
.createTranscription(TranscriptionRequest.builder().file(speech).language("nl").build(),
.createTranscription(
TranscriptionRequest.builder().file(speech).fileName("speech.mp3").language("nl").build(),
StructuredResponse.class)
.getBody();

Expand Down