Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion core/src/main/java/com/google/adk/agents/RunConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@ public enum StreamingMode {

public abstract @Nullable AudioTranscriptionConfig outputAudioTranscription();

public abstract @Nullable AudioTranscriptionConfig inputAudioTranscription();

public abstract int maxLlmCalls();

public abstract Builder toBuilder();

public static Builder builder() {
return new AutoValue_RunConfig.Builder()
.setSaveInputBlobsAsArtifacts(false)
Expand All @@ -65,7 +69,8 @@ public static Builder builder(RunConfig runConfig) {
.setMaxLlmCalls(runConfig.maxLlmCalls())
.setResponseModalities(runConfig.responseModalities())
.setSpeechConfig(runConfig.speechConfig())
.setOutputAudioTranscription(runConfig.outputAudioTranscription());
.setOutputAudioTranscription(runConfig.outputAudioTranscription())
.setInputAudioTranscription(runConfig.inputAudioTranscription());
}

/** Builder for {@link RunConfig}. */
Expand All @@ -88,6 +93,10 @@ public abstract static class Builder {
public abstract Builder setOutputAudioTranscription(
AudioTranscriptionConfig outputAudioTranscription);

@CanIgnoreReturnValue
public abstract Builder setInputAudioTranscription(
AudioTranscriptionConfig inputAudioTranscription);

@CanIgnoreReturnValue
public abstract Builder setMaxLlmCalls(int maxLlmCalls);

Expand Down
2 changes: 2 additions & 0 deletions core/src/main/java/com/google/adk/flows/llmflows/Basic.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ public Single<RequestProcessor.RequestProcessingResult> processRequest(
.ifPresent(liveConnectConfigBuilder::speechConfig);
Optional.ofNullable(context.runConfig().outputAudioTranscription())
.ifPresent(liveConnectConfigBuilder::outputAudioTranscription);
Optional.ofNullable(context.runConfig().inputAudioTranscription())
.ifPresent(liveConnectConfigBuilder::inputAudioTranscription);

LlmRequest.Builder builder =
request.toBuilder()
Expand Down
9 changes: 7 additions & 2 deletions core/src/main/java/com/google/adk/runner/Runner.java
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,9 @@ private Single<Session> emitStateDeltaEvent(
private InvocationContext newInvocationContextForLive(
Session session, Optional<LiveRequestQueue> liveRequestQueue, RunConfig runConfig) {
RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig);
if (!CollectionUtils.isNullOrEmpty(runConfig.responseModalities())
&& liveRequestQueue.isPresent()) {
if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) {
// Parity with Python: apply modality defaults and transcription settings
// only for multi-agent live scenarios.
Comment on lines -313 to +371
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jinnigu The inline comment and the code don't seem to align, here? The "text" says "apply modality defaults" but then this removes !CollectionUtils.isNullOrEmpty(runConfig.responseModalities()... is that intentional? (It may well be, I'm entirely sure about why this was originally like this; but it seems worth double checking.) Also, why would we limit transcription only for multi-agent live scenarios? I would personally love to use this even for a very simple trivial only-LlmAgent use case... you speak to it, and get a persistent transcript in your session store, that's very cool! I'd love to use this e.g. in my (personal) https://docs.enola.dev project - but don't see why it needs to be limited to work only if !this.agent.subAgents().isEmpty().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why I made the change like this is because I want to make adk-java equivalent to adk-python (https://github.com/google/adk-python/blob/main/src/google/adk/runners.py#L939-L971). I also agree that we should not limit transcription to multi-agent live scenarios only. I will raise an issue in adk-python to gather some feedbacks and make PRs to both adk-python and adk-java.

// Default to AUDIO modality if not specified.
if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) {
runConfigBuilder.setResponseModalities(
Expand All @@ -380,6 +381,10 @@ private InvocationContext newInvocationContextForLive(
runConfigBuilder.setOutputAudioTranscription(AudioTranscriptionConfig.builder().build());
}
}
// Need input transcription for agent transferring in live mode.
if (runConfig.inputAudioTranscription() == null) {
runConfigBuilder.setInputAudioTranscription(AudioTranscriptionConfig.builder().build());
}
}
return newInvocationContext(
session, /* newMessage= */ Optional.empty(), liveRequestQueue, runConfigBuilder.build());
Expand Down
22 changes: 22 additions & 0 deletions core/src/test/java/com/google/adk/agents/RunConfigTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public void testBuilderWithVariousValues() {
.setSaveInputBlobsAsArtifacts(true)
.setStreamingMode(RunConfig.StreamingMode.SSE)
.setOutputAudioTranscription(audioTranscriptionConfig)
.setInputAudioTranscription(audioTranscriptionConfig)
.setMaxLlmCalls(10)
.build();

Expand All @@ -33,6 +34,7 @@ public void testBuilderWithVariousValues() {
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.SSE);
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
assertThat(runConfig.maxLlmCalls()).isEqualTo(10);
}

Expand All @@ -45,6 +47,7 @@ public void testBuilderDefaults() {
assertThat(runConfig.saveInputBlobsAsArtifacts()).isFalse();
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.NONE);
assertThat(runConfig.outputAudioTranscription()).isNull();
assertThat(runConfig.inputAudioTranscription()).isNull();
assertThat(runConfig.maxLlmCalls()).isEqualTo(500);
}

Expand All @@ -66,6 +69,7 @@ public void testBuilderWithDifferentValues() {
.setSaveInputBlobsAsArtifacts(true)
.setStreamingMode(RunConfig.StreamingMode.BIDI)
.setOutputAudioTranscription(audioTranscriptionConfig)
.setInputAudioTranscription(audioTranscriptionConfig)
.setMaxLlmCalls(20)
.build();

Expand All @@ -74,6 +78,24 @@ public void testBuilderWithDifferentValues() {
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
assertThat(runConfig.maxLlmCalls()).isEqualTo(20);
}

@Test
public void testInputAudioTranscriptionOnly() {
AudioTranscriptionConfig inputTranscriptionConfig = AudioTranscriptionConfig.builder().build();

RunConfig runConfig =
RunConfig.builder()
.setStreamingMode(RunConfig.StreamingMode.BIDI)
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
.setInputAudioTranscription(inputTranscriptionConfig)
.build();

assertThat(runConfig.inputAudioTranscription()).isEqualTo(inputTranscriptionConfig);
assertThat(runConfig.outputAudioTranscription()).isNull();
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
assertThat(runConfig.responseModalities()).containsExactly(new Modality(Modality.Known.AUDIO));
}
}
22 changes: 22 additions & 0 deletions core/src/test/java/com/google/adk/flows/llmflows/BasicTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,33 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_outputAudioTrans
assertThat(result.events()).isEmpty();
}

@Test
public void processRequest_buildsLiveConnectConfigFromRunConfig_inputAudioTranscription() {
RunConfig runConfig =
RunConfig.builder().setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG).build();
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);

RequestProcessingResult result =
basicProcessor.processRequest(contextWithRunConfig, initialRequest).blockingGet();

LlmRequest updatedRequest = result.updatedRequest();
assertThat(updatedRequest.liveConnectConfig()).isNotNull();
assertThat(updatedRequest.liveConnectConfig().responseModalities().get()).isEmpty();
assertThat(updatedRequest.liveConnectConfig().speechConfig()).isEmpty();
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
assertThat(result.events()).isEmpty();
}

@Test
public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
RunConfig runConfig =
RunConfig.builder()
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
.setSpeechConfig(TEST_SPEECH_CONFIG)
.setOutputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
.setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
.build();
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);
Expand All @@ -241,6 +261,8 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
assertThat(updatedRequest.liveConnectConfig().speechConfig()).hasValue(TEST_SPEECH_CONFIG);
assertThat(updatedRequest.liveConnectConfig().outputAudioTranscription())
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
assertThat(result.events()).isEmpty();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright 2025 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.adk.runner;

import static com.google.adk.testing.TestUtils.createLlmResponse;
import static com.google.adk.testing.TestUtils.createTestAgentBuilder;
import static com.google.adk.testing.TestUtils.createTestLlm;
import static com.google.common.truth.Truth.assertThat;

import com.google.adk.agents.InvocationContext;
import com.google.adk.agents.LiveRequestQueue;
import com.google.adk.agents.LlmAgent;
import com.google.adk.agents.RunConfig;
import com.google.adk.sessions.Session;
import com.google.adk.testing.TestLlm;
import com.google.common.collect.ImmutableList;
import com.google.genai.types.AudioTranscriptionConfig;
import com.google.genai.types.Content;
import com.google.genai.types.Modality;
import com.google.genai.types.Part;
import java.lang.reflect.Method;
import java.util.Optional;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

@RunWith(JUnit4.class)
public final class InputAudioTranscriptionTest {

private Content createContent(String text) {
return Content.builder().parts(Part.builder().text(text).build()).build();
}

private InvocationContext invokeNewInvocationContextForLive(
Runner runner, Session session, LiveRequestQueue liveRequestQueue, RunConfig runConfig)
throws Exception {
Method method =
Runner.class.getDeclaredMethod(
"newInvocationContextForLive", Session.class, Optional.class, RunConfig.class);
method.setAccessible(true);
return (InvocationContext)
method.invoke(runner, session, Optional.of(liveRequestQueue), runConfig);
}

@Test
public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTranscription()
throws Exception {
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
LlmAgent rootAgent =
createTestAgentBuilder(testLlm)
.name("root_agent")
.subAgents(ImmutableList.of(subAgent))
.build();

Runner runner = new InMemoryRunner(rootAgent, "test", ImmutableList.of());
Session session = runner.sessionService().createSession("test", "user").blockingGet();

RunConfig initialConfig =
RunConfig.builder()
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
.setStreamingMode(RunConfig.StreamingMode.BIDI)
.build();

assertThat(initialConfig.inputAudioTranscription()).isNull();

LiveRequestQueue liveQueue = new LiveRequestQueue();
InvocationContext context =
invokeNewInvocationContextForLive(runner, session, liveQueue, initialConfig);

assertThat(context.runConfig().inputAudioTranscription()).isNotNull();
}

@Test
public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTranscription()
throws Exception {
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
LlmAgent rootAgent =
createTestAgentBuilder(testLlm)
.name("root_agent")
.subAgents(ImmutableList.of(subAgent))
.build();

Runner runner = new InMemoryRunner(rootAgent, "test", ImmutableList.of());
Session session = runner.sessionService().createSession("test", "user").blockingGet();

AudioTranscriptionConfig userConfig = AudioTranscriptionConfig.builder().build();
RunConfig configWithUserSetting =
RunConfig.builder()
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
.setStreamingMode(RunConfig.StreamingMode.BIDI)
.setInputAudioTranscription(userConfig)
.build();

LiveRequestQueue liveQueue = new LiveRequestQueue();
InvocationContext context =
invokeNewInvocationContextForLive(runner, session, liveQueue, configWithUserSetting);

assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
}
}