From 72af644923a86ce6e7b3735d6a53dac1cf132455 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Mon, 23 Jun 2025 12:17:08 +0200 Subject: [PATCH] use strongly typed attributes --- agents/src/attributes.ts | 87 +++++++++++++++++++++++ agents/src/constants.ts | 24 +++++-- agents/src/multimodal/multimodal_agent.ts | 18 +++-- agents/src/voice/agent_session.ts | 2 +- agents/src/voice/events.ts | 1 - agents/src/voice/room_io/_output.ts | 17 +++-- 6 files changed, 128 insertions(+), 21 deletions(-) create mode 100644 agents/src/attributes.ts diff --git a/agents/src/attributes.ts b/agents/src/attributes.ts new file mode 100644 index 00000000..b106e293 --- /dev/null +++ b/agents/src/attributes.ts @@ -0,0 +1,87 @@ +// This file was generated from JSON Schema using quicktype, do not modify it directly. +// The code generation lives at https://github.com/livekit/attribute-definitions +// +// To parse this data: +// +// import { Convert, AgentAttributes, TranscriptionAttributes } from "./file"; +// +// const agentAttributes = Convert.toAgentAttributes(json); +// const transcriptionAttributes = Convert.toTranscriptionAttributes(json); + +export interface AgentAttributes { + 'lk.agent.inputs'?: AgentInput[]; + 'lk.agent.outputs'?: AgentOutput[]; + 'lk.agent.state'?: AgentState; + 'lk.publish_on_behalf'?: string; + [property: string]: any; +} + +export type AgentInput = 'audio' | 'video' | 'text'; + +export type AgentOutput = 'transcription' | 'audio'; + +export type AgentState = 'idle' | 'initializing' | 'listening' | 'thinking' | 'speaking'; + +/** + * Schema for transcription-related attributes + */ +export interface TranscriptionAttributes { + /** + * The segment id of the transcription + */ + 'lk.segment_id'?: string; + /** + * The associated track id of the transcription + */ + 'lk.transcribed_track_id'?: string; + /** + * Whether the transcription is final + */ + 'lk.transcription_final'?: boolean; + [property: string]: any; +} + +// Converts JSON strings to/from your types +export class Convert { + public static toAgentAttributes(attributes: Record): AgentAttributes { + const agentAttributes: AgentAttributes = {}; + for (const key in attributes) { + const value = attributes[key]; + if (value !== undefined) { + agentAttributes[key] = JSON.parse(value); + } + } + return agentAttributes; + } + + public static agentAttributesToRaw(attributes: AgentAttributes): Record { + const rawAttributes: Record = {}; + for (const key in attributes) { + rawAttributes[key] = JSON.stringify(attributes[key]); + } + return rawAttributes; + } + + public static toTranscriptionAttributes( + attributes: Record, + ): TranscriptionAttributes { + const transcriptionAttributes: TranscriptionAttributes = {}; + for (const key in attributes) { + const value = attributes[key]; + if (value !== undefined) { + transcriptionAttributes[key] = JSON.parse(value); + } + } + return transcriptionAttributes; + } + + public static transcriptionAttributesToRaw( + attributes: TranscriptionAttributes, + ): Record { + const rawAttributes: Record = {}; + for (const key in attributes) { + rawAttributes[key] = JSON.stringify(attributes[key]); + } + return rawAttributes; + } +} diff --git a/agents/src/constants.ts b/agents/src/constants.ts index 86ead5b4..0ec43fe9 100644 --- a/agents/src/constants.ts +++ b/agents/src/constants.ts @@ -1,9 +1,21 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -export const ATTRIBUTE_TRANSCRIPTION_TRACK_ID = 'lk.transcribed_track_id'; -export const ATTRIBUTE_TRANSCRIPTION_FINAL = 'lk.transcription_final'; -export const TOPIC_TRANSCRIPTION = 'lk.transcription'; -export const ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID = 'lk.segment_id'; -export const ATTRIBUTE_PUBLISH_ON_BEHALF = 'lk.publish_on_behalf'; -export const TOPIC_CHAT = 'lk.chat'; +import type { AgentAttributes, TranscriptionAttributes } from './attributes.js'; + +// Agent attributes +export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state' as const satisfies keyof AgentAttributes; +export const ATTRIBUTE_PUBLISH_ON_BEHALF = + 'lk.publish_on_behalf' as const satisfies keyof AgentAttributes; + +// Transcription attributes +export const ATTRIBUTE_TRANSCRIPTION_TRACK_ID = + 'lk.transcribed_track_id' as const satisfies keyof TranscriptionAttributes; +export const ATTRIBUTE_TRANSCRIPTION_FINAL = + 'lk.transcription_final' as const satisfies keyof TranscriptionAttributes; +export const ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID = + 'lk.segment_id' as const satisfies keyof TranscriptionAttributes; + +// Topics +export const TOPIC_TRANSCRIPTION = 'lk.transcription' as const; +export const TOPIC_CHAT = 'lk.chat' as const; diff --git a/agents/src/multimodal/multimodal_agent.ts b/agents/src/multimodal/multimodal_agent.ts index edf92905..71387003 100644 --- a/agents/src/multimodal/multimodal_agent.ts +++ b/agents/src/multimodal/multimodal_agent.ts @@ -19,8 +19,14 @@ import { TrackSource, } from '@livekit/rtc-node'; import { EventEmitter } from 'node:events'; +import { + type AgentState, + Convert as ConvertAttributes, + type TranscriptionAttributes, +} from '../attributes.js'; import { AudioByteStream } from '../audio.js'; import { + AGENT_STATE_ATTRIBUTE, ATTRIBUTE_TRANSCRIPTION_FINAL, ATTRIBUTE_TRANSCRIPTION_TRACK_ID, TOPIC_TRANSCRIPTION, @@ -56,9 +62,6 @@ export abstract class RealtimeModel { abstract outFrameSize: number; } -export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking'; -export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state'; - /** @beta */ export class MultimodalAgent extends EventEmitter { model: RealtimeModel; @@ -508,10 +511,10 @@ export class MultimodalAgent extends EventEmitter { const stream = await this.room.localParticipant.streamText({ topic: TOPIC_TRANSCRIPTION, senderIdentity: participantIdentity, - attributes: { + attributes: ConvertAttributes.transcriptionAttributesToRaw({ [ATTRIBUTE_TRANSCRIPTION_TRACK_ID]: trackSid, - [ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal.toString(), - }, + [ATTRIBUTE_TRANSCRIPTION_FINAL]: isFinal, + } satisfies TranscriptionAttributes), }); await stream.write(text); await stream.close(); @@ -532,7 +535,8 @@ export class MultimodalAgent extends EventEmitter { #setState(state: AgentState) { if (this.room?.isConnected && this.room.localParticipant) { - const currentState = this.room.localParticipant.attributes![AGENT_STATE_ATTRIBUTE]; + const attributes = ConvertAttributes.toAgentAttributes(this.room.localParticipant.attributes); + const currentState = attributes[AGENT_STATE_ATTRIBUTE]; if (currentState !== state) { this.room.localParticipant.setAttributes({ [AGENT_STATE_ATTRIBUTE]: state, diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index b3514bed..2e73cae9 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -5,6 +5,7 @@ import type { AudioFrame, Room } from '@livekit/rtc-node'; import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter'; import { EventEmitter } from 'node:events'; import type { ReadableStream } from 'node:stream/web'; +import type { AgentState } from '../attributes.js'; import type { ChatMessage } from '../llm/chat_context.js'; import { ChatContext } from '../llm/chat_context.js'; import type { LLM } from '../llm/index.js'; @@ -20,7 +21,6 @@ import type { AudioOutput, TextOutput } from './io.js'; import { RoomIO } from './room_io/index.js'; import type { UnknownUserData } from './run_context.js'; -export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking'; export interface VoiceOptions { allowInterruptions: boolean; discardAudioIfUninterruptible: boolean; diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index 4710b5ef..2e3d59c7 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -1,5 +1,4 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking'; export type UserState = 'idle' | 'thinking' | 'listening' | 'speaking'; diff --git a/agents/src/voice/room_io/_output.ts b/agents/src/voice/room_io/_output.ts index 71eb3c49..91c9550f 100644 --- a/agents/src/voice/room_io/_output.ts +++ b/agents/src/voice/room_io/_output.ts @@ -17,8 +17,13 @@ import { TrackPublishOptions, TrackSource, } from '@livekit/rtc-node'; +import { + Convert as ConvertAttributes, + type TranscriptionAttributes, +} from 'agents/src/attributes.js'; import { randomUUID } from 'node:crypto'; import { + ATTRIBUTE_TRANSCRIPTION_FINAL, ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID, ATTRIBUTE_TRANSCRIPTION_TRACK_ID, TOPIC_TRANSCRIPTION, @@ -165,7 +170,7 @@ export class ParticipantTranscriptionOutput extends BaseParticipantTranscription this.flushTask = Task.from((controller) => this.flushTaskImpl(currWriter, controller.signal)); } - private async createTextWriter(attributes?: Record): Promise { + private async createTextWriter(attributes?: TranscriptionAttributes): Promise { if (!this.participantIdentity) { throw new Error('participantIdentity not found'); } @@ -176,24 +181,24 @@ export class ParticipantTranscriptionOutput extends BaseParticipantTranscription if (!attributes) { attributes = { - ATTRIBUTE_TRANSCRIPTION_FINAL: 'false', + [ATTRIBUTE_TRANSCRIPTION_FINAL]: false, + [ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID]: this.currentId, }; if (this.trackId) { attributes[ATTRIBUTE_TRANSCRIPTION_TRACK_ID] = this.trackId; } - attributes[ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID] = this.currentId; } return await this.room.localParticipant.streamText({ topic: TOPIC_TRANSCRIPTION, senderIdentity: this.participantIdentity, - attributes, + attributes: ConvertAttributes.transcriptionAttributesToRaw(attributes), }); } private async flushTaskImpl(writer: TextStreamWriter | null, signal: AbortSignal): Promise { - const attributes: Record = { - ATTRIBUTE_TRANSCRIPTION_FINAL: 'true', + const attributes: TranscriptionAttributes = { + [ATTRIBUTE_TRANSCRIPTION_FINAL]: true, }; if (this.trackId) { attributes[ATTRIBUTE_TRANSCRIPTION_TRACK_ID] = this.trackId;