diff --git a/.env.example b/.env.example index a12b24e7b..a29fbd78b 100644 --- a/.env.example +++ b/.env.example @@ -8,3 +8,7 @@ # Only create `agent-starter-react/.env.local` for standalone frontend # development where this repository is launched directly with `pnpm dev`. # In that case, define only the variables needed for that standalone run. +# +# `OBSERVABILITY_ENABLED=1` uses the same unified switch as the backend. When +# enabled by the LexVoice runtime, browser-side probes publish LiveKit data +# packets for the local observability report. diff --git a/.gitignore b/.gitignore index 122bf192a..dcc586172 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,7 @@ yarn-error.log* # typescript *.tsbuildinfo next-env.d.ts + +# generated local VAD runtime assets +/public/onnxruntime-web/ +/public/vad-web/ diff --git a/.prettierignore b/.prettierignore index b16c148c5..791b1d2fd 100644 --- a/.prettierignore +++ b/.prettierignore @@ -4,6 +4,7 @@ node_modules/ pnpm-lock.yaml .next/ .env* - +public/onnxruntime-web/ +public/vad-web/ diff --git a/app-config.ts b/app-config.ts index 8933860ed..60386bbc9 100644 --- a/app-config.ts +++ b/app-config.ts @@ -1,3 +1,10 @@ +import { + type RoleInputDeviceOptions, + normalizeInputSource, + resolveRoleInputDevices, + usesServerRoomInputDevice, +} from './lib/input-device-config'; + export interface VideoTrackConfig { id: string; label: string; @@ -49,6 +56,7 @@ export interface AppConfig { showAudioFilterDebug?: boolean; debugAudio?: boolean; debugVideo?: boolean; + observabilityEnabled?: boolean; // 全局调试配置 enableGlobalDebug?: boolean; // 全局调试开关,控制所有调试信息的显示 @@ -74,16 +82,9 @@ const ROOM_INPUT_VIDEO_TRACK_NAME = 'room_video'; const BROWSER_VIDEO_TRACK_NAME = 'browser_video_track'; -const DEFAULT_ROLE_INPUT_DEVICE = 'xunfei'; -const VALID_INPUT_DEVICES = new Set(['xunfei', 'generic', 'primebot', 'browser']); -const SERVER_ROOM_INPUT_DEVICES = new Set(['xunfei', 'generic']); +export { normalizeInputSource }; -export interface InputDeviceConfigOptions { - inputSource?: string | null; - audioInputDevice?: string | null; - visionInputDevice?: string | null; - outputDevice?: string | null; -} +export type InputDeviceConfigOptions = RoleInputDeviceOptions; export interface InputDeviceConfig { inputSource: string; @@ -98,43 +99,23 @@ export interface InputDeviceConfig { showDefaultCameraPreview: boolean; } -export function normalizeInputSource(inputSource?: string | null) { - const normalized = (inputSource || '').trim().toLowerCase(); - return normalized || 'browser'; -} - -function normalizeRoleInputDevice(inputDevice: string | null | undefined, fallback: string) { - const normalized = (inputDevice || '').trim().toLowerCase(); - if (VALID_INPUT_DEVICES.has(normalized)) { - return normalized; - } - return fallback; -} - -function usesServerRoomInputDevice(inputDevice: string) { - return SERVER_ROOM_INPUT_DEVICES.has(inputDevice); -} - export function resolveInputDeviceConfig({ inputSource, audioInputDevice, visionInputDevice, outputDevice, }: InputDeviceConfigOptions = {}): InputDeviceConfig { - const normalizedInputSource = normalizeInputSource(inputSource); - const isMixedInputSource = normalizedInputSource === 'mixed'; - const baseInputDevice = isMixedInputSource - ? DEFAULT_ROLE_INPUT_DEVICE - : normalizeRoleInputDevice(normalizedInputSource, DEFAULT_ROLE_INPUT_DEVICE); - const resolvedAudioInputDevice = isMixedInputSource - ? normalizeRoleInputDevice(audioInputDevice, baseInputDevice) - : baseInputDevice; - const resolvedVisionInputDevice = isMixedInputSource - ? normalizeRoleInputDevice(visionInputDevice, baseInputDevice) - : baseInputDevice; - const resolvedOutputDevice = isMixedInputSource - ? normalizeRoleInputDevice(outputDevice, baseInputDevice) - : baseInputDevice; + const { + inputSource: normalizedInputSource, + audioInputDevice: resolvedAudioInputDevice, + visionInputDevice: resolvedVisionInputDevice, + outputDevice: resolvedOutputDevice, + } = resolveRoleInputDevices({ + inputSource, + audioInputDevice, + visionInputDevice, + outputDevice, + }); const usesBrowserRawAudioInput = resolvedAudioInputDevice === 'browser'; const usesBrowserRawVideoInput = resolvedVisionInputDevice === 'browser'; const usesBrowserRawMediaInput = usesBrowserRawAudioInput || usesBrowserRawVideoInput; @@ -259,6 +240,7 @@ export const APP_CONFIG_DEFAULTS: AppConfig = { showAudioFilterDebug: process.env.NEXT_PUBLIC_SHOW_AUDIO_DEBUG === 'true' || false, // 是否显示音频过滤调试组件 debugAudio: false, debugVideo: false, + observabilityEnabled: false, // 全局调试配置 enableGlobalDebug: process.env.NEXT_PUBLIC_ENABLE_GLOBAL_DEBUG === 'true' || false, // 全局调试开关 diff --git a/app/api/session/stop/route.ts b/app/api/session/stop/route.ts index d7bbb43a2..add849012 100644 --- a/app/api/session/stop/route.ts +++ b/app/api/session/stop/route.ts @@ -8,7 +8,10 @@ import { deriveSessionIdFromLiveKitRoomName, isValidConnectionRoomId, } from '@/lib/connection-room-id'; -import { resolveLiveKitHttpUrl } from '@/lib/session-stop'; +import { + resolveRoomInputStopUrls as resolveConfiguredRoomInputStopUrls, + resolveLiveKitHttpUrl, +} from '@/lib/session-stop'; import { markRoomSessionStopped, markRoomSessionStopping, @@ -22,11 +25,13 @@ const AGENT_WORKER_READINESS_TIMEOUT_MS = readPositiveIntEnv( 10_000 ); const AGENT_WORKER_LOG_TAIL_BYTES = readPositiveIntEnv('AGENT_WORKER_LOG_TAIL_BYTES', 256 * 1024); +const ROOM_INPUT_STOP_TIMEOUT_MS = readPositiveIntEnv('ROOM_INPUT_STOP_TIMEOUT_MS', 3_000); type StopResult = { target: string; ok: boolean; skipped?: boolean; + fatal?: boolean; status?: number; error?: string; dispatch_ids?: string[]; @@ -134,6 +139,34 @@ function shouldWaitForLocalAgentWorkerReadiness(): boolean { return inputSource !== 'browser' && !(inputSource === 'mixed' && usesBrowserOnlyMixedInput()); } +function shouldStopRoomInput(): boolean { + return shouldWaitForLocalAgentWorkerReadiness(); +} + +function resolveRoomInputStopUrls(): string[] { + if (!shouldStopRoomInput()) { + return []; + } + + return resolveConfiguredRoomInputStopUrls({ + inputSource: readStopInputSource(), + audioInputDevice: readStopRoleDevice( + 'ROOM_AUDIO_INPUT_DEVICE', + 'NEXT_PUBLIC_ROOM_AUDIO_INPUT_DEVICE' + ), + visionInputDevice: readStopRoleDevice( + 'ROOM_VISION_INPUT_DEVICE', + 'NEXT_PUBLIC_ROOM_VISION_INPUT_DEVICE' + ), + roomAudioInputUrl: readStopEnv('ROOM_AUDIO_INPUT_URL'), + roomVisionInputUrl: readStopEnv('ROOM_VISION_INPUT_URL'), + roomInputUrl: readStopEnv('ROOM_INPUT_URL'), + frontdeskInputParticipantUrl: readStopEnv('FRONTDESK_INPUT_PARTICIPANT_URL'), + faceServiceUrl: readStopEnv('FACE_SERVICE_URL'), + genericCameraParticipantUrl: readStopEnv('GENERIC_CAMERA_PARTICIPANT_URL'), + }); +} + function resolveLocalLiveKitServerLogPath(): string { const runLogDir = process.env.LEXVOICE_RUN_LOG_DIR?.trim(); return runLogDir ? path.join(runLogDir, 'server.log') : ''; @@ -272,6 +305,53 @@ async function waitForPendingDispatches(roomName: string, sessionId: string): Pr return { target: 'agent_dispatch_barrier', ok: true }; } +async function postRoomInputStop( + stopUrl: string, + roomName: string, + sessionId: string +): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), ROOM_INPUT_STOP_TIMEOUT_MS); + try { + const response = await fetch(stopUrl, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ room_name: roomName, session_id: sessionId }), + signal: controller.signal, + }); + + if (response.ok) { + return { target: 'room_input', ok: true, status: response.status }; + } + + return { + target: 'room_input', + ok: false, + fatal: false, + status: response.status, + error: `room-input stop returned HTTP ${response.status}`, + }; + } catch (error) { + return { + target: 'room_input', + ok: false, + fatal: false, + error: error instanceof Error ? error.message : String(error), + }; + } finally { + clearTimeout(timeout); + } +} + +async function stopRoomInput(roomName: string, sessionId: string): Promise { + const stopUrls = resolveRoomInputStopUrls(); + if (stopUrls.length === 0) { + return [{ target: 'room_input', ok: true, skipped: true }]; + } + + return Promise.all(stopUrls.map((stopUrl) => postRoomInputStop(stopUrl, roomName, sessionId))); +} + async function runRemoteSessionCleanup( roomName: string, sessionId: string, @@ -279,9 +359,15 @@ async function runRemoteSessionCleanup( dispatchIds: string[] ): Promise<{ results: StopResult[]; failures: StopResult[] }> { const dispatchBarrierResult = await waitForPendingDispatches(roomName, sessionId); + const roomInputResults = await stopRoomInput(roomName, sessionId); const liveKitRoomResult = await deleteLiveKitRoom(roomName); const agentWorkerReadinessResult = await waitForLocalAgentWorkerReadiness(); - const cleanupResults = [dispatchBarrierResult, liveKitRoomResult, agentWorkerReadinessResult]; + const cleanupResults = [ + dispatchBarrierResult, + ...roomInputResults, + liveKitRoomResult, + agentWorkerReadinessResult, + ]; const results = [ { target: 'session_registry', @@ -291,12 +377,18 @@ async function runRemoteSessionCleanup( dispatchResult, ...cleanupResults, ]; - const failures = results.filter((result) => !result.ok && !result.skipped); + const failures = results.filter( + (result) => !result.ok && !result.skipped && result.fatal !== false + ); + const bestEffortFailures = results.filter( + (result) => !result.ok && !result.skipped && result.fatal === false + ); console.info('agent session remote cleanup completed', { roomName, sessionId, results, failures, + bestEffortFailures, }); markRoomSessionStopped(roomName, sessionId); return { results, failures }; diff --git a/components/app/app.tsx b/components/app/app.tsx index cddca8f94..9253c111a 100644 --- a/components/app/app.tsx +++ b/components/app/app.tsx @@ -22,6 +22,7 @@ export function App({ appConfig }: AppProps) { {}, setVideoEnabled: async () => {}, start: async () => {}, - stop: () => {}, + stop: async () => {}, }; const SessionContext = createContext<{ appConfig: AppConfig; isSessionActive: boolean; startSession: () => Promise; - endSession: () => void; + endSession: () => Promise; getCurrentSessionId: () => string | null; browserSourceClient: BrowserSourceClient; }>({ appConfig: APP_CONFIG_DEFAULTS, isSessionActive: false, startSession: async () => {}, - endSession: () => {}, + endSession: async () => {}, getCurrentSessionId: () => null, browserSourceClient: DEFAULT_BROWSER_SOURCE_CLIENT, }); diff --git a/components/app/tile-layout.tsx b/components/app/tile-layout.tsx index 2601c1899..6fd296e8f 100644 --- a/components/app/tile-layout.tsx +++ b/components/app/tile-layout.tsx @@ -12,6 +12,7 @@ import { APP_CONFIG_DEFAULTS, type VideoTrackConfig } from '@/app-config'; import { useSelectedVideoTrack } from '@/hooks/useSelectedVideoTrack'; import { useSmartVoiceAssistant } from '@/hooks/useSmartVoiceAssistant'; import { cn } from '@/lib/utils'; +import { resolveCameraPreviewTrack } from '@/lib/video-preview-selection'; function debugVideoLog(enabled: boolean | undefined, ...args: unknown[]) { if (enabled) { @@ -183,11 +184,20 @@ export function TileLayout({ videoTrackConfigs, ]); + const selectedTrackType = useMemo(() => { + if (!selectedTrackId) return null; + return videoTrackConfigs.find((config) => config.id === selectedTrackId)?.type ?? null; + }, [selectedTrackId, videoTrackConfigs]); + const canShowDefaultCameraPreview = showDefaultCameraPreview && !isPreviewDisabled; - const cameraTrack = - selectedTrack || - (canShowDefaultCameraPreview && selectedTrackId === null ? configuredCameraTrack : undefined) || - (canShowDefaultCameraPreview ? defaultCameraTrack : undefined); + const cameraTrack = resolveCameraPreviewTrack({ + selectedTrack, + selectedTrackId, + selectedTrackType, + canShowDefaultCameraPreview, + configuredCameraTrack, + defaultCameraTrack, + }); const isCameraEnabled = Boolean(cameraTrack?.publication && !cameraTrack.publication.isMuted); const isScreenShareEnabled = Boolean(screenShareTrack && !screenShareTrack.publication.isMuted); diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 12e45869c..90743cf25 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -117,8 +117,8 @@ export function AgentControlBar({ const handleDisconnect = useCallback(async () => { const sessionId = getCurrentSessionId() ?? getActiveAgentSession()?.sessionId; - const localDisconnectPromise = Promise.resolve().then(() => { - endSession(); + const localDisconnectPromise = Promise.resolve().then(async () => { + await endSession(); onDisconnect?.(); }); registerAgentSessionLocalCleanup(localDisconnectPromise); diff --git a/components/livekit/filtered-audio-renderer.tsx b/components/livekit/filtered-audio-renderer.tsx index 624ef23b3..704a29aa0 100644 --- a/components/livekit/filtered-audio-renderer.tsx +++ b/components/livekit/filtered-audio-renderer.tsx @@ -1,6 +1,6 @@ 'use client'; -import { useEffect, useRef } from 'react'; +import { useCallback, useEffect, useRef } from 'react'; import { ParticipantEvent, RemoteParticipant, @@ -10,6 +10,15 @@ import { Track, } from 'livekit-client'; import { useRemoteParticipants, useRoomContext } from '@livekit/components-react'; +import { startMediaTrackAudioObserver } from '@/lib/frontend-audio-observer'; +import { + FRONTEND_EVENTS, + OBSERVABILITY_ATTRS, + outputSegmentAttributesFromMarker, + parseBackendObservabilityMarkerPayload, + publishFrontendObservabilityEvent, +} from '@/lib/observability'; +import type { ObservabilityAttribute } from '@/lib/observability'; function debugAudioLog(enabled: boolean | undefined, ...args: unknown[]) { if (enabled) { @@ -32,12 +41,18 @@ type AudioTrackDiagnostics = { type PendingPlayback = { element: HTMLAudioElement; diagnostics: AudioTrackDiagnostics; + mediaStreamTrack: MediaStreamTrack; }; interface FilteredAudioRendererProps { excludeTrackNames?: string[]; volume?: number; debugAudio?: boolean; + observabilityEnabled?: boolean; +} + +function participantSegmentKey(participantIdentity: string) { + return `participant:${participantIdentity}`; } function buildAudioTrackDiagnostics( @@ -78,34 +93,52 @@ function playAudioElement({ debugAudio, elementKey, diagnostics, + mediaStreamTrack, pendingPlayback, + recordPlaybackError, + startPlaybackObserver, trigger, }: { audioElement: HTMLAudioElement; debugAudio?: boolean; elementKey: string; diagnostics: AudioTrackDiagnostics; + mediaStreamTrack: MediaStreamTrack; pendingPlayback: Map; + recordPlaybackError: ( + diagnostics: AudioTrackDiagnostics, + trigger: string, + error: unknown + ) => void; + startPlaybackObserver: ( + elementKey: string, + diagnostics: AudioTrackDiagnostics, + mediaStreamTrack: MediaStreamTrack + ) => void; trigger: string; }) { const playPromise = audioElement.play(); if (playPromise === undefined) { pendingPlayback.delete(elementKey); + startPlaybackObserver(elementKey, diagnostics, mediaStreamTrack); return; } playPromise .then(() => { pendingPlayback.delete(elementKey); + startPlaybackObserver(elementKey, diagnostics, mediaStreamTrack); debugAudioLog(debugAudio, '[FilteredAudioRenderer] 音频播放成功', { trigger, track: diagnostics, }); }) .catch((error: unknown) => { + recordPlaybackError(diagnostics, trigger, error); pendingPlayback.set(elementKey, { element: audioElement, diagnostics, + mediaStreamTrack, }); debugAudioLog(debugAudio, '[FilteredAudioRenderer] 音频播放失败,等待用户手势后重试', { trigger, @@ -129,27 +162,189 @@ export function FilteredAudioRenderer({ excludeTrackNames = [], volume = 1.0, debugAudio, + observabilityEnabled, }: FilteredAudioRendererProps) { const room = useRoomContext(); const participants = useRemoteParticipants(); const audioElementsRef = useRef>(new Map()); const pendingPlaybackRef = useRef>(new Map()); + const playbackObserverStopsRef = useRef void>>(new Map()); + const outputSegmentsRef = useRef>>(new Map()); + const activePlaybackSourcesRef = useRef>(new Map()); + const sharedAudioContextRef = useRef(null); + const observabilityEnabledRef = useRef(false); + const recordFrontendObservabilityRef = useRef< + (name: string, attributes?: Record) => void + >(() => {}); + const recordFrontendObservability = useCallback( + (name: string, attributes?: Record) => { + void publishFrontendObservabilityEvent({ + enabled: !!observabilityEnabled, + room, + name, + attributes, + }).catch((error) => { + console.warn('[frontend-observability] failed to publish event', error); + }); + }, + [observabilityEnabled, room] + ); + observabilityEnabledRef.current = !!observabilityEnabled; + recordFrontendObservabilityRef.current = recordFrontendObservability; + + useEffect(() => { + if (observabilityEnabled) { + return; + } + + playbackObserverStopsRef.current.forEach((stop) => stop()); + playbackObserverStopsRef.current.clear(); + outputSegmentsRef.current.clear(); + void sharedAudioContextRef.current?.close?.().catch(() => undefined); + sharedAudioContextRef.current = null; + }, [observabilityEnabled]); + + useEffect(() => { + if (!room) return; + const outputSegments = outputSegmentsRef.current; + if (!observabilityEnabled) { + outputSegments.clear(); + return; + } + + const onDataReceived = ( + payload: Uint8Array, + participant?: { identity?: string }, + _kind?: unknown, + topic?: string + ) => { + const marker = parseBackendObservabilityMarkerPayload(payload, topic); + if (!marker) { + return; + } + const attributes = outputSegmentAttributesFromMarker(marker); + // Fallback order: canonical backend marker field -> legacy field -> LiveKit sender. + const markerParticipant = String( + marker.attributes[OBSERVABILITY_ATTRS.PARTICIPANT_IDENTITY] || + marker.attributes[OBSERVABILITY_ATTRS.PARTICIPANT_IDENTITY_LEGACY] || + participant?.identity || + '' + ).trim(); + if (!markerParticipant || !attributes[OBSERVABILITY_ATTRS.OUTPUT_SEGMENT_ID]) { + return; + } + outputSegments.set(participantSegmentKey(markerParticipant), attributes); + }; + + room.on(RoomEvent.DataReceived, onDataReceived); + return () => { + room.off(RoomEvent.DataReceived, onDataReceived); + }; + }, [room, observabilityEnabled]); useEffect(() => { if (!room) return; const audioElements = audioElementsRef.current; const pendingPlayback = pendingPlaybackRef.current; + const playbackObserverStops = playbackObserverStopsRef.current; + const outputSegments = outputSegmentsRef.current; + const activePlaybackSources = activePlaybackSourcesRef.current; + const audioElementListenerCleanups = new Map void>(); + const getSharedAudioContext = () => { + if (sharedAudioContextRef.current && sharedAudioContextRef.current.state !== 'closed') { + return sharedAudioContextRef.current; + } + const AudioContextClass = + typeof window === 'undefined' + ? undefined + : window.AudioContext || window.webkitAudioContext; + if (!AudioContextClass) { + return undefined; + } + sharedAudioContextRef.current = new AudioContextClass(); + return sharedAudioContextRef.current; + }; + const activeSegmentAttributes = (participantIdentity: string) => + outputSegments.get(participantSegmentKey(participantIdentity)) ?? {}; + const playbackAttributes = (diagnostics: AudioTrackDiagnostics) => ({ + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_DIRECTION]: 'output', + [OBSERVABILITY_ATTRS.PARTICIPANT_IDENTITY]: diagnostics.participantIdentity, + [OBSERVABILITY_ATTRS.TRACK_NAME]: diagnostics.trackName, + [OBSERVABILITY_ATTRS.TRACK_SID]: diagnostics.trackSid, + [OBSERVABILITY_ATTRS.TRACK_SOURCE]: diagnostics.source, + ...activeSegmentAttributes(diagnostics.participantIdentity), + }); + const stopPlaybackObserver = (elementKey: string) => { + playbackObserverStops.get(elementKey)?.(); + playbackObserverStops.delete(elementKey); + }; + const recordPlaybackError = ( + diagnostics: AudioTrackDiagnostics, + trigger: string, + error: unknown + ) => { + const { name, message } = describePlaybackError(error); + recordFrontendObservabilityRef.current(FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_ERROR, { + ...playbackAttributes(diagnostics), + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_REASON]: trigger, + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_ERROR]: `${name}: ${message}`, + }); + }; + const startPlaybackObserver = ( + elementKey: string, + diagnostics: AudioTrackDiagnostics, + mediaStreamTrack: MediaStreamTrack + ) => { + if (!observabilityEnabledRef.current) { + return; + } + stopPlaybackObserver(elementKey); + playbackObserverStops.set( + elementKey, + startMediaTrackAudioObserver({ + mediaStreamTrack, + startEventName: FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_STARTED, + endEventName: FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_ENDED, + resumeErrorEventName: FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_ERROR, + emit: (name, attributes) => recordFrontendObservabilityRef.current(name, attributes), + sharedAudioContext: getSharedAudioContext(), + attributes: () => playbackAttributes(diagnostics), + startThreshold: 0.012, + endThreshold: 0.004, + startDurationMs: 40, + endSilenceMs: 350, + }).stop + ); + }; + const removeAudioElement = (elementKey: string) => { + const audioElement = audioElements.get(elementKey); + if (!audioElement) { + pendingPlayback.delete(elementKey); + activePlaybackSources.delete(elementKey); + stopPlaybackObserver(elementKey); + return; + } + + audioElementListenerCleanups.get(elementKey)?.(); + audioElementListenerCleanups.delete(elementKey); + activePlaybackSources.delete(elementKey); + stopPlaybackObserver(elementKey); + audioElement.pause(); + audioElement.srcObject = null; + audioElement.remove(); + audioElements.delete(elementKey); + pendingPlayback.delete(elementKey); + }; // 清理函数 const cleanup = () => { - audioElements.forEach((element) => { - element.pause(); - element.srcObject = null; - element.remove(); - }); - audioElements.clear(); + Array.from(audioElements.keys()).forEach((elementKey) => removeAudioElement(elementKey)); pendingPlayback.clear(); + activePlaybackSources.clear(); + outputSegments.clear(); + void sharedAudioContextRef.current?.close?.().catch(() => undefined); + sharedAudioContextRef.current = null; }; // 处理音频轨道订阅 @@ -161,16 +356,17 @@ export function FilteredAudioRenderer({ const trackName = publication.trackName || publication.trackSid; const elementKey = `${participantIdentity}-${trackName}`; const diagnostics = buildAudioTrackDiagnostics(publication, participantIdentity); + const mediaStreamTrack = publication.track.mediaStreamTrack; debugAudioLog(debugAudio, '[FilteredAudioRenderer] 收到音频轨道订阅', diagnostics); // 检查是否应该排除此轨道 - const shouldExclude = excludeTrackNames.some( - (excludeName) => - trackName.includes(excludeName) || - trackName === excludeName || - publication.trackSid === excludeName - ); + const shouldExclude = excludeTrackNames.some((excludeName) => { + if (!excludeName) { + return false; + } + return trackName.includes(excludeName) || publication.trackSid === excludeName; + }); if (shouldExclude) { debugAudioLog( @@ -180,14 +376,7 @@ export function FilteredAudioRenderer({ ); // 如果之前有播放这个轨道,现在停止 - const existingElement = audioElements.get(elementKey); - if (existingElement) { - existingElement.pause(); - existingElement.srcObject = null; - existingElement.remove(); - audioElements.delete(elementKey); - pendingPlayback.delete(elementKey); - } + removeAudioElement(elementKey); return; } @@ -196,14 +385,49 @@ export function FilteredAudioRenderer({ // 创建或更新音频元素 let audioElement = audioElements.get(elementKey); if (!audioElement) { - audioElement = document.createElement('audio'); - audioElement.autoplay = true; - audioElement.setAttribute('playsinline', 'true'); - audioElement.dataset.livekitParticipantIdentity = participantIdentity; - audioElement.dataset.livekitTrackName = trackName; - audioElement.volume = volume; - document.body.appendChild(audioElement); - audioElements.set(elementKey, audioElement); + const createdAudioElement = document.createElement('audio'); + createdAudioElement.autoplay = true; + createdAudioElement.setAttribute('playsinline', 'true'); + createdAudioElement.dataset.livekitParticipantIdentity = participantIdentity; + createdAudioElement.dataset.livekitTrackName = trackName; + createdAudioElement.volume = volume; + const handleElementPlaybackStopped = () => { + stopPlaybackObserver(elementKey); + }; + const handleElementPlaybackStarted = () => { + const playbackSource = activePlaybackSources.get(elementKey); + if (!playbackSource || playbackObserverStops.has(elementKey)) { + return; + } + pendingPlayback.delete(elementKey); + startPlaybackObserver( + elementKey, + playbackSource.diagnostics, + playbackSource.mediaStreamTrack + ); + }; + const handleElementPlaybackError = () => { + const playbackSource = activePlaybackSources.get(elementKey); + stopPlaybackObserver(elementKey); + recordPlaybackError( + playbackSource?.diagnostics ?? diagnostics, + 'audio-element-error', + createdAudioElement.error ?? new Error('audio element playback failed') + ); + }; + createdAudioElement.addEventListener('playing', handleElementPlaybackStarted); + createdAudioElement.addEventListener('pause', handleElementPlaybackStopped); + createdAudioElement.addEventListener('ended', handleElementPlaybackStopped); + createdAudioElement.addEventListener('error', handleElementPlaybackError); + audioElementListenerCleanups.set(elementKey, () => { + createdAudioElement.removeEventListener('playing', handleElementPlaybackStarted); + createdAudioElement.removeEventListener('pause', handleElementPlaybackStopped); + createdAudioElement.removeEventListener('ended', handleElementPlaybackStopped); + createdAudioElement.removeEventListener('error', handleElementPlaybackError); + }); + document.body.appendChild(createdAudioElement); + audioElements.set(elementKey, createdAudioElement); + audioElement = createdAudioElement; debugAudioLog( debugAudio, @@ -212,7 +436,12 @@ export function FilteredAudioRenderer({ } // 设置音频流 - const mediaStream = new MediaStream([publication.track.mediaStreamTrack]); + activePlaybackSources.set(elementKey, { + element: audioElement, + diagnostics, + mediaStreamTrack, + }); + const mediaStream = new MediaStream([mediaStreamTrack]); audioElement.srcObject = mediaStream; audioElement.volume = volume; @@ -227,19 +456,25 @@ export function FilteredAudioRenderer({ debugAudio, elementKey, diagnostics, + mediaStreamTrack, pendingPlayback, + recordPlaybackError, + startPlaybackObserver, trigger: 'track-subscribed', }); }; const retryPendingPlayback = (trigger: string) => { - pendingPlayback.forEach(({ element, diagnostics }, elementKey) => { + pendingPlayback.forEach(({ element, diagnostics, mediaStreamTrack }, elementKey) => { playAudioElement({ audioElement: element, debugAudio, elementKey, diagnostics, + mediaStreamTrack, pendingPlayback, + recordPlaybackError, + startPlaybackObserver, trigger, }); }); @@ -276,19 +511,11 @@ export function FilteredAudioRenderer({ const trackName = publication.trackName || publication.trackSid; const elementKey = `${participantIdentity}-${trackName}`; - const audioElement = audioElements.get(elementKey); - if (audioElement) { - audioElement.pause(); - audioElement.srcObject = null; - audioElement.remove(); - audioElements.delete(elementKey); - pendingPlayback.delete(elementKey); - - debugAudioLog( - debugAudio, - `[FilteredAudioRenderer] 停止音频轨道: ${trackName} (参与者: ${participantIdentity})` - ); - } + removeAudioElement(elementKey); + debugAudioLog( + debugAudio, + `[FilteredAudioRenderer] 停止音频轨道: ${trackName} (参与者: ${participantIdentity})` + ); }; const participantListenerCleanups: Array<() => void> = []; @@ -333,18 +560,13 @@ export function FilteredAudioRenderer({ const onParticipantDisconnected = (participant: RemoteParticipant) => { // 清理该参与者的所有音频元素 const keysToRemove: string[] = []; - audioElements.forEach((element, key) => { + audioElements.forEach((_element, key) => { if (key.startsWith(`${participant.identity}-`)) { - element.pause(); - element.srcObject = null; - element.remove(); keysToRemove.push(key); } }); - keysToRemove.forEach((key) => { - audioElements.delete(key); - pendingPlayback.delete(key); - }); + keysToRemove.forEach((key) => removeAudioElement(key)); + outputSegments.delete(participantSegmentKey(participant.identity)); }; room.on(RoomEvent.ParticipantConnected, onParticipantConnected); diff --git a/eslint.config.mjs b/eslint.config.mjs index 55541fdfe..1a9623623 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -11,7 +11,16 @@ const compat = new FlatCompat({ const eslintConfig = [ { - ignores: ['.next/**', 'coverage/**', 'dist/**', 'next-env.d.ts', 'node_modules/**', 'out/**'], + ignores: [ + '.next/**', + 'coverage/**', + 'dist/**', + 'next-env.d.ts', + 'node_modules/**', + 'out/**', + 'public/onnxruntime-web/**', + 'public/vad-web/**', + ], }, ...compat.extends( 'next/core-web-vitals', diff --git a/hooks/useAudioTrackFilter.ts b/hooks/useAudioTrackFilter.ts index 57bee6cf1..dd7388833 100644 --- a/hooks/useAudioTrackFilter.ts +++ b/hooks/useAudioTrackFilter.ts @@ -31,12 +31,12 @@ export function useAudioTrackFilter({ const shouldExcludeTrack = useCallback( (publication: TrackPublication): boolean => { const trackName = publication.trackName || publication.trackSid; - return excludeTrackNames.some( - (excludeName) => - trackName.includes(excludeName) || - trackName === excludeName || - publication.trackSid === excludeName - ); + return excludeTrackNames.some((excludeName) => { + if (!excludeName) { + return false; + } + return trackName.includes(excludeName) || publication.trackSid === excludeName; + }); }, [excludeTrackNames] ); diff --git a/hooks/useBrowserSourceClient.ts b/hooks/useBrowserSourceClient.ts index 9012e3eef..d55f1c869 100644 --- a/hooks/useBrowserSourceClient.ts +++ b/hooks/useBrowserSourceClient.ts @@ -11,12 +11,23 @@ import { createLocalVideoTrack, } from 'livekit-client'; import type { AppConfig } from '@/app-config'; +import { startMediaTrackVadObserver } from '@/lib/frontend-vad-observer'; +import { + FRONTEND_EVENTS, + OBSERVABILITY_ATTRS, + publishFrontendObservabilityEvent, +} from '@/lib/observability'; const BROWSER_AUDIO_TRACK_NAME = 'browser_audio_track'; const BROWSER_VIDEO_TRACK_NAME = 'browser_video_track'; const DEFAULT_BROWSER_MEDIA_STREAM_NAME = 'browser_input'; const BROWSER_VIDEO_DEFAULT_ENABLED = true; const BROWSER_VIDEO_STATS_INTERVAL_MS = 5000; +const BROWSER_AUDIO_CONSTRAINTS: MediaTrackConstraints = { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, +}; interface BrowserSourceRuntime { audioTrack: LocalAudioTrack | null; @@ -28,6 +39,7 @@ interface BrowserSourceRuntime { videoStatsStartedAt: number | null; videoStatsTimer: number | null; previousVideoStats: BrowserVideoStatsSnapshot | null; + audioObserverStop: (() => Promise) | null; } interface BrowserVideoStatsSnapshot { @@ -48,7 +60,7 @@ export interface BrowserSourceClient { setAudioEnabled: (enabled: boolean) => Promise; setVideoEnabled: (enabled: boolean) => Promise; start: () => Promise; - stop: () => void; + stop: () => Promise; } interface BrowserSourceClientOptions { @@ -82,6 +94,24 @@ export function useBrowserSourceClient( const [videoTrack, setVideoTrackState] = useState(null); const [audioPending, setAudioPending] = useState(false); const [videoPending, setVideoPending] = useState(false); + const recordFrontendObservability = useCallback( + ( + name: string, + attributes?: Record, + options?: { wallTimeUnixMs?: number } + ) => { + void publishFrontendObservabilityEvent({ + enabled: !!appConfig.observabilityEnabled, + room, + name, + attributes, + wallTimeUnixMs: options?.wallTimeUnixMs, + }).catch((error) => { + console.warn('[frontend-observability] failed to publish event', error); + }); + }, + [appConfig.observabilityEnabled, room] + ); const ensureAudioPublished = useCallback(async () => { const runtime = runtimeRef.current; @@ -89,11 +119,15 @@ export function useBrowserSourceClient( return; } - const audioTrack = await createLocalAudioTrack({ - echoCancellation: true, - noiseSuppression: true, - autoGainControl: true, - }); + const vadAttributes: Record = { + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_DIRECTION]: 'input', + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_PROBE]: 'vad-web', + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + [OBSERVABILITY_ATTRS.TRACK_SID]: null, + [OBSERVABILITY_ATTRS.TRACK_STREAM_NAME]: browserMediaStreamName, + }; + const audioTrack = await createDirectBrowserAudioTrack(); + const captureTrack = audioTrack.mediaStreamTrack; audioTrack.mediaStreamTrack.enabled = runtime.audioEnabled; try { @@ -104,11 +138,71 @@ export function useBrowserSourceClient( }); runtime.audioTrack = audioTrack; runtime.audioPublication = publication; + runtime.audioObserverStop = null; + vadAttributes[OBSERVABILITY_ATTRS.TRACK_SID] = publication.trackSid || null; + recordFrontendObservability(FRONTEND_EVENTS.BROWSER_AUDIO_TRACK_PUBLISHED, { + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + [OBSERVABILITY_ATTRS.TRACK_SID]: publication.trackSid || null, + [OBSERVABILITY_ATTRS.TRACK_STREAM_NAME]: browserMediaStreamName, + }); + if (appConfig.observabilityEnabled) { + void startMediaTrackVadObserver({ + mediaStreamTrack: captureTrack, + onSpeechStart: (event) => { + recordFrontendObservability( + FRONTEND_EVENTS.BROWSER_AUDIO_VAD_SPEECH_STARTED, + { + ...vadAttributes, + [OBSERVABILITY_ATTRS.VAD_PROVIDER]: event.provider, + [OBSERVABILITY_ATTRS.VAD_MODEL]: event.model, + }, + { wallTimeUnixMs: event.timestampMs } + ); + }, + onSpeechEnd: (event) => { + recordFrontendObservability( + FRONTEND_EVENTS.BROWSER_AUDIO_VAD_SPEECH_ENDED, + { + ...vadAttributes, + [OBSERVABILITY_ATTRS.VAD_PROVIDER]: event.provider, + [OBSERVABILITY_ATTRS.VAD_MODEL]: event.model, + [OBSERVABILITY_ATTRS.VAD_AUDIO_DURATION_MS]: event.audioDurationMs ?? null, + }, + { wallTimeUnixMs: event.timestampMs } + ); + }, + }) + .then((observer) => { + if (runtime.audioTrack === audioTrack) { + runtime.audioObserverStop = observer.stop; + return; + } + observer.stop(); + }) + .catch((error) => { + if (runtime.audioTrack !== audioTrack) { + return; + } + console.warn('[browser-audio] VAD observer unavailable', error); + recordFrontendObservability(FRONTEND_EVENTS.BROWSER_AUDIO_VAD_PROBE_UNAVAILABLE, { + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_PROBE]: 'vad-web', + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_ERROR]: + error instanceof Error ? error.message : String(error), + }); + }); + } } catch (error) { audioTrack.stop(); throw error; } - }, [audioConfigured, browserMediaStreamName, room]); + }, [ + appConfig.observabilityEnabled, + audioConfigured, + browserMediaStreamName, + recordFrontendObservability, + room, + ]); const ensureVideoPublished = useCallback(async () => { const runtime = runtimeRef.current; @@ -163,14 +257,27 @@ export function useBrowserSourceClient( const unpublishAudio = useCallback( async (runtime: BrowserSourceRuntime) => { const track = runtime.audioTrack; + const publication = runtime.audioPublication; + const stopObservedAudio = runtime.audioObserverStop; runtime.audioTrack = null; runtime.audioPublication = null; + runtime.audioObserverStop = null; if (!track) return; await room.localParticipant.unpublishTrack(track, true).catch(() => undefined); - track.stop(); + try { + await stopObservedAudio?.(); + } catch (error) { + console.warn('[browser-audio] VAD observer stop failed', error); + } finally { + track.stop(); + recordFrontendObservability(FRONTEND_EVENTS.BROWSER_AUDIO_TRACK_UNPUBLISHED, { + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + [OBSERVABILITY_ATTRS.TRACK_SID]: publication?.trackSid || null, + }); + } }, - [room] + [recordFrontendObservability, room] ); const unpublishVideo = useCallback( @@ -188,13 +295,12 @@ export function useBrowserSourceClient( [room] ); - const stop = useCallback(() => { + const stop = useCallback(async () => { const runtime = runtimeRef.current; runtimeRef.current = null; if (!runtime) return; - void unpublishAudio(runtime); - void unpublishVideo(runtime); + await Promise.all([unpublishAudio(runtime), unpublishVideo(runtime)]); }, [unpublishAudio, unpublishVideo]); const start = useCallback(async () => { @@ -212,6 +318,7 @@ export function useBrowserSourceClient( videoStatsStartedAt: null, videoStatsTimer: null, previousVideoStats: null, + audioObserverStop: null, }; try { @@ -219,7 +326,7 @@ export function useBrowserSourceClient( await ensureAudioPublished(); } } catch (error) { - stop(); + await stop(); throw error; } @@ -257,14 +364,20 @@ export function useBrowserSourceClient( runtime.audioEnabled = nextEnabled; if (nextEnabled) { if (runtime.audioTrack) { - runtime.audioTrack.mediaStreamTrack.enabled = true; + syncTrackEnabled(runtime.audioTrack, true); await runtime.audioTrack.unmute(); + recordFrontendObservability(FRONTEND_EVENTS.BROWSER_AUDIO_TRACK_UNMUTED, { + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + }); } else { await ensureAudioPublished(); } } else if (runtime.audioTrack) { - runtime.audioTrack.mediaStreamTrack.enabled = false; + syncTrackEnabled(runtime.audioTrack, false); await runtime.audioTrack.mute(); + recordFrontendObservability(FRONTEND_EVENTS.BROWSER_AUDIO_TRACK_MUTED, { + [OBSERVABILITY_ATTRS.TRACK_NAME]: BROWSER_AUDIO_TRACK_NAME, + }); } } catch (error) { audioEnabledRef.current = previousEnabled; @@ -286,7 +399,7 @@ export function useBrowserSourceClient( setAudioPending(false); } }, - [audioConfigured, ensureAudioPublished, unpublishAudio] + [audioConfigured, ensureAudioPublished, recordFrontendObservability, unpublishAudio] ); const setVideoEnabled = useCallback( @@ -338,7 +451,11 @@ export function useBrowserSourceClient( [ensureVideoPublished, unpublishVideo, videoConfigured] ); - useEffect(() => stop, [stop]); + useEffect(() => { + return () => { + void stop(); + }; + }, [stop]); return useMemo( (): BrowserSourceClient => ({ @@ -368,6 +485,10 @@ export function useBrowserSourceClient( ); } +async function createDirectBrowserAudioTrack(): Promise { + return createLocalAudioTrack(BROWSER_AUDIO_CONSTRAINTS); +} + function syncTrackEnabled(track: LocalAudioTrack | LocalVideoTrack | null, enabled: boolean) { if (!track) return; diff --git a/hooks/useChatMessages.ts b/hooks/useChatMessages.ts index 6f30fe34d..e9b75f5d0 100644 --- a/hooks/useChatMessages.ts +++ b/hooks/useChatMessages.ts @@ -8,6 +8,7 @@ import { useTranscriptions, } from '@livekit/components-react'; import { type AppConfig } from '@/app-config'; +import { isRenderableChatMessage } from '@/lib/chat-message-filter'; function transcriptionToChatMessage( textStream: TextStreamData, @@ -105,7 +106,10 @@ export function useChatMessages( return chatMessage; }); - const merged: Array = [...transcriptionMessages, ...processedChatMessages]; + const merged: Array = [ + ...transcriptionMessages, + ...processedChatMessages, + ].filter(isRenderableChatMessage); return merged.sort((a, b) => a.timestamp - b.timestamp); }, [transcriptions, chat.chatMessages, room, config]); diff --git a/hooks/useExcludedVideoTracks.ts b/hooks/useExcludedVideoTracks.ts index 6822c01ff..180365ce9 100644 --- a/hooks/useExcludedVideoTracks.ts +++ b/hooks/useExcludedVideoTracks.ts @@ -37,12 +37,12 @@ export function useExcludedVideoTracks({ // 检查轨道是否应该被排除 const shouldExcludeTrack = useMemo(() => { return (trackName: string): boolean => { - return excludedTrackNames.some( - (excludeName) => - trackName === excludeName || - trackName.includes(excludeName) || - excludeName.includes(trackName) - ); + return excludedTrackNames.some((excludeName) => { + if (!excludeName) { + return false; + } + return trackName.includes(excludeName) || excludeName.includes(trackName); + }); }; }, [excludedTrackNames]); diff --git a/hooks/useRoom.ts b/hooks/useRoom.ts index 58d9352a1..d2fc9b066 100644 --- a/hooks/useRoom.ts +++ b/hooks/useRoom.ts @@ -5,6 +5,7 @@ import { toastAlert } from '@/components/livekit/alert-toast'; import { useBrowserSourceClient } from '@/hooks/useBrowserSourceClient'; import { getVoiceSessionId, resetVoiceSessionId } from '@/lib/browser-room-session'; import { readConnectionDetailsResponse } from '@/lib/connection-details-response'; +import { FRONTEND_EVENTS, publishFrontendObservabilityEvent } from '@/lib/observability'; import { waitForRoomDisconnected } from '@/lib/room-disconnect'; import { AgentSessionDispatchCancelledError, @@ -37,6 +38,19 @@ export function useRoom(appConfig: AppConfig) { const browserSourceClient = useBrowserSourceClient(room, appConfig, { onVideoError: handleBrowserVideoError, }); + const recordFrontendObservability = useCallback( + (name: string, attributes?: Record) => { + void publishFrontendObservabilityEvent({ + enabled: !!appConfig.observabilityEnabled, + room, + name, + attributes, + }).catch((error) => { + console.warn('[frontend-observability] failed to publish event', error); + }); + }, + [appConfig.observabilityEnabled, room] + ); useEffect(() => { function onDisconnected() { @@ -120,8 +134,13 @@ export function useRoom(appConfig: AppConfig) { const recoverFromStartError = async (error: unknown) => { const startError = error instanceof Error ? error : new Error(String(error)); - browserSourceClient.stop(); - room.disconnect(); + try { + await browserSourceClient.stop(); + } catch (stopError) { + console.warn('Failed to stop browser source after start failure', stopError); + } finally { + room.disconnect(); + } if (connectedRoomName) { try { await requestAgentSessionStop(dispatchSessionId ?? sessionIdRef.current ?? undefined, { @@ -192,6 +211,7 @@ export function useRoom(appConfig: AppConfig) { if (browserSourceClient.enabled || appConfig.usesServerRoomInput) { const connectionDetails = await tokenSource.fetch({ agentName: appConfig.agentName }); await room.connect(connectionDetails.serverUrl, connectionDetails.participantToken); + recordFrontendObservability(FRONTEND_EVENTS.ROOM_CONNECTED); connectedRoomName = room.name; await startLocalInput(); } else { @@ -199,6 +219,7 @@ export function useRoom(appConfig: AppConfig) { startDefaultMicrophone(), tokenSource.fetch({ agentName: appConfig.agentName }).then(async (connectionDetails) => { await room.connect(connectionDetails.serverUrl, connectionDetails.participantToken); + recordFrontendObservability(FRONTEND_EVENTS.ROOM_CONNECTED); connectedRoomName = room.name; }), ]); @@ -208,14 +229,19 @@ export function useRoom(appConfig: AppConfig) { } catch (error) { await handleStartError(error); } - }, [room, appConfig, tokenSource, browserSourceClient]); - - const endSession = useCallback(() => { - browserSourceClient.stop(); - room.disconnect(); - resetVoiceSessionId(); - sessionIdRef.current = null; - setIsSessionActive(false); + }, [room, appConfig, tokenSource, browserSourceClient, recordFrontendObservability]); + + const endSession = useCallback(async () => { + try { + await browserSourceClient.stop(); + } catch (error) { + console.warn('Failed to stop browser source while ending session', error); + } finally { + room.disconnect(); + resetVoiceSessionId(); + sessionIdRef.current = null; + setIsSessionActive(false); + } }, [browserSourceClient, room]); const getCurrentSessionId = useCallback(() => sessionIdRef.current, []); diff --git a/lib/chat-message-filter.ts b/lib/chat-message-filter.ts new file mode 100644 index 000000000..6822937c0 --- /dev/null +++ b/lib/chat-message-filter.ts @@ -0,0 +1,7 @@ +type ChatMessageLike = { + message?: string; +}; + +export function isRenderableChatMessage(message: ChatMessageLike): boolean { + return typeof message.message === 'string' && message.message.trim().length > 0; +} diff --git a/lib/frontend-audio-observer.ts b/lib/frontend-audio-observer.ts new file mode 100644 index 000000000..4661f508e --- /dev/null +++ b/lib/frontend-audio-observer.ts @@ -0,0 +1,211 @@ +import { OBSERVABILITY_ATTRS, type ObservabilityAttributes } from '@/lib/observability'; + +type AudioActivityReason = 'silence' | 'stop'; + +export type AudioActivityEvent = { + timestampMs: number; + level: number; + reason?: AudioActivityReason; +}; + +export type AudioActivityDetector = { + sample: () => void; + stop: (options?: { emitEnd?: boolean }) => void; + isActive: () => boolean; +}; + +interface AudioActivityDetectorOptions { + readLevel: () => number; + onStart: (event: AudioActivityEvent) => void; + onEnd: (event: AudioActivityEvent) => void; + now?: () => number; + startThreshold?: number; + endThreshold?: number; + startDurationMs?: number; + endSilenceMs?: number; +} + +type MediaTrackAudioObserverAttributes = ObservabilityAttributes | (() => ObservabilityAttributes); + +declare global { + interface Window { + webkitAudioContext?: typeof AudioContext; + } +} + +interface MediaTrackAudioObserverOptions { + mediaStreamTrack: MediaStreamTrack; + startEventName: string; + endEventName: string; + resumeErrorEventName?: string; + emit: (name: string, attributes?: ObservabilityAttributes) => void; + sharedAudioContext?: AudioContext; + attributes?: MediaTrackAudioObserverAttributes; + sampleIntervalMs?: number; + startThreshold?: number; + endThreshold?: number; + startDurationMs?: number; + endSilenceMs?: number; +} + +export function createAudioActivityDetector({ + readLevel, + onStart, + onEnd, + now = () => Date.now(), + startThreshold = 0.015, + endThreshold = 0.006, + startDurationMs = 80, + endSilenceMs = 500, +}: AudioActivityDetectorOptions): AudioActivityDetector { + let active = false; + let stopped = false; + let aboveStartedAt: number | null = null; + let belowStartedAt: number | null = null; + let lastLevel = 0; + + const sample = () => { + if (stopped) return; + + const timestampMs = now(); + const level = Math.max(0, readLevel()); + lastLevel = level; + + if (!active) { + if (level >= startThreshold) { + aboveStartedAt ??= timestampMs; + if (timestampMs - aboveStartedAt >= startDurationMs) { + active = true; + belowStartedAt = null; + onStart({ timestampMs, level }); + } + } else { + aboveStartedAt = null; + } + return; + } + + if (level <= endThreshold) { + belowStartedAt ??= timestampMs; + if (timestampMs - belowStartedAt >= endSilenceMs) { + active = false; + aboveStartedAt = null; + belowStartedAt = null; + onEnd({ timestampMs, level, reason: 'silence' }); + } + } else { + belowStartedAt = null; + } + }; + + const stop = ({ emitEnd = false }: { emitEnd?: boolean } = {}) => { + if (stopped) return; + stopped = true; + if (emitEnd && active) { + active = false; + onEnd({ timestampMs: now(), level: lastLevel, reason: 'stop' }); + } + }; + + return { + sample, + stop, + isActive: () => active, + }; +} + +export function startMediaTrackAudioObserver({ + mediaStreamTrack, + startEventName, + endEventName, + resumeErrorEventName, + emit, + sharedAudioContext, + attributes = {}, + sampleIntervalMs = 50, + startThreshold, + endThreshold, + startDurationMs, + endSilenceMs, +}: MediaTrackAudioObserverOptions): { stop: () => void } { + if (typeof window === 'undefined') { + return { stop: () => {} }; + } + const AudioContextClass = window.AudioContext || window.webkitAudioContext; + if (!sharedAudioContext && !AudioContextClass) { + return { stop: () => {} }; + } + + const ownsAudioContext = !sharedAudioContext; + const audioContext = sharedAudioContext ?? new AudioContextClass(); + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 1024; + + const mediaStream = new MediaStream([mediaStreamTrack]); + const source = audioContext.createMediaStreamSource(mediaStream); + source.connect(analyser); + + const samples = new Float32Array(analyser.fftSize); + const detector = createAudioActivityDetector({ + startThreshold, + endThreshold, + startDurationMs, + endSilenceMs, + readLevel: () => readRmsLevel(analyser, samples), + onStart: (event) => + emit(startEventName, activityAttributes(resolveAttributes(attributes), event)), + onEnd: (event) => emit(endEventName, activityAttributes(resolveAttributes(attributes), event)), + }); + + const intervalId = window.setInterval(() => detector.sample(), sampleIntervalMs); + let stopped = false; + const stop = () => { + if (stopped) return; + stopped = true; + window.clearInterval(intervalId); + mediaStreamTrack.removeEventListener('ended', stop); + detector.stop({ emitEnd: true }); + source.disconnect(); + if (ownsAudioContext) { + void audioContext.close?.().catch(() => undefined); + } + }; + + void audioContext.resume?.().catch((error) => { + console.warn('[frontend-observability] audio observer could not resume AudioContext', error); + if (resumeErrorEventName) { + emit(resumeErrorEventName, { + ...resolveAttributes(attributes), + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_REASON]: 'audio-context-resume-failed', + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_ERROR]: + error instanceof Error ? error.message : String(error), + }); + } + stop(); + }); + + mediaStreamTrack.addEventListener('ended', stop, { once: true }); + + return { stop }; +} + +function readRmsLevel(analyser: AnalyserNode, samples: Float32Array): number { + analyser.getFloatTimeDomainData(samples); + let sumSquares = 0; + for (const sample of samples) { + sumSquares += sample * sample; + } + return Math.sqrt(sumSquares / samples.length); +} + +function activityAttributes(baseAttributes: ObservabilityAttributes, event: AudioActivityEvent) { + return { + ...baseAttributes, + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_LEVEL]: Number(event.level.toFixed(6)), + [OBSERVABILITY_ATTRS.FRONTEND_AUDIO_REASON]: event.reason ?? null, + }; +} + +function resolveAttributes(attributes: MediaTrackAudioObserverAttributes): ObservabilityAttributes { + return typeof attributes === 'function' ? attributes() : attributes; +} diff --git a/lib/frontend-vad-observer.ts b/lib/frontend-vad-observer.ts new file mode 100644 index 000000000..657c115c6 --- /dev/null +++ b/lib/frontend-vad-observer.ts @@ -0,0 +1,121 @@ +export interface BrowserVadEvent { + timestampMs: number; + provider: 'vad-web'; + model: 'silero_vad_v5'; + audioDurationMs?: number; +} + +interface MicVadInstance { + start: () => void | Promise; + pause?: () => void | Promise; + destroy?: () => void | Promise; +} + +interface MicVadOptions { + getStream: () => Promise; + pauseStream: (stream: MediaStream) => Promise; + resumeStream: (stream: MediaStream) => Promise; + onSpeechStart: () => void; + onSpeechEnd: (audio: Float32Array) => void; + onFrameProcessed?: ( + probabilities: { isSpeech: number; notSpeech?: number }, + frame: Float32Array + ) => void; + baseAssetPath: string; + onnxWASMBasePath: string; + model: 'v5'; + startOnLoad: boolean; +} + +type CreateMicVad = (options: MicVadOptions) => Promise; + +interface MediaTrackVadObserverOptions { + mediaStreamTrack: MediaStreamTrack; + onSpeechStart: (event: BrowserVadEvent) => void; + onSpeechEnd: (event: BrowserVadEvent) => void; + createMicVad?: CreateMicVad; + now?: () => number; + baseAssetPath?: string; + onnxWASMBasePath?: string; +} + +const VAD_SAMPLE_RATE = 16_000; +const VAD_MODEL = 'silero_vad_v5'; +const VAD_SPEECH_PROBABILITY_THRESHOLD = 0.5; +const DEFAULT_VAD_ASSET_BASE_PATH = '/vad-web/'; +const DEFAULT_ONNX_WASM_BASE_PATH = '/onnxruntime-web/'; + +export async function startMediaTrackVadObserver({ + mediaStreamTrack, + onSpeechStart, + onSpeechEnd, + createMicVad = createDefaultMicVad, + now = () => Date.now(), + baseAssetPath = DEFAULT_VAD_ASSET_BASE_PATH, + onnxWASMBasePath = DEFAULT_ONNX_WASM_BASE_PATH, +}: MediaTrackVadObserverOptions): Promise<{ stop: () => Promise }> { + if (typeof MediaStream === 'undefined') { + return { stop: async () => {} }; + } + + const mediaStream = new MediaStream([mediaStreamTrack]); + let lastSpeechFrameTimestampMs: number | null = null; + const vad = await createMicVad({ + getStream: async () => mediaStream, + pauseStream: async () => {}, + resumeStream: async () => mediaStream, + baseAssetPath, + onnxWASMBasePath, + model: 'v5', + startOnLoad: false, + onSpeechStart: () => { + lastSpeechFrameTimestampMs = null; + onSpeechStart({ + timestampMs: now(), + provider: 'vad-web', + model: VAD_MODEL, + }); + }, + onFrameProcessed: (probabilities) => { + if (probabilities.isSpeech >= VAD_SPEECH_PROBABILITY_THRESHOLD) { + lastSpeechFrameTimestampMs = now(); + } + }, + onSpeechEnd: (audio) => { + const timestampMs = lastSpeechFrameTimestampMs ?? now(); + lastSpeechFrameTimestampMs = null; + onSpeechEnd({ + timestampMs, + provider: 'vad-web', + model: VAD_MODEL, + audioDurationMs: Math.round((audio.length / VAD_SAMPLE_RATE) * 1000), + }); + }, + }); + let stopped = false; + const stop = async () => { + if (stopped) return; + stopped = true; + mediaStreamTrack.removeEventListener('ended', handleTrackEnded); + try { + await vad.pause?.(); + } finally { + await vad.destroy?.(); + } + }; + const handleTrackEnded = () => { + void stop(); + }; + + await vad.start(); + mediaStreamTrack.addEventListener('ended', handleTrackEnded, { once: true }); + + return { + stop, + }; +} + +async function createDefaultMicVad(options: MicVadOptions): Promise { + const { MicVAD } = await import('@ricky0123/vad-web'); + return MicVAD.new(options); +} diff --git a/lib/input-device-config.ts b/lib/input-device-config.ts new file mode 100644 index 000000000..7807150fc --- /dev/null +++ b/lib/input-device-config.ts @@ -0,0 +1,72 @@ +export const DEFAULT_ROLE_INPUT_DEVICE = 'xunfei'; + +const VALID_INPUT_DEVICE_VALUES = ['xunfei', 'generic', 'primebot', 'browser'] as const; +const SERVER_ROOM_INPUT_DEVICE_VALUES = ['xunfei', 'generic'] as const; + +export const VALID_INPUT_DEVICES: ReadonlySet = new Set(VALID_INPUT_DEVICE_VALUES); +export const SERVER_ROOM_INPUT_DEVICES: ReadonlySet = new Set( + SERVER_ROOM_INPUT_DEVICE_VALUES +); + +export interface RoleInputDeviceOptions { + inputSource?: string | null; + audioInputDevice?: string | null; + visionInputDevice?: string | null; + outputDevice?: string | null; +} + +export interface ResolvedRoleInputDevices { + inputSource: string; + audioInputDevice: string; + visionInputDevice: string; + outputDevice: string; +} + +export function normalizeInputSource(inputSource?: string | null): string { + const normalized = (inputSource || '').trim().toLowerCase(); + return normalized || 'browser'; +} + +export function normalizeRoleInputDevice( + inputDevice: string | null | undefined, + fallback: string +): string { + const normalized = (inputDevice || '').trim().toLowerCase(); + if (VALID_INPUT_DEVICES.has(normalized)) { + return normalized; + } + return fallback; +} + +export function usesServerRoomInputDevice(inputDevice: string): boolean { + return SERVER_ROOM_INPUT_DEVICES.has(inputDevice); +} + +export function resolveRoleInputDevices({ + inputSource, + audioInputDevice, + visionInputDevice, + outputDevice, +}: RoleInputDeviceOptions = {}): ResolvedRoleInputDevices { + const normalizedInputSource = normalizeInputSource(inputSource); + const isMixedInputSource = normalizedInputSource === 'mixed'; + const baseInputDevice = isMixedInputSource + ? DEFAULT_ROLE_INPUT_DEVICE + : normalizeRoleInputDevice(normalizedInputSource, DEFAULT_ROLE_INPUT_DEVICE); + const resolvedAudioInputDevice = isMixedInputSource + ? normalizeRoleInputDevice(audioInputDevice, baseInputDevice) + : baseInputDevice; + const resolvedVisionInputDevice = isMixedInputSource + ? normalizeRoleInputDevice(visionInputDevice, baseInputDevice) + : baseInputDevice; + const resolvedOutputDevice = isMixedInputSource + ? normalizeRoleInputDevice(outputDevice, baseInputDevice) + : baseInputDevice; + + return { + inputSource: normalizedInputSource, + audioInputDevice: resolvedAudioInputDevice, + visionInputDevice: resolvedVisionInputDevice, + outputDevice: resolvedOutputDevice, + }; +} diff --git a/lib/observability.ts b/lib/observability.ts new file mode 100644 index 000000000..3dac10f80 --- /dev/null +++ b/lib/observability.ts @@ -0,0 +1,214 @@ +export const OBSERVABILITY_EVENT_TYPES = { + FRONTEND_EVENT: 'observability.frontend_event', + BACKEND_MARKER: 'observability.backend_marker', +} as const; + +export const FRONTEND_EVENTS = { + ROOM_CONNECTED: 'frontend.room.connected', + BROWSER_AUDIO_TRACK_PUBLISHED: 'frontend.browser_audio.track_published', + BROWSER_AUDIO_TRACK_UNPUBLISHED: 'frontend.browser_audio.track_unpublished', + BROWSER_AUDIO_TRACK_MUTED: 'frontend.browser_audio.track_muted', + BROWSER_AUDIO_TRACK_UNMUTED: 'frontend.browser_audio.track_unmuted', + BROWSER_AUDIO_VAD_SPEECH_STARTED: 'frontend.browser_audio.vad_speech_started', + BROWSER_AUDIO_VAD_SPEECH_ENDED: 'frontend.browser_audio.vad_speech_ended', + BROWSER_AUDIO_VAD_PROBE_UNAVAILABLE: 'frontend.browser_audio.vad_probe_unavailable', + REPLY_AUDIO_PLAYBACK_STARTED: 'frontend.reply_audio.playback_started', + REPLY_AUDIO_PLAYBACK_ENDED: 'frontend.reply_audio.playback_ended', + REPLY_AUDIO_PLAYBACK_ERROR: 'frontend.reply_audio.playback_error', +} as const; + +export const BACKEND_MARKERS = { + OUTPUT_AUDIO_SEGMENT_STARTED: 'backend.output_audio.segment_started', + OUTPUT_AUDIO_SEGMENT_FINISHED: 'backend.output_audio.segment_finished', + OUTPUT_AUDIO_PLAYBACK_FINISHED: 'backend.output_audio.playback_finished', +} as const; + +export const OBSERVABILITY_ATTRS = { + TURN_ID: 'observability.turn_id', + OUTPUT_SEGMENT_ID: 'observability.output_segment_id', + OUTPUT_SEGMENT_INDEX: 'observability.output_segment_index', + OUTPUT_SEGMENT_KIND: 'observability.output_segment_kind', + PARTICIPANT_IDENTITY: 'livekit.participant_identity', + PARTICIPANT_IDENTITY_LEGACY: 'livekit.participant', + FRONTEND_AUDIO_DIRECTION: 'observability.frontend_audio.direction', + FRONTEND_AUDIO_PROBE: 'observability.frontend_audio.probe', + FRONTEND_AUDIO_LEVEL: 'observability.frontend_audio.level', + FRONTEND_AUDIO_REASON: 'observability.frontend_audio.reason', + FRONTEND_AUDIO_ERROR: 'observability.frontend_audio.error', + VAD_PROVIDER: 'observability.vad.provider', + VAD_MODEL: 'observability.vad.model', + VAD_AUDIO_DURATION_MS: 'observability.vad.audio_duration_ms', + TRACK_NAME: 'livekit.track_name', + TRACK_SID: 'livekit.track_sid', + TRACK_SOURCE: 'livekit.track_source', + TRACK_STREAM_NAME: 'livekit.stream_name', +} as const; + +export const FRONTEND_OBSERVABILITY_TOPIC = OBSERVABILITY_EVENT_TYPES.FRONTEND_EVENT; +export const BACKEND_OBSERVABILITY_MARKER_TOPIC = OBSERVABILITY_EVENT_TYPES.BACKEND_MARKER; + +export type ObservabilityAttribute = string | number | boolean | null; +export type ObservabilityAttributes = Record; + +export interface BackendObservabilityMarker { + name: string; + attributes: ObservabilityAttributes; +} + +const MAX_BACKEND_MARKER_NAME_LENGTH = 128; + +type PublishableRoom = { + name?: string; + localParticipant?: { + identity?: string; + publishData?: ( + data: Uint8Array, + options?: { reliable?: boolean; topic?: string } + ) => Promise | void; + }; +}; + +interface PublishFrontendObservabilityEventOptions { + enabled: boolean; + room: PublishableRoom; + name: string; + attributes?: Record; + wallTimeUnixMs?: number; + now?: () => number; + performanceNow?: () => number; +} + +export async function publishFrontendObservabilityEvent({ + enabled, + room, + name, + attributes, + wallTimeUnixMs, + now = () => Date.now(), + performanceNow = defaultPerformanceNow, +}: PublishFrontendObservabilityEventOptions) { + if (!enabled) { + return false; + } + if (typeof room.localParticipant?.publishData !== 'function') { + return false; + } + + const payload = { + schema_version: 1, + type: OBSERVABILITY_EVENT_TYPES.FRONTEND_EVENT, + name, + wall_time_unix_ms: wallTimeUnixMs ?? now(), + performance_now_ms: performanceNow(), + room_name: room.name || undefined, + participant_identity: room.localParticipant?.identity || undefined, + attributes: attributes ?? {}, + }; + await room.localParticipant.publishData(new TextEncoder().encode(JSON.stringify(payload)), { + reliable: true, + topic: FRONTEND_OBSERVABILITY_TOPIC, + }); + return true; +} + +export function parseBackendObservabilityMarkerPayload( + payload: Uint8Array | string, + topic?: string +): BackendObservabilityMarker | null { + if (topic !== BACKEND_OBSERVABILITY_MARKER_TOPIC) { + return null; + } + + let decoded = ''; + try { + decoded = typeof payload === 'string' ? payload : new TextDecoder().decode(payload); + } catch { + return null; + } + + let parsed: unknown; + try { + parsed = JSON.parse(decoded); + } catch { + return null; + } + if (!parsed || typeof parsed !== 'object') { + return null; + } + + const packet = parsed as { + schema_version?: unknown; + type?: unknown; + name?: unknown; + attributes?: unknown; + }; + if (packet.schema_version !== 1 || packet.type !== OBSERVABILITY_EVENT_TYPES.BACKEND_MARKER) { + return null; + } + const name = typeof packet.name === 'string' ? packet.name.trim() : ''; + if (!name.startsWith('backend.')) { + return null; + } + if (name.length > MAX_BACKEND_MARKER_NAME_LENGTH) { + return null; + } + + return { + name, + attributes: sanitizeObservabilityAttributes(packet.attributes), + }; +} + +export function outputSegmentAttributesFromMarker( + marker: BackendObservabilityMarker | null | undefined +): ObservabilityAttributes { + if (!marker) { + return {}; + } + const output: ObservabilityAttributes = {}; + for (const key of [ + OBSERVABILITY_ATTRS.TURN_ID, + OBSERVABILITY_ATTRS.OUTPUT_SEGMENT_ID, + OBSERVABILITY_ATTRS.OUTPUT_SEGMENT_INDEX, + OBSERVABILITY_ATTRS.OUTPUT_SEGMENT_KIND, + ]) { + const value = marker.attributes[key]; + if (value !== undefined) { + output[key] = value; + } + } + return output; +} + +function sanitizeObservabilityAttributes(value: unknown): ObservabilityAttributes { + if (!value || typeof value !== 'object') { + return {}; + } + if (Array.isArray(value)) { + return {}; + } + const output: ObservabilityAttributes = {}; + for (const [key, rawValue] of Object.entries(value as Record)) { + if (!key) { + continue; + } + if ( + rawValue === null || + typeof rawValue === 'string' || + typeof rawValue === 'number' || + typeof rawValue === 'boolean' + ) { + output[key] = rawValue; + } else { + output[key] = String(rawValue); + } + } + return output; +} + +function defaultPerformanceNow() { + if (typeof performance === 'undefined') { + return 0; + } + return performance.now(); +} diff --git a/lib/session-stop.ts b/lib/session-stop.ts index ee410dc85..ee3a23b6d 100644 --- a/lib/session-stop.ts +++ b/lib/session-stop.ts @@ -1,3 +1,24 @@ +import { resolveRoleInputDevices, usesServerRoomInputDevice } from './input-device-config'; + +export type RoomInputControlAction = 'start' | 'stop'; + +export interface ResolveRoomInputStopUrlsOptions { + inputSource?: string | null; + audioInputDevice?: string | null; + visionInputDevice?: string | null; + /** + * Room-input control URLs are configured as base endpoint paths. The + * normalizer intentionally strips query/hash fragments when switching + * between /start and /stop so stop calls do not inherit start-only params. + */ + roomAudioInputUrl?: string | null; + roomVisionInputUrl?: string | null; + roomInputUrl?: string | null; + frontdeskInputParticipantUrl?: string | null; + faceServiceUrl?: string | null; + genericCameraParticipantUrl?: string | null; +} + export function resolveLiveKitHttpUrl(liveKitUrl?: string | null): string | undefined { const normalized = liveKitUrl?.trim(); if (!normalized) { @@ -11,3 +32,93 @@ export function resolveLiveKitHttpUrl(liveKitUrl?: string | null): string | unde } return normalized; } + +function addRoomInputStopUrl(urls: Set, rawUrl?: string | null): void { + const stopUrl = normalizeRoomInputControlUrl(rawUrl || '', 'stop'); + if (stopUrl) { + urls.add(stopUrl); + } +} + +export function normalizeRoomInputControlUrl( + rawUrl: string, + action: RoomInputControlAction +): string { + const value = rawUrl.trim(); + if (!value) { + return ''; + } + + try { + const url = new URL(value); + const pathname = url.pathname.replace(/\/+$/, ''); + const otherAction = action === 'stop' ? 'start' : 'stop'; + if (pathname.endsWith(`/${otherAction}`)) { + url.pathname = `${pathname.slice(0, -1 * (otherAction.length + 1))}/${action}`; + } else if (!pathname.endsWith(`/${action}`)) { + url.pathname = `${pathname}/${action}`; + } + // Control URLs are base endpoint paths; query/hash fragments are not part + // of the room-input stop contract and should not be carried across actions. + url.search = ''; + url.hash = ''; + return url.toString(); + } catch { + const withoutTrailingSlash = value.replace(/\/+$/, ''); + if (withoutTrailingSlash.endsWith(`/${action}`)) { + return withoutTrailingSlash; + } + + const otherAction = action === 'stop' ? 'start' : 'stop'; + if (withoutTrailingSlash.endsWith(`/${otherAction}`)) { + return `${withoutTrailingSlash.slice(0, -1 * (otherAction.length + 1))}/${action}`; + } + return `${withoutTrailingSlash}/${action}`; + } +} + +export function resolveRoomInputStopUrls({ + inputSource, + audioInputDevice, + visionInputDevice, + roomAudioInputUrl, + roomVisionInputUrl, + roomInputUrl, + frontdeskInputParticipantUrl, + faceServiceUrl, + genericCameraParticipantUrl, +}: ResolveRoomInputStopUrlsOptions): string[] { + const { + audioInputDevice: resolvedAudioInputDevice, + visionInputDevice: resolvedVisionInputDevice, + } = resolveRoleInputDevices({ + inputSource, + audioInputDevice, + visionInputDevice, + }); + + const urls = new Set(); + const selectedServerDevices = new Set(); + + if (usesServerRoomInputDevice(resolvedAudioInputDevice)) { + selectedServerDevices.add(resolvedAudioInputDevice); + addRoomInputStopUrl(urls, roomAudioInputUrl || roomInputUrl); + } + if (usesServerRoomInputDevice(resolvedVisionInputDevice)) { + selectedServerDevices.add(resolvedVisionInputDevice); + addRoomInputStopUrl(urls, roomVisionInputUrl || roomInputUrl); + } + if (selectedServerDevices.size === 0) { + return []; + } + + if (selectedServerDevices.has('xunfei')) { + addRoomInputStopUrl(urls, frontdeskInputParticipantUrl); + addRoomInputStopUrl(urls, faceServiceUrl); + } + if (selectedServerDevices.has('generic')) { + addRoomInputStopUrl(urls, genericCameraParticipantUrl); + } + + return [...urls]; +} diff --git a/lib/utils.ts b/lib/utils.ts index 38d87ed4f..660a5512a 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -304,6 +304,10 @@ export function getClientConfigFromEnv(): AppConfig { ['DEBUG_VISION', 'NEXT_PUBLIC_DEBUG_VISION', 'NEXT_PUBLIC_LEXVOICE_DEBUG_VISION'], ['DEBUG_VIDEO', 'NEXT_PUBLIC_DEBUG_VIDEO', 'NEXT_PUBLIC_LEXVOICE_DEBUG_VIDEO'] ), + observabilityEnabled: readBooleanEnv( + APP_CONFIG_DEFAULTS.observabilityEnabled ?? false, + 'OBSERVABILITY_ENABLED' + ), }; } diff --git a/lib/video-preview-selection.ts b/lib/video-preview-selection.ts new file mode 100644 index 000000000..5d31f2967 --- /dev/null +++ b/lib/video-preview-selection.ts @@ -0,0 +1,39 @@ +import type { VideoTrackConfig } from '@/app-config'; + +type PreviewTrackType = VideoTrackConfig['type'] | null | undefined; + +interface ResolveCameraPreviewTrackParams { + selectedTrack: T | null | undefined; + selectedTrackId: string | null | undefined; + selectedTrackType?: PreviewTrackType; + canShowDefaultCameraPreview: boolean; + configuredCameraTrack?: T; + defaultCameraTrack?: T; +} + +export function resolveCameraPreviewTrack({ + selectedTrack, + selectedTrackId, + selectedTrackType, + canShowDefaultCameraPreview, + configuredCameraTrack, + defaultCameraTrack, +}: ResolveCameraPreviewTrackParams): T | undefined { + if (selectedTrack) { + return selectedTrack; + } + + if (!canShowDefaultCameraPreview) { + return undefined; + } + + if (!selectedTrackId) { + return configuredCameraTrack ?? defaultCameraTrack; + } + + if (selectedTrackType === 'system') { + return defaultCameraTrack; + } + + return undefined; +} diff --git a/package.json b/package.json index dc7484700..92b4fdb67 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,9 @@ "start": "next start", "lint": "eslint .", "test": "tsx --test tests/*.test.mjs", + "sync:vad-assets": "node scripts/sync-vad-assets.js", + "postinstall": "node scripts/sync-vad-assets.js", + "prebuild": "node scripts/sync-vad-assets.js", "format": "prettier --write .", "format:check": "prettier --check .", "debug:on": "node scripts/toggle-debug.js on", @@ -21,6 +24,7 @@ "@radix-ui/react-select": "^2.2.5", "@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-toggle": "^1.1.9", + "@ricky0123/vad-web": "^0.0.30", "buffer-image-size": "^0.6.4", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", @@ -31,6 +35,7 @@ "motion": "^12.16.0", "next": "15.5.2", "next-themes": "^0.4.6", + "onnxruntime-web": "^1.27.0", "react": "^19.0.0", "react-dom": "^19.0.0", "sonner": "^2.0.3", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e1c3a0314..e9818bec2 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -26,6 +26,9 @@ importers: '@radix-ui/react-toggle': specifier: ^1.1.9 version: 1.1.10(@types/react-dom@19.1.9(@types/react@19.1.12))(@types/react@19.1.12)(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + '@ricky0123/vad-web': + specifier: ^0.0.30 + version: 0.0.30 buffer-image-size: specifier: ^0.6.4 version: 0.6.4 @@ -56,6 +59,9 @@ importers: next-themes: specifier: ^0.4.6 version: 0.4.6(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + onnxruntime-web: + specifier: ^1.27.0 + version: 1.27.0 react: specifier: ^19.0.0 version: 19.1.1 @@ -674,6 +680,33 @@ packages: resolution: {integrity: sha512-QNqXyfVS2wm9hweSYD2O7F0G06uurj9kZ96TRQE5Y9hU7+tgdZwIkbAKc5Ocy1HxEY2kuDQa6cQ1WRs/O5LFKA==} engines: {node: ^12.20.0 || ^14.18.0 || >=16.0.0} + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.5': + resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==} + + '@protobufjs/eventemitter@1.1.1': + resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==} + + '@protobufjs/fetch@1.1.1': + resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.1': + resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==} + '@radix-ui/number@1.1.1': resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==} @@ -939,6 +972,9 @@ packages: '@radix-ui/rect@1.1.1': resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==} + '@ricky0123/vad-web@0.0.30': + resolution: {integrity: sha512-cJyYrh4YeeUBJcbR9Bic/bFDyB9qBkAepvpuWM3vLxnAi7bC3VHzf51UeNdT+OtY4D7MLAgV8iJMc4z41ZnaWg==} + '@rtsao/scc@1.1.0': resolution: {integrity: sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==} @@ -1677,6 +1713,9 @@ packages: resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==} engines: {node: '>=16'} + flatbuffers@25.9.23: + resolution: {integrity: sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==} + flatted@3.3.3: resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==} @@ -1762,6 +1801,9 @@ packages: graphemer@1.4.0: resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==} + guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + has-bigints@1.1.0: resolution: {integrity: sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==} engines: {node: '>= 0.4'} @@ -2075,6 +2117,9 @@ packages: resolution: {integrity: sha512-HgMmCqIJSAKqo68l0rS2AanEWfkxaZ5wNiEFb5ggm08lDs9Xl2KxBlX3PTcaD2chBM1gXAYf491/M2Rv8Jwayg==} engines: {node: '>= 0.6.0'} + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + loose-envify@1.4.0: resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==} hasBin: true @@ -2221,6 +2266,12 @@ packages: resolution: {integrity: sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==} engines: {node: '>= 0.4'} + onnxruntime-common@1.27.0: + resolution: {integrity: sha512-3KxL5wIVqa8Ex08jxSzncm9CMgw8CjOFyOQ7SxvG9o0cVLlhTNKXyIQuTbtX4tGPJEf73OER2xrjt4HJSBL4ow==} + + onnxruntime-web@1.27.0: + resolution: {integrity: sha512-ogDLsqIozHZwifPuN37OproAo0byX6t43/bP8GzeZWBWD6MOGExswFAx3up4NS/vvWBOg2u2PXomDt3rMmdQSg==} + optionator@0.9.4: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} @@ -2263,6 +2314,9 @@ packages: resolution: {integrity: sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==} engines: {node: '>=12'} + platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + possible-typed-array-names@1.1.0: resolution: {integrity: sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==} engines: {node: '>= 0.4'} @@ -2352,6 +2406,10 @@ packages: prop-types@15.8.1: resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} + protobufjs@7.6.4: + resolution: {integrity: sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==} + engines: {node: '>=12.0.0'} + punycode@2.3.1: resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} engines: {node: '>=6'} @@ -3178,6 +3236,26 @@ snapshots: '@pkgr/core@0.2.9': {} + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.5': {} + + '@protobufjs/eventemitter@1.1.1': {} + + '@protobufjs/fetch@1.1.1': + dependencies: + '@protobufjs/aspromise': 1.1.2 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.1': {} + '@radix-ui/number@1.1.1': {} '@radix-ui/primitive@1.1.3': {} @@ -3407,6 +3485,10 @@ snapshots: '@radix-ui/rect@1.1.1': {} + '@ricky0123/vad-web@0.0.30': + dependencies: + onnxruntime-web: 1.27.0 + '@rtsao/scc@1.1.0': {} '@rushstack/eslint-patch@1.11.0': {} @@ -4319,6 +4401,8 @@ snapshots: flatted: 3.3.3 keyv: 4.5.4 + flatbuffers@25.9.23: {} + flatted@3.3.3: {} for-each@0.3.5: @@ -4403,6 +4487,8 @@ snapshots: graphemer@1.4.0: {} + guid-typescript@1.0.9: {} + has-bigints@1.1.0: {} has-flag@4.0.0: {} @@ -4697,6 +4783,8 @@ snapshots: loglevel@1.9.2: {} + long@5.3.2: {} + loose-envify@1.4.0: dependencies: js-tokens: 4.0.0 @@ -4828,6 +4916,17 @@ snapshots: define-properties: 1.2.1 es-object-atoms: 1.1.1 + onnxruntime-common@1.27.0: {} + + onnxruntime-web@1.27.0: + dependencies: + flatbuffers: 25.9.23 + guid-typescript: 1.0.9 + long: 5.3.2 + onnxruntime-common: 1.27.0 + platform: 1.3.6 + protobufjs: 7.6.4 + optionator@0.9.4: dependencies: deep-is: 0.1.4 @@ -4867,6 +4966,8 @@ snapshots: picomatch@4.0.2: {} + platform@1.3.6: {} + possible-typed-array-names@1.1.0: {} postcss@8.4.31: @@ -4901,6 +5002,20 @@ snapshots: object-assign: 4.1.1 react-is: 16.13.1 + protobufjs@7.6.4: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.5 + '@protobufjs/eventemitter': 1.1.1 + '@protobufjs/fetch': 1.1.1 + '@protobufjs/float': 1.0.2 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.1 + '@types/node': 22.18.1 + long: 5.3.2 + punycode@2.3.1: {} queue-microtask@1.2.3: {} diff --git a/scripts/sync-vad-assets.js b/scripts/sync-vad-assets.js new file mode 100644 index 000000000..aa9ede2cc --- /dev/null +++ b/scripts/sync-vad-assets.js @@ -0,0 +1,58 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const path = require('path'); + +const rootDir = path.resolve(__dirname, '..'); +const publicDir = path.join(rootDir, 'public'); + +function packageDir(packageName) { + try { + return path.dirname(require.resolve(`${packageName}/package.json`)); + } catch { + let currentDir = path.dirname(require.resolve(packageName)); + while (currentDir !== path.dirname(currentDir)) { + if (fs.existsSync(path.join(currentDir, 'package.json'))) { + return currentDir; + } + currentDir = path.dirname(currentDir); + } + throw new Error(`Unable to locate package directory for ${packageName}`); + } +} + +function copyFile(source, destination) { + fs.mkdirSync(path.dirname(destination), { recursive: true }); + fs.copyFileSync(source, destination); +} + +function copyNamedFiles(sourceDir, destinationDir, filenames) { + for (const filename of filenames) { + copyFile(path.join(sourceDir, filename), path.join(destinationDir, filename)); + } +} + +function syncVadWebAssets() { + const sourceDir = path.join(packageDir('@ricky0123/vad-web'), 'dist'); + const destinationDir = path.join(publicDir, 'vad-web'); + + copyNamedFiles(sourceDir, destinationDir, [ + 'silero_vad_v5.onnx', + 'silero_vad_legacy.onnx', + 'vad.worklet.bundle.min.js', + ]); +} + +function syncOnnxRuntimeAssets() { + const sourceDir = path.join(packageDir('onnxruntime-web'), 'dist'); + const destinationDir = path.join(publicDir, 'onnxruntime-web'); + + copyNamedFiles(sourceDir, destinationDir, [ + 'ort-wasm-simd-threaded.mjs', + 'ort-wasm-simd-threaded.wasm', + ]); +} + +syncVadWebAssets(); +syncOnnxRuntimeAssets(); +console.log('Synced local VAD assets into public/.'); diff --git a/tests/chat-message-filter.test.mjs b/tests/chat-message-filter.test.mjs new file mode 100644 index 000000000..9f3074b99 --- /dev/null +++ b/tests/chat-message-filter.test.mjs @@ -0,0 +1,21 @@ +import assert from 'node:assert/strict'; +import { readFile } from 'node:fs/promises'; +import { test } from 'node:test'; + +const { isRenderableChatMessage } = await import('../lib/chat-message-filter.ts'); + +test('chat message filter hides empty transcription messages', () => { + assert.equal(isRenderableChatMessage({ message: '' }), false); + assert.equal(isRenderableChatMessage({ message: ' \n\t ' }), false); +}); + +test('chat message filter keeps normal chat text', () => { + assert.equal(isRenderableChatMessage({ message: '天气怎么样?' }), true); +}); + +test('chat message hook filters empty merged messages before sorting', async () => { + const source = await readFile('hooks/useChatMessages.ts', 'utf8'); + + assert.match(source, /isRenderableChatMessage/); + assert.match(source, /\.filter\(isRenderableChatMessage\)/); +}); diff --git a/tests/frontend-audio-observer.test.mjs b/tests/frontend-audio-observer.test.mjs new file mode 100644 index 000000000..6af552dcf --- /dev/null +++ b/tests/frontend-audio-observer.test.mjs @@ -0,0 +1,282 @@ +import assert from 'node:assert/strict'; +import { test } from 'node:test'; + +const { createAudioActivityDetector, startMediaTrackAudioObserver } = await import( + '../lib/frontend-audio-observer.ts' +); + +test('audio activity detector emits start and end after configured windows', () => { + const events = []; + let now = 0; + const levels = [0.001, 0.004, 0.022, 0.028, 0.018, 0.003, 0.002, 0.001, 0.001]; + const detector = createAudioActivityDetector({ + startThreshold: 0.015, + endThreshold: 0.006, + startDurationMs: 40, + endSilenceMs: 100, + readLevel: () => levels.shift() ?? 0, + now: () => now, + onStart: (event) => events.push(['start', event]), + onEnd: (event) => events.push(['end', event]), + }); + + for (let index = 0; index < 9; index += 1) { + detector.sample(); + now += 50; + } + + assert.equal(events.length, 2); + assert.equal(events[0][0], 'start'); + assert.equal(events[0][1].timestampMs, 150); + assert.equal(events[1][0], 'end'); + assert.equal(events[1][1].timestampMs, 350); +}); + +test('audio activity detector closes an active segment when stopped', () => { + const events = []; + let now = 0; + const detector = createAudioActivityDetector({ + startThreshold: 0.015, + endThreshold: 0.006, + startDurationMs: 0, + endSilenceMs: 100, + readLevel: () => 0.03, + now: () => now, + onStart: (event) => events.push(['start', event]), + onEnd: (event) => events.push(['end', event]), + }); + + detector.sample(); + now = 250; + detector.stop({ emitEnd: true }); + + assert.equal(events.length, 2); + assert.equal(events[0][0], 'start'); + assert.equal(events[1][0], 'end'); + assert.equal(events[1][1].reason, 'stop'); +}); + +test('media track audio observer stop is idempotent', () => { + const originalWindow = globalThis.window; + const originalMediaStream = globalThis.MediaStream; + let disconnectCount = 0; + let closeCount = 0; + const track = { + addEventListener() {}, + removeEventListener() {}, + }; + + class FakeAudioContext { + createAnalyser() { + return { + fftSize: 0, + getFloatTimeDomainData(samples) { + samples.fill(0); + }, + }; + } + + createMediaStreamSource() { + return { + connect() {}, + disconnect() { + disconnectCount += 1; + if (disconnectCount > 1) { + throw new Error('disconnect called twice'); + } + }, + }; + } + + resume() { + return Promise.resolve(); + } + + close() { + closeCount += 1; + return Promise.resolve(); + } + } + + globalThis.MediaStream = class FakeMediaStream { + constructor(tracks) { + this.tracks = tracks; + } + }; + globalThis.window = { + AudioContext: FakeAudioContext, + setInterval, + clearInterval, + }; + + try { + const observer = startMediaTrackAudioObserver({ + mediaStreamTrack: track, + startEventName: 'start', + endEventName: 'end', + emit() {}, + }); + + observer.stop(); + observer.stop(); + + assert.equal(disconnectCount, 1); + assert.equal(closeCount, 1); + } finally { + globalThis.window = originalWindow; + globalThis.MediaStream = originalMediaStream; + } +}); + +test('media track audio observer leaves caller-owned AudioContext open', () => { + const originalWindow = globalThis.window; + const originalMediaStream = globalThis.MediaStream; + let closeCount = 0; + const track = { + addEventListener() {}, + removeEventListener() {}, + }; + + class FakeAudioContext { + createAnalyser() { + return { + fftSize: 0, + getFloatTimeDomainData(samples) { + samples.fill(0); + }, + }; + } + + createMediaStreamSource() { + return { + connect() {}, + disconnect() {}, + }; + } + + resume() { + return Promise.resolve(); + } + + close() { + closeCount += 1; + return Promise.resolve(); + } + } + + globalThis.MediaStream = class FakeMediaStream { + constructor(tracks) { + this.tracks = tracks; + } + }; + globalThis.window = { + AudioContext: FakeAudioContext, + setInterval, + clearInterval, + }; + + try { + const sharedAudioContext = new FakeAudioContext(); + const observer = startMediaTrackAudioObserver({ + mediaStreamTrack: track, + startEventName: 'start', + endEventName: 'end', + emit() {}, + sharedAudioContext, + }); + + observer.stop(); + + assert.equal(closeCount, 0); + } finally { + globalThis.window = originalWindow; + globalThis.MediaStream = originalMediaStream; + } +}); + +test('media track audio observer emits resume failure when configured', async () => { + const originalWindow = globalThis.window; + const originalMediaStream = globalThis.MediaStream; + const originalConsoleWarn = console.warn; + const events = []; + let clearIntervalCount = 0; + let closeCount = 0; + let disconnectCount = 0; + const track = { + addEventListener() {}, + removeEventListener() {}, + }; + + class FakeAudioContext { + createAnalyser() { + return { + fftSize: 0, + getFloatTimeDomainData(samples) { + samples.fill(0); + }, + }; + } + + createMediaStreamSource() { + return { + connect() {}, + disconnect() { + disconnectCount += 1; + }, + }; + } + + resume() { + return Promise.reject(new Error('resume blocked')); + } + + close() { + closeCount += 1; + return Promise.resolve(); + } + } + + globalThis.MediaStream = class FakeMediaStream { + constructor(tracks) { + this.tracks = tracks; + } + }; + globalThis.window = { + AudioContext: FakeAudioContext, + setInterval: () => 123, + clearInterval: () => { + clearIntervalCount += 1; + }, + }; + console.warn = () => {}; + + try { + const observer = startMediaTrackAudioObserver({ + mediaStreamTrack: track, + startEventName: 'start', + endEventName: 'end', + resumeErrorEventName: 'resume-error', + emit: (name, attributes) => events.push([name, attributes]), + attributes: { 'livekit.track_sid': 'TR_A' }, + }); + + await new Promise((resolve) => setTimeout(resolve, 0)); + observer.stop(); + + assert.equal(events.length, 1); + assert.equal(events[0][0], 'resume-error'); + assert.equal(events[0][1]['livekit.track_sid'], 'TR_A'); + assert.equal( + events[0][1]['observability.frontend_audio.reason'], + 'audio-context-resume-failed' + ); + assert.equal(events[0][1]['observability.frontend_audio.error'], 'resume blocked'); + assert.equal(clearIntervalCount, 1); + assert.equal(disconnectCount, 1); + assert.equal(closeCount, 1); + } finally { + console.warn = originalConsoleWarn; + globalThis.window = originalWindow; + globalThis.MediaStream = originalMediaStream; + } +}); diff --git a/tests/frontend-vad-observer.test.mjs b/tests/frontend-vad-observer.test.mjs new file mode 100644 index 000000000..088208d6a --- /dev/null +++ b/tests/frontend-vad-observer.test.mjs @@ -0,0 +1,151 @@ +import assert from 'node:assert/strict'; +import { test } from 'node:test'; + +const { startMediaTrackVadObserver } = await import('../lib/frontend-vad-observer.ts'); + +test('media track vad observer uses the supplied track and emits speech events', async () => { + const originalMediaStream = globalThis.MediaStream; + const events = []; + let sourceTrackStopCount = 0; + let endedListener; + const track = { + id: 'browser-audio-track', + addEventListener(eventName, listener) { + if (eventName === 'ended') endedListener = listener; + }, + removeEventListener(eventName, listener) { + if (eventName === 'ended' && endedListener === listener) endedListener = undefined; + }, + stop() { + sourceTrackStopCount += 1; + }, + }; + let capturedOptions; + let startCount = 0; + let pauseCount = 0; + let destroyCount = 0; + let now = 10_000; + + globalThis.MediaStream = class FakeMediaStream { + constructor(tracks) { + this.tracks = tracks; + } + }; + + try { + const observer = await startMediaTrackVadObserver({ + mediaStreamTrack: track, + now: () => now, + createMicVad: async (options) => { + capturedOptions = options; + return { + start() { + startCount += 1; + }, + pause() { + pauseCount += 1; + }, + destroy() { + destroyCount += 1; + }, + }; + }, + onSpeechStart: (event) => events.push(['start', event]), + onSpeechEnd: (event) => events.push(['end', event]), + }); + + const stream = await capturedOptions.getStream(); + assert.deepEqual(stream.tracks, [track]); + assert.equal(capturedOptions.model, 'v5'); + assert.equal(capturedOptions.startOnLoad, false); + assert.equal(capturedOptions.baseAssetPath, '/vad-web/'); + assert.equal(capturedOptions.onnxWASMBasePath, '/onnxruntime-web/'); + await capturedOptions.pauseStream(stream); + assert.equal(sourceTrackStopCount, 0); + assert.equal(await capturedOptions.resumeStream(stream), stream); + assert.equal(startCount, 1); + + now = 12_345; + capturedOptions.onSpeechStart(); + now = 12_450; + capturedOptions.onFrameProcessed({ isSpeech: 0.2 }, new Float32Array(512)); + now = 12_500; + capturedOptions.onFrameProcessed({ isSpeech: 0.8 }, new Float32Array(512)); + now = 12_700; + capturedOptions.onFrameProcessed({ isSpeech: 0.1 }, new Float32Array(512)); + now = 12_789; + capturedOptions.onSpeechEnd(new Float32Array(1600)); + + assert.deepEqual(events, [ + [ + 'start', + { + timestampMs: 12_345, + provider: 'vad-web', + model: 'silero_vad_v5', + }, + ], + [ + 'end', + { + timestampMs: 12_500, + provider: 'vad-web', + model: 'silero_vad_v5', + audioDurationMs: 100, + }, + ], + ]); + + assert.equal(typeof endedListener, 'function'); + await observer.stop(); + assert.equal(pauseCount, 1); + assert.equal(destroyCount, 1); + assert.equal(endedListener, undefined); + } finally { + globalThis.MediaStream = originalMediaStream; + } +}); + +test('media track vad observer destroys vad even when pause fails', async () => { + const originalMediaStream = globalThis.MediaStream; + let endedListener; + const track = { + addEventListener(eventName, listener) { + if (eventName === 'ended') endedListener = listener; + }, + removeEventListener(eventName, listener) { + if (eventName === 'ended' && endedListener === listener) endedListener = undefined; + }, + }; + let destroyCount = 0; + + globalThis.MediaStream = class FakeMediaStream { + constructor(tracks) { + this.tracks = tracks; + } + }; + + try { + const observer = await startMediaTrackVadObserver({ + mediaStreamTrack: track, + createMicVad: async () => ({ + start() {}, + pause() { + throw new Error('pause failed'); + }, + destroy() { + destroyCount += 1; + }, + }), + onSpeechStart: () => {}, + onSpeechEnd: () => {}, + }); + + await assert.rejects(observer.stop(), /pause failed/); + + assert.equal(destroyCount, 1); + assert.equal(endedListener, undefined); + } finally { + globalThis.MediaStream = originalMediaStream; + } +}); diff --git a/tests/observability.test.mjs b/tests/observability.test.mjs new file mode 100644 index 000000000..28cc69ec3 --- /dev/null +++ b/tests/observability.test.mjs @@ -0,0 +1,355 @@ +import assert from 'node:assert/strict'; +import { readFile } from 'node:fs/promises'; +import { test } from 'node:test'; + +const { + BACKEND_OBSERVABILITY_MARKER_TOPIC, + BACKEND_MARKERS, + FRONTEND_OBSERVABILITY_TOPIC, + FRONTEND_EVENTS, + OBSERVABILITY_ATTRS, + OBSERVABILITY_EVENT_TYPES, + outputSegmentAttributesFromMarker, + parseBackendObservabilityMarkerPayload, + publishFrontendObservabilityEvent, +} = await import('../lib/observability.ts'); + +test('frontend observability does not publish when disabled', async () => { + const calls = []; + const room = { + name: 'voice_assistant_room_a', + localParticipant: { + identity: 'voice_assistant_user_a', + publishData: async (...args) => { + calls.push(args); + }, + }, + }; + + const published = await publishFrontendObservabilityEvent({ + enabled: false, + room, + name: 'frontend.room.connected', + }); + + assert.equal(published, false); + assert.equal(calls.length, 0); +}); + +test('frontend observability exports shared event protocol constants', () => { + assert.equal(OBSERVABILITY_EVENT_TYPES.FRONTEND_EVENT, 'observability.frontend_event'); + assert.equal(OBSERVABILITY_EVENT_TYPES.BACKEND_MARKER, 'observability.backend_marker'); + assert.equal( + FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_STARTED, + 'frontend.reply_audio.playback_started' + ); + assert.equal(FRONTEND_EVENTS.REPLY_AUDIO_PLAYBACK_ERROR, 'frontend.reply_audio.playback_error'); + assert.equal( + FRONTEND_EVENTS.BROWSER_AUDIO_VAD_SPEECH_ENDED, + 'frontend.browser_audio.vad_speech_ended' + ); + assert.equal( + BACKEND_MARKERS.OUTPUT_AUDIO_SEGMENT_STARTED, + 'backend.output_audio.segment_started' + ); + assert.equal(OBSERVABILITY_ATTRS.PARTICIPANT_IDENTITY, 'livekit.participant_identity'); + assert.equal(OBSERVABILITY_ATTRS.PARTICIPANT_IDENTITY_LEGACY, 'livekit.participant'); + assert.equal(OBSERVABILITY_ATTRS.OUTPUT_SEGMENT_ID, 'observability.output_segment_id'); + assert.equal(OBSERVABILITY_ATTRS.VAD_PROVIDER, 'observability.vad.provider'); + assert.equal(OBSERVABILITY_ATTRS.VAD_MODEL, 'observability.vad.model'); + assert.equal(OBSERVABILITY_ATTRS.VAD_AUDIO_DURATION_MS, 'observability.vad.audio_duration_ms'); + assert.equal(OBSERVABILITY_ATTRS.TRACK_NAME, 'livekit.track_name'); + assert.equal(OBSERVABILITY_ATTRS.TRACK_SID, 'livekit.track_sid'); + assert.equal(OBSERVABILITY_ATTRS.TRACK_SOURCE, 'livekit.track_source'); + assert.equal(OBSERVABILITY_ATTRS.TRACK_STREAM_NAME, 'livekit.stream_name'); +}); + +test('frontend observability publishes livekit data packet payload', async () => { + const calls = []; + const room = { + name: 'voice_assistant_room_a', + localParticipant: { + identity: 'voice_assistant_user_a', + publishData: async (...args) => { + calls.push(args); + }, + }, + }; + + const published = await publishFrontendObservabilityEvent({ + enabled: true, + room, + name: 'frontend.browser_audio.track_published', + attributes: { + 'livekit.track_name': 'browser_audio_track', + 'livekit.track_sid': 'TR_A', + }, + now: () => 1_779_773_931_123, + performanceNow: () => 123.45, + }); + + assert.equal(published, true); + assert.equal(calls.length, 1); + assert.deepEqual(calls[0][1], { + reliable: true, + topic: FRONTEND_OBSERVABILITY_TOPIC, + }); + + const payload = JSON.parse(new TextDecoder().decode(calls[0][0])); + assert.deepEqual(payload, { + schema_version: 1, + type: 'observability.frontend_event', + name: 'frontend.browser_audio.track_published', + wall_time_unix_ms: 1_779_773_931_123, + performance_now_ms: 123.45, + room_name: 'voice_assistant_room_a', + participant_identity: 'voice_assistant_user_a', + attributes: { + 'livekit.track_name': 'browser_audio_track', + 'livekit.track_sid': 'TR_A', + }, + }); +}); + +test('frontend observability can publish an explicit event wall time', async () => { + const calls = []; + const room = { + localParticipant: { + publishData: async (...args) => { + calls.push(args); + }, + }, + }; + + await publishFrontendObservabilityEvent({ + enabled: true, + room, + name: 'frontend.browser_audio.vad_speech_ended', + wallTimeUnixMs: 1_779_773_930_777, + now: () => 1_779_773_931_123, + performanceNow: () => 456.78, + }); + + const payload = JSON.parse(new TextDecoder().decode(calls[0][0])); + assert.equal(payload.wall_time_unix_ms, 1_779_773_930_777); + assert.equal(payload.performance_now_ms, 456.78); +}); + +test('frontend observability parses backend output segment markers', () => { + const payload = { + schema_version: 1, + type: 'observability.backend_marker', + name: 'backend.output_audio.segment_started', + attributes: { + 'observability.turn_id': 'turn-000001', + 'observability.output_segment_id': 'turn-000001-output-002', + 'observability.output_segment_index': 2, + 'observability.output_segment_kind': 'final', + 'livekit.participant_identity': 'agent', + 'livekit.track_name': 'assistant_audio', + }, + }; + + const marker = parseBackendObservabilityMarkerPayload( + new TextEncoder().encode(JSON.stringify(payload)), + BACKEND_OBSERVABILITY_MARKER_TOPIC + ); + + assert.equal(marker?.name, 'backend.output_audio.segment_started'); + assert.deepEqual(outputSegmentAttributesFromMarker(marker), { + 'observability.turn_id': 'turn-000001', + 'observability.output_segment_id': 'turn-000001-output-002', + 'observability.output_segment_index': 2, + 'observability.output_segment_kind': 'final', + }); +}); + +test('frontend observability rejects oversized backend marker names', () => { + const marker = parseBackendObservabilityMarkerPayload( + JSON.stringify({ + schema_version: 1, + type: 'observability.backend_marker', + name: `backend.${'x'.repeat(122)}`, + attributes: {}, + }), + BACKEND_OBSERVABILITY_MARKER_TOPIC + ); + + assert.equal(marker, null); +}); + +test('frontend observability rejects array backend marker attributes', () => { + const marker = parseBackendObservabilityMarkerPayload( + JSON.stringify({ + schema_version: 1, + type: 'observability.backend_marker', + name: 'backend.output_audio.segment_started', + attributes: ['unexpected'], + }), + BACKEND_OBSERVABILITY_MARKER_TOPIC + ); + + assert.deepEqual(marker?.attributes, {}); +}); + +test('frontend observability rejects backend markers on the wrong topic', () => { + const marker = parseBackendObservabilityMarkerPayload( + JSON.stringify({ + schema_version: 1, + type: 'observability.backend_marker', + name: 'backend.output_audio.segment_started', + attributes: {}, + }), + FRONTEND_OBSERVABILITY_TOPIC + ); + + assert.equal(marker, null); +}); + +test('app routes playback observability through the filtered audio renderer', async () => { + const source = await readFile('components/app/app.tsx', 'utf8'); + + assert.match(source, /FilteredAudioRenderer/); + assert.match(source, /observabilityEnabled=\{appConfig\.observabilityEnabled\}/); + assert.doesNotMatch(source, /RemoteAudioPlaybackObserver/); +}); + +test('track exclusion helpers ignore empty exclude names', async () => { + const sources = await Promise.all( + [ + 'components/livekit/filtered-audio-renderer.tsx', + 'hooks/useAudioTrackFilter.ts', + 'hooks/useExcludedVideoTracks.ts', + ].map(async (path) => [path, await readFile(path, 'utf8')]) + ); + + for (const [path, source] of sources) { + assert.match(source, /if \(!excludeName\) \{\s*return false;\s*\}/, path); + assert.doesNotMatch(source, /trackName === excludeName/, path); + } +}); + +test('filtered audio renderer reports real element playback with backend marker context', async () => { + const source = await readFile('components/livekit/filtered-audio-renderer.tsx', 'utf8'); + + assert.match(source, /audioElement\.play\(\)/); + assert.match( + source, + /playPromise[\s\S]*\.then\(\(\) => \{[\s\S]*startPlaybackObserver\(elementKey, diagnostics, mediaStreamTrack\)/ + ); + assert.match(source, /FRONTEND_EVENTS\.REPLY_AUDIO_PLAYBACK_STARTED/); + assert.match(source, /FRONTEND_EVENTS\.REPLY_AUDIO_PLAYBACK_ENDED/); + assert.match(source, /FRONTEND_EVENTS\.REPLY_AUDIO_PLAYBACK_ERROR/); + assert.match(source, /resumeErrorEventName: FRONTEND_EVENTS\.REPLY_AUDIO_PLAYBACK_ERROR/); + assert.match( + source, + /emit: \(name, attributes\) => recordFrontendObservabilityRef\.current\(name, attributes\)/ + ); + assert.match( + source, + /const startPlaybackObserver = \([\s\S]*if \(!observabilityEnabledRef\.current\) \{[\s\S]*return;[\s\S]*\}[\s\S]*startMediaTrackAudioObserver/ + ); + assert.match(source, /recordFrontendObservabilityRef\.current/); + assert.match(source, /addEventListener\('playing', handleElementPlaybackStarted\)/); + assert.match(source, /activePlaybackSources\.get\(elementKey\)/); + assert.match( + source, + /const handleElementPlaybackError = \(\) => \{[\s\S]*const playbackSource = activePlaybackSources\.get\(elementKey\);[\s\S]*recordPlaybackError\(\s*playbackSource\?\.diagnostics \?\? diagnostics,/ + ); + assert.doesNotMatch(source, /playbackObserverStops\.clear\(\)/); + assert.match(source, /parseBackendObservabilityMarkerPayload/); + assert.match(source, /outputSegmentAttributesFromMarker/); + assert.match( + source, + /if \(!observabilityEnabled\) \{[\s\S]*outputSegments\.clear\(\);[\s\S]*return;[\s\S]*\}[\s\S]*room\.on\(RoomEvent\.DataReceived/ + ); + assert.doesNotMatch( + source, + /room,\s*participants,\s*excludeTrackNames,\s*volume,\s*debugAudio,\s*observabilityEnabled/ + ); + assert.doesNotMatch(source, /startsWith\(prefix\)/); + assert.match(source, /OBSERVABILITY_ATTRS\.PARTICIPANT_IDENTITY/); + assert.match(source, /OBSERVABILITY_ATTRS\.PARTICIPANT_IDENTITY_LEGACY/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_NAME/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_SID/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_SOURCE/); + assert.doesNotMatch(source, /marker\.attributes\['livekit\.participant'\]/); + assert.doesNotMatch(source, /'livekit\.track_name'/); + assert.doesNotMatch(source, /'livekit\.track_sid'/); + assert.doesNotMatch(source, /'livekit\.track_source'/); + assert.match(source, /canonical backend marker field -> legacy field -> LiveKit sender/); +}); + +test('browser source client publishes frontend audio observability events', async () => { + const source = await readFile('hooks/useBrowserSourceClient.ts', 'utf8'); + + assert.match(source, /startMediaTrackVadObserver/); + assert.match(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_TRACK_PUBLISHED/); + assert.match(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_TRACK_UNPUBLISHED/); + assert.match(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_VAD_SPEECH_STARTED/); + assert.match(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_VAD_SPEECH_ENDED/); + assert.doesNotMatch(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_USER_SPEECH_ENDED/); + assert.match(source, /FRONTEND_EVENTS\.BROWSER_AUDIO_VAD_PROBE_UNAVAILABLE/); + assert.match(source, /OBSERVABILITY_ATTRS\.VAD_PROVIDER/); + assert.match(source, /OBSERVABILITY_ATTRS\.VAD_MODEL/); + assert.match(source, /OBSERVABILITY_ATTRS\.VAD_AUDIO_DURATION_MS/); + assert.doesNotMatch(source, /OBSERVABILITY_ATTRS\.FRONTEND_AUDIO_CONFIRMATION_WALL_TIME_UNIX_MS/); + assert.doesNotMatch(source, /confirmationTimestampMs/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_NAME/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_SID/); + assert.match(source, /OBSERVABILITY_ATTRS\.TRACK_STREAM_NAME/); + assert.doesNotMatch(source, /'livekit\.track_name'/); + assert.doesNotMatch(source, /'livekit\.track_sid'/); + assert.doesNotMatch(source, /'livekit\.stream_name'/); + assert.match(source, /'vad-web'/); + assert.doesNotMatch(source, /startMediaTrackTailObserver/); + assert.doesNotMatch(source, /BROWSER_AUDIO_LAST_ACTIVE_FRAME_SENT/); + assert.match(source, /stop:\s*\(\) => Promise/); + assert.match( + source, + /try \{[\s\S]*await stopObservedAudio\?\.\(\);[\s\S]*\} catch \(error\) \{[\s\S]*VAD observer stop failed[\s\S]*\} finally \{[\s\S]*track\.stop\(\);[\s\S]*FRONTEND_EVENTS\.BROWSER_AUDIO_TRACK_UNPUBLISHED/ + ); + assert.doesNotMatch(source, /audioCaptureTrack/); + assert.doesNotMatch(source, /syncBrowserAudioEnabled/); +}); + +test('frontend vad observer defaults to local bundled assets', async () => { + const source = await readFile('lib/frontend-vad-observer.ts', 'utf8'); + + assert.match(source, /DEFAULT_VAD_ASSET_BASE_PATH = '\/vad-web\/'/); + assert.match(source, /DEFAULT_ONNX_WASM_BASE_PATH = '\/onnxruntime-web\/'/); + assert.doesNotMatch(source, /cdn\.jsdelivr/); + assert.match(source, /stop:\s*\(\) => Promise/); + assert.match(source, /await vad\.pause\?\.\(\)/); + assert.match(source, /await vad\.destroy\?\.\(\)/); +}); + +test('package scripts sync local vad assets before install and build', async () => { + const packageJson = JSON.parse(await readFile('package.json', 'utf8')); + + assert.equal(packageJson.scripts['sync:vad-assets'], 'node scripts/sync-vad-assets.js'); + assert.equal(packageJson.scripts.postinstall, 'node scripts/sync-vad-assets.js'); + assert.equal(packageJson.scripts.prebuild, 'node scripts/sync-vad-assets.js'); +}); + +test('frontend audio observer reuses shared observability attribute types', async () => { + const source = await readFile('lib/frontend-audio-observer.ts', 'utf8'); + + assert.match(source, /type ObservabilityAttributes/); + assert.doesNotMatch(source, /type ObservabilityAttribute =/); +}); + +test('room hook publishes room connected frontend observability event', async () => { + const source = await readFile('hooks/useRoom.ts', 'utf8'); + + assert.match(source, /FRONTEND_EVENTS\.ROOM_CONNECTED/); + assert.match(source, /publishFrontendObservabilityEvent/); + assert.match( + source, + /const recoverFromStartError = async[\s\S]*try \{[\s\S]*await browserSourceClient\.stop\(\);[\s\S]*\} catch \(stopError\)[\s\S]*\} finally \{[\s\S]*room\.disconnect\(\);[\s\S]*\}/ + ); + assert.match( + source, + /const endSession = useCallback\(async \(\) => \{[\s\S]*try \{[\s\S]*await browserSourceClient\.stop\(\);[\s\S]*\} catch \(error\)[\s\S]*\} finally \{[\s\S]*room\.disconnect\(\);[\s\S]*resetVoiceSessionId\(\);[\s\S]*setIsSessionActive\(false\);[\s\S]*\}/ + ); +}); diff --git a/tests/project-config.test.mjs b/tests/project-config.test.mjs index fa75dc818..6b06b0448 100644 --- a/tests/project-config.test.mjs +++ b/tests/project-config.test.mjs @@ -1,6 +1,7 @@ import assert from 'node:assert/strict'; import { readFile } from 'node:fs/promises'; import test from 'node:test'; +import { getClientConfigFromEnv } from '../lib/utils.ts'; test('shadcn config points at the imported Tailwind CSS file', async () => { const config = JSON.parse(await readFile('components.json', 'utf8')); @@ -29,3 +30,27 @@ test('avatar filtering excludes the current room video input identity', async () assert.match(source, /room_video_input/); assert.doesNotMatch(source, /room_vision_input/); }); + +test('client config reads frontend observability from OBSERVABILITY_ENABLED only', () => { + const previousObservability = process.env.OBSERVABILITY_ENABLED; + const previousNextPublicObservability = process.env.NEXT_PUBLIC_OBSERVABILITY_ENABLED; + try { + delete process.env.OBSERVABILITY_ENABLED; + process.env.NEXT_PUBLIC_OBSERVABILITY_ENABLED = '1'; + assert.equal(getClientConfigFromEnv().observabilityEnabled, false); + + process.env.OBSERVABILITY_ENABLED = '1'; + assert.equal(getClientConfigFromEnv().observabilityEnabled, true); + } finally { + if (previousObservability === undefined) { + delete process.env.OBSERVABILITY_ENABLED; + } else { + process.env.OBSERVABILITY_ENABLED = previousObservability; + } + if (previousNextPublicObservability === undefined) { + delete process.env.NEXT_PUBLIC_OBSERVABILITY_ENABLED; + } else { + process.env.NEXT_PUBLIC_OBSERVABILITY_ENABLED = previousNextPublicObservability; + } + } +}); diff --git a/tests/session-stop.test.mjs b/tests/session-stop.test.mjs index dc987072f..ec93b453b 100644 --- a/tests/session-stop.test.mjs +++ b/tests/session-stop.test.mjs @@ -1,22 +1,8 @@ import assert from 'node:assert/strict'; import { readFile } from 'node:fs/promises'; import { test } from 'node:test'; -import ts from 'typescript'; import { readAgentWorkerStateFromLog } from '../lib/agent-worker-readiness.ts'; - -async function loadSessionStopModule() { - const source = await readFile(new URL('../lib/session-stop.ts', import.meta.url), 'utf8'); - const { outputText } = ts.transpileModule(source, { - compilerOptions: { - module: ts.ModuleKind.ESNext, - target: ts.ScriptTarget.ES2022, - }, - }); - - return import(`data:text/javascript;base64,${Buffer.from(outputText).toString('base64')}`); -} - -const { resolveLiveKitHttpUrl } = await loadSessionStopModule(); +import { resolveLiveKitHttpUrl, resolveRoomInputStopUrls } from '../lib/session-stop.ts'; test('parses the latest target agent worker state from LiveKit server logs', () => { const source = [ @@ -35,16 +21,57 @@ test('maps livekit websocket URLs to server API URLs', () => { assert.equal(resolveLiveKitHttpUrl('https://livekit.example'), 'https://livekit.example'); }); -test('session stop route does not call the room-input control endpoint', async () => { +test('room input stop URL resolver ignores primebot non-server input', () => { + assert.deepEqual( + resolveRoomInputStopUrls({ + inputSource: 'primebot', + roomInputUrl: 'http://room-input.local/start', + roomAudioInputUrl: 'http://audio.local/start', + roomVisionInputUrl: 'http://vision.local/start', + frontdeskInputParticipantUrl: 'http://xunfei.local/start', + faceServiceUrl: 'http://face.local/start', + genericCameraParticipantUrl: 'http://generic.local/start', + }), + [] + ); +}); + +test('room input stop URL resolver only stops selected mixed server roles', () => { + assert.deepEqual( + resolveRoomInputStopUrls({ + inputSource: 'mixed', + audioInputDevice: 'xunfei', + visionInputDevice: 'browser', + roomAudioInputUrl: 'http://xunfei-audio.local/start', + roomVisionInputUrl: 'http://unused-vision.local/start', + roomInputUrl: 'http://fallback.local/start', + frontdeskInputParticipantUrl: 'http://frontdesk.local/start', + faceServiceUrl: 'http://face.local/start', + genericCameraParticipantUrl: 'http://generic.local/start', + }), + ['http://xunfei-audio.local/stop', 'http://frontdesk.local/stop', 'http://face.local/stop'] + ); +}); + +test('session stop route can call the room-input control endpoint before deleting the room', async () => { const routeSource = await readFile( new URL('../app/api/session/stop/route.ts', import.meta.url), 'utf8' ); - assert.doesNotMatch(routeSource, /process\.env\.ROOM_INPUT_URL/); - assert.doesNotMatch(routeSource, /resolveRoomInputStopUrl/); - assert.doesNotMatch(routeSource, /stopRoomInput/); - assert.doesNotMatch(routeSource, /GENERIC_CAMERA_PARTICIPANT_URL/); + const cleanupSource = routeSource.match(/async function runRemoteSessionCleanup[\s\S]*?\n}/)?.[0]; + + assert.ok(cleanupSource, 'runRemoteSessionCleanup should be defined'); + assert.match(routeSource, /readStopEnv\('ROOM_INPUT_URL'\)/); + assert.match(routeSource, /resolveRoomInputStopUrls/); + assert.match(routeSource, /stopRoomInput/); + assert.match(routeSource, /FRONTDESK_INPUT_PARTICIPANT_URL/); + assert.match(routeSource, /FACE_SERVICE_URL/); + assert.match(routeSource, /GENERIC_CAMERA_PARTICIPANT_URL/); + assert.match( + cleanupSource, + /const roomInputResults = await stopRoomInput\(roomName, sessionId\);[\s\S]*const liveKitRoomResult = await deleteLiveKitRoom\(roomName\);/ + ); }); test('session stop route cancels room session before remote cleanup', async () => { @@ -84,6 +111,7 @@ test('session stop route deletes the LiveKit room after the dispatch barrier', a ); assert.match(routeSource, /await waitForPendingDispatches\(roomName, sessionId\)/); + assert.match(routeSource, /await stopRoomInput\(roomName, sessionId\)/); assert.match(routeSource, /deleteLiveKitRoom\(roomName\)/); }); @@ -105,7 +133,7 @@ test('session stop route waits for local agent worker readiness before finishing assert.match(cleanupSource, /await waitForLocalAgentWorkerReadiness\(\)/); assert.match( cleanupSource, - /const cleanupResults = \[dispatchBarrierResult, liveKitRoomResult, agentWorkerReadinessResult\]/ + /const cleanupResults = \[\s*dispatchBarrierResult,\s*\.\.\.roomInputResults,\s*liveKitRoomResult,\s*agentWorkerReadinessResult,\s*\]/ ); }); @@ -147,6 +175,7 @@ test('session stop route closes the registry even when remote cleanup is partial assert.ok(cleanupSource, 'runRemoteSessionCleanup should be defined'); assert.match(cleanupSource, /const failures = results\.filter/); + assert.match(cleanupSource, /result\.fatal !== false/); assert.match( cleanupSource, /markRoomSessionStopped\(roomName, sessionId\);\s*return \{ results, failures \};/ diff --git a/tests/video-preview-selection.test.mjs b/tests/video-preview-selection.test.mjs new file mode 100644 index 000000000..ffc04d3c6 --- /dev/null +++ b/tests/video-preview-selection.test.mjs @@ -0,0 +1,39 @@ +import assert from 'node:assert/strict'; +import { readFile } from 'node:fs/promises'; +import { test } from 'node:test'; + +const { resolveCameraPreviewTrack } = await import('../lib/video-preview-selection.ts'); + +test('explicit livekit selection does not fall back to the raw camera while the track is missing', () => { + const previewTrack = resolveCameraPreviewTrack({ + selectedTrack: null, + selectedTrackId: 'room_video', + selectedTrackType: 'livekit', + canShowDefaultCameraPreview: true, + configuredCameraTrack: { id: 'room_video' }, + defaultCameraTrack: { id: 'browser_video_track' }, + }); + + assert.equal(previewTrack, undefined); +}); + +test('auto preview may fall back to a configured camera before the user selects a track', () => { + const configuredCameraTrack = { id: 'room_video' }; + const previewTrack = resolveCameraPreviewTrack({ + selectedTrack: null, + selectedTrackId: null, + selectedTrackType: null, + canShowDefaultCameraPreview: true, + configuredCameraTrack, + defaultCameraTrack: { id: 'browser_video_track' }, + }); + + assert.equal(previewTrack, configuredCameraTrack); +}); + +test('tile layout resolves camera preview through the shared helper', async () => { + const source = await readFile('components/app/tile-layout.tsx', 'utf8'); + + assert.match(source, /resolveCameraPreviewTrack/); + assert.match(source, /selectedTrackType/); +});