From 9c2e5ff42056897a774340cab1e45ba5d1781f44 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Tue, 26 May 2026 14:17:54 -0700 Subject: [PATCH 1/2] fix(zoom): iteratively strip tags to close incomplete-sanitization gap --- apps/sim/connectors/zoom/zoom.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/apps/sim/connectors/zoom/zoom.ts b/apps/sim/connectors/zoom/zoom.ts index 8027754892..2beb5d6221 100644 --- a/apps/sim/connectors/zoom/zoom.ts +++ b/apps/sim/connectors/zoom/zoom.ts @@ -152,10 +152,13 @@ function parseVtt(vtt: string): string { if (textParts.length > 0) { const raw = textParts.join(' ') const withSpeakers = raw.replace(/]+)?\s+([^>]+)>([\s\S]*?)<\/v>/g, '$1: $2') - const stripped = withSpeakers - .replace(/<\/?[^>]+>/g, '') - .replace(/\s+/g, ' ') - .trim() + let withoutTags = withSpeakers + let previous: string + do { + previous = withoutTags + withoutTags = withoutTags.replace(/<\/?[^>]+>/g, '') + } while (withoutTags !== previous) + const stripped = withoutTags.replace(/\s+/g, ' ').trim() if (stripped) segments.push(stripped) } } From 79c057ee110e0ea8ab55f031c1ac9fada4a26725 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Tue, 26 May 2026 14:23:21 -0700 Subject: [PATCH 2/2] test(zoom): cover iterative sanitization in transcript parser --- apps/sim/connectors/zoom/zoom.test.ts | 87 +++++++++++++++++++++++++++ apps/sim/connectors/zoom/zoom.ts | 4 +- 2 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 apps/sim/connectors/zoom/zoom.test.ts diff --git a/apps/sim/connectors/zoom/zoom.test.ts b/apps/sim/connectors/zoom/zoom.test.ts new file mode 100644 index 0000000000..040de394fb --- /dev/null +++ b/apps/sim/connectors/zoom/zoom.test.ts @@ -0,0 +1,87 @@ +/** + * @vitest-environment node + */ +import { describe, expect, it } from 'vitest' +import { parseVtt } from '@/connectors/zoom/zoom' + +const HEADER = 'WEBVTT\n\n' + +describe('parseVtt', () => { + it.concurrent('returns empty string for input with no cues', () => { + expect(parseVtt(HEADER)).toBe('') + }) + + it.concurrent('extracts plain spoken text from a single cue', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nHello world\n` + expect(parseVtt(vtt)).toBe('Hello world') + }) + + it.concurrent('preserves WebVTT voice tags as "Speaker: text"', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello there\n` + expect(parseVtt(vtt)).toBe('Alice: hello there') + }) + + it.concurrent('preserves voice tags with class suffix', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nwelcome\n` + expect(parseVtt(vtt)).toBe('Bob: welcome') + }) + + it.concurrent('strips inline formatting tags but keeps text', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nbold and italic\n` + expect(parseVtt(vtt)).toBe('bold and italic') + }) + + it.concurrent('strips karaoke timestamp tags', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello <00:00:01.000>world\n` + expect(parseVtt(vtt)).toBe('hello world') + }) + + it.concurrent('strips class spans', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nSHOUT\n` + expect(parseVtt(vtt)).toBe('SHOUT') + }) + + it.concurrent('skips cue identifier lines before timing', () => { + const vtt = `${HEADER}cue-1\n00:00:00.000 --> 00:00:02.000\nhello\n` + expect(parseVtt(vtt)).toBe('hello') + }) + + it.concurrent('joins multiple cues with newlines', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nfirst\n\n00:00:02.000 --> 00:00:04.000\nsecond\n` + expect(parseVtt(vtt)).toBe('first\nsecond') + }) + + it.concurrent('collapses repeated whitespace within a cue', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello world\n` + expect(parseVtt(vtt)).toBe('hello world') + }) + + it.concurrent('iteratively strips overlapping tags that reconstruct after one pass', () => { + const crafted = '<b>injectedb>' + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n` + const result = parseVtt(vtt) + expect(result).not.toMatch(/<\/?[^>]+>/) + expect(result).toContain('injected') + }) + + it.concurrent('iteratively strips nested script-like tag fragments', () => { + const crafted = 'ipt>alert(1)ipt>' + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n` + const result = parseVtt(vtt) + expect(result).not.toMatch(/<\/?[^>]+>/) + expect(result.toLowerCase()).not.toContain('script') + }) + + it.concurrent('sanitizes crafted speaker names that embed tag fragments', () => { + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nEvil>payload\n` + const result = parseVtt(vtt) + expect(result).not.toMatch(/<\/?[^>]+>/) + }) + + it.concurrent('terminates on adversarial deeply-nested input', () => { + const crafted = `${'<'.repeat(50)}b${'>'.repeat(50)}text${'<'.repeat(50)}/b${'>'.repeat(50)}` + const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n` + const result = parseVtt(vtt) + expect(result).not.toMatch(/<\/?[^>]+>/) + }) +}) diff --git a/apps/sim/connectors/zoom/zoom.ts b/apps/sim/connectors/zoom/zoom.ts index 2beb5d6221..240031d1cf 100644 --- a/apps/sim/connectors/zoom/zoom.ts +++ b/apps/sim/connectors/zoom/zoom.ts @@ -120,8 +120,10 @@ function findTranscriptFile(files?: ZoomRecordingFile[]): ZoomRecordingFile | un * Extracts spoken text from a Zoom WebVTT transcript, stripping cue identifiers, * timestamps, and inline markup. Handles both Zoom's `Speaker: text` convention * and standard WebVTT `text` voice tags. + * + * Exported for unit tests; not part of the connector's public surface. */ -function parseVtt(vtt: string): string { +export function parseVtt(vtt: string): string { const lines = vtt.split(/\r?\n/) const segments: string[] = [] let i = 0