Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions apps/sim/connectors/zoom/zoom.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/**
* @vitest-environment node
*/
import { describe, expect, it } from 'vitest'
import { parseVtt } from '@/connectors/zoom/zoom'

const HEADER = 'WEBVTT\n\n'

describe('parseVtt', () => {
it.concurrent('returns empty string for input with no cues', () => {
expect(parseVtt(HEADER)).toBe('')
})

it.concurrent('extracts plain spoken text from a single cue', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nHello world\n`
expect(parseVtt(vtt)).toBe('Hello world')
})

it.concurrent('preserves WebVTT voice tags as "Speaker: text"', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v Alice>hello there</v>\n`
expect(parseVtt(vtt)).toBe('Alice: hello there')
})

it.concurrent('preserves voice tags with class suffix', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v.host Bob>welcome</v>\n`
expect(parseVtt(vtt)).toBe('Bob: welcome')
})

it.concurrent('strips inline formatting tags but keeps text', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<b>bold</b> and <i>italic</i>\n`
expect(parseVtt(vtt)).toBe('bold and italic')
})

it.concurrent('strips karaoke timestamp tags', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello <00:00:01.000>world\n`
expect(parseVtt(vtt)).toBe('hello world')
})

it.concurrent('strips class spans', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<c.loud>SHOUT</c>\n`
expect(parseVtt(vtt)).toBe('SHOUT')
})

it.concurrent('skips cue identifier lines before timing', () => {
const vtt = `${HEADER}cue-1\n00:00:00.000 --> 00:00:02.000\nhello\n`
expect(parseVtt(vtt)).toBe('hello')
})

it.concurrent('joins multiple cues with newlines', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nfirst\n\n00:00:02.000 --> 00:00:04.000\nsecond\n`
expect(parseVtt(vtt)).toBe('first\nsecond')
})

it.concurrent('collapses repeated whitespace within a cue', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello world\n`
expect(parseVtt(vtt)).toBe('hello world')
})

it.concurrent('iteratively strips overlapping tags that reconstruct after one pass', () => {
const crafted = '<<b>b>injected</<b>b>'
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
const result = parseVtt(vtt)
expect(result).not.toMatch(/<\/?[^>]+>/)
expect(result).toContain('injected')
})

it.concurrent('iteratively strips nested script-like tag fragments', () => {
const crafted = '<scr<script>ipt>alert(1)</scr</script>ipt>'
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
const result = parseVtt(vtt)
expect(result).not.toMatch(/<\/?[^>]+>/)
expect(result.toLowerCase()).not.toContain('script')
})

it.concurrent('sanitizes crafted speaker names that embed tag fragments', () => {
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v <b>Evil</b>>payload</v>\n`
const result = parseVtt(vtt)
expect(result).not.toMatch(/<\/?[^>]+>/)
})

it.concurrent('terminates on adversarial deeply-nested input', () => {
const crafted = `${'<'.repeat(50)}b${'>'.repeat(50)}text${'<'.repeat(50)}/b${'>'.repeat(50)}`
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
const result = parseVtt(vtt)
expect(result).not.toMatch(/<\/?[^>]+>/)
})
})
15 changes: 10 additions & 5 deletions apps/sim/connectors/zoom/zoom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,10 @@ function findTranscriptFile(files?: ZoomRecordingFile[]): ZoomRecordingFile | un
* Extracts spoken text from a Zoom WebVTT transcript, stripping cue identifiers,
* timestamps, and inline markup. Handles both Zoom's `Speaker: text` convention
* and standard WebVTT `<v Speaker>text</v>` voice tags.
*
* Exported for unit tests; not part of the connector's public surface.
*/
function parseVtt(vtt: string): string {
export function parseVtt(vtt: string): string {
const lines = vtt.split(/\r?\n/)
const segments: string[] = []
let i = 0
Expand Down Expand Up @@ -152,10 +154,13 @@ function parseVtt(vtt: string): string {
if (textParts.length > 0) {
const raw = textParts.join(' ')
const withSpeakers = raw.replace(/<v(?:\.[^\s>]+)?\s+([^>]+)>([\s\S]*?)<\/v>/g, '$1: $2')
const stripped = withSpeakers
.replace(/<\/?[^>]+>/g, '')
.replace(/\s+/g, ' ')
.trim()
let withoutTags = withSpeakers
let previous: string
do {
previous = withoutTags
withoutTags = withoutTags.replace(/<\/?[^>]+>/g, '')
} while (withoutTags !== previous)
Comment thread
waleedlatif1 marked this conversation as resolved.
const stripped = withoutTags.replace(/\s+/g, ' ').trim()
if (stripped) segments.push(stripped)
}
}
Expand Down
Loading