From 3f646a6ae17a3236b9cb53ed8b60e8fd86ebc8cb Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 12:28:07 -0400 Subject: [PATCH 1/6] ci: retry Datadog telemetry queries on 429; widen indexing wait and suite timeouts --- integration-tests/config.ts | 4 +- integration-tests/jest.config.js | 2 +- integration-tests/tests/auth.test.ts | 2 +- .../tests/custom-metrics.test.ts | 2 +- integration-tests/tests/lmi.test.ts | 2 +- integration-tests/tests/on-demand.test.ts | 2 +- integration-tests/tests/otlp.test.ts | 6 +- integration-tests/tests/payload-size.test.ts | 2 +- integration-tests/tests/snapstart.test.ts | 2 +- integration-tests/tests/utils/datadog.ts | 92 ++++++++++++++++--- 10 files changed, 91 insertions(+), 25 deletions(-) diff --git a/integration-tests/config.ts b/integration-tests/config.ts index f0f8ef42f..8cf232715 100644 --- a/integration-tests/config.ts +++ b/integration-tests/config.ts @@ -4,13 +4,13 @@ export const ACCOUNT = process.env.CDK_DEFAULT_ACCOUNT || process.env.AWS_ACCOUN export const REGION = process.env.CDK_DEFAULT_REGION || process.env.AWS_REGION || 'us-east-1'; // Default wait time for Datadog to index logs and traces after Lambda invocation -export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 5 * 60 * 1000; // 5 minutes +export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 10 * 60 * 1000; // 10 minutes // Extended wait time for async invocations (SQS, SNS) - need more time for message processing export const ASYNC_DATADOG_INDEXING_WAIT_MS = 90 * 1000; // 90 seconds // Extended wait time for tests that need more time (e.g., OTLP tests) -export const DATADOG_INDEXING_WAIT_5_MIN_MS = 5 * 60 * 1000; // 5 minutes +export const DATADOG_INDEXING_WAIT_10_MIN_MS = 10 * 60 * 1000; // 10 minutes export function getIdentifier(): string { diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js index 2d10a3e21..7bac95655 100644 --- a/integration-tests/jest.config.js +++ b/integration-tests/jest.config.js @@ -10,7 +10,7 @@ module.exports = { '!tests/**/*.d.ts', ], // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog - testTimeout: 900000, // 15 minutes + testTimeout: 1800000, // 30 minutes verbose: true, // Reporters for test results reporters: [ diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts index 7f4879a33..8416b0939 100644 --- a/integration-tests/tests/auth.test.ts +++ b/integration-tests/tests/auth.test.ts @@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('All invocations and data fetching completed'); - }, 600000); + }, 1800000); describe('on-demand (node)', () => { it('should invoke Lambda successfully', () => { diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts index d00755bbf..38b1824d8 100644 --- a/integration-tests/tests/custom-metrics.test.ts +++ b/integration-tests/tests/custom-metrics.test.ts @@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => { metricsEndTime = Date.now(); console.log("Lambdas invoked and indexing wait complete"); - }, 900000); + }, 1800000); describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => { it.each(EXCLUDED_TAGS)( diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts index 36b4f4c55..abe1b3610 100644 --- a/integration-tests/tests/lmi.test.ts +++ b/integration-tests/tests/lmi.test.ts @@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('LMI invocation and data fetching completed'); - }, 600000); + }, 1800000); describe.each(runtimes)('%s Runtime with LMI', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts index ff88f6108..cb59e5cf7 100644 --- a/integration-tests/tests/on-demand.test.ts +++ b/integration-tests/tests/on-demand.test.ts @@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000); console.log('All invocations and data fetching completed'); - }, 600000); + }, 1800000); describe.each(runtimes)('%s runtime', (runtime) => { const getTelemetry = () => telemetry[runtime]; diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts index fd836af16..1d1a9923e 100644 --- a/integration-tests/tests/otlp.test.ts +++ b/integration-tests/tests/otlp.test.ts @@ -1,6 +1,6 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; -import { IDENTIFIER, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config'; +import { IDENTIFIER, DATADOG_INDEXING_WAIT_10_MIN_MS } from '../config'; const runtimes = ['node', 'python', 'java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; @@ -30,10 +30,10 @@ describe('OTLP Integration Tests', () => { console.log('Invoking all OTLP Lambda functions...'); // Invoke all OTLP functions and collect telemetry - telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS); + telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_10_MIN_MS); console.log('All OTLP Lambda invocations and data fetching completed'); - }, 700000); + }, 1800000); describe.each(runtimes)('%s Runtime', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts index af81c6b97..b660e24ca 100644 --- a/integration-tests/tests/payload-size.test.ts +++ b/integration-tests/tests/payload-size.test.ts @@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => { console.log(`Extension send-error log lines: ${sendErrorMessages.length}`); console.log('Invocation and telemetry collection complete'); - }, 900000); + }, 1800000); // Assert on the FIRST request's trace. Its flush is deferred to a later // invocation (cold-start race), which is why we invoke a few times — but the diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts index 0f9aa1e61..f4b70e833 100644 --- a/integration-tests/tests/snapstart.test.ts +++ b/integration-tests/tests/snapstart.test.ts @@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000); console.log('All Snapstart Lambda invocations and data fetching completed'); - }, 900000); + }, 1800000); describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => { // With concurrency=2, invocations=2: diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 3e69dab78..42df90d7e 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -54,6 +54,71 @@ function formatDatadogError(error: unknown, query: string): string { return `Error (query: '${query}'): ${String(error)}`; } +// Total time a single request is allowed to spend waiting out 429s before +// giving up. Kept under the Jest `beforeAll` timeout (30 min) — with margin +// for the other requests a suite makes — so a rate-limited suite fails with +// the rich 429 message rather than timing out. +const MAX_RETRY_WAIT_MS = 20 * 60 * 1000; +// Fallback wait when the API returns 429 without a usable retry-after header. +const DEFAULT_RETRY_AFTER_MS = 5000; +// Upper bound on any single backoff so one large header value can't blow the +// whole budget on the first retry. +const MAX_SINGLE_WAIT_MS = 60 * 1000; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Reads the retry delay (ms) advertised by a 429 response, honoring + * `retry-after` / `x-ratelimit-reset` (both expressed in seconds). Falls back + * to a default when the header is missing or unparseable, and caps a single + * wait so an oversized value can't consume the entire retry budget at once. + */ +function parseRetryAfterMs(error: AxiosError): number { + const headers = error.response?.headers ?? {}; + const raw = headers['x-ratelimit-reset'] ?? headers['retry-after']; + const seconds = raw !== undefined ? Number(raw) : NaN; + const ms = Number.isFinite(seconds) && seconds > 0 ? seconds * 1000 : DEFAULT_RETRY_AFTER_MS; + return Math.min(ms, MAX_SINGLE_WAIT_MS); +} + +/** + * Executes a Datadog API request, retrying with backoff when the API responds + * 429 Too Many Requests. Honors the rate-limit reset header, adds a little + * random jitter so the ~7 suites running in parallel (each sharing one API + * key) don't retry in lockstep, and caps the total time spent waiting. When the + * budget is exhausted the last 429 is rethrown so callers still surface the + * rich rate-limit error message via formatDatadogError. + */ +async function requestWithRetry(fn: () => Promise, query: string): Promise { + let waited = 0; + let attempt = 0; + // eslint-disable-next-line no-constant-condition + while (true) { + try { + return await fn(); + } catch (error: unknown) { + const is429 = error instanceof AxiosError && error.response?.status === 429; + if (!is429) { + throw error; + } + const jitter = Math.floor(Math.random() * 1000); + const wait = parseRetryAfterMs(error as AxiosError) + jitter; + if (waited + wait > MAX_RETRY_WAIT_MS) { + // Out of budget — rethrow so the caller logs the rich 429 message. + throw error; + } + attempt += 1; + waited += wait; + console.warn( + `Datadog API 429 for '${query}'; retrying in ${wait}ms (attempt ${attempt}, total waited ${waited}ms)`, + ); + await sleep(wait); + } + } +} + export interface DatadogTelemetry { threads: InvocationTracesLogs[][]; // [thread][invocation] metrics: EnhancedMetrics; @@ -137,7 +202,7 @@ export async function getTraces( try { console.log(`Searching for traces: ${query}`); - const initialResponse = await datadogClient.post('/api/v2/spans/events/search', { + const initialResponse = await requestWithRetry(() => datadogClient.post('/api/v2/spans/events/search', { data: { type: 'search_request', attributes: { @@ -152,7 +217,7 @@ export async function getTraces( sort: '-timestamp', }, }, - }); + }), query); const initialSpans = initialResponse.data.data || []; console.log(`Found ${initialSpans.length} initial span(s)`); @@ -169,12 +234,13 @@ export async function getTraces( const allSpans: any[] = []; for (const traceId of traceIds) { - const traceResponse = await datadogClient.post('/api/v2/spans/events/search', { + const traceQuery = `trace_id:${traceId}`; + const traceResponse = await requestWithRetry(() => datadogClient.post('/api/v2/spans/events/search', { data: { type: 'search_request', attributes: { filter: { - query: `trace_id:${traceId}`, + query: traceQuery, from: new Date(fromTime).toISOString(), to: new Date(toTime).toISOString(), }, @@ -183,7 +249,7 @@ export async function getTraces( }, }, }, - }); + }), traceQuery); const traceSpans = traceResponse.data.data || []; console.log(`Trace ${traceId}: ${traceSpans.length} spans`); allSpans.push(...traceSpans); @@ -239,7 +305,7 @@ export async function getLogs( try { console.log(`Searching for logs: ${query}`); - const response = await datadogClient.post('/api/v2/logs/events/search', { + const response = await requestWithRetry(() => datadogClient.post('/api/v2/logs/events/search', { filter: { query: query, from: new Date(fromTime).toISOString(), @@ -248,7 +314,7 @@ export async function getLogs( page: { limit: 1000, }, - }); + }), query); const rawLogs = response.data.data || []; console.log(`Found ${rawLogs.length} log(s)`); @@ -309,13 +375,13 @@ export async function getMetricCount( console.log(`Querying metric count: ${query}`); - const response = await datadogClient.get('/api/v1/query', { + const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', { params: { query, from: Math.floor(fromTime / 1000), to: Math.floor(toTime / 1000), }, - }); + }), query); const series = response.data.series || []; if (series.length === 0) { @@ -337,13 +403,13 @@ async function getMetrics( console.log(`Querying metrics: ${query}`); - const response = await datadogClient.get('/api/v1/query', { + const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', { params: { query, from: Math.floor(fromTime / 1000), to: Math.floor(toTime / 1000), }, - }); + }), query); const series = response.data.series || []; console.log(`Found ${series.length} series for ${metricName}`); @@ -375,13 +441,13 @@ export async function hasMetricWithTag( console.log(`Querying metric with tag filter: ${query}`); - const response = await datadogClient.get('/api/v1/query', { + const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', { params: { query, from: Math.floor(fromTime / 1000), to: Math.floor(toTime / 1000), }, - }); + }), query); const series = response.data.series || []; const hasData = series.some((s: any) => Array.isArray(s.pointlist) && s.pointlist.length > 0); From b1c715728a92845d5e0277d30d1e2455ae36cda0 Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 14:34:00 -0400 Subject: [PATCH 2/6] Trim self-evident comments from retry helper --- integration-tests/tests/utils/datadog.ts | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 42df90d7e..4fb20db0b 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -54,27 +54,14 @@ function formatDatadogError(error: unknown, query: string): string { return `Error (query: '${query}'): ${String(error)}`; } -// Total time a single request is allowed to spend waiting out 429s before -// giving up. Kept under the Jest `beforeAll` timeout (30 min) — with margin -// for the other requests a suite makes — so a rate-limited suite fails with -// the rich 429 message rather than timing out. const MAX_RETRY_WAIT_MS = 20 * 60 * 1000; -// Fallback wait when the API returns 429 without a usable retry-after header. const DEFAULT_RETRY_AFTER_MS = 5000; -// Upper bound on any single backoff so one large header value can't blow the -// whole budget on the first retry. const MAX_SINGLE_WAIT_MS = 60 * 1000; function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } -/** - * Reads the retry delay (ms) advertised by a 429 response, honoring - * `retry-after` / `x-ratelimit-reset` (both expressed in seconds). Falls back - * to a default when the header is missing or unparseable, and caps a single - * wait so an oversized value can't consume the entire retry budget at once. - */ function parseRetryAfterMs(error: AxiosError): number { const headers = error.response?.headers ?? {}; const raw = headers['x-ratelimit-reset'] ?? headers['retry-after']; @@ -84,12 +71,8 @@ function parseRetryAfterMs(error: AxiosError): number { } /** - * Executes a Datadog API request, retrying with backoff when the API responds - * 429 Too Many Requests. Honors the rate-limit reset header, adds a little - * random jitter so the ~7 suites running in parallel (each sharing one API - * key) don't retry in lockstep, and caps the total time spent waiting. When the - * budget is exhausted the last 429 is rethrown so callers still surface the - * rich rate-limit error message via formatDatadogError. + * Executes a Datadog API request, retrying with jittered backoff on HTTP 429 + * until MAX_RETRY_WAIT_MS is exhausted, then rethrowing the last 429. */ async function requestWithRetry(fn: () => Promise, query: string): Promise { let waited = 0; @@ -106,7 +89,6 @@ async function requestWithRetry(fn: () => Promise, query: string): Promise const jitter = Math.floor(Math.random() * 1000); const wait = parseRetryAfterMs(error as AxiosError) + jitter; if (waited + wait > MAX_RETRY_WAIT_MS) { - // Out of budget — rethrow so the caller logs the rich 429 message. throw error; } attempt += 1; From a4a8cecbed5acc6cc14fdf373d3ffc942212dfd8 Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 15:14:31 -0400 Subject: [PATCH 3/6] Remove remaining helper comments --- integration-tests/tests/utils/datadog.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 4fb20db0b..58184c215 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -70,10 +70,6 @@ function parseRetryAfterMs(error: AxiosError): number { return Math.min(ms, MAX_SINGLE_WAIT_MS); } -/** - * Executes a Datadog API request, retrying with jittered backoff on HTTP 429 - * until MAX_RETRY_WAIT_MS is exhausted, then rethrowing the last 429. - */ async function requestWithRetry(fn: () => Promise, query: string): Promise { let waited = 0; let attempt = 0; From d103763e391903b945932d4b4bd009868746c33c Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 15:31:43 -0400 Subject: [PATCH 4/6] Scope PR to 429 retries only; revert indexing-wait and suite-timeout changes --- integration-tests/config.ts | 4 ++-- integration-tests/jest.config.js | 2 +- integration-tests/tests/auth.test.ts | 2 +- integration-tests/tests/custom-metrics.test.ts | 2 +- integration-tests/tests/lmi.test.ts | 2 +- integration-tests/tests/on-demand.test.ts | 2 +- integration-tests/tests/otlp.test.ts | 6 +++--- integration-tests/tests/payload-size.test.ts | 2 +- integration-tests/tests/snapstart.test.ts | 2 +- integration-tests/tests/utils/datadog.ts | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/integration-tests/config.ts b/integration-tests/config.ts index 8cf232715..f0f8ef42f 100644 --- a/integration-tests/config.ts +++ b/integration-tests/config.ts @@ -4,13 +4,13 @@ export const ACCOUNT = process.env.CDK_DEFAULT_ACCOUNT || process.env.AWS_ACCOUN export const REGION = process.env.CDK_DEFAULT_REGION || process.env.AWS_REGION || 'us-east-1'; // Default wait time for Datadog to index logs and traces after Lambda invocation -export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 10 * 60 * 1000; // 10 minutes +export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 5 * 60 * 1000; // 5 minutes // Extended wait time for async invocations (SQS, SNS) - need more time for message processing export const ASYNC_DATADOG_INDEXING_WAIT_MS = 90 * 1000; // 90 seconds // Extended wait time for tests that need more time (e.g., OTLP tests) -export const DATADOG_INDEXING_WAIT_10_MIN_MS = 10 * 60 * 1000; // 10 minutes +export const DATADOG_INDEXING_WAIT_5_MIN_MS = 5 * 60 * 1000; // 5 minutes export function getIdentifier(): string { diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js index 7bac95655..2d10a3e21 100644 --- a/integration-tests/jest.config.js +++ b/integration-tests/jest.config.js @@ -10,7 +10,7 @@ module.exports = { '!tests/**/*.d.ts', ], // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog - testTimeout: 1800000, // 30 minutes + testTimeout: 900000, // 15 minutes verbose: true, // Reporters for test results reporters: [ diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts index 8416b0939..7f4879a33 100644 --- a/integration-tests/tests/auth.test.ts +++ b/integration-tests/tests/auth.test.ts @@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('All invocations and data fetching completed'); - }, 1800000); + }, 600000); describe('on-demand (node)', () => { it('should invoke Lambda successfully', () => { diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts index 38b1824d8..d00755bbf 100644 --- a/integration-tests/tests/custom-metrics.test.ts +++ b/integration-tests/tests/custom-metrics.test.ts @@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => { metricsEndTime = Date.now(); console.log("Lambdas invoked and indexing wait complete"); - }, 1800000); + }, 900000); describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => { it.each(EXCLUDED_TAGS)( diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts index abe1b3610..36b4f4c55 100644 --- a/integration-tests/tests/lmi.test.ts +++ b/integration-tests/tests/lmi.test.ts @@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('LMI invocation and data fetching completed'); - }, 1800000); + }, 600000); describe.each(runtimes)('%s Runtime with LMI', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts index cb59e5cf7..ff88f6108 100644 --- a/integration-tests/tests/on-demand.test.ts +++ b/integration-tests/tests/on-demand.test.ts @@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000); console.log('All invocations and data fetching completed'); - }, 1800000); + }, 600000); describe.each(runtimes)('%s runtime', (runtime) => { const getTelemetry = () => telemetry[runtime]; diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts index 1d1a9923e..fd836af16 100644 --- a/integration-tests/tests/otlp.test.ts +++ b/integration-tests/tests/otlp.test.ts @@ -1,6 +1,6 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; -import { IDENTIFIER, DATADOG_INDEXING_WAIT_10_MIN_MS } from '../config'; +import { IDENTIFIER, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config'; const runtimes = ['node', 'python', 'java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; @@ -30,10 +30,10 @@ describe('OTLP Integration Tests', () => { console.log('Invoking all OTLP Lambda functions...'); // Invoke all OTLP functions and collect telemetry - telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_10_MIN_MS); + telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS); console.log('All OTLP Lambda invocations and data fetching completed'); - }, 1800000); + }, 700000); describe.each(runtimes)('%s Runtime', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts index b660e24ca..af81c6b97 100644 --- a/integration-tests/tests/payload-size.test.ts +++ b/integration-tests/tests/payload-size.test.ts @@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => { console.log(`Extension send-error log lines: ${sendErrorMessages.length}`); console.log('Invocation and telemetry collection complete'); - }, 1800000); + }, 900000); // Assert on the FIRST request's trace. Its flush is deferred to a later // invocation (cold-start race), which is why we invoke a few times — but the diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts index f4b70e833..0f9aa1e61 100644 --- a/integration-tests/tests/snapstart.test.ts +++ b/integration-tests/tests/snapstart.test.ts @@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000); console.log('All Snapstart Lambda invocations and data fetching completed'); - }, 1800000); + }, 900000); describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => { // With concurrency=2, invocations=2: diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 58184c215..24506e416 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string { return `Error (query: '${query}'): ${String(error)}`; } -const MAX_RETRY_WAIT_MS = 20 * 60 * 1000; +const MAX_RETRY_WAIT_MS = 4 * 60 * 1000; const DEFAULT_RETRY_AFTER_MS = 5000; const MAX_SINGLE_WAIT_MS = 60 * 1000; From fecef3191dfd1dbd9afcc24670ef17e0e590153b Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 16:04:30 -0400 Subject: [PATCH 5/6] Raise suite timeouts to 30 min; widen retry budget to 20 min --- integration-tests/jest.config.js | 2 +- integration-tests/tests/auth.test.ts | 2 +- integration-tests/tests/custom-metrics.test.ts | 2 +- integration-tests/tests/lmi.test.ts | 2 +- integration-tests/tests/on-demand.test.ts | 2 +- integration-tests/tests/otlp.test.ts | 2 +- integration-tests/tests/payload-size.test.ts | 2 +- integration-tests/tests/snapstart.test.ts | 2 +- integration-tests/tests/utils/datadog.ts | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js index 2d10a3e21..7bac95655 100644 --- a/integration-tests/jest.config.js +++ b/integration-tests/jest.config.js @@ -10,7 +10,7 @@ module.exports = { '!tests/**/*.d.ts', ], // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog - testTimeout: 900000, // 15 minutes + testTimeout: 1800000, // 30 minutes verbose: true, // Reporters for test results reporters: [ diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts index 7f4879a33..8416b0939 100644 --- a/integration-tests/tests/auth.test.ts +++ b/integration-tests/tests/auth.test.ts @@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('All invocations and data fetching completed'); - }, 600000); + }, 1800000); describe('on-demand (node)', () => { it('should invoke Lambda successfully', () => { diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts index d00755bbf..38b1824d8 100644 --- a/integration-tests/tests/custom-metrics.test.ts +++ b/integration-tests/tests/custom-metrics.test.ts @@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => { metricsEndTime = Date.now(); console.log("Lambdas invoked and indexing wait complete"); - }, 900000); + }, 1800000); describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => { it.each(EXCLUDED_TAGS)( diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts index 36b4f4c55..abe1b3610 100644 --- a/integration-tests/tests/lmi.test.ts +++ b/integration-tests/tests/lmi.test.ts @@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1); console.log('LMI invocation and data fetching completed'); - }, 600000); + }, 1800000); describe.each(runtimes)('%s Runtime with LMI', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts index ff88f6108..cb59e5cf7 100644 --- a/integration-tests/tests/on-demand.test.ts +++ b/integration-tests/tests/on-demand.test.ts @@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000); console.log('All invocations and data fetching completed'); - }, 600000); + }, 1800000); describe.each(runtimes)('%s runtime', (runtime) => { const getTelemetry = () => telemetry[runtime]; diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts index fd836af16..c16128f34 100644 --- a/integration-tests/tests/otlp.test.ts +++ b/integration-tests/tests/otlp.test.ts @@ -33,7 +33,7 @@ describe('OTLP Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS); console.log('All OTLP Lambda invocations and data fetching completed'); - }, 700000); + }, 1800000); describe.each(runtimes)('%s Runtime', (runtime) => { const getResult = () => telemetry[runtime]?.threads[0]?.[0]; diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts index af81c6b97..b660e24ca 100644 --- a/integration-tests/tests/payload-size.test.ts +++ b/integration-tests/tests/payload-size.test.ts @@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => { console.log(`Extension send-error log lines: ${sendErrorMessages.length}`); console.log('Invocation and telemetry collection complete'); - }, 900000); + }, 1800000); // Assert on the FIRST request's trace. Its flush is deferred to a later // invocation (cold-start race), which is why we invoke a few times — but the diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts index 0f9aa1e61..f4b70e833 100644 --- a/integration-tests/tests/snapstart.test.ts +++ b/integration-tests/tests/snapstart.test.ts @@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => { telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000); console.log('All Snapstart Lambda invocations and data fetching completed'); - }, 900000); + }, 1800000); describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => { // With concurrency=2, invocations=2: diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 24506e416..58184c215 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string { return `Error (query: '${query}'): ${String(error)}`; } -const MAX_RETRY_WAIT_MS = 4 * 60 * 1000; +const MAX_RETRY_WAIT_MS = 20 * 60 * 1000; const DEFAULT_RETRY_AFTER_MS = 5000; const MAX_SINGLE_WAIT_MS = 60 * 1000; From 92f23360e13ecfaf5e5ce0c430c82c49fe35abe1 Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 16:08:48 -0400 Subject: [PATCH 6/6] Cap retry wait at 5 minutes --- integration-tests/tests/utils/datadog.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index 58184c215..f8d2bcb31 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string { return `Error (query: '${query}'): ${String(error)}`; } -const MAX_RETRY_WAIT_MS = 20 * 60 * 1000; +const MAX_RETRY_WAIT_MS = 5 * 60 * 1000; const DEFAULT_RETRY_AFTER_MS = 5000; const MAX_SINGLE_WAIT_MS = 60 * 1000;