From 3f646a6ae17a3236b9cb53ed8b60e8fd86ebc8cb Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 12:28:07 -0400
Subject: [PATCH 1/6] ci: retry Datadog telemetry queries on 429; widen
 indexing wait and suite timeouts

---
 integration-tests/config.ts                   |  4 +-
 integration-tests/jest.config.js              |  2 +-
 integration-tests/tests/auth.test.ts          |  2 +-
 .../tests/custom-metrics.test.ts              |  2 +-
 integration-tests/tests/lmi.test.ts           |  2 +-
 integration-tests/tests/on-demand.test.ts     |  2 +-
 integration-tests/tests/otlp.test.ts          |  6 +-
 integration-tests/tests/payload-size.test.ts  |  2 +-
 integration-tests/tests/snapstart.test.ts     |  2 +-
 integration-tests/tests/utils/datadog.ts      | 92 ++++++++++++++++---
 10 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/integration-tests/config.ts b/integration-tests/config.ts
index f0f8ef42f..8cf232715 100644
--- a/integration-tests/config.ts
+++ b/integration-tests/config.ts
@@ -4,13 +4,13 @@ export const ACCOUNT = process.env.CDK_DEFAULT_ACCOUNT || process.env.AWS_ACCOUN
 export const REGION = process.env.CDK_DEFAULT_REGION || process.env.AWS_REGION || 'us-east-1';
 
 // Default wait time for Datadog to index logs and traces after Lambda invocation
-export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 5 * 60 * 1000; // 5 minutes
+export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 10 * 60 * 1000; // 10 minutes
 
 // Extended wait time for async invocations (SQS, SNS) - need more time for message processing
 export const ASYNC_DATADOG_INDEXING_WAIT_MS = 90 * 1000; // 90 seconds
 
 // Extended wait time for tests that need more time (e.g., OTLP tests)
-export const DATADOG_INDEXING_WAIT_5_MIN_MS = 5 * 60 * 1000; // 5 minutes
+export const DATADOG_INDEXING_WAIT_10_MIN_MS = 10 * 60 * 1000; // 10 minutes
 
 
 export function getIdentifier(): string {
diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js
index 2d10a3e21..7bac95655 100644
--- a/integration-tests/jest.config.js
+++ b/integration-tests/jest.config.js
@@ -10,7 +10,7 @@ module.exports = {
     '!tests/**/*.d.ts',
   ],
   // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog
-  testTimeout: 900000, // 15 minutes
+  testTimeout: 1800000, // 30 minutes
   verbose: true,
   // Reporters for test results
   reporters: [
diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts
index 7f4879a33..8416b0939 100644
--- a/integration-tests/tests/auth.test.ts
+++ b/integration-tests/tests/auth.test.ts
@@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('All invocations and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe('on-demand (node)', () => {
     it('should invoke Lambda successfully', () => {
diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts
index d00755bbf..38b1824d8 100644
--- a/integration-tests/tests/custom-metrics.test.ts
+++ b/integration-tests/tests/custom-metrics.test.ts
@@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => {
     metricsEndTime = Date.now();
 
     console.log("Lambdas invoked and indexing wait complete");
-  }, 900000);
+  }, 1800000);
 
   describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => {
     it.each(EXCLUDED_TAGS)(
diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts
index 36b4f4c55..abe1b3610 100644
--- a/integration-tests/tests/lmi.test.ts
+++ b/integration-tests/tests/lmi.test.ts
@@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('LMI invocation and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime with LMI', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts
index ff88f6108..cb59e5cf7 100644
--- a/integration-tests/tests/on-demand.test.ts
+++ b/integration-tests/tests/on-demand.test.ts
@@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000);
 
     console.log('All invocations and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe.each(runtimes)('%s runtime', (runtime) => {
     const getTelemetry = () => telemetry[runtime];
diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts
index fd836af16..1d1a9923e 100644
--- a/integration-tests/tests/otlp.test.ts
+++ b/integration-tests/tests/otlp.test.ts
@@ -1,6 +1,6 @@
 import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default';
 import { DatadogTelemetry } from './utils/datadog';
-import { IDENTIFIER, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config';
+import { IDENTIFIER, DATADOG_INDEXING_WAIT_10_MIN_MS } from '../config';
 
 const runtimes = ['node', 'python', 'java', 'dotnet'] as const;
 type Runtime = typeof runtimes[number];
@@ -30,10 +30,10 @@ describe('OTLP Integration Tests', () => {
     console.log('Invoking all OTLP Lambda functions...');
 
     // Invoke all OTLP functions and collect telemetry
-    telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS);
+    telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_10_MIN_MS);
 
     console.log('All OTLP Lambda invocations and data fetching completed');
-  }, 700000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts
index af81c6b97..b660e24ca 100644
--- a/integration-tests/tests/payload-size.test.ts
+++ b/integration-tests/tests/payload-size.test.ts
@@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => {
       console.log(`Extension send-error log lines: ${sendErrorMessages.length}`);
 
       console.log('Invocation and telemetry collection complete');
-    }, 900000);
+    }, 1800000);
 
     // Assert on the FIRST request's trace. Its flush is deferred to a later
     // invocation (cold-start race), which is why we invoke a few times — but the
diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts
index 0f9aa1e61..f4b70e833 100644
--- a/integration-tests/tests/snapstart.test.ts
+++ b/integration-tests/tests/snapstart.test.ts
@@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000);
 
     console.log('All Snapstart Lambda invocations and data fetching completed');
-  }, 900000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => {
     // With concurrency=2, invocations=2:
diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 3e69dab78..42df90d7e 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -54,6 +54,71 @@ function formatDatadogError(error: unknown, query: string): string {
   return `Error (query: '${query}'): ${String(error)}`;
 }
 
+// Total time a single request is allowed to spend waiting out 429s before
+// giving up. Kept under the Jest `beforeAll` timeout (30 min) — with margin
+// for the other requests a suite makes — so a rate-limited suite fails with
+// the rich 429 message rather than timing out.
+const MAX_RETRY_WAIT_MS = 20 * 60 * 1000;
+// Fallback wait when the API returns 429 without a usable retry-after header.
+const DEFAULT_RETRY_AFTER_MS = 5000;
+// Upper bound on any single backoff so one large header value can't blow the
+// whole budget on the first retry.
+const MAX_SINGLE_WAIT_MS = 60 * 1000;
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Reads the retry delay (ms) advertised by a 429 response, honoring
+ * `retry-after` / `x-ratelimit-reset` (both expressed in seconds). Falls back
+ * to a default when the header is missing or unparseable, and caps a single
+ * wait so an oversized value can't consume the entire retry budget at once.
+ */
+function parseRetryAfterMs(error: AxiosError): number {
+  const headers = error.response?.headers ?? {};
+  const raw = headers['x-ratelimit-reset'] ?? headers['retry-after'];
+  const seconds = raw !== undefined ? Number(raw) : NaN;
+  const ms = Number.isFinite(seconds) && seconds > 0 ? seconds * 1000 : DEFAULT_RETRY_AFTER_MS;
+  return Math.min(ms, MAX_SINGLE_WAIT_MS);
+}
+
+/**
+ * Executes a Datadog API request, retrying with backoff when the API responds
+ * 429 Too Many Requests. Honors the rate-limit reset header, adds a little
+ * random jitter so the ~7 suites running in parallel (each sharing one API
+ * key) don't retry in lockstep, and caps the total time spent waiting. When the
+ * budget is exhausted the last 429 is rethrown so callers still surface the
+ * rich rate-limit error message via formatDatadogError.
+ */
+async function requestWithRetry<T>(fn: () => Promise<T>, query: string): Promise<T> {
+  let waited = 0;
+  let attempt = 0;
+  // eslint-disable-next-line no-constant-condition
+  while (true) {
+    try {
+      return await fn();
+    } catch (error: unknown) {
+      const is429 = error instanceof AxiosError && error.response?.status === 429;
+      if (!is429) {
+        throw error;
+      }
+      const jitter = Math.floor(Math.random() * 1000);
+      const wait = parseRetryAfterMs(error as AxiosError) + jitter;
+      if (waited + wait > MAX_RETRY_WAIT_MS) {
+        // Out of budget — rethrow so the caller logs the rich 429 message.
+        throw error;
+      }
+      attempt += 1;
+      waited += wait;
+      console.warn(
+        `Datadog API 429 for '${query}'; retrying in ${wait}ms (attempt ${attempt}, total waited ${waited}ms)`,
+      );
+      await sleep(wait);
+    }
+  }
+}
+
 export interface DatadogTelemetry {
   threads: InvocationTracesLogs[][];  // [thread][invocation]
   metrics: EnhancedMetrics;
@@ -137,7 +202,7 @@ export async function getTraces(
   try {
     console.log(`Searching for traces: ${query}`);
 
-    const initialResponse = await datadogClient.post('/api/v2/spans/events/search', {
+    const initialResponse = await requestWithRetry(() => datadogClient.post('/api/v2/spans/events/search', {
       data: {
         type: 'search_request',
         attributes: {
@@ -152,7 +217,7 @@ export async function getTraces(
           sort: '-timestamp',
         },
       },
-    });
+    }), query);
 
     const initialSpans = initialResponse.data.data || [];
     console.log(`Found ${initialSpans.length} initial span(s)`);
@@ -169,12 +234,13 @@ export async function getTraces(
 
     const allSpans: any[] = [];
     for (const traceId of traceIds) {
-      const traceResponse = await datadogClient.post('/api/v2/spans/events/search', {
+      const traceQuery = `trace_id:${traceId}`;
+      const traceResponse = await requestWithRetry(() => datadogClient.post('/api/v2/spans/events/search', {
         data: {
           type: 'search_request',
           attributes: {
             filter: {
-              query: `trace_id:${traceId}`,
+              query: traceQuery,
               from: new Date(fromTime).toISOString(),
               to: new Date(toTime).toISOString(),
             },
@@ -183,7 +249,7 @@ export async function getTraces(
             },
           },
         },
-      });
+      }), traceQuery);
       const traceSpans = traceResponse.data.data || [];
       console.log(`Trace ${traceId}: ${traceSpans.length} spans`);
       allSpans.push(...traceSpans);
@@ -239,7 +305,7 @@ export async function getLogs(
   try {
     console.log(`Searching for logs: ${query}`);
 
-    const response = await datadogClient.post('/api/v2/logs/events/search', {
+    const response = await requestWithRetry(() => datadogClient.post('/api/v2/logs/events/search', {
       filter: {
         query: query,
         from: new Date(fromTime).toISOString(),
@@ -248,7 +314,7 @@ export async function getLogs(
       page: {
         limit: 1000,
       },
-    });
+    }), query);
 
     const rawLogs = response.data.data || [];
     console.log(`Found ${rawLogs.length} log(s)`);
@@ -309,13 +375,13 @@ export async function getMetricCount(
 
   console.log(`Querying metric count: ${query}`);
 
-  const response = await datadogClient.get('/api/v1/query', {
+  const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', {
     params: {
       query,
       from: Math.floor(fromTime / 1000),
       to: Math.floor(toTime / 1000),
     },
-  });
+  }), query);
 
   const series = response.data.series || [];
   if (series.length === 0) {
@@ -337,13 +403,13 @@ async function getMetrics(
 
   console.log(`Querying metrics: ${query}`);
 
-  const response = await datadogClient.get('/api/v1/query', {
+  const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', {
     params: {
       query,
       from: Math.floor(fromTime / 1000),
       to: Math.floor(toTime / 1000),
     },
-  });
+  }), query);
 
   const series = response.data.series || [];
   console.log(`Found ${series.length} series for ${metricName}`);
@@ -375,13 +441,13 @@ export async function hasMetricWithTag(
 
   console.log(`Querying metric with tag filter: ${query}`);
 
-  const response = await datadogClient.get('/api/v1/query', {
+  const response = await requestWithRetry(() => datadogClient.get('/api/v1/query', {
     params: {
       query,
       from: Math.floor(fromTime / 1000),
       to: Math.floor(toTime / 1000),
     },
-  });
+  }), query);
 
   const series = response.data.series || [];
   const hasData = series.some((s: any) => Array.isArray(s.pointlist) && s.pointlist.length > 0);

From b1c715728a92845d5e0277d30d1e2455ae36cda0 Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 14:34:00 -0400
Subject: [PATCH 2/6] Trim self-evident comments from retry helper

---
 integration-tests/tests/utils/datadog.ts | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 42df90d7e..4fb20db0b 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -54,27 +54,14 @@ function formatDatadogError(error: unknown, query: string): string {
   return `Error (query: '${query}'): ${String(error)}`;
 }
 
-// Total time a single request is allowed to spend waiting out 429s before
-// giving up. Kept under the Jest `beforeAll` timeout (30 min) — with margin
-// for the other requests a suite makes — so a rate-limited suite fails with
-// the rich 429 message rather than timing out.
 const MAX_RETRY_WAIT_MS = 20 * 60 * 1000;
-// Fallback wait when the API returns 429 without a usable retry-after header.
 const DEFAULT_RETRY_AFTER_MS = 5000;
-// Upper bound on any single backoff so one large header value can't blow the
-// whole budget on the first retry.
 const MAX_SINGLE_WAIT_MS = 60 * 1000;
 
 function sleep(ms: number): Promise<void> {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
 
-/**
- * Reads the retry delay (ms) advertised by a 429 response, honoring
- * `retry-after` / `x-ratelimit-reset` (both expressed in seconds). Falls back
- * to a default when the header is missing or unparseable, and caps a single
- * wait so an oversized value can't consume the entire retry budget at once.
- */
 function parseRetryAfterMs(error: AxiosError): number {
   const headers = error.response?.headers ?? {};
   const raw = headers['x-ratelimit-reset'] ?? headers['retry-after'];
@@ -84,12 +71,8 @@ function parseRetryAfterMs(error: AxiosError): number {
 }
 
 /**
- * Executes a Datadog API request, retrying with backoff when the API responds
- * 429 Too Many Requests. Honors the rate-limit reset header, adds a little
- * random jitter so the ~7 suites running in parallel (each sharing one API
- * key) don't retry in lockstep, and caps the total time spent waiting. When the
- * budget is exhausted the last 429 is rethrown so callers still surface the
- * rich rate-limit error message via formatDatadogError.
+ * Executes a Datadog API request, retrying with jittered backoff on HTTP 429
+ * until MAX_RETRY_WAIT_MS is exhausted, then rethrowing the last 429.
  */
 async function requestWithRetry<T>(fn: () => Promise<T>, query: string): Promise<T> {
   let waited = 0;
@@ -106,7 +89,6 @@ async function requestWithRetry<T>(fn: () => Promise<T>, query: string): Promise
       const jitter = Math.floor(Math.random() * 1000);
       const wait = parseRetryAfterMs(error as AxiosError) + jitter;
       if (waited + wait > MAX_RETRY_WAIT_MS) {
-        // Out of budget — rethrow so the caller logs the rich 429 message.
         throw error;
       }
       attempt += 1;

From a4a8cecbed5acc6cc14fdf373d3ffc942212dfd8 Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 15:14:31 -0400
Subject: [PATCH 3/6] Remove remaining helper comments

---
 integration-tests/tests/utils/datadog.ts | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 4fb20db0b..58184c215 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -70,10 +70,6 @@ function parseRetryAfterMs(error: AxiosError): number {
   return Math.min(ms, MAX_SINGLE_WAIT_MS);
 }
 
-/**
- * Executes a Datadog API request, retrying with jittered backoff on HTTP 429
- * until MAX_RETRY_WAIT_MS is exhausted, then rethrowing the last 429.
- */
 async function requestWithRetry<T>(fn: () => Promise<T>, query: string): Promise<T> {
   let waited = 0;
   let attempt = 0;

From d103763e391903b945932d4b4bd009868746c33c Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 15:31:43 -0400
Subject: [PATCH 4/6] Scope PR to 429 retries only; revert indexing-wait and
 suite-timeout changes

---
 integration-tests/config.ts                    | 4 ++--
 integration-tests/jest.config.js               | 2 +-
 integration-tests/tests/auth.test.ts           | 2 +-
 integration-tests/tests/custom-metrics.test.ts | 2 +-
 integration-tests/tests/lmi.test.ts            | 2 +-
 integration-tests/tests/on-demand.test.ts      | 2 +-
 integration-tests/tests/otlp.test.ts           | 6 +++---
 integration-tests/tests/payload-size.test.ts   | 2 +-
 integration-tests/tests/snapstart.test.ts      | 2 +-
 integration-tests/tests/utils/datadog.ts       | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/integration-tests/config.ts b/integration-tests/config.ts
index 8cf232715..f0f8ef42f 100644
--- a/integration-tests/config.ts
+++ b/integration-tests/config.ts
@@ -4,13 +4,13 @@ export const ACCOUNT = process.env.CDK_DEFAULT_ACCOUNT || process.env.AWS_ACCOUN
 export const REGION = process.env.CDK_DEFAULT_REGION || process.env.AWS_REGION || 'us-east-1';
 
 // Default wait time for Datadog to index logs and traces after Lambda invocation
-export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 10 * 60 * 1000; // 10 minutes
+export const DEFAULT_DATADOG_INDEXING_WAIT_MS = 5 * 60 * 1000; // 5 minutes
 
 // Extended wait time for async invocations (SQS, SNS) - need more time for message processing
 export const ASYNC_DATADOG_INDEXING_WAIT_MS = 90 * 1000; // 90 seconds
 
 // Extended wait time for tests that need more time (e.g., OTLP tests)
-export const DATADOG_INDEXING_WAIT_10_MIN_MS = 10 * 60 * 1000; // 10 minutes
+export const DATADOG_INDEXING_WAIT_5_MIN_MS = 5 * 60 * 1000; // 5 minutes
 
 
 export function getIdentifier(): string {
diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js
index 7bac95655..2d10a3e21 100644
--- a/integration-tests/jest.config.js
+++ b/integration-tests/jest.config.js
@@ -10,7 +10,7 @@ module.exports = {
     '!tests/**/*.d.ts',
   ],
   // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog
-  testTimeout: 1800000, // 30 minutes
+  testTimeout: 900000, // 15 minutes
   verbose: true,
   // Reporters for test results
   reporters: [
diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts
index 8416b0939..7f4879a33 100644
--- a/integration-tests/tests/auth.test.ts
+++ b/integration-tests/tests/auth.test.ts
@@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('All invocations and data fetching completed');
-  }, 1800000);
+  }, 600000);
 
   describe('on-demand (node)', () => {
     it('should invoke Lambda successfully', () => {
diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts
index 38b1824d8..d00755bbf 100644
--- a/integration-tests/tests/custom-metrics.test.ts
+++ b/integration-tests/tests/custom-metrics.test.ts
@@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => {
     metricsEndTime = Date.now();
 
     console.log("Lambdas invoked and indexing wait complete");
-  }, 1800000);
+  }, 900000);
 
   describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => {
     it.each(EXCLUDED_TAGS)(
diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts
index abe1b3610..36b4f4c55 100644
--- a/integration-tests/tests/lmi.test.ts
+++ b/integration-tests/tests/lmi.test.ts
@@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('LMI invocation and data fetching completed');
-  }, 1800000);
+  }, 600000);
 
   describe.each(runtimes)('%s Runtime with LMI', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts
index cb59e5cf7..ff88f6108 100644
--- a/integration-tests/tests/on-demand.test.ts
+++ b/integration-tests/tests/on-demand.test.ts
@@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000);
 
     console.log('All invocations and data fetching completed');
-  }, 1800000);
+  }, 600000);
 
   describe.each(runtimes)('%s runtime', (runtime) => {
     const getTelemetry = () => telemetry[runtime];
diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts
index 1d1a9923e..fd836af16 100644
--- a/integration-tests/tests/otlp.test.ts
+++ b/integration-tests/tests/otlp.test.ts
@@ -1,6 +1,6 @@
 import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default';
 import { DatadogTelemetry } from './utils/datadog';
-import { IDENTIFIER, DATADOG_INDEXING_WAIT_10_MIN_MS } from '../config';
+import { IDENTIFIER, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config';
 
 const runtimes = ['node', 'python', 'java', 'dotnet'] as const;
 type Runtime = typeof runtimes[number];
@@ -30,10 +30,10 @@ describe('OTLP Integration Tests', () => {
     console.log('Invoking all OTLP Lambda functions...');
 
     // Invoke all OTLP functions and collect telemetry
-    telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_10_MIN_MS);
+    telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS);
 
     console.log('All OTLP Lambda invocations and data fetching completed');
-  }, 1800000);
+  }, 700000);
 
   describe.each(runtimes)('%s Runtime', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts
index b660e24ca..af81c6b97 100644
--- a/integration-tests/tests/payload-size.test.ts
+++ b/integration-tests/tests/payload-size.test.ts
@@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => {
       console.log(`Extension send-error log lines: ${sendErrorMessages.length}`);
 
       console.log('Invocation and telemetry collection complete');
-    }, 1800000);
+    }, 900000);
 
     // Assert on the FIRST request's trace. Its flush is deferred to a later
     // invocation (cold-start race), which is why we invoke a few times — but the
diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts
index f4b70e833..0f9aa1e61 100644
--- a/integration-tests/tests/snapstart.test.ts
+++ b/integration-tests/tests/snapstart.test.ts
@@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000);
 
     console.log('All Snapstart Lambda invocations and data fetching completed');
-  }, 1800000);
+  }, 900000);
 
   describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => {
     // With concurrency=2, invocations=2:
diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 58184c215..24506e416 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string {
   return `Error (query: '${query}'): ${String(error)}`;
 }
 
-const MAX_RETRY_WAIT_MS = 20 * 60 * 1000;
+const MAX_RETRY_WAIT_MS = 4 * 60 * 1000;
 const DEFAULT_RETRY_AFTER_MS = 5000;
 const MAX_SINGLE_WAIT_MS = 60 * 1000;
 

From fecef3191dfd1dbd9afcc24670ef17e0e590153b Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 16:04:30 -0400
Subject: [PATCH 5/6] Raise suite timeouts to 30 min; widen retry budget to 20
 min

---
 integration-tests/jest.config.js               | 2 +-
 integration-tests/tests/auth.test.ts           | 2 +-
 integration-tests/tests/custom-metrics.test.ts | 2 +-
 integration-tests/tests/lmi.test.ts            | 2 +-
 integration-tests/tests/on-demand.test.ts      | 2 +-
 integration-tests/tests/otlp.test.ts           | 2 +-
 integration-tests/tests/payload-size.test.ts   | 2 +-
 integration-tests/tests/snapstart.test.ts      | 2 +-
 integration-tests/tests/utils/datadog.ts       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/integration-tests/jest.config.js b/integration-tests/jest.config.js
index 2d10a3e21..7bac95655 100644
--- a/integration-tests/jest.config.js
+++ b/integration-tests/jest.config.js
@@ -10,7 +10,7 @@ module.exports = {
     '!tests/**/*.d.ts',
   ],
   // Increase timeout for integration tests that involve Lambda invocations and waiting for Datadog
-  testTimeout: 900000, // 15 minutes
+  testTimeout: 1800000, // 30 minutes
   verbose: true,
   // Reporters for test results
   reporters: [
diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts
index 7f4879a33..8416b0939 100644
--- a/integration-tests/tests/auth.test.ts
+++ b/integration-tests/tests/auth.test.ts
@@ -27,7 +27,7 @@ describe('Auth Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('All invocations and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe('on-demand (node)', () => {
     it('should invoke Lambda successfully', () => {
diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts
index d00755bbf..38b1824d8 100644
--- a/integration-tests/tests/custom-metrics.test.ts
+++ b/integration-tests/tests/custom-metrics.test.ts
@@ -34,7 +34,7 @@ describe("Customer Metrics Exclude Tags Integration Tests", () => {
     metricsEndTime = Date.now();
 
     console.log("Lambdas invoked and indexing wait complete");
-  }, 900000);
+  }, 1800000);
 
   describe("unfiltered function (no DD_LAMBDA_CUSTOMER_METRICS_EXCLUDE_TAGS)", () => {
     it.each(EXCLUDED_TAGS)(
diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts
index 36b4f4c55..abe1b3610 100644
--- a/integration-tests/tests/lmi.test.ts
+++ b/integration-tests/tests/lmi.test.ts
@@ -22,7 +22,7 @@ describe('LMI Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('LMI invocation and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime with LMI', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts
index ff88f6108..cb59e5cf7 100644
--- a/integration-tests/tests/on-demand.test.ts
+++ b/integration-tests/tests/on-demand.test.ts
@@ -22,7 +22,7 @@ describe('On-Demand Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000);
 
     console.log('All invocations and data fetching completed');
-  }, 600000);
+  }, 1800000);
 
   describe.each(runtimes)('%s runtime', (runtime) => {
     const getTelemetry = () => telemetry[runtime];
diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts
index fd836af16..c16128f34 100644
--- a/integration-tests/tests/otlp.test.ts
+++ b/integration-tests/tests/otlp.test.ts
@@ -33,7 +33,7 @@ describe('OTLP Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS);
 
     console.log('All OTLP Lambda invocations and data fetching completed');
-  }, 700000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime', (runtime) => {
     const getResult = () => telemetry[runtime]?.threads[0]?.[0];
diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts
index af81c6b97..b660e24ca 100644
--- a/integration-tests/tests/payload-size.test.ts
+++ b/integration-tests/tests/payload-size.test.ts
@@ -70,7 +70,7 @@ describe('Payload Size Integration Tests', () => {
       console.log(`Extension send-error log lines: ${sendErrorMessages.length}`);
 
       console.log('Invocation and telemetry collection complete');
-    }, 900000);
+    }, 1800000);
 
     // Assert on the FIRST request's trace. Its flush is deferred to a later
     // invocation (cold-start race), which is why we invoke a few times — but the
diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts
index 0f9aa1e61..f4b70e833 100644
--- a/integration-tests/tests/snapstart.test.ts
+++ b/integration-tests/tests/snapstart.test.ts
@@ -45,7 +45,7 @@ describe('Snapstart Integration Tests', () => {
     telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000);
 
     console.log('All Snapstart Lambda invocations and data fetching completed');
-  }, 900000);
+  }, 1800000);
 
   describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => {
     // With concurrency=2, invocations=2:
diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 24506e416..58184c215 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string {
   return `Error (query: '${query}'): ${String(error)}`;
 }
 
-const MAX_RETRY_WAIT_MS = 4 * 60 * 1000;
+const MAX_RETRY_WAIT_MS = 20 * 60 * 1000;
 const DEFAULT_RETRY_AFTER_MS = 5000;
 const MAX_SINGLE_WAIT_MS = 60 * 1000;
 

From 92f23360e13ecfaf5e5ce0c430c82c49fe35abe1 Mon Sep 17 00:00:00 2001
From: John Chrostek <john.chrostek@datadoghq.com>
Date: Wed, 24 Jun 2026 16:08:48 -0400
Subject: [PATCH 6/6] Cap retry wait at 5 minutes

---
 integration-tests/tests/utils/datadog.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
index 58184c215..f8d2bcb31 100644
--- a/integration-tests/tests/utils/datadog.ts
+++ b/integration-tests/tests/utils/datadog.ts
@@ -54,7 +54,7 @@ function formatDatadogError(error: unknown, query: string): string {
   return `Error (query: '${query}'): ${String(error)}`;
 }
 
-const MAX_RETRY_WAIT_MS = 20 * 60 * 1000;
+const MAX_RETRY_WAIT_MS = 5 * 60 * 1000;
 const DEFAULT_RETRY_AFTER_MS = 5000;
 const MAX_SINGLE_WAIT_MS = 60 * 1000;