From af5ebdae81934d8ae5a17abc566c4a8f95db99ce Mon Sep 17 00:00:00 2001 From: John Chrostek Date: Wed, 24 Jun 2026 07:22:39 -0400 Subject: [PATCH] ci: unique identifier per integration-suite attempt Integration-suite deploys intermittently failed at 'cdk deploy' (before any test runs) because the test stack was named only after the commit SHA and the job retries. When a retry's teardown deleted a function's log group while another attempt was still invoking that function, the Lambda service recreated the group as an unmanaged, never-expire group. It survived 'cdk destroy' and blocked the next attempt, which reused the same name and failed to auto-import it (the construct is RemovalPolicy.DESTROY, not Retain). Include CI_JOB_ID in the identifier so every attempt (including retries) gets unique stack, function, and log group names that never recur, so a leftover group can no longer collide with a later deploy. CI_JOB_ID is unique per retry and available in both script and after_script, so teardown still targets the right stack. The identifier also carries a short 'it-' prefix (replacing the longer 'integ-') to keep generated Lambda names under the 64-char limit; the shared integ-auth-delegated-role and capacity provider are unchanged. --- .gitlab/templates/pipeline.yaml.tpl | 6 ++--- integration-tests/bin/app.ts | 22 +++++++++---------- integration-tests/config.ts | 6 +++-- integration-tests/tests/auth.test.ts | 5 ++--- .../tests/custom-metrics.test.ts | 5 ++--- integration-tests/tests/lmi-oom.test.ts | 5 ++--- integration-tests/tests/lmi.test.ts | 5 ++--- integration-tests/tests/on-demand.test.ts | 5 ++--- integration-tests/tests/oom.test.ts | 5 ++--- integration-tests/tests/otlp.test.ts | 5 ++--- integration-tests/tests/payload-size.test.ts | 5 ++--- integration-tests/tests/snapstart.test.ts | 5 ++--- 12 files changed, 35 insertions(+), 44 deletions(-) diff --git a/.gitlab/templates/pipeline.yaml.tpl b/.gitlab/templates/pipeline.yaml.tpl index a87bfaa14..776a7c5fa 100644 --- a/.gitlab/templates/pipeline.yaml.tpl +++ b/.gitlab/templates/pipeline.yaml.tpl @@ -626,7 +626,7 @@ integration-suite: - build ruby lambdas - build go lambdas variables: - IDENTIFIER: ${CI_COMMIT_SHORT_SHA} + IDENTIFIER: it-${CI_COMMIT_SHORT_SHA}-${CI_JOB_ID} AWS_DEFAULT_REGION: us-east-1 DD_SITE: datadoghq.com {{ with $environment := (ds "environments").environments.sandbox }} @@ -649,7 +649,7 @@ integration-suite: - export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text) - export CDK_DEFAULT_REGION=us-east-1 - npm run build - - npx cdk deploy "integ-${IDENTIFIER}-${TEST_SUITE}" --require-approval never --import-existing-resources + - npx cdk deploy "${IDENTIFIER}-${TEST_SUITE}" --require-approval never --import-existing-resources - echo "Running ${TEST_SUITE} integration tests with identifier ${IDENTIFIER}..." - export TEST_SUITE=${TEST_SUITE} - npx jest tests/${TEST_SUITE}.test.ts @@ -658,7 +658,7 @@ integration-suite: - EXTERNAL_ID_NAME={{ $environment.external_id }} ROLE_TO_ASSUME={{ $environment.role_to_assume }} AWS_ACCOUNT={{ $environment.account }} source .gitlab/scripts/get_secrets.sh - echo "Destroying ${TEST_SUITE} CDK stack with identifier ${IDENTIFIER}..." - | - STACK_NAME="integ-${IDENTIFIER}-${TEST_SUITE}" + STACK_NAME="${IDENTIFIER}-${TEST_SUITE}" # Check if stack exists STACK_STATUS=$(aws cloudformation describe-stacks \ diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index 2bc9cf07b..b34f6b910 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -11,7 +11,7 @@ import {LmiOom} from '../lib/stacks/lmi-oom'; import {CustomMetrics} from '../lib/stacks/custom-metrics'; import {PayloadSize} from '../lib/stacks/payload-size'; import {AuthRoleStack} from '../lib/auth-role'; -import {ACCOUNT, getIdentifier, REGION} from '../config'; +import {ACCOUNT, IDENTIFIER, REGION} from '../config'; import {CapacityProviderStack} from "../lib/capacity-provider"; const app = new cdk.App(); @@ -21,39 +21,37 @@ const env = { region: REGION, }; -const identifier = getIdentifier(); - // Use the same Lambda Managed Instance Capacity Provider for all LMI functions. // It is slow to create/destroy the related resources. new CapacityProviderStack(app, `integ-default-capacity-provider`, {env}); new AuthRoleStack(app, `integ-auth-role`, {env}); const stacks = [ - new OnDemand(app, `integ-${identifier}-on-demand`, { + new OnDemand(app, `${IDENTIFIER}-on-demand`, { env, }), - new Otlp(app, `integ-${identifier}-otlp`, { + new Otlp(app, `${IDENTIFIER}-otlp`, { env, }), - new Snapstart(app, `integ-${identifier}-snapstart`, { + new Snapstart(app, `${IDENTIFIER}-snapstart`, { env, }), - new LambdaManagedInstancesStack(app, `integ-${identifier}-lmi`, { + new LambdaManagedInstancesStack(app, `${IDENTIFIER}-lmi`, { env, }), - new AuthStack(app, `integ-${identifier}-auth`, { + new AuthStack(app, `${IDENTIFIER}-auth`, { env, }), - new Oom(app, `integ-${identifier}-oom`, { + new Oom(app, `${IDENTIFIER}-oom`, { env, }), - new LmiOom(app, `integ-${identifier}-lmi-oom`, { + new LmiOom(app, `${IDENTIFIER}-lmi-oom`, { env, }), - new CustomMetrics(app, `integ-${identifier}-custom-metrics`, { + new CustomMetrics(app, `${IDENTIFIER}-custom-metrics`, { env, }), - new PayloadSize(app, `integ-${identifier}-payload-size`, { + new PayloadSize(app, `${IDENTIFIER}-payload-size`, { env, }), ] diff --git a/integration-tests/config.ts b/integration-tests/config.ts index 3669acf12..f0f8ef42f 100644 --- a/integration-tests/config.ts +++ b/integration-tests/config.ts @@ -22,11 +22,13 @@ export function getIdentifier(): string { const username = os.userInfo().username; const firstName = username.split('.')[0]; if (firstName && firstName.length > 0) { - return firstName; + return `it-${firstName}`; } } catch (error) { console.error('Error getting identifier:', error); } - return 'integration'; + return 'it-integration'; } + +export const IDENTIFIER = getIdentifier(); diff --git a/integration-tests/tests/auth.test.ts b/integration-tests/tests/auth.test.ts index 6a62e0477..7f4879a33 100644 --- a/integration-tests/tests/auth.test.ts +++ b/integration-tests/tests/auth.test.ts @@ -1,10 +1,9 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; import { forceColdStart, publishVersion, waitForSnapStartReady } from './utils/lambda'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-auth`; +const stackName = `${IDENTIFIER}-auth`; describe('Auth Integration Tests', () => { let telemetry: Record; diff --git a/integration-tests/tests/custom-metrics.test.ts b/integration-tests/tests/custom-metrics.test.ts index 5e960a5ea..d00755bbf 100644 --- a/integration-tests/tests/custom-metrics.test.ts +++ b/integration-tests/tests/custom-metrics.test.ts @@ -1,9 +1,8 @@ import { hasMetricWithTag } from "./utils/datadog"; import { forceColdStart, invokeLambda } from "./utils/lambda"; -import { getIdentifier, DEFAULT_DATADOG_INDEXING_WAIT_MS } from "../config"; +import { IDENTIFIER, DEFAULT_DATADOG_INDEXING_WAIT_MS } from "../config"; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-custom-metrics`; +const stackName = `${IDENTIFIER}-custom-metrics`; const CUSTOM_METRIC_NAME = "custom.exclude_tags_test"; const EXCLUDED_TAGS = ["function_arn", "region"]; diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index ed43e821d..5b89c344a 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -1,6 +1,6 @@ import { invokeLambda } from './utils/lambda'; import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; /** * LMI OOM test. @@ -15,8 +15,7 @@ import { getIdentifier } from '../config'; * (e.g. a future change where `handle_managed_instance_report` surfaces * `Runtime.OutOfMemory` in the synthesized runtime-done). */ -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-lmi-oom`; +const stackName = `${IDENTIFIER}-lmi-oom`; const functionName = `${stackName}-python-lambda`; const INITIAL_WAIT_MS = 90 * 1000; diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts index e2574ed30..36b4f4c55 100644 --- a/integration-tests/tests/lmi.test.ts +++ b/integration-tests/tests/lmi.test.ts @@ -1,12 +1,11 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; const runtimes = ['node', 'python', 'java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-lmi`; +const stackName = `${IDENTIFIER}-lmi`; describe('LMI Integration Tests', () => { let telemetry: Record; diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts index 8c26d06ab..ff88f6108 100644 --- a/integration-tests/tests/on-demand.test.ts +++ b/integration-tests/tests/on-demand.test.ts @@ -1,13 +1,12 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry, DURATION_METRICS } from './utils/datadog'; import { forceColdStart } from './utils/lambda'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; const runtimes = ['node', 'python', 'java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-on-demand`; +const stackName = `${IDENTIFIER}-on-demand`; describe('On-Demand Integration Tests', () => { let telemetry: Record; diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts index b00b80bff..e1678e5f2 100644 --- a/integration-tests/tests/oom.test.ts +++ b/integration-tests/tests/oom.test.ts @@ -1,6 +1,6 @@ import { invokeLambda } from './utils/lambda'; import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; /** * Cross-runtime OOM test. @@ -25,8 +25,7 @@ import { getIdentifier } from '../config'; * after an initial wait we re-query every 30s until every runtime reports * count>=1 or the overall budget is exhausted. */ -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-oom`; +const stackName = `${IDENTIFIER}-oom`; interface OomCase { runtime: string; diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts index a0c0b0c58..fd836af16 100644 --- a/integration-tests/tests/otlp.test.ts +++ b/integration-tests/tests/otlp.test.ts @@ -1,12 +1,11 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; -import { getIdentifier, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config'; +import { IDENTIFIER, DATADOG_INDEXING_WAIT_5_MIN_MS } from '../config'; const runtimes = ['node', 'python', 'java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-otlp`; +const stackName = `${IDENTIFIER}-otlp`; describe('OTLP Integration Tests', () => { let telemetry: Record; diff --git a/integration-tests/tests/payload-size.test.ts b/integration-tests/tests/payload-size.test.ts index 965cba34b..af81c6b97 100644 --- a/integration-tests/tests/payload-size.test.ts +++ b/integration-tests/tests/payload-size.test.ts @@ -2,7 +2,7 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; import { forceColdStart } from './utils/lambda'; import { filterLogMessages } from './utils/cloudwatch'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; // The enriched payload must be large enough to need a high batch cap, yet stay // under the 12 MB cap so it flushes in a single batch without a 413. @@ -12,8 +12,7 @@ const MIN_ENRICHED_BYTES = 10_000_000; const SPAN_COUNT = 400; const PAYLOAD_BYTES = 24_000; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-payload-size`; +const stackName = `${IDENTIFIER}-payload-size`; describe('Payload Size Integration Tests', () => { diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts index 7dbbc7f50..0f9aa1e61 100644 --- a/integration-tests/tests/snapstart.test.ts +++ b/integration-tests/tests/snapstart.test.ts @@ -1,13 +1,12 @@ import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default'; import { DatadogTelemetry } from './utils/datadog'; import { publishVersion, waitForSnapStartReady } from './utils/lambda'; -import { getIdentifier } from '../config'; +import { IDENTIFIER } from '../config'; const runtimes = ['java', 'dotnet'] as const; type Runtime = typeof runtimes[number]; -const identifier = getIdentifier(); -const stackName = `integ-${identifier}-snapstart`; +const stackName = `${IDENTIFIER}-snapstart`; describe('Snapstart Integration Tests', () => { let telemetry: Record;