-
Notifications
You must be signed in to change notification settings - Fork 182
feat(ci): merge-gate job that checks failures against a flaky allowlist #4028
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,152 @@ | ||
| # flaky-jobs.txt — glob patterns for CI jobs EXCLUDED from the merge gate. | ||
|
|
||
| aggregate tested versions | ||
| Amazon_Linux_2023_amd64.SI94:* | ||
| Amazon_Linux_2023_amd64.SID0:* | ||
| Amazon_Linux_2_arm64.SI94:* | ||
| Amazon_Linux_2_arm64.SID0:* | ||
| API unit tests:* | ||
| appsec integration tests (helper-rust):* | ||
| appsec integration tests (ssi):* | ||
| appsec integration tests:* | ||
| ASAN test_c with multiple observers:* | ||
| ASAN test_c:* | ||
| Cargo test | ||
| compile extension windows:* | ||
| compile extension: debug-zts-asan:* | ||
| compile extension: debug:* | ||
| compile tracing extension asan:* | ||
| compile tracing extension:* | ||
| compile tracing sidecar:* | ||
| create-multiarch-lib-injection-image | ||
| Debian_11_amd64.SID0:* | ||
| Debian_12_amd64.SID0:* | ||
| deploy-s3 | ||
| Disabled test_c run:* | ||
| Extension Tea Tests:* | ||
| framework test:* | ||
| generate-lib-init-pinned-tag-values | ||
| helper-rust integration coverage | ||
| installer tests | ||
| K8S_LIB_INJECTION_UDS:* | ||
| linux-php-laravel-realworld-parallel | ||
| linux-php-symfony-realworld-parallel | ||
| Loader test on amd64 libc:* | ||
| macrobenchmarks:* | ||
| min install tests | ||
| package-oci:* | ||
| pecl tests:* | ||
| PHP Language Tests:* | ||
| PHP language tests:* | ||
| php-app.amd64.DOC:* | ||
| php-app.arm64.DOC:* | ||
| php-laravel-realworld-parallel-check-slo-breaches | ||
| php-symfony-realworld-parallel-check-slo-breaches | ||
| php-symfony-realworld-parallel-generate-slos | ||
| php-symfony-realworld-parallel-upload-to-bp-api | ||
| PHP: FrankePHP Demo | ||
| PHP: Laravel Release | ||
| PHP: Shopware Demo | ||
| PHP: Symfony Demo | ||
| profiling tests:* | ||
| randomized tests:* | ||
| RedHat_8_6_arm64.SID0:* | ||
| System Tests:* | ||
| test appsec extension:* | ||
| test early PHP 8.1 | ||
| test_auto_instrumentation:* | ||
| test_composer:* | ||
| test_distributed_tracing:* | ||
| test_extension_ci:* | ||
| test_integrations_amqp2:* | ||
| test_integrations_curl:* | ||
| test_integrations_deferred_loading:* | ||
| test_integrations_elasticsearch8:* | ||
| test_integrations_elasticsearch_latest:* | ||
| test_integrations_frankenphp:* | ||
| test_integrations_googlespanner_latest:* | ||
| test_integrations_guzzle_latest:* | ||
| test_integrations_kafka:* | ||
| test_integrations_memcached:* | ||
| test_integrations_monolog2:* | ||
| test_integrations_mysqli:* | ||
| test_integrations_openai_latest:* | ||
| test_integrations_pcntl:* | ||
| test_integrations_predis_2:* | ||
| test_integrations_sqlsrv:* | ||
| test_integrations_swoole_5:* | ||
| test_metrics:* | ||
| test_opentelemetry_1:* | ||
| test_opentelemetry_beta:* | ||
| test_opentracing_10:* | ||
| test_web_cakephp_28:* | ||
| test_web_cakephp_310:* | ||
| test_web_cakephp_45:* | ||
| test_web_cakephp_latest:* | ||
| test_web_codeigniter_22:* | ||
| test_web_codeigniter_31:* | ||
| test_web_custom:* | ||
| test_web_drupal_101:* | ||
| test_web_drupal_89:* | ||
| test_web_drupal_95:* | ||
| test_web_laminas_mvc_33:* | ||
| test_web_laminas_mvc_latest:* | ||
| test_web_laminas_rest_latest:* | ||
| test_web_laravel_10x:* | ||
| test_web_laravel_11x:* | ||
| test_web_laravel_42:* | ||
| test_web_laravel_57:* | ||
| test_web_laravel_58:* | ||
| test_web_laravel_8x:* | ||
| test_web_laravel_9x:* | ||
| test_web_laravel_latest:* | ||
| test_web_laravel_octane_latest:* | ||
| test_web_lumen_100:* | ||
| test_web_lumen_52:* | ||
| test_web_lumen_56:* | ||
| test_web_lumen_58:* | ||
| test_web_lumen_81:* | ||
| test_web_lumen_90:* | ||
| test_web_magento_23:* | ||
| test_web_magento_24:* | ||
| test_web_nette_24:* | ||
| test_web_nette_31:* | ||
| test_web_nette_latest:* | ||
| test_web_slim_312:* | ||
| test_web_slim_48:* | ||
| test_web_slim_latest:* | ||
| test_web_symfony_23:* | ||
| test_web_symfony_28:* | ||
| test_web_symfony_30:* | ||
| test_web_symfony_33:* | ||
| test_web_symfony_34:* | ||
| test_web_symfony_40:* | ||
| test_web_symfony_42:* | ||
| test_web_symfony_44:* | ||
| test_web_symfony_50:* | ||
| test_web_symfony_51:* | ||
| test_web_symfony_52:* | ||
| test_web_symfony_62:* | ||
| test_web_symfony_73:* | ||
| test_web_symfony_latest:* | ||
| test_web_wordpress_48:* | ||
| test_web_wordpress_55:* | ||
| test_web_wordpress_59:* | ||
| test_web_wordpress_61:* | ||
| test_web_yii_2049:* | ||
| test_web_yii_latest:* | ||
| test_web_zend_1:* | ||
| test_web_zend_1_21:* | ||
| Ubuntu_20_amd64.SID0:* | ||
| Ubuntu_22_arm64.SI94:* | ||
| Ubuntu_22_arm64.SID0:* | ||
| Ubuntu_23_10_arm64.SI94:* | ||
| Ubuntu_23_10_arm64.SID0:* | ||
| Ubuntu_23_10_arm64.SIM:* | ||
| Ubuntu_24_10_amd64.SID0:* | ||
| Ubuntu_24_amd64.SID0:* | ||
| Unit tests:* | ||
| update-latest-versions | ||
| verify windows | ||
| windows test_c:* | ||
| Zend Abstract Interface Tests:* |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| #!/usr/bin/env bash | ||
| # Merge gate: passes iff every non-flaky job in this pipeline (and its triggered | ||
| # child pipelines) succeeded. It collects failed jobs via the GitLab API and | ||
| # classifies each against the glob patterns in .gitlab/flaky-jobs.txt; a failure | ||
| # matching no pattern is a real regression and fails the gate. See the | ||
| # `merge-gate` job in .gitlab-ci.yml. | ||
| set -uo pipefail | ||
|
|
||
| # Short-lived GitLab API token, same path as `analyze and create pr` | ||
| # (Vault-issued JWT for the 'sdm' audience -> BTI CI API). | ||
| _vault_jwt() { | ||
| local audience="$1" | ||
| if [ -n "${VAULT_ADDR:-}" ]; then | ||
| curl -sf -H "X-Vault-Request: true" \ | ||
| "${VAULT_ADDR}/v1/identity/oidc/token/${audience}" | jq -r '.data.token' 2>/dev/null && return 0 | ||
| fi | ||
| if [ -n "${DD_DATACENTER:-}" ]; then | ||
| curl -sf -H "X-Vault-Request: true" \ | ||
| "https://vault.${DD_DATACENTER}/v1/identity/oidc/token/${audience}" | jq -r '.data.token' 2>/dev/null && return 0 | ||
| fi | ||
| return 1 | ||
| } | ||
| BTI_JWT=$(_vault_jwt sdm) || { echo "ERROR: could not obtain a BTI JWT" >&2; exit 1; } | ||
| GITLAB_TOKEN=$(curl -sf -H "Authorization: Bearer ${BTI_JWT}" \ | ||
| "https://bti-ci-api.us1.ddbuild.io/internal/ci/gitlab/token?owner=DataDog&repository=dd-trace-php" \ | ||
| | jq -r '.token') | ||
| GITLAB_API="https://gitlab.ddbuild.io/api/v4" | ||
| AUTH="PRIVATE-TOKEN: ${GITLAB_TOKEN}" | ||
|
|
||
| # Pipelines to inspect: this parent pipeline + every triggered child. | ||
| pipelines=("${CI_PIPELINE_ID}") | ||
| bridges=$(curl -sf -H "${AUTH}" \ | ||
| "${GITLAB_API}/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100" || echo "[]") | ||
| while read -r child; do | ||
| [ -n "${child}" ] && pipelines+=("${child}") | ||
| done < <(echo "${bridges}" | jq -r '.[] | select(.downstream_pipeline != null) | .downstream_pipeline.id') | ||
|
|
||
| # Collect the names of all failed jobs across those pipelines. | ||
| : > failed_jobs.txt | ||
| for pid in "${pipelines[@]}"; do | ||
| for page in 1 2 3 4 5; do | ||
| data=$(curl -g -sf -H "${AUTH}" \ | ||
| "${GITLAB_API}/projects/${CI_PROJECT_ID}/pipelines/${pid}/jobs?scope[]=failed&per_page=100&page=${page}" || echo "[]") | ||
|
Comment on lines
+42
to
+43
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In any run where GitLab returns a non-2xx response here (for example an expired/empty BTI token, permission problem, or API outage), Useful? React with 👍 / 👎. |
||
| echo "${data}" | jq -r '.[] | select(.status == "failed") | .name' >> failed_jobs.txt | ||
| [ "$(echo "${data}" | jq 'length')" -lt 100 ] && break | ||
| done | ||
| done | ||
| sort -u failed_jobs.txt -o failed_jobs.txt | ||
|
|
||
| # Load flaky globs and classify each failure. | ||
| mapfile -t GLOBS < <(grep -vE '^[[:space:]]*(#|$)' .gitlab/flaky-jobs.txt) | ||
| echo "Loaded ${#GLOBS[@]} flaky patterns; $(wc -l < failed_jobs.txt) distinct failed job(s)." | ||
| blocking=0 | ||
| while IFS= read -r job; do | ||
| [ -z "${job}" ] && continue | ||
| [ "${job}" = "merge-gate" ] && continue | ||
| ok=0 | ||
| for g in "${GLOBS[@]}"; do | ||
| if [[ "${job}" == $g ]]; then ok=1; break; fi | ||
| done | ||
| if [ "${ok}" -eq 0 ]; then | ||
| echo " ✗ non-flaky failure: ${job}" | ||
| blocking=1 | ||
| else | ||
| echo " ✓ known-flaky: ${job}" | ||
| fi | ||
| done < failed_jobs.txt | ||
|
|
||
| if [ "${blocking}" -ne 0 ]; then | ||
| echo "" | ||
| echo "Merge gate FAILED — a required (non-flaky) job failed. See ✗ lines above." | ||
| exit 1 | ||
| fi | ||
| echo "" | ||
| echo "Merge gate PASSED — no failures, or all failures are known-flaky." | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When a trigger bridge fails before creating its child pipeline (for example invalid generated YAML or a downstream creation/permission error), GitLab exposes a failed bridge with
downstream_pipeline == null; this filter drops that bridge, and bridge jobs are not collected by the later/jobs?scope[]=failedcalls. In that scenario themerge-gatestatus can pass even though an entire child suite never ran, so failed bridges with no downstream pipeline should be counted as non-flaky failures.Useful? React with 👍 / 👎.