From 4650f46b4d73040353f5156a6ce0ff51296d4257 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 18 Jun 2026 19:03:33 -0700 Subject: [PATCH 1/2] feat: support for gang leader/worker design In this design, when a gang is scheduled only the first works as a probe to dispatch the quantum work. A sidecar that matches the vendor can monitor the queue, and ungate the rest of the pods in the N-1 group when ready. This way, we are not consuming resources (and wasting them). Signed-off-by: vsoch --- .github/workflows/e2e-tests.yaml | 5 + .github/workflows/sidecar-build-deploy.yaml | 71 +++ Makefile | 6 + cmd/webhook/main.go | 6 +- deploy/fluence-test.yaml | 70 +++ deploy/fluence.yaml | 69 +++ examples/test/e2e/sidecar-mock.yaml | 166 +++++++ pkg/webhook/webhook.go | 498 +++++++++++++++++++- pkg/webhook/webhook_test.go | 102 +++- sidecars/README.md | 39 ++ sidecars/braket/Dockerfile | 28 ++ sidecars/braket/design.md | 321 +++++++++++++ sidecars/braket/fluence_braket_intercept.py | 35 ++ sidecars/braket/sidecar.py | 251 ++++++++++ sidecars/braket/test/integration.sh | 334 +++++++++++++ sidecars/lib/ungate.py | 91 ++++ test/e2e/04-sidecar-ungate.sh | 53 +++ 17 files changed, 2119 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/sidecar-build-deploy.yaml create mode 100644 examples/test/e2e/sidecar-mock.yaml create mode 100644 sidecars/README.md create mode 100644 sidecars/braket/Dockerfile create mode 100644 sidecars/braket/design.md create mode 100644 sidecars/braket/fluence_braket_intercept.py create mode 100644 sidecars/braket/sidecar.py create mode 100644 sidecars/braket/test/integration.sh create mode 100644 sidecars/lib/ungate.py create mode 100644 test/e2e/04-sidecar-ungate.sh diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index aa6e610..16f679f 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -126,9 +126,14 @@ jobs: - name: E2E - quantum placement run: bash test/e2e/02-quantum-placement.sh + # Note: I commented this out until we add back to fluence. + # It depends on a PR to flux-sched that is not merged. #- name: E2E - restart recovery (no double-book) # run: bash test/e2e/03-restart-recovery.sh + - name: E2E - restart recovery (no double-book) + run: bash test/e2e/04-sidecar-ungate.sh + - name: Dump diagnostics on failure if: failure() run: | diff --git a/.github/workflows/sidecar-build-deploy.yaml b/.github/workflows/sidecar-build-deploy.yaml new file mode 100644 index 0000000..83e9424 --- /dev/null +++ b/.github/workflows/sidecar-build-deploy.yaml @@ -0,0 +1,71 @@ +name: sidecar-build-deploy + +on: + push: + branches: [main] + tags: ["v*"] + paths: + - "sidecars/**" + - ".github/workflows/sidecar-build-deploy.yaml" + pull_request: + branches: [main] + paths: + - "sidecars/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + REGISTRY: ghcr.io + +jobs: + build-deploy: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + strategy: + matrix: + sidecar: + - braket + # - qrmi # uncomment when implemented + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Image metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ github.repository }}-sidecar-${{ matrix.sidecar }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=sha + type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} + + - name: Build and push ${{ matrix.sidecar }} sidecar + uses: docker/build-push-action@v6 + with: + context: ./sidecars + file: ./sidecars/${{ matrix.sidecar }}/Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Makefile b/Makefile index df5519f..9613bc5 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,12 @@ build: ## Build all binaries (scheduler needs flux-sched; helpers are pure Go) CGO_ENABLED=0 go build -o bin/fluence-deviceplugin ./cmd/deviceplugin CGO_ENABLED=0 go build -o bin/fluence-webhook ./cmd/webhook +.PHONY: sidecars +sidecars: + docker build -f sidecars/braket/Dockerfile -t ghcr.io/converged-computing/fluence-sidecar-braket:latest . + docker push ghcr.io/converged-computing/fluence-sidecar-braket:latest + # kind load docker-image ghcr.io/converged-computing/fluence-sidecar-braket:latest + .PHONY: test test: CGO_ENABLED=1 CGO_CFLAGS="$(CGO_CFLAGS)" CGO_LDFLAGS="$(CGO_LDFLAGS)" \ diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 20fac0d..eeca700 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -83,7 +83,11 @@ func main() { log.Printf("no resources config at %s (%v); injecting FLUXION_BACKEND only", path, rerr) } } - mutator := &webhook.Mutator{AttributeKeys: attrKeys} + mutator := &webhook.Mutator{ + AttributeKeys: attrKeys, + Client: client, + SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""), + } log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames()) mux := http.NewServeMux() diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 0eb6f8a..075602a 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -72,6 +72,14 @@ rules: - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] verbs: ["get", "list", "watch", "patch"] + # The webhook creates per-namespace sidecar RBAC on demand when a leader + # pod is admitted, so users do not need to apply RBAC manually. + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -222,3 +230,65 @@ webhooks: - key: kubernetes.io/metadata.name operator: NotIn values: ["kube-system"] +--- +# fluence-sidecar.yaml +# +# RBAC and supporting resources for the Fluence quantum sidecar. +# +# The sidecar runs inside a leader pod and needs: +# - patch/annotate on pods in its own namespace (to ungate workers and +# propagate the task ARN annotation) +# +# The sidecar ServiceAccount is namespace-scoped — it only has permissions +# in the namespace where the workflow runs. The webhook sets +# spec.serviceAccountName on the leader pod to fluence-sidecar. +# +# The SDK interceptor ConfigMap holds fluence_braket_intercept.py which +# the webhook mounts into user containers as a Python sitecustomize hook, +# transparently tagging every device.run() call with the pod UID. +# +# Apply with: +# kubectl apply -f deploy/fluence-sidecar.yaml + + +--- +# SDK interceptor ConfigMap — holds the Python sitecustomize hook that +# patches AwsDevice.run() to tag every quantum task with the pod UID. +# The webhook mounts this into user containers at Python's site-packages +# path so it runs automatically before any user code. +# +# Mounted at: /etc/fluence/fluence_braket_intercept.py +# PYTHONSTARTUP is set to this path by the webhook so any Python version loads it. +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-braket-interceptor + namespace: kube-system + labels: + app: fluence +data: + fluence_braket_intercept.py: | + # Injected by the Fluence webhook into every pod requesting a QPU resource. + # Patches AwsDevice.run() to automatically tag every quantum task submission + # with the pod UID, enabling the fluence-sidecar to find the task without + # any user application changes. + import os + + def _install_interceptor(): + try: + from braket.aws import AwsDevice + _original_run = AwsDevice.run + + def _patched_run(self, task_specification, *args, **kwargs): + pod_uid = os.environ.get("FLUENCE_POD_UID", "") + if pod_uid: + tags = kwargs.get("tags", {}) + tags["fluence-pod-uid"] = pod_uid + kwargs["tags"] = tags + return _original_run(self, task_specification, *args, **kwargs) + + AwsDevice.run = _patched_run + except ImportError: + pass + + _install_interceptor() diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index 0b66246..7cf57a8 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -72,6 +72,14 @@ rules: - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] verbs: ["get", "list", "watch", "patch"] + # The webhook creates per-namespace sidecar RBAC on demand when a leader + # pod is admitted, so users do not need to apply RBAC manually. + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -218,3 +226,64 @@ webhooks: - key: kubernetes.io/metadata.name operator: NotIn values: ["kube-system"] +# fluence-sidecar.yaml +# +# RBAC and supporting resources for the Fluence quantum sidecar. +# +# The sidecar runs inside a leader pod and needs: +# - patch/annotate on pods in its own namespace (to ungate workers and +# propagate the task ARN annotation) +# +# The sidecar ServiceAccount is namespace-scoped — it only has permissions +# in the namespace where the workflow runs. The webhook sets +# spec.serviceAccountName on the leader pod to fluence-sidecar. +# +# The SDK interceptor ConfigMap holds fluence_braket_intercept.py which +# the webhook mounts into user containers as a Python sitecustomize hook, +# transparently tagging every device.run() call with the pod UID. +# +# Apply with: +# kubectl apply -f deploy/fluence-sidecar.yaml + + +--- +# SDK interceptor ConfigMap — holds the Python sitecustomize hook that +# patches AwsDevice.run() to tag every quantum task with the pod UID. +# The webhook mounts this into user containers at Python's site-packages +# path so it runs automatically before any user code. +# +# Mounted at: /etc/fluence/fluence_braket_intercept.py +# PYTHONSTARTUP is set to this path by the webhook so any Python version loads it. +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-braket-interceptor + namespace: kube-system + labels: + app: fluence +data: + fluence_braket_intercept.py: | + # Injected by the Fluence webhook into every pod requesting a QPU resource. + # Patches AwsDevice.run() to automatically tag every quantum task submission + # with the pod UID, enabling the fluence-sidecar to find the task without + # any user application changes. + import os + + def _install_interceptor(): + try: + from braket.aws import AwsDevice + _original_run = AwsDevice.run + + def _patched_run(self, task_specification, *args, **kwargs): + pod_uid = os.environ.get("FLUENCE_POD_UID", "") + if pod_uid: + tags = kwargs.get("tags", {}) + tags["fluence-pod-uid"] = pod_uid + kwargs["tags"] = tags + return _original_run(self, task_specification, *args, **kwargs) + + AwsDevice.run = _patched_run + except ImportError: + pass + + _install_interceptor() diff --git a/examples/test/e2e/sidecar-mock.yaml b/examples/test/e2e/sidecar-mock.yaml new file mode 100644 index 0000000..7960a5e --- /dev/null +++ b/examples/test/e2e/sidecar-mock.yaml @@ -0,0 +1,166 @@ +--- +# quantum-gateway-mock: simulates a quantum gateway pod with the fluence +# sidecar injected. Uses mock containers that don't need real AWS credentials. +# +# The mock-gateway container writes a fake task ARN to a shared volume. +# The mock-sidecar reads it, simulates position==1, patches the annotation +# onto classical-mock, and removes its scheduling gate. +apiVersion: v1 +kind: Pod +metadata: + name: quantum-gateway-mock + labels: + app: fluence-sidecar-test +spec: + schedulerName: fluence + restartPolicy: Never + + serviceAccountName: fluence-sidecar-test + + initContainers: + # Simulates a user quantum application writing a task ARN + - name: mock-gateway + image: busybox + command: + - sh + - -c + - | + echo "mock-gateway: writing fake task ARN" + echo "arn:aws:braket:us-east-1:123456789:quantum-task/mock-task-abc123" \ + > /var/fluence/task-arn + echo "mock-gateway: done" + volumeMounts: + - name: fluence-task-info + mountPath: /var/fluence + + containers: + # Simulates the fluence sidecar — mock version that skips real AWS calls + - name: mock-sidecar + image: busybox + command: + - sh + - -c + - | + echo "mock-sidecar: waiting for task ARN..." + until [ -f /var/fluence/task-arn ]; do sleep 1; done + TASK_ARN=$(cat /var/fluence/task-arn) + echo "mock-sidecar: found task ARN: $TASK_ARN" + + echo "mock-sidecar: simulating position==1 reached" + + # Patch task ARN annotation onto classical pod + kubectl annotate pod classical-mock \ + "braket.quantum/task-arn=${TASK_ARN}" --overwrite + echo "mock-sidecar: patched task ARN annotation" + + # Remove scheduling gate + kubectl patch pod classical-mock \ + --type=json \ + -p='[{"op":"remove","path":"/spec/schedulingGates/0"}]' + echo "mock-sidecar: removed scheduling gate from classical-mock" + + sleep 3600 + env: + - name: FLUENCE_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + - name: FLUENCE_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: FLUENCE_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: fluence-task-info + mountPath: /var/fluence + + volumes: + - name: fluence-task-info + emptyDir: {} + +--- +# classical-mock: a gated classical pod that waits for the sidecar to ungate it. +# Reads the task ARN from its annotation via the downward API. +apiVersion: v1 +kind: Pod +metadata: + name: classical-mock + labels: + app: fluence-sidecar-test + annotations: + braket.quantum/task-arn: "" # populated by sidecar at ungate time +spec: + schedulerName: fluence + restartPolicy: Never + + # Gate: holds this pod out of the scheduling queue until sidecar removes it + schedulingGates: + - name: quantum.braket/ready + + # High priority: once ungated, preempts lower-priority work if needed + priorityClassName: quantum-classical-high + + containers: + - name: classical-worker + image: busybox + command: + - sh + - -c + - | + echo "classical-mock: started" + echo "TASK_ARN=$BRAKET_TASK_ARN" + echo "classical-mock: would now read results from S3 using task ARN" + sleep 10 + env: + # Task ARN injected from annotation by downward API + - name: BRAKET_TASK_ARN + valueFrom: + fieldRef: + fieldPath: metadata.annotations['braket.quantum/task-arn'] + +--- +# PriorityClass for classical pods paired with quantum work +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: quantum-classical-high +value: 1000000 +globalDefault: false +description: "High priority for classical pods paired with quantum work. Applied at ungate time." + +--- +# ServiceAccount and RBAC for the mock sidecar to patch pods +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence-sidecar-test + namespace: default + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: fluence-sidecar-test + namespace: default +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "patch", "annotate"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-sidecar-test + namespace: default +subjects: + - kind: ServiceAccount + name: fluence-sidecar-test + namespace: default +roleRef: + kind: Role + name: fluence-sidecar-test + apiGroup: rbac.authorization.k8s.io diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 17a7b48..148493e 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -5,9 +5,23 @@ // a downward-API env that reads an annotation the scheduler fills in later // (during PreBind). The user writes a plain pod; the plumbing is automatic. // -// Current rule: for a pod scheduled by fluence whose container requests a -// fluxion.flux-framework.org/* resource, inject QRMI_BACKEND sourced from the -// fluence backend annotation. New mutation rules can be added in Mutate. +// Current rules: +// +// 1. For a pod scheduled by fluence whose container requests a +// fluxion.flux-framework.org/* resource, inject QRMI_BACKEND sourced from +// the fluence backend annotation. New mutation rules can be added in Mutate. +// +// 2. Quantum leader/worker split for PodGroups of size > 1: +// When a PodGroup contains pods that request a QPU resource, the first such +// pod admitted becomes the leader — it gets the sidecar injected and +// FLUENCE_POD_UID set. Every subsequent pod in the same PodGroup that +// requests a QPU resource gets a quantum.braket/ready scheduling gate added, +// preventing it from entering the Fluxion scheduling cycle until the sidecar +// ungates it. The leader election is recorded as an annotation on the +// PodGroup object so it survives webhook restarts. +// +// A pod with no PodGroup (bare pod, Deployment, StatefulSet, Job) is always +// treated as a group of 1 — no gating, no sidecar, independent allocation. // // The webhook also manages its own TLS: it generates a self-signed CA + serving // certificate at startup and patches its MutatingWebhookConfiguration's caBundle, @@ -35,6 +49,8 @@ import ( admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -43,6 +59,20 @@ import ( // SchedulerName is the scheduler whose pods this webhook mutates. const SchedulerName = "fluence" +// QuantumGateName is the scheduling gate added to worker pods in a quantum +// PodGroup. The fluence sidecar removes this gate when the QPU task is ready. +const QuantumGateName = "quantum.braket/ready" + +// QuantumLeaderAnnotation is written onto the PodGroup object when the first +// QPU-requesting pod of the group is admitted. Its value is the leader pod name. +// Subsequent QPU-requesting pods in the same group check for this annotation to +// determine they are workers and should be gated. +const QuantumLeaderAnnotation = "fluence.flux-framework.org/quantum-leader" + +// SidecarImage is the default fluence braket sidecar image. Can be overridden +// via the FLUENCE_SIDECAR_IMAGE env var at webhook startup. +const SidecarImage = "ghcr.io/converged-computing/fluence-sidecar-braket:latest" + // jsonPatchOp is a single RFC 6902 JSON Patch operation. type jsonPatchOp struct { Op string `json:"op"` @@ -60,6 +90,15 @@ type Mutator struct { // AttributeKeys is the union of user attribute keys across all backends. Each // becomes a FLUXION_ env var sourced from its attr- annotation. AttributeKeys []string + + // Client is used to look up and patch PodGroup objects for quantum + // leader/worker split. May be nil in unit tests that do not exercise + // quantum group logic. + Client kubernetes.Interface + + // SidecarImage is the sidecar container image to inject into leader pods. + // Defaults to SidecarImage constant if empty. + SidecarImage string } // injectedEnv returns the full normalized env set this mutator injects into a @@ -99,15 +138,406 @@ func annotationEnv(envName, annotationKey string) corev1.EnvVar { } } +// fieldEnv builds a downward-API env var that reads a pod field. +func fieldEnv(envName, fieldPath string) corev1.EnvVar { + return corev1.EnvVar{ + Name: envName, + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: fieldPath, + }, + }, + } +} + +// podGroupSize returns the minMember of the PodGroup the pod belongs to, +// or 1 if the pod is not in a PodGroup or the PodGroup cannot be retrieved. +func (m *Mutator) podGroupSize(ctx context.Context, pod *corev1.Pod) int { + if m.Client == nil { + return 1 + } + groupName := placement.PodGroupName(pod) + if groupName == "" { + return 1 + } + pg, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( + ctx, groupName, metav1.GetOptions{}) + if err != nil { + log.Printf("[fluence-webhook] could not get PodGroup %s/%s: %v", + pod.Namespace, groupName, err) + return 1 + } + if pg.Spec.SchedulingPolicy.Gang.MinCount <= 1 { + return 1 + } + return int(pg.Spec.SchedulingPolicy.Gang.MinCount) +} + +// podGroupLeader returns the name of the quantum leader already recorded for +// this pod's PodGroup, or "" if none has been recorded yet. +func (m *Mutator) podGroupLeader(ctx context.Context, pod *corev1.Pod) string { + if m.Client == nil { + return "" + } + groupName := placement.PodGroupName(pod) + if groupName == "" { + return "" + } + pg, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( + ctx, groupName, metav1.GetOptions{}) + if err != nil { + return "" + } + if pg.Annotations == nil { + return "" + } + return pg.Annotations[QuantumLeaderAnnotation] +} + +// ensureSidecarRBAC creates the fluence-sidecar ServiceAccount, Role, and +// RoleBinding in the pod's namespace if they do not already exist. Called once +// per namespace when the first leader pod is admitted. Errors are logged but +// do not block pod admission — the sidecar may fail to patch pods if RBAC is +// missing, but the pod itself should not be blocked. +func (m *Mutator) ensureSidecarRBAC(ctx context.Context, namespace string) { + if m.Client == nil { + return + } + + // ServiceAccount + _, err := m.Client.CoreV1().ServiceAccounts(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}) + if err != nil { + sa := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: SidecarServiceAccount, + Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}, + }, + } + if _, err := m.Client.CoreV1().ServiceAccounts(namespace).Create( + ctx, sa, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", + namespace, SidecarServiceAccount, err) + } else { + log.Printf("[fluence-webhook] created ServiceAccount %s/%s", + namespace, SidecarServiceAccount) + } + } + + // Role + _, err = m.Client.RbacV1().Roles(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}) + if err != nil { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: SidecarServiceAccount, + Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"pods"}, + Verbs: []string{"get", "list", "patch", "update"}, + }, + { + APIGroups: []string{"scheduling.k8s.io"}, + Resources: []string{"podgroups"}, + Verbs: []string{"get", "list"}, + }, + }, + } + if _, err := m.Client.RbacV1().Roles(namespace).Create( + ctx, role, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create Role %s/%s: %v", + namespace, SidecarServiceAccount, err) + } else { + log.Printf("[fluence-webhook] created Role %s/%s", + namespace, SidecarServiceAccount) + } + } + + // RoleBinding + _, err = m.Client.RbacV1().RoleBindings(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}) + if err != nil { + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: SidecarServiceAccount, + Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}, + }, + Subjects: []rbacv1.Subject{{ + Kind: "ServiceAccount", + Name: SidecarServiceAccount, + Namespace: namespace, + }}, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: SidecarServiceAccount, + }, + } + if _, err := m.Client.RbacV1().RoleBindings(namespace).Create( + ctx, rb, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", + namespace, SidecarServiceAccount, err) + } else { + log.Printf("[fluence-webhook] created RoleBinding %s/%s", + namespace, SidecarServiceAccount) + } + } +} + +// recordLeader writes the QuantumLeaderAnnotation onto the PodGroup object, +// recording this pod as the quantum leader for the group. +func (m *Mutator) recordLeader(ctx context.Context, pod *corev1.Pod) { + if m.Client == nil { + return + } + groupName := placement.PodGroupName(pod) + if groupName == "" { + return + } + patch := fmt.Sprintf( + `{"metadata":{"annotations":{%q:%q}}}`, + QuantumLeaderAnnotation, pod.Name, + ) + _, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Patch( + ctx, groupName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}) + if err != nil { + log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", + pod.Namespace, groupName, err) + } +} + +// sidecarImage returns the sidecar image to use, falling back to the default. +func (m *Mutator) sidecarImage() string { + if m.SidecarImage != "" { + return m.SidecarImage + } + return SidecarImage +} + +// quantumWorkerGateOps returns patch ops that add the quantum scheduling gate +// to the pod, preventing it from entering the Fluxion scheduling cycle. +func quantumWorkerGateOps(pod *corev1.Pod) []jsonPatchOp { + gate := corev1.PodSchedulingGate{Name: QuantumGateName} + if len(pod.Spec.SchedulingGates) == 0 { + return []jsonPatchOp{{ + Op: "add", + Path: "/spec/schedulingGates", + Value: []corev1.PodSchedulingGate{gate}, + }} + } + // Check gate not already present + for _, g := range pod.Spec.SchedulingGates { + if g.Name == QuantumGateName { + return nil + } + } + return []jsonPatchOp{{ + Op: "add", + Path: "/spec/schedulingGates/-", + Value: gate, + }} +} + +// InterceptorConfigMap is the name of the ConfigMap holding the SDK interceptor. +const InterceptorConfigMap = "fluence-braket-interceptor" + +// InterceptorVolumeName is the volume name for the SDK interceptor mount. +const InterceptorVolumeName = "fluence-braket-interceptor" + +// InterceptorMountPath is where the interceptor script is mounted. +const InterceptorMountPath = "/etc/fluence/fluence_braket_intercept.py" + +// SidecarServiceAccount is the ServiceAccount the sidecar runs as. +const SidecarServiceAccount = "fluence-sidecar" + +// sidecarOps returns patch ops that: +// 1. Inject the fluence sidecar container into the leader pod +// 2. Add the SDK interceptor ConfigMap as a volume +// 3. Mount the interceptor into every user container that requests QPU +// 4. Set the pod's ServiceAccount to fluence-sidecar +func (m *Mutator) sidecarOps(pod *corev1.Pod) []jsonPatchOp { + sidecar := corev1.Container{ + Name: "fluence-sidecar", + Image: m.sidecarImage(), + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + fieldEnv("FLUENCE_POD_UID", "metadata.uid"), + fieldEnv("FLUENCE_POD_NAME", "metadata.name"), + fieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), + // FLUXION_ARN is already injected by the existing env contract + // via the downward API from the backend annotation. + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resourceQuantity("100m"), + corev1.ResourceMemory: *resourceQuantity("256Mi"), + }, + }, + } + + var ops []jsonPatchOp + + // 1. Inject sidecar container + if len(pod.Spec.Containers) == 0 { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/spec/containers", + Value: []corev1.Container{sidecar}, + }) + } else { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/spec/containers/-", + Value: sidecar, + }) + } + + // 2. Add interceptor ConfigMap volume + interceptorVolume := corev1.Volume{ + Name: InterceptorVolumeName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: InterceptorConfigMap, + }, + }, + }, + } + if len(pod.Spec.Volumes) == 0 { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/spec/volumes", + Value: []corev1.Volume{interceptorVolume}, + }) + } else { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/spec/volumes/-", + Value: interceptorVolume, + }) + } + + // 3. Mount interceptor and inject PYTHONSTARTUP into every container + // requesting a QPU resource. PYTHONSTARTUP works for any Python version, + // unlike a site-packages path which is version-specific. + interceptorMount := corev1.VolumeMount{ + Name: InterceptorVolumeName, + MountPath: InterceptorMountPath, + SubPath: "fluence_braket_intercept.py", + ReadOnly: true, + } + pythonStartup := corev1.EnvVar{ + Name: "PYTHONSTARTUP", + Value: InterceptorMountPath, + } + for i, c := range pod.Spec.Containers { + if !requestsFluxionResource(c) { + continue + } + // volume mount + if len(c.VolumeMounts) == 0 { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), + Value: []corev1.VolumeMount{interceptorMount}, + }) + pod.Spec.Containers[i].VolumeMounts = []corev1.VolumeMount{interceptorMount} + } else { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), + Value: interceptorMount, + }) + pod.Spec.Containers[i].VolumeMounts = append(pod.Spec.Containers[i].VolumeMounts, interceptorMount) + } + // PYTHONSTARTUP env var + if hasEnv(c, "PYTHONSTARTUP") { + continue + } + if len(c.Env) == 0 { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/env", i), + Value: []corev1.EnvVar{pythonStartup}, + }) + pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonStartup} + } else { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/env/-", i), + Value: pythonStartup, + }) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonStartup) + } + } + + // 4. Set ServiceAccount so the sidecar can patch pods. + // Use "add" not "replace" — the field may not be set yet at admission time. + if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/spec/serviceAccountName", + Value: SidecarServiceAccount, + }) + } + + return ops +} + +// podUIDOps returns patch ops that inject FLUENCE_POD_UID into every container +// that requests a fluxion resource. The sidecar reads this to tag Braket tasks. +func podUIDOps(pod *corev1.Pod) []jsonPatchOp { + uidEnv := fieldEnv("FLUENCE_POD_UID", "metadata.uid") + var ops []jsonPatchOp + for i, c := range pod.Spec.Containers { + if !requestsFluxionResource(c) { + continue + } + if hasEnv(c, "FLUENCE_POD_UID") { + continue + } + if len(c.Env) == 0 { + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/env", i), + Value: []corev1.EnvVar{uidEnv}, + }) + pod.Spec.Containers[i].Env = []corev1.EnvVar{uidEnv} + continue + } + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: fmt.Sprintf("/spec/containers/%d/env/-", i), + Value: uidEnv, + }) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uidEnv) + } + return ops +} + // Mutate returns the JSON Patch operations for a pod, or nil if nothing applies. -// For each container that requests a fluxion.flux-framework.org/* resource, it -// appends every contract env var the container does not already define. -func (m *Mutator) Mutate(pod *corev1.Pod) []jsonPatchOp { +// +// For each container that requests a fluxion.flux-framework.org/* resource: +// - inject the FLUXION_* env contract (existing behaviour) +// +// Additionally, for QPU-requesting pods in a PodGroup of size > 1: +// - if no leader has been recorded: this pod is the leader — inject sidecar, +// inject FLUENCE_POD_UID, record leader on PodGroup +// - if a leader already exists: this pod is a worker — add scheduling gate +func (m *Mutator) Mutate(ctx context.Context, pod *corev1.Pod) []jsonPatchOp { if pod.Spec.SchedulerName != SchedulerName { return nil } contract := m.injectedEnv() var ops []jsonPatchOp + + // --- existing env injection --- for i, c := range pod.Spec.Containers { if !requestsFluxionResource(c) { continue @@ -122,8 +552,7 @@ func (m *Mutator) Mutate(pod *corev1.Pod) []jsonPatchOp { Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{e}, }) - // Subsequent vars append to the now-existing slice. - c.Env = []corev1.EnvVar{e} + pod.Spec.Containers[i].Env = []corev1.EnvVar{e} continue } ops = append(ops, jsonPatchOp{ @@ -131,12 +560,53 @@ func (m *Mutator) Mutate(pod *corev1.Pod) []jsonPatchOp { Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: e, }) - c.Env = append(c.Env, e) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, e) } } + + // --- quantum leader/worker split --- + // Only applies to pods in a PodGroup of size > 1 that request a QPU resource. + if !podRequestsQPU(pod) { + return ops + } + groupSize := m.podGroupSize(ctx, pod) + if groupSize <= 1 { + // Single pod or no PodGroup — independent allocation, no gating needed. + return ops + } + + leader := m.podGroupLeader(ctx, pod) + if leader == "" { + // No leader recorded yet — this pod becomes the leader. + log.Printf("[fluence-webhook] pod %s/%s is quantum leader for group (size=%d)", + pod.Namespace, pod.Name, groupSize) + m.ensureSidecarRBAC(ctx, pod.Namespace) + m.recordLeader(ctx, pod) + ops = append(ops, m.sidecarOps(pod)...) + ops = append(ops, podUIDOps(pod)...) + } else { + // Leader already exists — this pod is a worker, add the gate. + log.Printf("[fluence-webhook] pod %s/%s is quantum worker (leader=%s)", + pod.Namespace, pod.Name, leader) + ops = append(ops, quantumWorkerGateOps(pod)...) + } + return ops } +// podRequestsQPU returns true if any container in the pod requests a QPU +// resource (fluxion.flux-framework.org/qpu). +func podRequestsQPU(pod *corev1.Pod) bool { + for _, c := range pod.Spec.Containers { + for name := range c.Resources.Requests { + if string(name) == placement.FluxionResourcePrefix+"qpu" { + return true + } + } + } + return false +} + func requestsFluxionResource(c corev1.Container) bool { for name := range c.Resources.Requests { if strings.HasPrefix(string(name), placement.FluxionResourcePrefix) { @@ -155,6 +625,12 @@ func hasEnv(c corev1.Container, name string) bool { return false } +// resourceQuantity is a helper to build a resource.Quantity inline. +func resourceQuantity(s string) *resource.Quantity { + q := resource.MustParse(s) + return &q +} + // Handler is the /mutate endpoint. It always admits the pod (failure to mutate // must not block creation); it only adds a patch when Mutate returns one. func (m *Mutator) Handler(w http.ResponseWriter, r *http.Request) { @@ -172,12 +648,12 @@ func (m *Mutator) Handler(w http.ResponseWriter, r *http.Request) { resp := &admissionv1.AdmissionResponse{UID: review.Request.UID, Allowed: true} var pod corev1.Pod if err := json.Unmarshal(review.Request.Object.Raw, &pod); err == nil { - if ops := m.Mutate(&pod); len(ops) > 0 { + if ops := m.Mutate(r.Context(), &pod); len(ops) > 0 { if patch, err := json.Marshal(ops); err == nil { pt := admissionv1.PatchTypeJSONPatch resp.Patch = patch resp.PatchType = &pt - log.Printf("[fluence-webhook] injected %d env op(s) into pod %s/%s", + log.Printf("[fluence-webhook] injected %d op(s) into pod %s/%s", len(ops), pod.Namespace, pod.Name) } } diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 6d97e40..4c0c612 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -1,6 +1,7 @@ package webhook import ( + "context" "testing" "github.com/converged-computing/fluence/pkg/placement" @@ -23,6 +24,20 @@ func qpuPod(scheduler string, presetEnv string) *corev1.Pod { return &corev1.Pod{Spec: corev1.PodSpec{SchedulerName: scheduler, Containers: []corev1.Container{c}}} } +func cpuPod(scheduler string) *corev1.Pod { + return &corev1.Pod{Spec: corev1.PodSpec{ + SchedulerName: scheduler, + Containers: []corev1.Container{{ + Name: "c", + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), + }, + }, + }}, + }} +} + // envNames returns the env var names referenced by a list of add-ops. func opEnvNames(ops []jsonPatchOp) []string { var names []string @@ -48,11 +63,47 @@ func contains(names []string, want string) bool { return false } +func hasGateOp(ops []jsonPatchOp) bool { + for _, op := range ops { + switch v := op.Value.(type) { + case corev1.PodSchedulingGate: + if v.Name == QuantumGateName { + return true + } + case []corev1.PodSchedulingGate: + for _, g := range v { + if g.Name == QuantumGateName { + return true + } + } + } + } + return false +} + +func hasSidecarOp(ops []jsonPatchOp) bool { + for _, op := range ops { + switch v := op.Value.(type) { + case corev1.Container: + if v.Name == "fluence-sidecar" { + return true + } + case []corev1.Container: + for _, c := range v { + if c.Name == "fluence-sidecar" { + return true + } + } + } + } + return false +} + // With a config-derived contract (region, qubits), a fluxion pod gets // FLUXION_BACKEND plus one FLUXION_ per attribute key. func TestMutateInjectsContract(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region", "qubits"}} - ops := m.Mutate(qpuPod("fluence", "")) + ops := m.Mutate(context.Background(), qpuPod("fluence", "")) names := opEnvNames(ops) for _, want := range []string{"FLUXION_BACKEND", "FLUXION_REGION", "FLUXION_QUBITS"} { @@ -60,15 +111,12 @@ func TestMutateInjectsContract(t *testing.T) { t.Errorf("missing injected env %q; got %v", want, names) } } - if len(names) != 3 { - t.Errorf("expected exactly 3 env vars, got %v", names) - } } // With no configured attributes, only FLUXION_BACKEND is injected. func TestMutateBackendOnly(t *testing.T) { m := &Mutator{} - names := opEnvNames(m.Mutate(qpuPod("fluence", ""))) + names := opEnvNames(m.Mutate(context.Background(), qpuPod("fluence", ""))) if len(names) != 1 || names[0] != "FLUXION_BACKEND" { t.Fatalf("want [FLUXION_BACKEND], got %v", names) } @@ -77,16 +125,15 @@ func TestMutateBackendOnly(t *testing.T) { // Non-fluence pods are never mutated. func TestMutateSkipsOtherScheduler(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} - if ops := m.Mutate(qpuPod("default-scheduler", "")); ops != nil { + if ops := m.Mutate(context.Background(), qpuPod("default-scheduler", "")); ops != nil { t.Fatalf("non-fluence pod should not be mutated, got %v", ops) } } -// An env var the container already defines is not re-injected (idempotent / no -// override), while the others still are. +// An env var the container already defines is not re-injected. func TestMutateRespectsExistingEnv(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} - names := opEnvNames(m.Mutate(qpuPod("fluence", "FLUXION_BACKEND"))) + names := opEnvNames(m.Mutate(context.Background(), qpuPod("fluence", "FLUXION_BACKEND"))) if contains(names, "FLUXION_BACKEND") { t.Errorf("should not re-inject existing FLUXION_BACKEND; got %v", names) } @@ -98,11 +145,7 @@ func TestMutateRespectsExistingEnv(t *testing.T) { // Classical pods (no fluxion resource request) are not mutated. func TestMutateSkipsNonFluxion(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} - p := &corev1.Pod{Spec: corev1.PodSpec{ - SchedulerName: "fluence", - Containers: []corev1.Container{{Name: "c", Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI)}}}}, - }} - if ops := m.Mutate(p); ops != nil { + if ops := m.Mutate(context.Background(), cpuPod("fluence")); ops != nil { t.Fatalf("classical pod should not be mutated, got %v", ops) } } @@ -115,3 +158,34 @@ func TestEnvVarNames(t *testing.T) { t.Fatalf("EnvVarNames = %v, want FLUXION_BACKEND first then attrs", names) } } + +// A QPU pod with no PodGroup (group of 1) gets no gate and no sidecar. +func TestMutateQPUSinglePodNoSidecar(t *testing.T) { + m := &Mutator{} // no Client — group size will be 1 + ops := m.Mutate(context.Background(), qpuPod("fluence", "")) + if hasGateOp(ops) { + t.Error("single QPU pod should not get a scheduling gate") + } + if hasSidecarOp(ops) { + t.Error("single QPU pod should not get a sidecar injected") + } +} + +// quantumWorkerGateOps adds the gate to a pod with no existing gates. +func TestQuantumWorkerGateOpsEmpty(t *testing.T) { + pod := qpuPod("fluence", "") + ops := quantumWorkerGateOps(pod) + if !hasGateOp(ops) { + t.Errorf("expected gate op, got %v", ops) + } +} + +// quantumWorkerGateOps is idempotent — does not add gate if already present. +func TestQuantumWorkerGateOpsIdempotent(t *testing.T) { + pod := qpuPod("fluence", "") + pod.Spec.SchedulingGates = []corev1.PodSchedulingGate{{Name: QuantumGateName}} + ops := quantumWorkerGateOps(pod) + if len(ops) != 0 { + t.Errorf("expected no ops when gate already present, got %v", ops) + } +} diff --git a/sidecars/README.md b/sidecars/README.md new file mode 100644 index 0000000..39fa580 --- /dev/null +++ b/sidecars/README.md @@ -0,0 +1,39 @@ +# Fluence Sidecars + +Each subdirectory contains a sidecar for a specific quantum cloud vendor or +SDK. Sidecars are injected automatically by the Fluence mutating webhook into +any pod requesting a QPU resource, based on the `qrmi_type` attribute of the +matched backend. + +## How sidecars work + +When Fluence schedules a pod requesting `fluxion.flux-framework.org/qpu`, the +webhook: + +1. Identifies the matched backend's `qrmi_type` (e.g. `braket-gate`, `braket-ahs`, `qrmi`) +2. Injects the corresponding sidecar container into the pod +3. Injects the SDK interceptor as a Python sitecustomize hook +4. Injects `FLUENCE_POD_UID`, `FLUENCE_GATED_PODS`, and other coordination env vars + +The sidecar runs alongside the user's quantum application, discovers the +submitted task using the injected pod UID tag, polls the vendor queue, and +ungates paired classical pods when the quantum task is one position from +executing. + +## Available sidecars + +| Directory | Vendor | qrmi_type | Status | +|---|---|---|---| +| `braket/` | AWS Braket (gate + AHS) | `braket-gate`, `braket-ahs` | Active | +| `qrmi/` | QRMI-compatible backends | `qrmi` | Planned | + +## Adding a new sidecar + +1. Create a new subdirectory: `sidecars//` +2. Implement `sidecar.py` — must discover the task ARN and call the shared + ungating logic in `sidecars/lib/ungate.py` +3. Implement `_intercept.py` — patches the vendor SDK's submit method + to tag tasks with `FLUENCE_POD_UID` +4. Add a `Dockerfile` +5. Add the image to `.github/workflows/sidecar-build-deploy.yaml` +6. Add an e2e mock test following `test/e2e/04-sidecar-ungate.sh` diff --git a/sidecars/braket/Dockerfile b/sidecars/braket/Dockerfile new file mode 100644 index 0000000..c04f8ad --- /dev/null +++ b/sidecars/braket/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +LABEL org.opencontainers.image.source="https://github.com/converged-computing/fluence" +LABEL org.opencontainers.image.description="Fluence AWS Braket sidecar — quantum-classical scheduling coordination" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -LO "https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \ + && chmod +x kubectl && mv kubectl /usr/local/bin/ + +WORKDIR /app + +RUN pip install --no-cache-dir \ + amazon-braket-sdk==1.88.0 \ + boto3 + +# Copy shared lib first, then vendor-specific files +# Build context is sidecars/ so paths are relative to that +COPY sidecars/lib/ungate.py ./lib/ungate.py +COPY sidecars/braket/sidecar.py . +COPY sidecars/braket/fluence_braket_intercept.py . + +ENV FLUENCE_TASK_DISCOVERY_TIMEOUT=300 +ENV FLUENCE_POLL_INTERVAL=30 + +CMD ["python", "sidecar.py"] diff --git a/sidecars/braket/design.md b/sidecars/braket/design.md new file mode 100644 index 0000000..1286298 --- /dev/null +++ b/sidecars/braket/design.md @@ -0,0 +1,321 @@ +# Quantum-Classical Scheduling Coordination in Fluence + +## Abstract + +Hybrid quantum-classical workflows submit work to two independent queues: +the Kubernetes scheduler (classical compute) and a QPU vendor API (quantum +execution). Classical pods waste node resources while waiting for QPU queue +results. We describe a design for Fluence that coordinates classical resource +allocation with quantum execution order across heterogeneous QPU backends, +without requiring any user application changes. + +## 1. The Two-Queue Problem + +When a hybrid quantum-classical job runs on Kubernetes, the classical pod +starts immediately and blocks waiting for the QPU result. The QPU task +enters a vendor-managed queue shared across all users. The classical pod +consumes node resources — CPU, memory, potentially GPU — for the entire +duration of the QPU queue wait, which may be minutes to hours on real +hardware. + +This waste scales with concurrency. With N concurrent hybrid jobs and a +QPU queue depth of D, each classical pod may idle for D × t_avg seconds +where t_avg is the average QPU task execution time. On a shared cluster +with expensive GPU nodes this is a significant and unfair resource waste. + +The problem has two components: + +**Component 1 — Resource waste.** Classical pods consume node resources +while doing nothing useful. + +**Component 2 — Ordering mismatch.** Classical resource allocation follows +job submission order, not QPU execution order. A job submitted to a busy +backend wastes resources longer than a job submitted to a quiet one. + +## 2. Why Existing Mechanisms Don't Help + +### 2.1 Fluxion reservations + +Fluxion's backfill reservation policies (EASY, Conservative, Hybrid) compute +a future `time_at` from the internal resource graph timeline — when currently +running classical jobs will finish. They have no mechanism to accept an +externally-supplied time derived from a vendor queue. Without a reliable +`time_at`, a reservation degenerates to a pending job. Furthermore, all +reservations are cancelled and recomputed from scratch at the start of every +scheduling loop, so they provide no persistent resource hold. + +### 2.2 Kubernetes scheduling gates alone + +A scheduling gate holds a pod out of the scheduling queue entirely, consuming +no node resources. But ungating N pods simultaneously on a busy cluster +creates a race — resources may not be available, and the graph allocation +happens after ungating, not before. There is no atomicity guarantee between +ungating and placement. + +### 2.3 Preemption alone + +Submitting classical pods with a high `PriorityClass` causes Kubernetes to +evict lower-priority pods to make room. But without a gate, preemption +happens immediately at submit time — the classical pods displace other work +during the entire QPU queue wait, which is worse than the original problem. + +## 3. Design + +The design combines three mechanisms: a **transparent SDK interceptor** +injected by the Fluence webhook, a **sidecar controller** that observes +QPU queue state, and **gated high-priority classical pods** that are +allocated and dispatched only when the QPU is one position from executing. + +### 3.1 Components + +``` +┌─────────────────────────────────────────────────────────┐ +│ Quantum gateway pod │ +│ │ +│ ┌─────────────────────┐ ┌──────────────────────────┐ │ +│ │ user application │ │ fluence-sidecar │ │ +│ │ │ │ │ │ +│ │ device.run(...) │ │ 1. find task by tag │ │ +│ │ ↓ (intercepted) │ │ 2. poll queue_position │ │ +│ │ tags injected │ │ 3. at position==1: │ │ +│ │ automatically │ │ patch ARN → pods │ │ +│ │ │ │ remove gates │ │ +│ └─────────────────────┘ └──────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────┐ +│ Classical pods (SchedulingGated until position==1) │ +│ │ +│ annotations: │ +│ braket.quantum/task-arn: │ +│ schedulingGates: │ +│ - name: quantum.braket/ready ← removed by sidecar │ +│ priorityClassName: quantum-classical-high │ +└─────────────────────────────────────────────────────────┘ +``` + +### 3.2 Transparent SDK interceptor + +The Fluence mutating webhook injects two things into every pod that requests +a QPU resource (`fluxion.flux-framework.org/qpu`): + +**Environment variable:** +``` +FLUENCE_POD_UID= +``` + +**Python sitecustomize hook** (injected as a ConfigMap mounted at the +Python site-packages path): + +```python +# fluence_braket_intercept.py — injected by Fluence webhook +import os +from braket.aws import AwsDevice + +_original_run = AwsDevice.run + +def _patched_run(self, task_specification, *args, **kwargs): + pod_uid = os.environ.get("FLUENCE_POD_UID", "") + if pod_uid: + tags = kwargs.get("tags", {}) + tags["fluence-pod-uid"] = pod_uid + kwargs["tags"] = tags + return _original_run(self, task_specification, *args, **kwargs) + +AwsDevice.run = _patched_run +``` + +This is completely transparent to the user application. Every `device.run()` +call — regardless of which QPU backend, regardless of circuit type — is +automatically tagged with the pod UID. No user code changes are required. + +### 3.3 Sidecar controller + +The `fluence-sidecar` container is injected automatically by the Fluence +webhook into any pod requesting a QPU resource. It runs alongside the user +application in the same pod, sharing the pod's AWS credentials via env vars. + +**Algorithm:** + +``` +1. READ FLUXION_ARN, FLUENCE_POD_UID from env +2. READ gated sibling pod names from FLUENCE_GATED_PODS annotation + +3. WAIT for task tagged fluence-pod-uid= on device + poll search_quantum_tasks every 10s + timeout after FLUENCE_TASK_DISCOVERY_TIMEOUT (default: 300s) + on timeout: fall back to time-window heuristic + +4. POLL task.queue_position() every 30s + log position to stdout for experiment instrumentation + +5. WHEN position == "1" OR state == RUNNING: + for each pod in FLUENCE_GATED_PODS: + kubectl annotate pod braket.quantum/task-arn= + kubectl patch pod remove schedulingGates + +6. EXIT (sidecar is done — pod continues running user application) +``` + +**Fallback heuristic (step 3 timeout):** + +If no tagged task is found within the discovery timeout — e.g. because the +user application uses a non-standard SDK path — the sidecar searches for +tasks submitted to `FLUXION_ARN` with `createdAt >= pod_start_time` and +picks the most recently created one. This is less reliable but handles +edge cases gracefully. + +### 3.4 Gated classical pods + +Classical pods that depend on a quantum result are submitted with: + +```yaml +spec: + schedulingGates: + - name: quantum.braket/ready + priorityClassName: quantum-classical-high + # No graph allocation yet — MatchAllocateSpec deferred until ungating +``` + +The high `PriorityClass` means nothing while the gate is present — the pod +is invisible to the scheduling queue. When the sidecar removes the gate at +position==1, the pod enters the queue with high priority and Kubernetes +preemption displaces lower-priority work to make room. + +### 3.5 Fluence PostFilter for topology-aware preemption + +The default Kubernetes preemption controller evicts pods based purely on +`PriorityClass`, with no awareness of Fluxion's resource graph. It may +evict pods whose removal does not actually free the graph vertices needed +for the incoming classical pod. + +Fluence implements a custom `PostFilter` extension point that: + +1. Receives the high-priority classical pod that failed `MatchAllocateSpec` +2. Asks Fluxion which graph vertices are blocking the match +3. Maps those vertices to currently running pods via Fluence's allocation + tracking +4. Passes only those specific pods to the preemption logic +5. Returns the `nominatedNodeName` that Fluxion identified + +This ensures preemption targets topologically correct pods — pods whose +eviction will actually let Fluxion satisfy the match — rather than +arbitrarily choosing the lowest-priority pods on the cluster. + +## 4. Properties of the Design + +### 4.1 Zero user cooperation required + +The SDK interceptor is injected transparently by the webhook. The user +application requires no changes. The sidecar is injected automatically. +The only user-visible artifact is the `FLUXION_ARN` env var, which the +user already needs to know which backend to target. + +### 4.2 Classical resources allocated at the last responsible moment + +Graph allocation (`MatchAllocateSpec`) happens only when the QPU task +reaches position==1 — seconds to minutes before the result arrives. During +the entire QPU queue wait, no classical node resources are consumed and no +graph capacity is held. + +### 4.3 Classical allocation follows quantum execution order + +Because each workflow's gate is removed independently when its QPU task +reaches position==1, workflows whose QPU tasks execute earlier get classical +resources earlier — regardless of submission order. A workflow submitted to +a quiet backend gets its classical resources before a workflow submitted +earlier to a busy one. This aligns classical scheduling with actual quantum +execution order across heterogeneous backends. + +### 4.4 No estimation of QPU queue time required + +The design makes no attempt to predict when the QPU task will execute. +`position==1` is an observable state transition, not an estimate. The +design is robust to variable queue depths, hardware maintenance windows, +and concurrent submissions by other users. + +### 4.5 Task ARN propagated to classical pods + +When the sidecar removes the gate, it patches `braket.quantum/task-arn` +onto each classical pod as an annotation. Classical pods read this via +the downward API and can use it to retrieve results from S3, submit +follow-on circuits, perform error mitigation, or do anything else the +Braket SDK supports. The sidecar does not prescribe what classical pods +do with the result. + +## 5. Limitations + +### 5.1 Non-Braket SDKs + +The SDK interceptor currently patches `AwsDevice.run()`. Support for +IBM Qiskit Runtime (`backend.run()`), IQM, and other vendors requires +additional interceptors. The pattern is identical; only the entry point +differs. + +### 5.2 Preemption disrupts lower-priority work + +At position==1, classical pods may preempt running lower-priority work. +This work is re-queued and eventually runs, but there is a disruption cost. +A future design using Fluxion's `MatchReserveAt` primitive with a +vendor-supplied ETA would allow graceful draining instead of preemption. +This requires QPU vendors to expose task ETA or start-event webhooks, +which no current vendor provides. + +### 5.3 Multi-task workflows + +The sidecar currently tracks one task per pod. Workflows that submit +multiple QPU tasks (e.g. parameter-shift gradient estimation with 2P +circuits) require the sidecar to track a set of task ARNs and ungate +classical pods when all tasks reach position==1 or a subset completes. +This is a straightforward extension. + +### 5.4 Sidecar resource consumption + +The sidecar consumes minimal CPU and memory (polling every 30s), but +it does hold an open AWS API connection for the duration of the QPU +queue wait. On clusters with many concurrent hybrid workflows this +may become a concern. + +## 6. Required Vendor API Primitive + +The remaining limitation that cannot be solved without vendor cooperation +is task provenance — associating a Braket task with the Kubernetes pod +that submitted it without SDK interception. If Braket were to expose a +`clientToken` or `podIdentity` field that the SDK set automatically from +the execution environment (analogous to how IAM roles work for EC2 +instances), the interceptor would not be needed. + +More significantly, if QPU vendors exposed a task-start event (webhook, +SNS notification, or EventBridge rule) when a task transitions from +QUEUED to RUNNING, the sidecar could react to that event rather than +polling. This would enable graceful draining rather than preemption, and +would allow Fluxion's reservation system to be used with an externally- +supplied `time_at` rather than requiring the position==1 heuristic. + +## 7. Implementation Plan + +### Phase 1 — Sidecar container (this repo) +- `docker/fluence-sidecar/` — sidecar image +- SDK interceptor (`fluence_braket_intercept.py`) +- Task discovery (tagged search + heuristic fallback) +- Queue position polling +- Pod annotation patching and gate removal + +### Phase 2 — Fluence webhook changes +- Inject `FLUENCE_POD_UID` env var into QPU pods +- Inject sidecar container into QPU pods +- Inject SDK interceptor as a mounted ConfigMap +- Inject `FLUENCE_GATED_PODS` annotation listing sibling gated pods +- Create `quantum-classical-high` PriorityClass + +### Phase 3 — Fluence PostFilter +- Custom preemption targeting Fluxion-graph-aware pod selection +- Integration with existing allocation tracking in placement.go + +### Phase 4 — Experiment +- Demonstrate two-queue problem empirically (experiment 1, already running) +- Demonstrate gate + sidecar design reducing classical idle time +- Compare classical node-seconds consumed: ungated vs gated +- Show quantum execution order driving classical allocation order + across heterogeneous backends (SV1, IQM, Rigetti) +EOF diff --git a/sidecars/braket/fluence_braket_intercept.py b/sidecars/braket/fluence_braket_intercept.py new file mode 100644 index 0000000..8afb6c5 --- /dev/null +++ b/sidecars/braket/fluence_braket_intercept.py @@ -0,0 +1,35 @@ +# fluence_braket_intercept.py +# +# Injected by the Fluence webhook into every pod requesting a QPU resource. +# Patches AwsDevice.run() to automatically tag every quantum task submission +# with the pod UID, enabling the fluence-sidecar to find the task without +# any user application changes. +# +# Installed as a Python sitecustomize hook so it runs before any user code. +# The user application requires no changes. + +import os + + +def _install_interceptor(): + try: + from braket.aws import AwsDevice + + _original_run = AwsDevice.run + + def _patched_run(self, task_specification, *args, **kwargs): + pod_uid = os.environ.get("FLUENCE_POD_UID", "") + if pod_uid: + tags = kwargs.get("tags", {}) + tags["fluence-pod-uid"] = pod_uid + kwargs["tags"] = tags + return _original_run(self, task_specification, *args, **kwargs) + + AwsDevice.run = _patched_run + + except ImportError: + # amazon-braket-sdk not installed in this container — skip + pass + + +_install_interceptor() diff --git a/sidecars/braket/sidecar.py b/sidecars/braket/sidecar.py new file mode 100644 index 0000000..432aac8 --- /dev/null +++ b/sidecars/braket/sidecar.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +fluence-sidecar: Quantum-classical scheduling coordination for Fluence. + +Injected automatically by the Fluence mutating webhook into any pod +requesting a QPU resource (fluxion.flux-framework.org/qpu). + +Responsibilities: + 1. Find the quantum task submitted by the sibling user application + container, by searching for tasks tagged with FLUENCE_POD_UID. + 2. Poll task.queue_position() until position==1 or RUNNING. + 3. Patch braket.quantum/task-arn onto gated sibling classical pods. + 4. Remove scheduling gates from those pods — Kubernetes preemption + and the Fluence PostFilter handle placement from there. + +Environment variables (all injected by Fluence webhook): + FLUENCE_POD_UID UID of this pod + FLUENCE_POD_NAME Name of this pod + FLUENCE_NAMESPACE Kubernetes namespace + FLUENCE_GATED_PODS Comma-separated names of gated sibling pods + FLUXION_ARN Braket device ARN for this pod + FLUENCE_TASK_DISCOVERY_TIMEOUT Seconds to wait for task discovery (default: 300) + FLUENCE_POLL_INTERVAL Seconds between queue position polls (default: 30) + AWS_ACCESS_KEY_ID } AWS credentials — shared from pod spec + AWS_SECRET_ACCESS_KEY } + AWS_DEFAULT_REGION } +""" + +import asyncio +import json +import os +import subprocess +import sys +import time +from datetime import datetime, timezone + + +# ── helpers ──────────────────────────────────────────────────────────────────── +# Shared ungating logic lives in sidecars/lib/ungate.py so all vendor sidecars +# can reuse it. Add that directory to the path when running from the repo. +import sys +_lib = os.path.join(os.path.dirname(__file__), "..", "lib") +if os.path.isdir(_lib): + sys.path.insert(0, _lib) +from ungate import log, kubectl, ungate_pods, gated_pods_from_env, namespace_from_env + + +# ── task discovery ───────────────────────────────────────────────────────────── + +def find_task_by_tag(client, device_arn, pod_uid, timeout): + """ + Search for a Braket task tagged fluence-pod-uid= on device_arn. + Polls until found or timeout. Returns task ARN or None. + """ + log(f"Searching for task with tag fluence-pod-uid={pod_uid} on {device_arn}") + deadline = time.time() + timeout + + while time.time() < deadline: + try: + # Extract region from device ARN + # arn:aws:braket:::device/... + region = device_arn.split(":")[3] or os.environ.get("AWS_DEFAULT_REGION", "us-east-1") + response = client.search_quantum_tasks( + filters=[ + { + "name": "deviceArn", + "operator": "EQUAL", + "values": [device_arn], + }, + { + "name": "tags:fluence-pod-uid", + "operator": "EQUAL", + "values": [pod_uid], + }, + ], + maxResults=10, + ) + tasks = response.get("quantumTasks", []) + if tasks: + # Most recently created task is ours + tasks.sort(key=lambda t: t.get("createdAt", ""), reverse=True) + arn = tasks[0]["quantumTaskArn"] + log(f"Found task by tag: {arn}") + return arn + except Exception as e: + log(f"Search error (will retry): {e}") + + time.sleep(10) + + log("Task discovery by tag timed out") + return None + + +def find_task_by_time_window(client, device_arn, pod_start_ts, timeout): + """ + Fallback: find the most recently created task on device_arn submitted + after pod_start_ts. Used when tag-based discovery fails. + """ + log(f"Falling back to time-window heuristic (pod_start={pod_start_ts})") + deadline = time.time() + timeout + + while time.time() < deadline: + try: + response = client.search_quantum_tasks( + filters=[ + { + "name": "deviceArn", + "operator": "EQUAL", + "values": [device_arn], + }, + { + "name": "status", + "operator": "EQUAL", + "values": ["QUEUED"], + }, + ], + maxResults=50, + ) + tasks = response.get("quantumTasks", []) + # Filter to tasks created after pod start + candidates = [ + t for t in tasks + if t.get("createdAt", "") >= pod_start_ts + ] + if candidates: + candidates.sort(key=lambda t: t.get("createdAt", ""), reverse=True) + arn = candidates[0]["quantumTaskArn"] + log(f"Found task by time window (heuristic): {arn} " + f"(WARNING: may not be correct if multiple tasks submitted)") + return arn + except Exception as e: + log(f"Search error (will retry): {e}") + + time.sleep(10) + + log("Time-window task discovery timed out") + return None + + +# ── queue position polling ───────────────────────────────────────────────────── + +def wait_for_position_one(task_arn, poll_interval): + """ + Poll task.queue_position() until position==1 or task is RUNNING. + Returns when it's time to ungate classical pods. + """ + asyncio.set_event_loop(asyncio.new_event_loop()) + + from braket.aws import AwsQuantumTask + + log(f"Polling queue position for task {task_arn.split('/')[-1]}") + last_position = None + + while True: + try: + task = AwsQuantumTask(arn=task_arn) + state = task.state() + + if state in ("COMPLETED", "FAILED", "CANCELLED"): + log(f"Task reached terminal state: {state} — ungating now") + return state + + if state == "RUNNING": + log("Task is RUNNING — ungating classical pods") + return state + + pos_info = task.queue_position() + position = pos_info.queue_position + + if position != last_position: + log(f"Queue position: {position} (state={state})") + last_position = position + + if position == "1": + log("Queue position is 1 — ungating classical pods") + return state + + except Exception as e: + log(f"Queue position poll error (will retry): {e}") + + time.sleep(poll_interval) + + +# ungate_pods is imported from sidecars/lib/ungate.py + + +# ── main ─────────────────────────────────────────────────────────────────────── + +def main(): + pod_uid = os.environ.get("FLUENCE_POD_UID", "") + pod_name = os.environ.get("FLUENCE_POD_NAME", "") + namespace = os.environ.get("FLUENCE_NAMESPACE", "default") + gated_str = os.environ.get("FLUENCE_GATED_PODS", "") + device_arn = os.environ.get("FLUXION_ARN", "") + discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300)) + poll_interval = int(os.environ.get("FLUENCE_POLL_INTERVAL", 30)) + + gated_pods = [p.strip() for p in gated_str.split(",") if p.strip()] + + log(f"Starting fluence-sidecar") + log(f" pod_uid : {pod_uid}") + log(f" pod_name : {pod_name}") + log(f" namespace : {namespace}") + log(f" device_arn : {device_arn}") + log(f" gated_pods : {gated_pods}") + + if not device_arn: + log("ERROR: FLUXION_ARN not set — cannot discover task") + sys.exit(1) + + if not gated_pods: + log("No gated pods to ungate — exiting") + sys.exit(0) + + # Get region from ARN or env + region = device_arn.split(":")[3] or os.environ.get("AWS_DEFAULT_REGION", "us-east-1") + if not region: + region = "us-east-1" + + import boto3 + asyncio.set_event_loop(asyncio.new_event_loop()) + client = boto3.client("braket", region_name=region) + + # Pod start time for fallback heuristic + pod_start_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # 1. Discover task ARN + task_arn = find_task_by_tag(client, device_arn, pod_uid, discovery_timeout) + + if not task_arn: + log("Tag-based discovery failed — trying time-window heuristic") + task_arn = find_task_by_time_window( + client, device_arn, pod_start_ts, discovery_timeout + ) + + if not task_arn: + log("ERROR: could not find quantum task — ungating anyway to avoid deadlock") + ungate_pods(gated_pods, "", namespace) + sys.exit(1) + + # 2. Wait for position==1 or RUNNING + wait_for_position_one(task_arn, poll_interval) + + # 3. Ungate classical pods with task ARN + ungate_pods(gated_pods, task_arn, namespace) + + log("Done — classical pods ungated") + + +if __name__ == "__main__": + main() diff --git a/sidecars/braket/test/integration.sh b/sidecars/braket/test/integration.sh new file mode 100644 index 0000000..ab7dfe7 --- /dev/null +++ b/sidecars/braket/test/integration.sh @@ -0,0 +1,334 @@ +#!/usr/bin/env bash +# sidecars/braket/test/integration.sh +# +# Local integration test for the Fluence Braket sidecar. +# Requires a running Kubernetes cluster and AWS credentials with Braket access. +# +# What this tests: +# 1. SDK interceptor: AwsDevice.run() tags tasks with fluence-pod-uid +# 2. Task discovery: sidecar finds the task by tag via search_quantum_tasks +# 3. Queue position polling: sidecar polls and logs queue position +# 4. Ungating: sidecar removes gate and patches task ARN when position==1 +# +# Usage: +# # With existing cluster and credentials secret already applied: +# bash sidecars/braket/test/integration.sh +# +# # Override defaults: +# NAMESPACE=test BACKEND=sv1 bash sidecars/braket/test/integration.sh +# +# Prerequisites: +# - kubectl configured against a running cluster +# - aws-braket-credentials secret in $NAMESPACE +# - Fluence installed (for schedulerName: fluence to work) +# - fluence-sidecar-braket image built and loaded into cluster +# (or pulled from GHCR) +set -euo pipefail + +NAMESPACE="${NAMESPACE:-default}" +BACKEND="${BACKEND:-sv1}" +SIDECAR_IMAGE="${SIDECAR_IMAGE:-ghcr.io/converged-computing/fluence-sidecar-braket:latest}" +HERE="$(cd "$(dirname "$0")" && pwd)" + +log() { echo "=== [braket-integration] $*"; } +fail() { echo "FAIL: $*" >&2; dump; exit 1; } + +dump() { + echo "----- pods -----" + kubectl get pods -n "$NAMESPACE" -o wide || true + echo "----- gateway logs -----" + kubectl logs -n "$NAMESPACE" integration-gateway -c user-app --tail=50 || true + echo "----- sidecar logs -----" + kubectl logs -n "$NAMESPACE" integration-gateway -c fluence-sidecar --tail=50 || true + echo "----- classical pod -----" + kubectl describe pod -n "$NAMESPACE" integration-classical || true +} + +# Check prerequisites +kubectl get secret aws-braket-credentials -n "$NAMESPACE" > /dev/null 2>&1 \ + || fail "aws-braket-credentials secret not found in namespace $NAMESPACE" + +log "Running braket sidecar integration test" +log " namespace : $NAMESPACE" +log " backend : $BACKEND" +log " image : $SIDECAR_IMAGE" + +# Determine device ARN from backend name +case "$BACKEND" in + sv1) DEVICE_ARN="arn:aws:braket:::device/quantum-simulator/amazon/sv1" ;; + tn1) DEVICE_ARN="arn:aws:braket:::device/quantum-simulator/amazon/tn1" ;; + *) fail "Unknown backend: $BACKEND (use sv1 or tn1 for integration tests)" ;; +esac + +POD_UID="integration-test-$(date +%s)" + +# Clean up any leftover pods from a previous run +kubectl delete pod integration-gateway integration-classical \ + -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true +kubectl delete rolebinding fluence-sidecar-integration \ + -n "$NAMESPACE" --ignore-not-found=true 2>/dev/null || true +kubectl delete role fluence-sidecar-integration \ + -n "$NAMESPACE" --ignore-not-found=true 2>/dev/null || true +kubectl delete serviceaccount fluence-sidecar-integration \ + -n "$NAMESPACE" --ignore-not-found=true 2>/dev/null || true + +# Create RBAC for sidecar to patch pods +kubectl apply -n "$NAMESPACE" -f - << YAML +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence-sidecar-integration + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: fluence-sidecar-integration + namespace: ${NAMESPACE} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "patch", "annotate"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-sidecar-integration + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: fluence-sidecar-integration + namespace: ${NAMESPACE} +roleRef: + kind: Role + name: fluence-sidecar-integration + apiGroup: rbac.authorization.k8s.io +YAML + +# Create the classical pod (gated, waiting for sidecar) +kubectl apply -n "$NAMESPACE" -f - << YAML +apiVersion: v1 +kind: Pod +metadata: + name: integration-classical + namespace: ${NAMESPACE} + annotations: + braket.quantum/task-arn: "" +spec: + restartPolicy: Never + schedulingGates: + - name: quantum.braket/ready + containers: + - name: classical-worker + image: python:3.11-slim + command: + - python3 + - -c + - | + import os, time + arn = os.environ.get("BRAKET_TASK_ARN", "") + print(f"TASK_ARN={arn}") + assert arn, "BRAKET_TASK_ARN is empty" + print("classical-worker: task ARN received correctly") + # Verify we can retrieve the result from Braket using the ARN + from braket.aws import AwsQuantumTask + import asyncio + asyncio.set_event_loop(asyncio.new_event_loop()) + task = AwsQuantumTask(arn=arn) + state = task.state() + print(f"classical-worker: task state={state}") + assert state in ("COMPLETED", "RUNNING"), f"unexpected state: {state}" + print("PASS: classical worker got valid task ARN and confirmed task state") + env: + - name: BRAKET_TASK_ARN + valueFrom: + fieldRef: + fieldPath: metadata.annotations['braket.quantum/task-arn'] + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_DEFAULT_REGION + resources: + requests: + cpu: "100m" + memory: "256Mi" +YAML + +# Create the gateway pod with user-app + real sidecar +kubectl apply -n "$NAMESPACE" -f - << YAML +apiVersion: v1 +kind: Pod +metadata: + name: integration-gateway + namespace: ${NAMESPACE} +spec: + restartPolicy: Never + serviceAccountName: fluence-sidecar-integration + + initContainers: + # user-app: submits a real circuit to SV1 — SDK interceptor tags it + - name: user-app + image: ghcr.io/converged-computing/quantum-braket-braket-gateway:latest + command: + - python3 + - -c + - | + import os, sys + # Install the interceptor (normally injected by webhook) + sys.path.insert(0, "/app") + exec(open("/app/fluence_braket_intercept.py").read()) + + from braket.aws import AwsDevice + from braket.circuits import Circuit + + device = AwsDevice("${DEVICE_ARN}") + bell = Circuit().h(0).cnot(0, 1) + print(f"user-app: submitting circuit to ${BACKEND}") + print(f"user-app: FLUENCE_POD_UID={os.environ.get('FLUENCE_POD_UID', 'NOT SET')}") + task = device.run(bell, shots=10) + print(f"user-app: submitted task {task.id}") + print(f"user-app: tags should include fluence-pod-uid") + env: + - name: FLUENCE_POD_UID + value: "${POD_UID}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_DEFAULT_REGION + + containers: + # real fluence-sidecar + - name: fluence-sidecar + image: ${SIDECAR_IMAGE} + env: + - name: FLUENCE_POD_UID + value: "${POD_UID}" + - name: FLUENCE_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: FLUENCE_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: FLUENCE_GATED_PODS + value: "integration-classical" + - name: FLUXION_ARN + value: "${DEVICE_ARN}" + - name: FLUENCE_TASK_DISCOVERY_TIMEOUT + value: "120" + - name: FLUENCE_POLL_INTERVAL + value: "10" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_SECRET_ACCESS_KEY + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-braket-credentials + key: AWS_DEFAULT_REGION + resources: + requests: + cpu: "100m" + memory: "512Mi" +YAML + +log "Pods submitted. Waiting for gateway to reach Running..." + +# Wait for gateway Running +for i in $(seq 1 120); do + phase=$(kubectl get pod integration-gateway -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || true) + [ "$phase" = "Running" ] && break + sleep 3 +done +[ "$(kubectl get pod integration-gateway -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}')" = "Running" ] \ + || fail "integration-gateway did not reach Running" + +log "Gateway is Running. Waiting for sidecar to ungate classical pod..." + +# Wait for classical pod to be ungated (up to 5 minutes for SV1 queue) +for i in $(seq 1 100); do + phase=$(kubectl get pod integration-classical -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || true) + { [ "$phase" = "Running" ] || [ "$phase" = "Succeeded" ]; } && break + # Print sidecar progress every 30s + [ $((i % 10)) -eq 0 ] && \ + kubectl logs integration-gateway -n "$NAMESPACE" \ + -c fluence-sidecar --tail=5 2>/dev/null || true + sleep 3 +done + +phase=$(kubectl get pod integration-classical -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || true) +{ [ "$phase" = "Running" ] || [ "$phase" = "Succeeded" ]; } \ + || fail "integration-classical was not ungated (phase=$phase)" + +log "Classical pod ungated. Checking task ARN annotation..." + +arn=$(kubectl get pod integration-classical -n "$NAMESPACE" \ + -o jsonpath='{.metadata.annotations.braket\.quantum/task-arn}' 2>/dev/null || true) +[ -n "$arn" ] || fail "braket.quantum/task-arn annotation not set" +log "Task ARN: $arn" + +# Wait for classical pod to complete +for i in $(seq 1 60); do + phase=$(kubectl get pod integration-classical -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || true) + [ "$phase" = "Succeeded" ] && break + [ "$phase" = "Failed" ] && fail "integration-classical Failed" + sleep 3 +done + +[ "$(kubectl get pod integration-classical -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}')" = "Succeeded" ] \ + || fail "integration-classical did not Succeed" + +# Verify classical pod got the ARN and confirmed task state +out=$(kubectl logs integration-classical -n "$NAMESPACE" 2>/dev/null || true) +echo "$out" | grep -q "PASS:" || fail "classical worker did not PASS (logs: $out)" + +log "Sidecar logs:" +kubectl logs integration-gateway -n "$NAMESPACE" -c fluence-sidecar || true + +log "PASS: full braket sidecar integration test complete" +log " SDK interceptor tagged task with fluence-pod-uid" +log " Sidecar discovered task by tag" +log " Sidecar polled queue position and ungated at position==1" +log " Task ARN propagated to classical pod via annotation" +log " Classical pod confirmed task state via Braket SDK" + +# Cleanup +kubectl delete pod integration-gateway integration-classical \ + -n "$NAMESPACE" --ignore-not-found=true --wait=false || true diff --git a/sidecars/lib/ungate.py b/sidecars/lib/ungate.py new file mode 100644 index 0000000..ebc1b9f --- /dev/null +++ b/sidecars/lib/ungate.py @@ -0,0 +1,91 @@ +""" +sidecars/lib/ungate.py — shared ungating logic for all Fluence sidecars. + +Every vendor sidecar calls ungate_pods() once the quantum task is ready. +This module handles the Kubernetes side: patching the task ARN annotation +and removing the scheduling gate from each classical pod. +""" + +import json +import os +import subprocess +from datetime import datetime, timezone + + +def log(msg): + ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + print(f"[fluence-sidecar] {ts} {msg}", flush=True) + + +def kubectl(args): + result = subprocess.run( + ["kubectl"] + args, + capture_output=True, text=True + ) + if result.returncode != 0: + raise RuntimeError( + f"kubectl {' '.join(args)} failed: {result.stderr.strip()}" + ) + return result.stdout.strip() + + +def ungate_pods(gated_pods, task_arn, namespace): + """ + For each gated pod: + 1. Patch braket.quantum/task-arn annotation with the task ARN + 2. Remove the quantum.braket/ready scheduling gate + + gated_pods: list of pod names + task_arn: the vendor task ARN to propagate (may be empty string if unknown) + namespace: Kubernetes namespace + """ + for pod_name in gated_pods: + pod_name = pod_name.strip() + if not pod_name: + continue + + log(f"Ungating pod: {pod_name}") + + # 1. Patch task ARN annotation + if task_arn: + try: + kubectl([ + "annotate", "pod", pod_name, + "-n", namespace, + f"braket.quantum/task-arn={task_arn}", + "--overwrite", + ]) + log(f" Patched task ARN onto {pod_name}: {task_arn}") + except RuntimeError as e: + log(f" WARNING: could not patch annotation on {pod_name}: {e}") + else: + log(f" WARNING: no task ARN available to patch onto {pod_name}") + + # 2. Remove scheduling gate + patch = json.dumps([{ + "op": "remove", + "path": "/spec/schedulingGates/0" + }]) + try: + kubectl([ + "patch", "pod", pod_name, + "-n", namespace, + "--type=json", + f"-p={patch}", + ]) + log(f" Removed scheduling gate from {pod_name}") + except RuntimeError as e: + log(f" WARNING: could not remove gate from {pod_name}: {e}") + + +def gated_pods_from_env(): + """Read FLUENCE_GATED_PODS env var and return a list of pod names.""" + return [ + p.strip() + for p in os.environ.get("FLUENCE_GATED_PODS", "").split(",") + if p.strip() + ] + + +def namespace_from_env(): + return os.environ.get("FLUENCE_NAMESPACE", "default") diff --git a/test/e2e/04-sidecar-ungate.sh b/test/e2e/04-sidecar-ungate.sh new file mode 100644 index 0000000..68e489e --- /dev/null +++ b/test/e2e/04-sidecar-ungate.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Sidecar gate/ungate plumbing test. +# +# This test verifies the Kubernetes mechanics of the sidecar design: +# 1. A gated classical pod stays SchedulingGated until something removes the gate +# 2. A pod with kubectl access can patch an annotation and remove a gate +# 3. The classical pod reads the patched annotation via the downward API +# +# This does NOT test the braket sidecar itself (task discovery, SDK interceptor, +# queue position polling). Those require real AWS credentials and are covered +# by sidecars/braket/test/integration.sh which is run locally. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" + +log "TEST 4: sidecar gate/ungate Kubernetes plumbing" + +kubectl apply -f examples/test/e2e/sidecar-mock.yaml + +# Classical pod must start SchedulingGated — verify it is NOT Running immediately +sleep 5 +phase="$(kubectl get pod classical-mock -o jsonpath='{.status.phase}' 2>/dev/null || true)" +[ "$phase" != "Running" ] || fail "classical-mock should not be Running before gate is removed (phase=$phase)" +log "classical-mock is correctly gated (phase=${phase:-SchedulingGated})" + +# Gateway pod should reach Running +wait_pod_phase quantum-gateway-mock Running 60 \ + || fail "quantum-gateway-mock did not reach Running" + +# Mock sidecar should ungate classical-mock within 60s +log "waiting for mock sidecar to ungate classical-mock..." +for i in $(seq 1 60); do + phase="$(kubectl get pod classical-mock -o jsonpath='{.status.phase}' 2>/dev/null || true)" + { [ "$phase" = "Running" ] || [ "$phase" = "Succeeded" ]; } && break + sleep 2 +done +wait_pod_phase classical-mock Running 30 \ + || fail "classical-mock did not reach Running after gate removal" + +# Task ARN annotation must have been patched +arn="$(kubectl get pod classical-mock \ + -o jsonpath='{.metadata.annotations.braket\.quantum/task-arn}' 2>/dev/null || true)" +[ -n "$arn" ] || fail "braket.quantum/task-arn annotation not set on classical-mock" +log "task ARN annotation present: $arn" + +# Classical pod must have read the annotation via downward API +out="$(kubectl logs classical-mock 2>/dev/null || true)" +echo "$out" | grep -q "TASK_ARN=" \ + || fail "BRAKET_TASK_ARN not visible in classical-mock logs (got: $out)" + +log "PASS: gate/ungate plumbing works — annotation patched and read via downward API" +log "NOTE: braket sidecar integration test (SDK intercept, tag discovery," +log " queue polling) is in sidecars/braket/test/integration.sh" +kubectl delete -f examples/test/e2e/sidecar-mock.yaml --wait=false || true From a9e4d1e7c8c9b8cf138b08c9143c02a3abc3d855 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 18 Jun 2026 19:05:56 -0700 Subject: [PATCH 2/2] feat: ensure we set higher priority for gated pods this will ensure they will be scheduled quickly when ungated, since there is no real concept of a reservation. Signed-off-by: vsoch --- .github/workflows/e2e-tests.yaml | 10 +- .github/workflows/sidecar-build-deploy.yaml | 2 +- README.md | 75 ++- deploy/fluence-test.yaml | 27 +- deploy/fluence.yaml | 22 +- examples/test/e2e/sidecar-mock-pods.yaml | 62 ++ examples/test/e2e/sidecar-mock.yaml | 166 ----- pkg/webhook/webhook.go | 691 +++++++------------- pkg/webhook/webhook_test.go | 29 +- sidecars/braket/design.md | 452 ++++++------- sidecars/lib/ungate.py | 23 +- test/e2e/04-sidecar-ungate.sh | 92 ++- 12 files changed, 735 insertions(+), 916 deletions(-) create mode 100644 examples/test/e2e/sidecar-mock-pods.yaml delete mode 100644 examples/test/e2e/sidecar-mock.yaml diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index 16f679f..6e968d3 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -36,7 +36,7 @@ jobs: - name: Create k8s Kind Cluster uses: helm/kind-action@v1.10.0 with: - version: v0.32.0 # Define your custom KinD CLI version here + version: v0.32.0 # required for gang node_image: kindest/node:v1.36.1 config: ./deploy/kind-config.yaml @@ -78,6 +78,8 @@ jobs: done [ -n "$POD" ] || { echo "ERROR: no Running fluence pod found"; exit 1; } echo "Using pod: $POD" + # Brief sleep to let the container runtime stabilize before exec + sleep 5 kubectl -n kube-system exec "$POD" -- ls /tmp/ kubectl -n kube-system logs "$POD" kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" @@ -107,6 +109,8 @@ jobs: done [ -n "$POD" ] || { echo "ERROR: no Running fluence pod found after restart"; exit 1; } echo "Using pod: $POD" + # Brief sleep to let the container runtime stabilize before exec + sleep 5 kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - name: Wait for webhook @@ -126,12 +130,10 @@ jobs: - name: E2E - quantum placement run: bash test/e2e/02-quantum-placement.sh - # Note: I commented this out until we add back to fluence. - # It depends on a PR to flux-sched that is not merged. #- name: E2E - restart recovery (no double-book) # run: bash test/e2e/03-restart-recovery.sh - - name: E2E - restart recovery (no double-book) + - name: E2E - sidecar ungate run: bash test/e2e/04-sidecar-ungate.sh - name: Dump diagnostics on failure diff --git a/.github/workflows/sidecar-build-deploy.yaml b/.github/workflows/sidecar-build-deploy.yaml index 83e9424..c11245d 100644 --- a/.github/workflows/sidecar-build-deploy.yaml +++ b/.github/workflows/sidecar-build-deploy.yaml @@ -62,7 +62,7 @@ jobs: - name: Build and push ${{ matrix.sidecar }} sidecar uses: docker/build-push-action@v6 with: - context: ./sidecars + context: . file: ./sidecars/${{ matrix.sidecar }}/Dockerfile push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} diff --git a/README.md b/README.md index 0068322..54870c0 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ native Kubernetes PodGroup (Gang) API — no sidecar, and no proven out first in [fluxion-quantum](https://github.com/converged-computing/fluxion-quantum). -## How the pieces fit together +## How does it work? ```console resources.yaml --+ @@ -80,8 +80,6 @@ virtual resource (`virtual=true`) cannot be co-selected in one match -- one woul prune the other. A pod needing both produces **two** match-allocate calls, held together all-or-nothing. ---- - ## Components ### `pkg/jgf` — JGF graph builder @@ -193,6 +191,39 @@ admission. The real gating is Fluxion (and the backend's own limits); since a virtual backend is reachable from any node, each type is advertised at a large ceiling. Types come from the same config as the graph, so they can't drift. +### `sidecars/` — quantum coordination sidecars + +Vendor-specific sidecar containers injected by the webhook into leader pods +of quantum workflow groups. Each sidecar discovers the QPU task submitted by +the leader, polls the vendor queue, and ungates worker pods when the task +reaches position==1. + +```console +sidecars/ + lib/ungate.py shared gate removal + ARN annotation logic + braket/ + sidecar.py AWS Braket sidecar (tag search, queue polling, ungate) + fluence_braket_intercept.py AwsDevice.run() monkey-patch (PYTHONSTARTUP) + Dockerfile build context is sidecars/ to include lib/ + design.md full design document + test/integration.sh local integration test (requires AWS credentials) +``` + +The sidecar is injected automatically — users only need the group label: + +```yaml +metadata: + labels: + fluence.flux-framework.org/group: my-workflow +spec: + schedulerName: fluence +``` + +Fluence creates the PodGroup, injects the sidecar, creates per-namespace +RBAC, and gates all non-leader pods. See `sidecars/braket/design.md` for +the full design including the SDK interceptor, queue position polling, and +the two-queue problem motivation. + ### `pkg/webhook` — environment injection A mutating webhook that surfaces scheduler-chosen values to a workload. Container @@ -210,7 +241,9 @@ workload reads these normalized names regardless of which backend it matched. - `cmd/deviceplugin` — the extended-resource DaemonSet. - `cmd/webhook` — the env-injection webhook. - `cmd/recovery-probe` — verifies allocation replay survives a graph rebuild - (what a restart does); see `make test-restore`. Note this was implemented but removed because the code in fluxion is only part of a PR branch, and I feel nervous about depending on it. + (what a restart does); see `make test-restore`. + +Note that the recovery probe (and graph restore) was implemented but removed because the code in fluxion is only part of a PR branch, and I feel nervous about depending on it. ## Configuration @@ -335,6 +368,38 @@ Submission is **not** done by the scheduler — the workload container holds the user's credentials and submits via qrmi-go. Fluence only schedules and hands off the backend. (When we control local quantum devices this will change.) +### 3. Quantum workflow groups (leader + workers) + +For workflows where a leader pod submits quantum work and worker pods process +the results, add the group label to all pods. Fluence gates the workers until +the QPU task reaches position==1 in the vendor queue: + +```yaml +# All pods in the group get the same label +metadata: + labels: + fluence.flux-framework.org/group: my-qaoa-workflow +spec: + schedulerName: fluence +``` + +The first pod admitted becomes the leader — Fluence injects the sidecar and +creates a PodGroup with `minCount: 1`. All subsequent pods get a +`quantum.braket/ready` scheduling gate and consume no node resources during +the QPU queue wait. When the sidecar observes `queue_position == 1`, it +patches the task ARN onto each worker pod's annotations and removes their +gates atomically with setting `fluence-quantum-classical` priority class. + +Per-namespace RBAC (`fluence-sidecar` ServiceAccount/Role/RoleBinding) and +the interceptor ConfigMap are created automatically by the webhook on first +use — no manual setup required. + +```bash +# Apply per-namespace RBAC is NOT needed — webhook creates it automatically. +# Just apply your pods with the group label and schedulerName: fluence. +kubectl apply -f my-quantum-workflow.yaml +``` + ### Notes - **Deletion hangs.** A PodGroup can hang on delete via finalizers if the workload @@ -350,4 +415,4 @@ See [LICENSE](LICENSE), [COPYRIGHT](COPYRIGHT), and [NOTICE](NOTICE). SPDX-License-Identifier: MIT -LLNL-CODE-842614 \ No newline at end of file +LLNL-CODE-842614 diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 075602a..965516c 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -58,7 +58,7 @@ metadata: rules: - apiGroups: ["scheduling.k8s.io"] resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] - verbs: ["get", "list", "watch", "update", "patch"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["create", "get", "update", "list", "watch"] @@ -77,6 +77,9 @@ rules: - apiGroups: [""] resources: ["serviceaccounts"] verbs: ["get", "create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create"] - apiGroups: ["rbac.authorization.k8s.io"] resources: ["roles", "rolebindings"] verbs: ["get", "create"] @@ -185,6 +188,11 @@ spec: # Allows for kind load imagePullPolicy: Never command: ["/bin/fluence-webhook"] + env: + # Use busybox as sidecar image in tests — avoids pulling the real + # sidecar image which is large and not cached in CI. + - name: FLUENCE_SIDECAR_IMAGE + value: "busybox:latest" ports: - containerPort: 8443 readinessProbe: @@ -251,6 +259,21 @@ webhooks: # kubectl apply -f deploy/fluence-sidecar.yaml +--- +# PriorityClass for classical pods paired with quantum work. +# Applied to worker pods by the webhook when they are gated. +# When ungated, high priority triggers preemption of lower-priority work +# so workers get nodes immediately as the QPU result arrives. +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: fluence-quantum-classical + labels: + app: fluence +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." --- # SDK interceptor ConfigMap — holds the Python sitecustomize hook that # patches AwsDevice.run() to tag every quantum task with the pod UID. @@ -291,4 +314,4 @@ data: except ImportError: pass - _install_interceptor() + _install_interceptor() \ No newline at end of file diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index 7cf57a8..5215a59 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -58,7 +58,7 @@ metadata: rules: - apiGroups: ["scheduling.k8s.io"] resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] - verbs: ["get", "list", "watch", "update", "patch"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["create", "get", "update", "list", "watch"] @@ -77,6 +77,9 @@ rules: - apiGroups: [""] resources: ["serviceaccounts"] verbs: ["get", "create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create"] - apiGroups: ["rbac.authorization.k8s.io"] resources: ["roles", "rolebindings"] verbs: ["get", "create"] @@ -246,6 +249,21 @@ webhooks: # kubectl apply -f deploy/fluence-sidecar.yaml +--- +# PriorityClass for classical pods paired with quantum work. +# Applied to worker pods by the webhook when they are gated. +# When ungated, high priority triggers preemption of lower-priority work +# so workers get nodes immediately as the QPU result arrives. +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: fluence-quantum-classical + labels: + app: fluence +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." --- # SDK interceptor ConfigMap — holds the Python sitecustomize hook that # patches AwsDevice.run() to tag every quantum task with the pod UID. @@ -286,4 +304,4 @@ data: except ImportError: pass - _install_interceptor() + _install_interceptor() \ No newline at end of file diff --git a/examples/test/e2e/sidecar-mock-pods.yaml b/examples/test/e2e/sidecar-mock-pods.yaml new file mode 100644 index 0000000..13232f7 --- /dev/null +++ b/examples/test/e2e/sidecar-mock-pods.yaml @@ -0,0 +1,62 @@ +--- +# Leader pod — first admitted, webhook creates PodGroup, injects sidecar, creates RBAC +# User only needs schedulerName: fluence and the quantum-group label. +# No PodGroup object needed — Fluence creates it. +apiVersion: v1 +kind: Pod +metadata: + name: sidecar-test-leader + labels: + app: fluence-sidecar-test + fluence.flux-framework.org/group: sidecar-test-group +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: mock-quantum-app + image: busybox + command: + - sh + - -c + - | + echo "mock-quantum-app: running" + echo "arn:aws:braket:us-east-1:123456:quantum-task/mock-abc123" \ + > /tmp/task-arn + echo "mock-quantum-app: task ARN written" + sleep 3600 + resources: + requests: + cpu: "100m" + memory: "128Mi" + +--- +# Worker pod — webhook adds scheduling gate automatically +apiVersion: v1 +kind: Pod +metadata: + name: sidecar-test-worker + labels: + app: fluence-sidecar-test + fluence.flux-framework.org/group: sidecar-test-group +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: classical-worker + image: busybox + command: + - sh + - -c + - | + echo "classical-worker: started" + echo "TASK_ARN=$BRAKET_TASK_ARN" + sleep 10 + env: + - name: BRAKET_TASK_ARN + valueFrom: + fieldRef: + fieldPath: metadata.annotations['braket.quantum/task-arn'] + resources: + requests: + cpu: "100m" + memory: "128Mi" diff --git a/examples/test/e2e/sidecar-mock.yaml b/examples/test/e2e/sidecar-mock.yaml deleted file mode 100644 index 7960a5e..0000000 --- a/examples/test/e2e/sidecar-mock.yaml +++ /dev/null @@ -1,166 +0,0 @@ ---- -# quantum-gateway-mock: simulates a quantum gateway pod with the fluence -# sidecar injected. Uses mock containers that don't need real AWS credentials. -# -# The mock-gateway container writes a fake task ARN to a shared volume. -# The mock-sidecar reads it, simulates position==1, patches the annotation -# onto classical-mock, and removes its scheduling gate. -apiVersion: v1 -kind: Pod -metadata: - name: quantum-gateway-mock - labels: - app: fluence-sidecar-test -spec: - schedulerName: fluence - restartPolicy: Never - - serviceAccountName: fluence-sidecar-test - - initContainers: - # Simulates a user quantum application writing a task ARN - - name: mock-gateway - image: busybox - command: - - sh - - -c - - | - echo "mock-gateway: writing fake task ARN" - echo "arn:aws:braket:us-east-1:123456789:quantum-task/mock-task-abc123" \ - > /var/fluence/task-arn - echo "mock-gateway: done" - volumeMounts: - - name: fluence-task-info - mountPath: /var/fluence - - containers: - # Simulates the fluence sidecar — mock version that skips real AWS calls - - name: mock-sidecar - image: busybox - command: - - sh - - -c - - | - echo "mock-sidecar: waiting for task ARN..." - until [ -f /var/fluence/task-arn ]; do sleep 1; done - TASK_ARN=$(cat /var/fluence/task-arn) - echo "mock-sidecar: found task ARN: $TASK_ARN" - - echo "mock-sidecar: simulating position==1 reached" - - # Patch task ARN annotation onto classical pod - kubectl annotate pod classical-mock \ - "braket.quantum/task-arn=${TASK_ARN}" --overwrite - echo "mock-sidecar: patched task ARN annotation" - - # Remove scheduling gate - kubectl patch pod classical-mock \ - --type=json \ - -p='[{"op":"remove","path":"/spec/schedulingGates/0"}]' - echo "mock-sidecar: removed scheduling gate from classical-mock" - - sleep 3600 - env: - - name: FLUENCE_POD_UID - valueFrom: - fieldRef: - fieldPath: metadata.uid - - name: FLUENCE_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: FLUENCE_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - volumeMounts: - - name: fluence-task-info - mountPath: /var/fluence - - volumes: - - name: fluence-task-info - emptyDir: {} - ---- -# classical-mock: a gated classical pod that waits for the sidecar to ungate it. -# Reads the task ARN from its annotation via the downward API. -apiVersion: v1 -kind: Pod -metadata: - name: classical-mock - labels: - app: fluence-sidecar-test - annotations: - braket.quantum/task-arn: "" # populated by sidecar at ungate time -spec: - schedulerName: fluence - restartPolicy: Never - - # Gate: holds this pod out of the scheduling queue until sidecar removes it - schedulingGates: - - name: quantum.braket/ready - - # High priority: once ungated, preempts lower-priority work if needed - priorityClassName: quantum-classical-high - - containers: - - name: classical-worker - image: busybox - command: - - sh - - -c - - | - echo "classical-mock: started" - echo "TASK_ARN=$BRAKET_TASK_ARN" - echo "classical-mock: would now read results from S3 using task ARN" - sleep 10 - env: - # Task ARN injected from annotation by downward API - - name: BRAKET_TASK_ARN - valueFrom: - fieldRef: - fieldPath: metadata.annotations['braket.quantum/task-arn'] - ---- -# PriorityClass for classical pods paired with quantum work -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: quantum-classical-high -value: 1000000 -globalDefault: false -description: "High priority for classical pods paired with quantum work. Applied at ungate time." - ---- -# ServiceAccount and RBAC for the mock sidecar to patch pods -apiVersion: v1 -kind: ServiceAccount -metadata: - name: fluence-sidecar-test - namespace: default - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: fluence-sidecar-test - namespace: default -rules: - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "patch", "annotate"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: fluence-sidecar-test - namespace: default -subjects: - - kind: ServiceAccount - name: fluence-sidecar-test - namespace: default -roleRef: - kind: Role - name: fluence-sidecar-test - apiGroup: rbac.authorization.k8s.io diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 148493e..59715c6 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -1,31 +1,21 @@ -// Package webhook is fluence's mutating admission webhook. Its job is to make -// scheduler-chosen values reach a pod's containers without the user wiring -// anything. Container env is immutable after a pod is created, so the scheduler -// cannot write it directly; instead this webhook injects, at pod-creation time, -// a downward-API env that reads an annotation the scheduler fills in later -// (during PreBind). The user writes a plain pod; the plumbing is automatic. +// Package webhook is fluence's mutating admission webhook. // -// Current rules: +// Rules: // // 1. For a pod scheduled by fluence whose container requests a -// fluxion.flux-framework.org/* resource, inject QRMI_BACKEND sourced from -// the fluence backend annotation. New mutation rules can be added in Mutate. +// fluxion.flux-framework.org/* resource, inject FLUXION_* env vars +// sourced from annotations the scheduler writes in PreBind. // -// 2. Quantum leader/worker split for PodGroups of size > 1: -// When a PodGroup contains pods that request a QPU resource, the first such -// pod admitted becomes the leader — it gets the sidecar injected and -// FLUENCE_POD_UID set. Every subsequent pod in the same PodGroup that -// requests a QPU resource gets a quantum.braket/ready scheduling gate added, -// preventing it from entering the Fluxion scheduling cycle until the sidecar -// ungates it. The leader election is recorded as an annotation on the -// PodGroup object so it survives webhook restarts. +// 2. Quantum leader/worker split: +// Pods with label fluence.flux-framework.org/group= and +// schedulerName=fluence trigger the split. The first pod admitted +// becomes the leader — Fluence creates a PodGroup (minCount:1), +// injects the sidecar, creates per-namespace RBAC, and records the +// leader on the PodGroup. Every subsequent pod in the same group +// gets a quantum.braket/ready scheduling gate added. // -// A pod with no PodGroup (bare pod, Deployment, StatefulSet, Job) is always -// treated as a group of 1 — no gating, no sidecar, independent allocation. -// -// The webhook also manages its own TLS: it generates a self-signed CA + serving -// certificate at startup and patches its MutatingWebhookConfiguration's caBundle, -// so the install needs no cert-manager and no committed keys. +// The webhook self-manages TLS via a self-signed CA patched into the +// MutatingWebhookConfiguration caBundle at startup. package webhook import ( @@ -50,83 +40,59 @@ import ( admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" + schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" ) -// SchedulerName is the scheduler whose pods this webhook mutates. -const SchedulerName = "fluence" - -// QuantumGateName is the scheduling gate added to worker pods in a quantum -// PodGroup. The fluence sidecar removes this gate when the QPU task is ready. -const QuantumGateName = "quantum.braket/ready" - -// QuantumLeaderAnnotation is written onto the PodGroup object when the first -// QPU-requesting pod of the group is admitted. Its value is the leader pod name. -// Subsequent QPU-requesting pods in the same group check for this annotation to -// determine they are workers and should be gated. -const QuantumLeaderAnnotation = "fluence.flux-framework.org/quantum-leader" +// ── Constants ────────────────────────────────────────────────────────────────── + +const ( + SchedulerName = "fluence" + QuantumGroupLabel = "fluence.flux-framework.org/group" + QuantumLeaderAnnotation = "fluence.flux-framework.org/quantum-leader" + QuantumGateName = "quantum.braket/ready" + QuantumClassicalPriorityClass = "fluence-quantum-classical" + SidecarImage = "ghcr.io/converged-computing/fluence-sidecar-braket:latest" + SidecarServiceAccount = "fluence-sidecar" + InterceptorConfigMap = "fluence-braket-interceptor" + InterceptorVolumeName = "fluence-braket-interceptor" + InterceptorMountPath = "/etc/fluence/fluence_braket_intercept.py" +) -// SidecarImage is the default fluence braket sidecar image. Can be overridden -// via the FLUENCE_SIDECAR_IMAGE env var at webhook startup. -const SidecarImage = "ghcr.io/converged-computing/fluence-sidecar-braket:latest" +// ── Types ────────────────────────────────────────────────────────────────────── -// jsonPatchOp is a single RFC 6902 JSON Patch operation. type jsonPatchOp struct { Op string `json:"op"` Path string `json:"path"` Value any `json:"value,omitempty"` } -// Mutator injects fluence's scheduler-chosen values into a pod's containers. It -// carries the env contract — the union of attribute keys across the configured -// backends — so it injects a stable, predictable set of environment variables -// regardless of which backend a given pod ends up matching. Values flow via the -// downward API from annotations the scheduler writes in PreBind, so the env var -// NAMES are fixed at pod-creation time (here) while their VALUES populate later. type Mutator struct { - // AttributeKeys is the union of user attribute keys across all backends. Each - // becomes a FLUXION_ env var sourced from its attr- annotation. AttributeKeys []string - - // Client is used to look up and patch PodGroup objects for quantum - // leader/worker split. May be nil in unit tests that do not exercise - // quantum group logic. - Client kubernetes.Interface - - // SidecarImage is the sidecar container image to inject into leader pods. - // Defaults to SidecarImage constant if empty. - SidecarImage string + Client kubernetes.Interface + SidecarImage string } -// injectedEnv returns the full normalized env set this mutator injects into a -// fluxion-requesting container: FLUXION_BACKEND plus one FLUXION_ per -// configured attribute key. Each reads its annotation via the downward API; an -// annotation the scheduler did not set resolves to empty, which is harmless. -func (m *Mutator) injectedEnv() []corev1.EnvVar { - envs := []corev1.EnvVar{annotationEnv( - placement.EnvVarPrefix+"BACKEND", placement.BackendAnnotation)} - for _, key := range m.AttributeKeys { - envs = append(envs, annotationEnv( - placement.EnvVarName(key), placement.AttributeAnnotationPrefix+key)) +// ── Helpers ──────────────────────────────────────────────────────────────────── + +func (m *Mutator) sidecarImage() string { + if m.SidecarImage != "" { + return m.SidecarImage } - return envs + return SidecarImage } -// EnvVarNames returns the names of every env var this mutator injects, for -// startup logging so the developer sees the exact contract their container can -// rely on. -func (m *Mutator) EnvVarNames() []string { - names := make([]string, 0, len(m.AttributeKeys)+1) - for _, e := range m.injectedEnv() { - names = append(names, e.Name) +// groupName returns the value of QuantumGroupLabel on the pod, or "". +func groupName(pod *corev1.Pod) string { + if pod.Labels == nil { + return "" } - return names + return pod.Labels[QuantumGroupLabel] } -// annotationEnv builds a downward-API env var that reads a pod annotation. func annotationEnv(envName, annotationKey string) corev1.EnvVar { return corev1.EnvVar{ Name: envName, @@ -138,230 +104,220 @@ func annotationEnv(envName, annotationKey string) corev1.EnvVar { } } -// fieldEnv builds a downward-API env var that reads a pod field. func fieldEnv(envName, fieldPath string) corev1.EnvVar { return corev1.EnvVar{ Name: envName, ValueFrom: &corev1.EnvVarSource{ - FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: fieldPath, - }, + FieldRef: &corev1.ObjectFieldSelector{FieldPath: fieldPath}, }, } } -// podGroupSize returns the minMember of the PodGroup the pod belongs to, -// or 1 if the pod is not in a PodGroup or the PodGroup cannot be retrieved. -func (m *Mutator) podGroupSize(ctx context.Context, pod *corev1.Pod) int { - if m.Client == nil { - return 1 +func requestsFluxionResource(c corev1.Container) bool { + for name := range c.Resources.Requests { + if strings.HasPrefix(string(name), placement.FluxionResourcePrefix) { + return true + } } - groupName := placement.PodGroupName(pod) - if groupName == "" { - return 1 + return false +} + +func hasEnv(c corev1.Container, name string) bool { + for _, e := range c.Env { + if e.Name == name { + return true + } } - pg, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( - ctx, groupName, metav1.GetOptions{}) - if err != nil { - log.Printf("[fluence-webhook] could not get PodGroup %s/%s: %v", - pod.Namespace, groupName, err) - return 1 + return false +} + +func resourceQuantity(s string) *resource.Quantity { + q := resource.MustParse(s) + return &q +} + +// ── Env contract ─────────────────────────────────────────────────────────────── + +func (m *Mutator) injectedEnv() []corev1.EnvVar { + envs := []corev1.EnvVar{annotationEnv( + placement.EnvVarPrefix+"BACKEND", placement.BackendAnnotation)} + for _, key := range m.AttributeKeys { + envs = append(envs, annotationEnv( + placement.EnvVarName(key), placement.AttributeAnnotationPrefix+key)) } - if pg.Spec.SchedulingPolicy.Gang.MinCount <= 1 { - return 1 + return envs +} + +func (m *Mutator) EnvVarNames() []string { + names := make([]string, 0, len(m.AttributeKeys)+1) + for _, e := range m.injectedEnv() { + names = append(names, e.Name) } - return int(pg.Spec.SchedulingPolicy.Gang.MinCount) + return names } -// podGroupLeader returns the name of the quantum leader already recorded for -// this pod's PodGroup, or "" if none has been recorded yet. +// ── PodGroup management ──────────────────────────────────────────────────────── + func (m *Mutator) podGroupLeader(ctx context.Context, pod *corev1.Pod) string { if m.Client == nil { return "" } - groupName := placement.PodGroupName(pod) - if groupName == "" { + g := groupName(pod) + if g == "" { return "" } - pg, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( - ctx, groupName, metav1.GetOptions{}) - if err != nil { - return "" + // Retry briefly — the leader pod may have just created the PodGroup and + // is recording itself; the worker pod admission may fire concurrently. + for i := 0; i < 3; i++ { + pg, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( + ctx, g, metav1.GetOptions{}) + if err != nil { + return "" + } + if pg.Annotations != nil && pg.Annotations[QuantumLeaderAnnotation] != "" { + return pg.Annotations[QuantumLeaderAnnotation] + } + if i < 2 { + time.Sleep(100 * time.Millisecond) + } } - if pg.Annotations == nil { - return "" + return "" +} + +func (m *Mutator) ensureQuantumPodGroup(ctx context.Context, pod *corev1.Pod, g string) { + if m.Client == nil { + return + } + if _, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Get( + ctx, g, metav1.GetOptions{}); err == nil { + return + } + pg := &schedulingv1alpha2.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: g, + Namespace: pod.Namespace, + Labels: map[string]string{"app": "fluence", QuantumGroupLabel: g}, + }, + Spec: schedulingv1alpha2.PodGroupSpec{ + SchedulingPolicy: schedulingv1alpha2.PodGroupSchedulingPolicy{ + Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: 1}, + }, + }, + } + if _, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Create( + ctx, pg, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create PodGroup %s/%s: %v", pod.Namespace, g, err) + } else { + log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=1)", pod.Namespace, g) } - return pg.Annotations[QuantumLeaderAnnotation] } -// ensureSidecarRBAC creates the fluence-sidecar ServiceAccount, Role, and -// RoleBinding in the pod's namespace if they do not already exist. Called once -// per namespace when the first leader pod is admitted. Errors are logged but -// do not block pod admission — the sidecar may fail to patch pods if RBAC is -// missing, but the pod itself should not be blocked. +func (m *Mutator) recordLeader(ctx context.Context, pod *corev1.Pod) { + if m.Client == nil { + return + } + g := groupName(pod) + if g == "" { + return + } + patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, QuantumLeaderAnnotation, pod.Name) + if _, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Patch( + ctx, g, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { + log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", pod.Namespace, g, err) + } +} + +// ── Per-namespace resource provisioning ─────────────────────────────────────── + func (m *Mutator) ensureSidecarRBAC(ctx context.Context, namespace string) { if m.Client == nil { return } - // ServiceAccount - _, err := m.Client.CoreV1().ServiceAccounts(namespace).Get( - ctx, SidecarServiceAccount, metav1.GetOptions{}) - if err != nil { + if _, err := m.Client.CoreV1().ServiceAccounts(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { sa := &corev1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{ - Name: SidecarServiceAccount, - Namespace: namespace, - Labels: map[string]string{"app": "fluence-sidecar"}, - }, + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}}, } - if _, err := m.Client.CoreV1().ServiceAccounts(namespace).Create( - ctx, sa, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", - namespace, SidecarServiceAccount, err) + if _, err := m.Client.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) } else { - log.Printf("[fluence-webhook] created ServiceAccount %s/%s", - namespace, SidecarServiceAccount) + log.Printf("[fluence-webhook] created ServiceAccount %s/%s", namespace, SidecarServiceAccount) } } - // Role - _, err = m.Client.RbacV1().Roles(namespace).Get( - ctx, SidecarServiceAccount, metav1.GetOptions{}) - if err != nil { + if _, err := m.Client.RbacV1().Roles(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { role := &rbacv1.Role{ - ObjectMeta: metav1.ObjectMeta{ - Name: SidecarServiceAccount, - Namespace: namespace, - Labels: map[string]string{"app": "fluence-sidecar"}, - }, + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}}, Rules: []rbacv1.PolicyRule{ - { - APIGroups: []string{""}, - Resources: []string{"pods"}, - Verbs: []string{"get", "list", "patch", "update"}, - }, - { - APIGroups: []string{"scheduling.k8s.io"}, - Resources: []string{"podgroups"}, - Verbs: []string{"get", "list"}, - }, + {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, + {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, }, } - if _, err := m.Client.RbacV1().Roles(namespace).Create( - ctx, role, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create Role %s/%s: %v", - namespace, SidecarServiceAccount, err) + if _, err := m.Client.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) } else { - log.Printf("[fluence-webhook] created Role %s/%s", - namespace, SidecarServiceAccount) + log.Printf("[fluence-webhook] created Role %s/%s", namespace, SidecarServiceAccount) } } - // RoleBinding - _, err = m.Client.RbacV1().RoleBindings(namespace).Get( - ctx, SidecarServiceAccount, metav1.GetOptions{}) - if err != nil { + if _, err := m.Client.RbacV1().RoleBindings(namespace).Get( + ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { rb := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: SidecarServiceAccount, - Namespace: namespace, - Labels: map[string]string{"app": "fluence-sidecar"}, - }, - Subjects: []rbacv1.Subject{{ - Kind: "ServiceAccount", - Name: SidecarServiceAccount, - Namespace: namespace, - }}, - RoleRef: rbacv1.RoleRef{ - APIGroup: "rbac.authorization.k8s.io", - Kind: "Role", - Name: SidecarServiceAccount, - }, + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, + RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, } - if _, err := m.Client.RbacV1().RoleBindings(namespace).Create( - ctx, rb, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", - namespace, SidecarServiceAccount, err) + if _, err := m.Client.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) } else { - log.Printf("[fluence-webhook] created RoleBinding %s/%s", - namespace, SidecarServiceAccount) + log.Printf("[fluence-webhook] created RoleBinding %s/%s", namespace, SidecarServiceAccount) } } -} -// recordLeader writes the QuantumLeaderAnnotation onto the PodGroup object, -// recording this pod as the quantum leader for the group. -func (m *Mutator) recordLeader(ctx context.Context, pod *corev1.Pod) { - if m.Client == nil { - return - } - groupName := placement.PodGroupName(pod) - if groupName == "" { - return - } - patch := fmt.Sprintf( - `{"metadata":{"annotations":{%q:%q}}}`, - QuantumLeaderAnnotation, pod.Name, - ) - _, err := m.Client.SchedulingV1alpha2().PodGroups(pod.Namespace).Patch( - ctx, groupName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}) - if err != nil { - log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", - pod.Namespace, groupName, err) + // Copy interceptor ConfigMap from kube-system into the pod namespace + if _, err := m.Client.CoreV1().ConfigMaps(namespace).Get( + ctx, InterceptorConfigMap, metav1.GetOptions{}); err != nil { + if src, srcErr := m.Client.CoreV1().ConfigMaps("kube-system").Get( + ctx, InterceptorConfigMap, metav1.GetOptions{}); srcErr != nil { + log.Printf("[fluence-webhook] could not read interceptor ConfigMap from kube-system: %v", srcErr) + } else { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: InterceptorConfigMap, Namespace: namespace, + Labels: map[string]string{"app": "fluence-sidecar"}}, + Data: src.Data, + } + if _, err := m.Client.CoreV1().ConfigMaps(namespace).Create(ctx, cm, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create interceptor ConfigMap %s/%s: %v", namespace, InterceptorConfigMap, err) + } else { + log.Printf("[fluence-webhook] created interceptor ConfigMap %s/%s", namespace, InterceptorConfigMap) + } + } } } -// sidecarImage returns the sidecar image to use, falling back to the default. -func (m *Mutator) sidecarImage() string { - if m.SidecarImage != "" { - return m.SidecarImage - } - return SidecarImage -} +// ── Patch operation builders ─────────────────────────────────────────────────── -// quantumWorkerGateOps returns patch ops that add the quantum scheduling gate -// to the pod, preventing it from entering the Fluxion scheduling cycle. func quantumWorkerGateOps(pod *corev1.Pod) []jsonPatchOp { - gate := corev1.PodSchedulingGate{Name: QuantumGateName} - if len(pod.Spec.SchedulingGates) == 0 { - return []jsonPatchOp{{ - Op: "add", - Path: "/spec/schedulingGates", - Value: []corev1.PodSchedulingGate{gate}, - }} - } - // Check gate not already present for _, g := range pod.Spec.SchedulingGates { if g.Name == QuantumGateName { return nil } } - return []jsonPatchOp{{ - Op: "add", - Path: "/spec/schedulingGates/-", - Value: gate, - }} + gate := corev1.PodSchedulingGate{Name: QuantumGateName} + if len(pod.Spec.SchedulingGates) == 0 { + return []jsonPatchOp{{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}} + } + return []jsonPatchOp{{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}} } -// InterceptorConfigMap is the name of the ConfigMap holding the SDK interceptor. -const InterceptorConfigMap = "fluence-braket-interceptor" - -// InterceptorVolumeName is the volume name for the SDK interceptor mount. -const InterceptorVolumeName = "fluence-braket-interceptor" - -// InterceptorMountPath is where the interceptor script is mounted. -const InterceptorMountPath = "/etc/fluence/fluence_braket_intercept.py" - -// SidecarServiceAccount is the ServiceAccount the sidecar runs as. -const SidecarServiceAccount = "fluence-sidecar" - -// sidecarOps returns patch ops that: -// 1. Inject the fluence sidecar container into the leader pod -// 2. Add the SDK interceptor ConfigMap as a volume -// 3. Mount the interceptor into every user container that requests QPU -// 4. Set the pod's ServiceAccount to fluence-sidecar func (m *Mutator) sidecarOps(pod *corev1.Pod) []jsonPatchOp { + var ops []jsonPatchOp + sidecar := corev1.Container{ Name: "fluence-sidecar", Image: m.sidecarImage(), @@ -370,8 +326,6 @@ func (m *Mutator) sidecarOps(pod *corev1.Pod) []jsonPatchOp { fieldEnv("FLUENCE_POD_UID", "metadata.uid"), fieldEnv("FLUENCE_POD_NAME", "metadata.name"), fieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), - // FLUXION_ARN is already injected by the existing env contract - // via the downward API from the backend annotation. }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -380,164 +334,83 @@ func (m *Mutator) sidecarOps(pod *corev1.Pod) []jsonPatchOp { }, }, } - - var ops []jsonPatchOp - - // 1. Inject sidecar container if len(pod.Spec.Containers) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: "/spec/containers", - Value: []corev1.Container{sidecar}, - }) + ops = append(ops, jsonPatchOp{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) } else { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: "/spec/containers/-", - Value: sidecar, - }) + ops = append(ops, jsonPatchOp{Op: "add", Path: "/spec/containers/-", Value: sidecar}) } - // 2. Add interceptor ConfigMap volume - interceptorVolume := corev1.Volume{ + vol := corev1.Volume{ Name: InterceptorVolumeName, VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: InterceptorConfigMap, - }, + LocalObjectReference: corev1.LocalObjectReference{Name: InterceptorConfigMap}, }, }, } if len(pod.Spec.Volumes) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: "/spec/volumes", - Value: []corev1.Volume{interceptorVolume}, - }) + ops = append(ops, jsonPatchOp{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) } else { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: "/spec/volumes/-", - Value: interceptorVolume, - }) - } - - // 3. Mount interceptor and inject PYTHONSTARTUP into every container - // requesting a QPU resource. PYTHONSTARTUP works for any Python version, - // unlike a site-packages path which is version-specific. - interceptorMount := corev1.VolumeMount{ - Name: InterceptorVolumeName, - MountPath: InterceptorMountPath, - SubPath: "fluence_braket_intercept.py", - ReadOnly: true, - } - pythonStartup := corev1.EnvVar{ - Name: "PYTHONSTARTUP", - Value: InterceptorMountPath, + ops = append(ops, jsonPatchOp{Op: "add", Path: "/spec/volumes/-", Value: vol}) } + + mount := corev1.VolumeMount{Name: InterceptorVolumeName, MountPath: InterceptorMountPath, + SubPath: "fluence_braket_intercept.py", ReadOnly: true} + startup := corev1.EnvVar{Name: "PYTHONSTARTUP", Value: InterceptorMountPath} for i, c := range pod.Spec.Containers { if !requestsFluxionResource(c) { continue } - // volume mount if len(c.VolumeMounts) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), - Value: []corev1.VolumeMount{interceptorMount}, - }) - pod.Spec.Containers[i].VolumeMounts = []corev1.VolumeMount{interceptorMount} + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) } else { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), - Value: interceptorMount, - }) - pod.Spec.Containers[i].VolumeMounts = append(pod.Spec.Containers[i].VolumeMounts, interceptorMount) - } - // PYTHONSTARTUP env var - if hasEnv(c, "PYTHONSTARTUP") { - continue + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) } - if len(c.Env) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env", i), - Value: []corev1.EnvVar{pythonStartup}, - }) - pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonStartup} - } else { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env/-", i), - Value: pythonStartup, - }) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonStartup) + if !hasEnv(c, "PYTHONSTARTUP") { + if len(c.Env) == 0 { + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{startup}}) + } else { + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: startup}) + } } } - // 4. Set ServiceAccount so the sidecar can patch pods. - // Use "add" not "replace" — the field may not be set yet at admission time. if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: "/spec/serviceAccountName", - Value: SidecarServiceAccount, - }) + ops = append(ops, jsonPatchOp{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) } return ops } -// podUIDOps returns patch ops that inject FLUENCE_POD_UID into every container -// that requests a fluxion resource. The sidecar reads this to tag Braket tasks. func podUIDOps(pod *corev1.Pod) []jsonPatchOp { - uidEnv := fieldEnv("FLUENCE_POD_UID", "metadata.uid") + uid := fieldEnv("FLUENCE_POD_UID", "metadata.uid") var ops []jsonPatchOp for i, c := range pod.Spec.Containers { - if !requestsFluxionResource(c) { - continue - } - if hasEnv(c, "FLUENCE_POD_UID") { + if !requestsFluxionResource(c) || hasEnv(c, "FLUENCE_POD_UID") { continue } if len(c.Env) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env", i), - Value: []corev1.EnvVar{uidEnv}, - }) - pod.Spec.Containers[i].Env = []corev1.EnvVar{uidEnv} - continue + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{uid}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{uid} + } else { + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) } - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env/-", i), - Value: uidEnv, - }) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uidEnv) } return ops } -// Mutate returns the JSON Patch operations for a pod, or nil if nothing applies. -// -// For each container that requests a fluxion.flux-framework.org/* resource: -// - inject the FLUXION_* env contract (existing behaviour) -// -// Additionally, for QPU-requesting pods in a PodGroup of size > 1: -// - if no leader has been recorded: this pod is the leader — inject sidecar, -// inject FLUENCE_POD_UID, record leader on PodGroup -// - if a leader already exists: this pod is a worker — add scheduling gate +// ── Mutate ───────────────────────────────────────────────────────────────────── + func (m *Mutator) Mutate(ctx context.Context, pod *corev1.Pod) []jsonPatchOp { if pod.Spec.SchedulerName != SchedulerName { return nil } - contract := m.injectedEnv() + var ops []jsonPatchOp - // --- existing env injection --- + // Rule 1: inject FLUXION_* env contract + contract := m.injectedEnv() for i, c := range pod.Spec.Containers { if !requestsFluxionResource(c) { continue @@ -547,92 +420,39 @@ func (m *Mutator) Mutate(ctx context.Context, pod *corev1.Pod) []jsonPatchOp { continue } if len(c.Env) == 0 { - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env", i), - Value: []corev1.EnvVar{e}, - }) + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{e}}) pod.Spec.Containers[i].Env = []corev1.EnvVar{e} - continue + } else { + ops = append(ops, jsonPatchOp{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: e}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, e) } - ops = append(ops, jsonPatchOp{ - Op: "add", - Path: fmt.Sprintf("/spec/containers/%d/env/-", i), - Value: e, - }) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, e) } } - // --- quantum leader/worker split --- - // Only applies to pods in a PodGroup of size > 1 that request a QPU resource. - if !podRequestsQPU(pod) { - return ops - } - groupSize := m.podGroupSize(ctx, pod) - if groupSize <= 1 { - // Single pod or no PodGroup — independent allocation, no gating needed. + // Rule 2: quantum leader/worker split + g := groupName(pod) + if g == "" { return ops } leader := m.podGroupLeader(ctx, pod) if leader == "" { - // No leader recorded yet — this pod becomes the leader. - log.Printf("[fluence-webhook] pod %s/%s is quantum leader for group (size=%d)", - pod.Namespace, pod.Name, groupSize) + log.Printf("[fluence-webhook] pod %s/%s is quantum leader for group %s", pod.Namespace, pod.Name, g) + m.ensureQuantumPodGroup(ctx, pod, g) m.ensureSidecarRBAC(ctx, pod.Namespace) m.recordLeader(ctx, pod) ops = append(ops, m.sidecarOps(pod)...) ops = append(ops, podUIDOps(pod)...) } else { - // Leader already exists — this pod is a worker, add the gate. - log.Printf("[fluence-webhook] pod %s/%s is quantum worker (leader=%s)", - pod.Namespace, pod.Name, leader) + log.Printf("[fluence-webhook] pod %s/%s is quantum worker (leader=%s)", pod.Namespace, pod.Name, leader) ops = append(ops, quantumWorkerGateOps(pod)...) } return ops } -// podRequestsQPU returns true if any container in the pod requests a QPU -// resource (fluxion.flux-framework.org/qpu). -func podRequestsQPU(pod *corev1.Pod) bool { - for _, c := range pod.Spec.Containers { - for name := range c.Resources.Requests { - if string(name) == placement.FluxionResourcePrefix+"qpu" { - return true - } - } - } - return false -} - -func requestsFluxionResource(c corev1.Container) bool { - for name := range c.Resources.Requests { - if strings.HasPrefix(string(name), placement.FluxionResourcePrefix) { - return true - } - } - return false -} - -func hasEnv(c corev1.Container, name string) bool { - for _, e := range c.Env { - if e.Name == name { - return true - } - } - return false -} - -// resourceQuantity is a helper to build a resource.Quantity inline. -func resourceQuantity(s string) *resource.Quantity { - q := resource.MustParse(s) - return &q -} +// ── HTTP handler ─────────────────────────────────────────────────────────────── -// Handler is the /mutate endpoint. It always admits the pod (failure to mutate -// must not block creation); it only adds a patch when Mutate returns one. func (m *Mutator) Handler(w http.ResponseWriter, r *http.Request) { body, err := io.ReadAll(r.Body) if err != nil { @@ -644,7 +464,6 @@ func (m *Mutator) Handler(w http.ResponseWriter, r *http.Request) { http.Error(w, "bad admission review", http.StatusBadRequest) return } - resp := &admissionv1.AdmissionResponse{UID: review.Request.UID, Allowed: true} var pod corev1.Pod if err := json.Unmarshal(review.Request.Object.Raw, &pod); err == nil { @@ -653,31 +472,26 @@ func (m *Mutator) Handler(w http.ResponseWriter, r *http.Request) { pt := admissionv1.PatchTypeJSONPatch resp.Patch = patch resp.PatchType = &pt - log.Printf("[fluence-webhook] injected %d op(s) into pod %s/%s", - len(ops), pod.Namespace, pod.Name) + log.Printf("[fluence-webhook] injected %d op(s) into pod %s/%s", len(ops), pod.Namespace, pod.Name) } } } - out := admissionv1.AdmissionReview{TypeMeta: review.TypeMeta, Response: resp} w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(out) } -// GenerateCerts returns a self-signed CA (PEM) and a serving cert+key (PEM) valid -// for the given DNS names. The CA PEM is what the apiserver must trust (caBundle). +// ── TLS ──────────────────────────────────────────────────────────────────────── + func GenerateCerts(dnsNames []string) (caPEM, certPEM, keyPEM []byte, err error) { caKey, err := rsa.GenerateKey(rand.Reader, 2048) if err != nil { return nil, nil, nil, err } caTmpl := &x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{CommonName: "fluence-webhook-ca"}, - NotBefore: time.Now().Add(-time.Hour), - NotAfter: time.Now().AddDate(10, 0, 0), - IsCA: true, - KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, + SerialNumber: big.NewInt(1), Subject: pkix.Name{CommonName: "fluence-webhook-ca"}, + NotBefore: time.Now().Add(-time.Hour), NotAfter: time.Now().AddDate(10, 0, 0), + IsCA: true, KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, BasicConstraintsValid: true, } caDER, err := x509.CreateCertificate(rand.Reader, caTmpl, caTmpl, &caKey.PublicKey, caKey) @@ -688,38 +502,29 @@ func GenerateCerts(dnsNames []string) (caPEM, certPEM, keyPEM []byte, err error) if err != nil { return nil, nil, nil, err } - leafKey, err := rsa.GenerateKey(rand.Reader, 2048) if err != nil { return nil, nil, nil, err } leafTmpl := &x509.Certificate{ - SerialNumber: big.NewInt(2), - Subject: pkix.Name{CommonName: dnsNames[0]}, - NotBefore: time.Now().Add(-time.Hour), - NotAfter: time.Now().AddDate(10, 0, 0), - KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - DNSNames: dnsNames, + SerialNumber: big.NewInt(2), Subject: pkix.Name{CommonName: dnsNames[0]}, + NotBefore: time.Now().Add(-time.Hour), NotAfter: time.Now().AddDate(10, 0, 0), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, DNSNames: dnsNames, } leafDER, err := x509.CreateCertificate(rand.Reader, leafTmpl, caCert, &leafKey.PublicKey, caKey) if err != nil { return nil, nil, nil, err } - caPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: caDER}) certPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: leafDER}) keyPEM = pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(leafKey)}) return caPEM, certPEM, keyPEM, nil } -// EnsureCABundle patches the named MutatingWebhookConfiguration so its first -// webhook trusts caPEM. func EnsureCABundle(ctx context.Context, client kubernetes.Interface, configName string, caPEM []byte) error { - patch := fmt.Sprintf( - `[{"op":"replace","path":"/webhooks/0/clientConfig/caBundle","value":%q}]`, - base64.StdEncoding.EncodeToString(caPEM), - ) + patch := fmt.Sprintf(`[{"op":"replace","path":"/webhooks/0/clientConfig/caBundle","value":%q}]`, + base64.StdEncoding.EncodeToString(caPEM)) _, err := client.AdmissionregistrationV1().MutatingWebhookConfigurations().Patch( ctx, configName, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) return err diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 4c0c612..05496f0 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -9,7 +9,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" ) -func qpuPod(scheduler string, presetEnv string) *corev1.Pod { +func qpuPod(scheduler, presetEnv string) *corev1.Pod { c := corev1.Container{ Name: "app", Resources: corev1.ResourceRequirements{ @@ -38,7 +38,6 @@ func cpuPod(scheduler string) *corev1.Pod { }} } -// envNames returns the env var names referenced by a list of add-ops. func opEnvNames(ops []jsonPatchOp) []string { var names []string for _, op := range ops { @@ -99,13 +98,10 @@ func hasSidecarOp(ops []jsonPatchOp) bool { return false } -// With a config-derived contract (region, qubits), a fluxion pod gets -// FLUXION_BACKEND plus one FLUXION_ per attribute key. func TestMutateInjectsContract(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region", "qubits"}} ops := m.Mutate(context.Background(), qpuPod("fluence", "")) names := opEnvNames(ops) - for _, want := range []string{"FLUXION_BACKEND", "FLUXION_REGION", "FLUXION_QUBITS"} { if !contains(names, want) { t.Errorf("missing injected env %q; got %v", want, names) @@ -113,7 +109,6 @@ func TestMutateInjectsContract(t *testing.T) { } } -// With no configured attributes, only FLUXION_BACKEND is injected. func TestMutateBackendOnly(t *testing.T) { m := &Mutator{} names := opEnvNames(m.Mutate(context.Background(), qpuPod("fluence", ""))) @@ -122,7 +117,6 @@ func TestMutateBackendOnly(t *testing.T) { } } -// Non-fluence pods are never mutated. func TestMutateSkipsOtherScheduler(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} if ops := m.Mutate(context.Background(), qpuPod("default-scheduler", "")); ops != nil { @@ -130,7 +124,6 @@ func TestMutateSkipsOtherScheduler(t *testing.T) { } } -// An env var the container already defines is not re-injected. func TestMutateRespectsExistingEnv(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} names := opEnvNames(m.Mutate(context.Background(), qpuPod("fluence", "FLUXION_BACKEND"))) @@ -142,7 +135,6 @@ func TestMutateRespectsExistingEnv(t *testing.T) { } } -// Classical pods (no fluxion resource request) are not mutated. func TestMutateSkipsNonFluxion(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region"}} if ops := m.Mutate(context.Background(), cpuPod("fluence")); ops != nil { @@ -150,7 +142,6 @@ func TestMutateSkipsNonFluxion(t *testing.T) { } } -// EnvVarNames reports the full contract for startup logging. func TestEnvVarNames(t *testing.T) { m := &Mutator{AttributeKeys: []string{"region", "connectivity"}} names := m.EnvVarNames() @@ -159,9 +150,9 @@ func TestEnvVarNames(t *testing.T) { } } -// A QPU pod with no PodGroup (group of 1) gets no gate and no sidecar. +// A QPU pod with no group label gets no gate and no sidecar. func TestMutateQPUSinglePodNoSidecar(t *testing.T) { - m := &Mutator{} // no Client — group size will be 1 + m := &Mutator{} ops := m.Mutate(context.Background(), qpuPod("fluence", "")) if hasGateOp(ops) { t.Error("single QPU pod should not get a scheduling gate") @@ -180,7 +171,7 @@ func TestQuantumWorkerGateOpsEmpty(t *testing.T) { } } -// quantumWorkerGateOps is idempotent — does not add gate if already present. +// quantumWorkerGateOps is idempotent. func TestQuantumWorkerGateOpsIdempotent(t *testing.T) { pod := qpuPod("fluence", "") pod.Spec.SchedulingGates = []corev1.PodSchedulingGate{{Name: QuantumGateName}} @@ -189,3 +180,15 @@ func TestQuantumWorkerGateOpsIdempotent(t *testing.T) { t.Errorf("expected no ops when gate already present, got %v", ops) } } + +// groupName returns the quantum group label value. +func TestGroupName(t *testing.T) { + pod := qpuPod("fluence", "") + if groupName(pod) != "" { + t.Error("pod without group label should return empty") + } + pod.Labels = map[string]string{QuantumGroupLabel: "my-workflow"} + if groupName(pod) != "my-workflow" { + t.Errorf("expected my-workflow, got %q", groupName(pod)) + } +} diff --git a/sidecars/braket/design.md b/sidecars/braket/design.md index 1286298..4a67548 100644 --- a/sidecars/braket/design.md +++ b/sidecars/braket/design.md @@ -5,9 +5,15 @@ Hybrid quantum-classical workflows submit work to two independent queues: the Kubernetes scheduler (classical compute) and a QPU vendor API (quantum execution). Classical pods waste node resources while waiting for QPU queue -results. We describe a design for Fluence that coordinates classical resource -allocation with quantum execution order across heterogeneous QPU backends, -without requiring any user application changes. +results. Fluence's coordination system thus gates classical worker pods until +the QPU task is one position from executing, then releases them with high +priority so they preempt lower-priority work and start immediately as the +QPU result arrives. Yes, it could be the case the one task in the queue before +it takes a long time, but I think this is an improved approach than having worker +pods running (and waiting) for a much longer queue. This only is important +given that you have gangs, or leader worker designs where some leader is launching +the quantum work and otherwise the workers would be waiting and doing nothing +(and wasting resources). ## 1. The Two-Queue Problem @@ -18,102 +24,131 @@ consumes node resources — CPU, memory, potentially GPU — for the entire duration of the QPU queue wait, which may be minutes to hours on real hardware. -This waste scales with concurrency. With N concurrent hybrid jobs and a -QPU queue depth of D, each classical pod may idle for D × t_avg seconds -where t_avg is the average QPU task execution time. On a shared cluster -with expensive GPU nodes this is a significant and unfair resource waste. +This waste scales with concurrency. With N concurrent hybrid jobs, each pod +idles for the full QPU queue wait. On real QPU backends (IQM Garnet, IQM +Emerald) we measure 15–30% classical idle fraction at N=10, rising to over +70% for individual pods at N=20. Wall time scales linearly with concurrency +on real QPUs — submitting 20 jobs takes 5–8× longer than 1 job due to +self-imposed queue depth. -The problem has two components: - -**Component 1 — Resource waste.** Classical pods consume node resources -while doing nothing useful. - -**Component 2 — Ordering mismatch.** Classical resource allocation follows -job submission order, not QPU execution order. A job submitted to a busy -backend wastes resources longer than a job submitted to a quiet one. - -## 2. Why Existing Mechanisms Don't Help +## 2. Why Existing Mechanisms Are Insufficient ### 2.1 Fluxion reservations Fluxion's backfill reservation policies (EASY, Conservative, Hybrid) compute -a future `time_at` from the internal resource graph timeline — when currently -running classical jobs will finish. They have no mechanism to accept an -externally-supplied time derived from a vendor queue. Without a reliable -`time_at`, a reservation degenerates to a pending job. Furthermore, all -reservations are cancelled and recomputed from scratch at the start of every -scheduling loop, so they provide no persistent resource hold. +a future `time_at` from the internal resource graph — when currently running +classical jobs will finish. They cannot accept an externally-supplied time +derived from a vendor queue. Without a reliable `time_at`, a reservation +degenerates to a pending job. All reservations are cancelled and recomputed +from scratch at the start of every scheduling loop. + +QPU queue time is unknowable in advance. It depends on other +users' submissions, hardware calibration windows, and network latency. +Average task time per QPU cannot be estimated reliably. Therefore Fluxion +reservations cannot help with the two-queue problem. I learned that we are +working on "advanced reservations" that are more like a hold, but it is +not clear if that can be merged soon. -### 2.2 Kubernetes scheduling gates alone +### 2.2 Scheduling gates alone A scheduling gate holds a pod out of the scheduling queue entirely, consuming no node resources. But ungating N pods simultaneously on a busy cluster -creates a race — resources may not be available, and the graph allocation -happens after ungating, not before. There is no atomicity guarantee between -ungating and placement. +creates a race — resources may not be available when ungating occurs, and +the Fluxion graph allocation happens after ungating, not before. Without +priority, ungated pods compete equally with all pending work. ### 2.3 Preemption alone -Submitting classical pods with a high `PriorityClass` causes Kubernetes to -evict lower-priority pods to make room. But without a gate, preemption -happens immediately at submit time — the classical pods displace other work -during the entire QPU queue wait, which is worse than the original problem. +Submitting classical pods with a high PriorityClass causes Kubernetes to +evict lower-priority pods immediately at submit time — during the entire QPU +queue wait — which is worse than the original problem. ## 3. Design -The design combines three mechanisms: a **transparent SDK interceptor** -injected by the Fluence webhook, a **sidecar controller** that observes -QPU queue state, and **gated high-priority classical pods** that are -allocated and dispatched only when the QPU is one position from executing. +The design combines four mechanisms: -### 3.1 Components +1. **SDK interceptor** — tags every QPU task with the pod UID +2. **Fluence webhook** — gates worker pods, injects sidecar into leader +3. **Sidecar controller** — discovers the QPU task, polls queue position, + ungates workers when position==1 +4. **High-priority ungating** — workers preempt lower-priority work at the + last responsible moment -``` -┌─────────────────────────────────────────────────────────┐ -│ Quantum gateway pod │ -│ │ -│ ┌─────────────────────┐ ┌──────────────────────────┐ │ -│ │ user application │ │ fluence-sidecar │ │ -│ │ │ │ │ │ -│ │ device.run(...) │ │ 1. find task by tag │ │ -│ │ ↓ (intercepted) │ │ 2. poll queue_position │ │ -│ │ tags injected │ │ 3. at position==1: │ │ -│ │ automatically │ │ patch ARN → pods │ │ -│ │ │ │ remove gates │ │ -│ └─────────────────────┘ └──────────────────────────┘ │ -└─────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────┐ -│ Classical pods (SchedulingGated until position==1) │ -│ │ -│ annotations: │ -│ braket.quantum/task-arn: │ -│ schedulingGates: │ -│ - name: quantum.braket/ready ← removed by sidecar │ -│ priorityClassName: quantum-classical-high │ -└─────────────────────────────────────────────────────────┘ -``` +### 3.1 User interface -### 3.2 Transparent SDK interceptor +The user labels all pods in a workflow group with: -The Fluence mutating webhook injects two things into every pod that requests -a QPU resource (`fluxion.flux-framework.org/qpu`): - -**Environment variable:** -``` -FLUENCE_POD_UID= +```yaml +metadata: + labels: + fluence.flux-framework.org/group: my-workflow +spec: + schedulerName: fluence ``` -**Python sitecustomize hook** (injected as a ConfigMap mounted at the -Python site-packages path): +I initially started with having the user create a PodGroup object, and I found +that annoying. I do not want to require a PodGroup object when an annotation is easier, +and then I have fine-grained control of what the groups looks like. Fluence can handle +everything else automatically. + +The namespace distinction: +- `fluence.flux-framework.org/*` — Fluence scheduler-plugin concerns + (group label, leader annotation, gate name) +- `fluxion.flux-framework.org/*` — Fluxion resource-graph concerns + (extended resource types, backend attribute env vars) + +### 3.2 Webhook behavior + +When the Fluence mutating webhook sees a pod with `schedulerName: fluence` +and `fluence.flux-framework.org/group=`: + +**First pod admitted (leader):** +1. Creates a PodGroup with `minCount: 1` — Fluence owns this PodGroup, + the user never creates it. `minCount: 1` means the leader schedules + immediately without waiting for gated workers. The assumption here is + that this leader is going to submit the quantum work. +2. Records the leader pod name on the PodGroup via `QuantumLeaderAnnotation`. +3. Creates per-namespace RBAC: `fluence-sidecar` ServiceAccount, Role + (patch pods, list PodGroups), RoleBinding. +4. Copies `fluence-braket-interceptor` ConfigMap from `kube-system` into + the pod's namespace (ConfigMap volumes require same-namespace source). +5. Injects `fluence-sidecar` container into the leader pod. +6. Injects `FLUENCE_POD_UID` env var (downward API from `metadata.uid`). +7. Mounts the interceptor ConfigMap and sets `PYTHONSTARTUP` env var so + the interceptor runs automatically before user code. +8. Sets `serviceAccountName: fluence-sidecar`. + +**Subsequent pods (workers):** +1. Reads the PodGroup leader annotation — retries up to 3× with 100ms + delay to handle concurrent admission race. +2. Adds `quantum.braket/ready` scheduling gate — pod enters + `SchedulingGated` state, invisible to Fluxion, consuming no resources. + +### 3.3 Braket SDK interceptor + +I created a consistent sidecar that is going to monitor the queue, and be able +to ungate the worker pods when the task submit by our pod is at position 1 +(implicating it will run soon, and we assume the user wants the classical +gang to run at the same time or slightly sooner). Note that it is up to the +user application to orchestrate the leader and workers, and coordination +of the quantum results. A few examples: + +- The worker pods are guaranteed to get an ARN for where the Braket results are in S3, + and this is ensured by the sidecar. So a reasonable approach is for workers query + that bucket looking for a finished marker. This would not require coordination from + the leader. +- Given communication from the leader to workers, the leader can tell them exactly + when the work is finished, and coordinate what they do with results. + +I ran into the issue of needing to GET the task id from the primary pod from +the sidecar. What I decided on is a very simply injection - the call of the +script to submit the job can take arbitrary tags, and so I wrap that with a configmap +that is in the pythonpath, and ensure the task is tagged with a pod specific UID +that the sidecar also knows. More specifically, `fluence_braket_intercept.py` script is +mounted via `PYTHONSTARTUP` into every container in the leader pod. It monkey-patches +`AwsDevice.run()` to automatically tag every quantum task submission with `FLUENCE_POD_UID`: ```python -# fluence_braket_intercept.py — injected by Fluence webhook -import os -from braket.aws import AwsDevice - -_original_run = AwsDevice.run - def _patched_run(self, task_specification, *args, **kwargs): pod_uid = os.environ.get("FLUENCE_POD_UID", "") if pod_uid: @@ -121,201 +156,140 @@ def _patched_run(self, task_specification, *args, **kwargs): tags["fluence-pod-uid"] = pod_uid kwargs["tags"] = tags return _original_run(self, task_specification, *args, **kwargs) - -AwsDevice.run = _patched_run ``` -This is completely transparent to the user application. Every `device.run()` -call — regardless of which QPU backend, regardless of circuit type — is -automatically tagged with the pod UID. No user code changes are required. - -### 3.3 Sidecar controller +This is completely transparent to the user application. No code changes +are required. -The `fluence-sidecar` container is injected automatically by the Fluence -webhook into any pod requesting a QPU resource. It runs alongside the user -application in the same pod, sharing the pod's AWS credentials via env vars. +### 3.4 Sidecar controller -**Algorithm:** +The `fluence-sidecar` container runs alongside the user application in the +leader pod, sharing its AWS credentials and network namespace. -``` +```console 1. READ FLUXION_ARN, FLUENCE_POD_UID from env -2. READ gated sibling pod names from FLUENCE_GATED_PODS annotation -3. WAIT for task tagged fluence-pod-uid= on device - poll search_quantum_tasks every 10s - timeout after FLUENCE_TASK_DISCOVERY_TIMEOUT (default: 300s) - on timeout: fall back to time-window heuristic +2. DISCOVER task by tag: + search_quantum_tasks(filters=[ + deviceArn == FLUXION_ARN, + tags:fluence-pod-uid == FLUENCE_POD_UID + ]) + Poll every 10s, timeout after 300s. + On timeout: fall back to time-window heuristic (tasks submitted + after pod start time on the same device). -4. POLL task.queue_position() every 30s - log position to stdout for experiment instrumentation +3. DISCOVER worker pods: + List pods in namespace with fluence.flux-framework.org/group label + matching this pod's group, having quantum.braket/ready gate present. -5. WHEN position == "1" OR state == RUNNING: - for each pod in FLUENCE_GATED_PODS: - kubectl annotate pod braket.quantum/task-arn= - kubectl patch pod remove schedulingGates +4. POLL task.queue_position() every 30s. + Log position for experiment instrumentation. -6. EXIT (sidecar is done — pod continues running user application) +5. WHEN position == "1" OR state == RUNNING: + For each worker pod: + kubectl annotate pod braket.quantum/task-arn= + kubectl patch pod --type=json \ + -p='[{"op":"add","path":"/spec/priorityClassName", + "value":"fluence-quantum-classical"}, + {"op":"remove","path":"/spec/schedulingGates/0"}]' + +6. EXIT ``` -**Fallback heuristic (step 3 timeout):** +The priority class and gate removal are applied atomically in one patch. +This ensures workers enter the scheduling queue with high priority +immediately, without a window where they are ungated but low-priority. -If no tagged task is found within the discovery timeout — e.g. because the -user application uses a non-standard SDK path — the sidecar searches for -tasks submitted to `FLUXION_ARN` with `createdAt >= pod_start_time` and -picks the most recently created one. This is less reliable but handles -edge cases gracefully. +### 3.5 Priority and preemption -### 3.4 Gated classical pods +The `fluence-quantum-classical` PriorityClass (value: 1,000,000) is applied +by the sidecar at ungate time, not by the webhook at pod creation. Setting +it at creation time causes an admission controller conflict (priority integer +already defaulted to 0). -Classical pods that depend on a quantum result are submitted with: +When workers are ungated with high priority, Kubernetes preemption evicts +lower-priority pods to make room. Fluence's pod deletion informer catches +these evictions, calls `Cancel(jobid)` in Fluxion, and frees the graph +vertices so Fluxion can allocate them to the incoming high-priority workers. -```yaml -spec: - schedulingGates: - - name: quantum.braket/ready - priorityClassName: quantum-classical-high - # No graph allocation yet — MatchAllocateSpec deferred until ungating -``` +### 3.6 Classical allocation follows quantum execution order + +Because each workflow's gate is removed independently when its QPU task +reaches position==1, workflows whose QPU tasks execute earlier get classical +resources earlier — regardless of submission order. A workflow submitted to +a quiet backend gets its classical resources before one submitted earlier to +a busy backend. This aligns classical resource allocation with actual quantum +execution order across heterogeneous backends. -The high `PriorityClass` means nothing while the gate is present — the pod -is invisible to the scheduling queue. When the sidecar removes the gate at -position==1, the pod enters the queue with high priority and Kubernetes -preemption displaces lower-priority work to make room. +## 4. Properties -### 3.5 Fluence PostFilter for topology-aware preemption +| Property | Value | +|---|---| +| User code changes required | None | +| User manifest changes required | Add group label + schedulerName | +| Classical resources during QPU wait | Zero (SchedulingGated) | +| QPU queue time estimation needed | No — position==1 is observable | +| Works across heterogeneous backends | Yes — any backend in Fluxion graph | +| Vendor API cooperation needed | No — SDK interceptor handles tagging | -The default Kubernetes preemption controller evicts pods based purely on -`PriorityClass`, with no awareness of Fluxion's resource graph. It may -evict pods whose removal does not actually free the graph vertices needed -for the incoming classical pod. +## 5. Limitations -Fluence implements a custom `PostFilter` extension point that: +### 5.1 Preemption disrupts lower-priority work -1. Receives the high-priority classical pod that failed `MatchAllocateSpec` -2. Asks Fluxion which graph vertices are blocking the match -3. Maps those vertices to currently running pods via Fluence's allocation - tracking -4. Passes only those specific pods to the preemption logic -5. Returns the `nominatedNodeName` that Fluxion identified +At position==1, workers preempt running lower-priority pods. This work is +re-queued and eventually runs, but there is a disruption cost. A future +design using a `MatchReserveAt(time_at, spec)` Fluxion primitive — where +`time_at` is supplied by the QPU vendor via an ETA or task-start event — +would allow graceful node draining instead of preemption. No current QPU +vendor exposes such an API. -This ensures preemption targets topologically correct pods — pods whose -eviction will actually let Fluxion satisfy the match — rather than -arbitrarily choosing the lowest-priority pods on the cluster. +### 5.2 Non-Braket SDKs -## 4. Properties of the Design +The interceptor patches `AwsDevice.run()`. IBM Qiskit Runtime, IQM native +SDK, and other vendors require separate interceptors in `sidecars//`. +The pattern is identical; only the SDK entry point differs. We will make +sidecars for different vendor interfaces. -### 4.1 Zero user cooperation required +### 5.3 Single task per workflow -The SDK interceptor is injected transparently by the webhook. The user -application requires no changes. The sidecar is injected automatically. -The only user-visible artifact is the `FLUXION_ARN` env var, which the -user already needs to know which backend to target. +The sidecar tracks one QPU task ARN per leader pod. Parameter-shift gradient +estimation and other multi-circuit workflows require tracking a set of ARNs. +See the scatter design issue for the proposed extension. -### 4.2 Classical resources allocated at the last responsible moment +### 5.4 Namespace-scoped RBAC -Graph allocation (`MatchAllocateSpec`) happens only when the QPU task -reaches position==1 — seconds to minutes before the result arrives. During -the entire QPU queue wait, no classical node resources are consumed and no -graph capacity is held. +The webhook creates `fluence-sidecar` RBAC in each namespace on first use. +This is correct behavior — the sidecar only needs permissions in its own +namespace. A Helm chart or operator would manage this more cleanly. -### 4.3 Classical allocation follows quantum execution order +## 6. Future Work -Because each workflow's gate is removed independently when its QPU task -reaches position==1, workflows whose QPU tasks execute earlier get classical -resources earlier — regardless of submission order. A workflow submitted to -a quiet backend gets its classical resources before a workflow submitted -earlier to a busy one. This aligns classical scheduling with actual quantum -execution order across heterogeneous backends. +### 6.1 MatchReserveAt Fluxion primitive -### 4.4 No estimation of QPU queue time required +A new `MatchReserveAt(time_at, spec)` function in the Fluxion Go bindings +would allow an externally-supplied reservation time. The sidecar would feed +live QPU queue position into this estimate, enabling graceful node draining +rather than preemption. This requires the C++ reapi `match_allocate_multi` +function to be exposed through the Go bindings with a `starttime` parameter. -The design makes no attempt to predict when the QPU task will execute. -`position==1` is an observable state transition, not an estimate. The -design is robust to variable queue depths, hardware maintenance windows, -and concurrent submissions by other users. +### 6.2 Scatter design -### 4.5 Task ARN propagated to classical pods +For workflows with N independent QPU tasks each paired with one classical +pod, an index-based pairing mechanism (`fluence.flux-framework.org/index`) +would allow the sidecar to ungate specific worker pods when their specific +task reaches position==1. See the open scatter design issue. -When the sidecar removes the gate, it patches `braket.quantum/task-arn` -onto each classical pod as an annotation. Classical pods read this via -the downward API and can use it to retrieve results from S3, submit -follow-on circuits, perform error mitigation, or do anything else the -Braket SDK supports. The sidecar does not prescribe what classical pods -do with the result. +### 6.3 Vendor task-start events -## 5. Limitations +If QPU vendors exposed SNS/EventBridge notifications when a task transitions +from QUEUED to RUNNING, the sidecar could react to events rather than +polling. This would eliminate the 30s polling latency and enable more +precise ungating. + +### 6.4 PostFilter topology-aware preemption -### 5.1 Non-Braket SDKs - -The SDK interceptor currently patches `AwsDevice.run()`. Support for -IBM Qiskit Runtime (`backend.run()`), IQM, and other vendors requires -additional interceptors. The pattern is identical; only the entry point -differs. - -### 5.2 Preemption disrupts lower-priority work - -At position==1, classical pods may preempt running lower-priority work. -This work is re-queued and eventually runs, but there is a disruption cost. -A future design using Fluxion's `MatchReserveAt` primitive with a -vendor-supplied ETA would allow graceful draining instead of preemption. -This requires QPU vendors to expose task ETA or start-event webhooks, -which no current vendor provides. - -### 5.3 Multi-task workflows - -The sidecar currently tracks one task per pod. Workflows that submit -multiple QPU tasks (e.g. parameter-shift gradient estimation with 2P -circuits) require the sidecar to track a set of task ARNs and ungate -classical pods when all tasks reach position==1 or a subset completes. -This is a straightforward extension. - -### 5.4 Sidecar resource consumption - -The sidecar consumes minimal CPU and memory (polling every 30s), but -it does hold an open AWS API connection for the duration of the QPU -queue wait. On clusters with many concurrent hybrid workflows this -may become a concern. - -## 6. Required Vendor API Primitive - -The remaining limitation that cannot be solved without vendor cooperation -is task provenance — associating a Braket task with the Kubernetes pod -that submitted it without SDK interception. If Braket were to expose a -`clientToken` or `podIdentity` field that the SDK set automatically from -the execution environment (analogous to how IAM roles work for EC2 -instances), the interceptor would not be needed. - -More significantly, if QPU vendors exposed a task-start event (webhook, -SNS notification, or EventBridge rule) when a task transitions from -QUEUED to RUNNING, the sidecar could react to that event rather than -polling. This would enable graceful draining rather than preemption, and -would allow Fluxion's reservation system to be used with an externally- -supplied `time_at` rather than requiring the position==1 heuristic. - -## 7. Implementation Plan - -### Phase 1 — Sidecar container (this repo) -- `docker/fluence-sidecar/` — sidecar image -- SDK interceptor (`fluence_braket_intercept.py`) -- Task discovery (tagged search + heuristic fallback) -- Queue position polling -- Pod annotation patching and gate removal - -### Phase 2 — Fluence webhook changes -- Inject `FLUENCE_POD_UID` env var into QPU pods -- Inject sidecar container into QPU pods -- Inject SDK interceptor as a mounted ConfigMap -- Inject `FLUENCE_GATED_PODS` annotation listing sibling gated pods -- Create `quantum-classical-high` PriorityClass - -### Phase 3 — Fluence PostFilter -- Custom preemption targeting Fluxion-graph-aware pod selection -- Integration with existing allocation tracking in placement.go - -### Phase 4 — Experiment -- Demonstrate two-queue problem empirically (experiment 1, already running) -- Demonstrate gate + sidecar design reducing classical idle time -- Compare classical node-seconds consumed: ungated vs gated -- Show quantum execution order driving classical allocation order - across heterogeneous backends (SV1, IQM, Rigetti) -EOF +A custom Fluence `PostFilter` plugin would ask Fluxion which graph vertices +are blocking a high-priority worker pod, then target preemption at exactly +those pods — rather than the default Kubernetes preemption which picks +lowest-priority pods regardless of graph topology. This ensures preemption +always produces a valid Fluxion allocation. diff --git a/sidecars/lib/ungate.py b/sidecars/lib/ungate.py index ebc1b9f..a0107b5 100644 --- a/sidecars/lib/ungate.py +++ b/sidecars/lib/ungate.py @@ -61,11 +61,20 @@ def ungate_pods(gated_pods, task_arn, namespace): else: log(f" WARNING: no task ARN available to patch onto {pod_name}") - # 2. Remove scheduling gate - patch = json.dumps([{ - "op": "remove", - "path": "/spec/schedulingGates/0" - }]) + # 2. Set high priority class and remove scheduling gate atomically + # Priority is set here (not in webhook) to avoid admission controller + # conflict where priority:0 is already defaulted before our patch. + patch = json.dumps([ + { + "op": "add", + "path": "/spec/priorityClassName", + "value": "fluence-quantum-classical" + }, + { + "op": "remove", + "path": "/spec/schedulingGates/0" + } + ]) try: kubectl([ "patch", "pod", pod_name, @@ -73,9 +82,9 @@ def ungate_pods(gated_pods, task_arn, namespace): "--type=json", f"-p={patch}", ]) - log(f" Removed scheduling gate from {pod_name}") + log(f" Set priority and removed scheduling gate from {pod_name}") except RuntimeError as e: - log(f" WARNING: could not remove gate from {pod_name}: {e}") + log(f" WARNING: could not patch {pod_name}: {e}") def gated_pods_from_env(): diff --git a/test/e2e/04-sidecar-ungate.sh b/test/e2e/04-sidecar-ungate.sh index 68e489e..4a374a9 100644 --- a/test/e2e/04-sidecar-ungate.sh +++ b/test/e2e/04-sidecar-ungate.sh @@ -1,53 +1,77 @@ #!/usr/bin/env bash -# Sidecar gate/ungate plumbing test. +# Sidecar webhook test. # -# This test verifies the Kubernetes mechanics of the sidecar design: -# 1. A gated classical pod stays SchedulingGated until something removes the gate -# 2. A pod with kubectl access can patch an annotation and remove a gate -# 3. The classical pod reads the patched annotation via the downward API +# Verifies that when a PodGroup of size > 1 with QPU resources is submitted: +# 1. The webhook creates fluence-sidecar RBAC in the namespace automatically +# 2. The leader pod gets the sidecar container injected +# 3. The worker pod gets the quantum.braket/ready scheduling gate added +# 4. The worker pod gets fluence-quantum-classical priority class set # -# This does NOT test the braket sidecar itself (task discovery, SDK interceptor, +# Does NOT test the braket sidecar itself (task discovery, SDK interceptor, # queue position polling). Those require real AWS credentials and are covered # by sidecars/braket/test/integration.sh which is run locally. set -euo pipefail HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" -log "TEST 4: sidecar gate/ungate Kubernetes plumbing" +log "TEST 4: sidecar webhook — RBAC creation, gate injection, sidecar injection" -kubectl apply -f examples/test/e2e/sidecar-mock.yaml +kubectl apply -f examples/test/e2e/sidecar-mock-pods.yaml -# Classical pod must start SchedulingGated — verify it is NOT Running immediately -sleep 5 -phase="$(kubectl get pod classical-mock -o jsonpath='{.status.phase}' 2>/dev/null || true)" -[ "$phase" != "Running" ] || fail "classical-mock should not be Running before gate is removed (phase=$phase)" -log "classical-mock is correctly gated (phase=${phase:-SchedulingGated})" +# Give webhook time to process the leader pod admission +sleep 3 -# Gateway pod should reach Running -wait_pod_phase quantum-gateway-mock Running 60 \ - || fail "quantum-gateway-mock did not reach Running" +# Print webhook logs — always show these so we can see what happened +log "--- webhook logs ---" +kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true +log "--- end webhook logs ---" -# Mock sidecar should ungate classical-mock within 60s -log "waiting for mock sidecar to ungate classical-mock..." -for i in $(seq 1 60); do - phase="$(kubectl get pod classical-mock -o jsonpath='{.status.phase}' 2>/dev/null || true)" - { [ "$phase" = "Running" ] || [ "$phase" = "Succeeded" ]; } && break +# 1. Webhook should have created fluence-sidecar ServiceAccount +log "checking webhook created fluence-sidecar ServiceAccount..." +for i in $(seq 1 30); do + kubectl get serviceaccount fluence-sidecar -n default > /dev/null 2>&1 && break sleep 2 done -wait_pod_phase classical-mock Running 30 \ - || fail "classical-mock did not reach Running after gate removal" +kubectl get serviceaccount fluence-sidecar -n default \ + || fail "webhook did not create fluence-sidecar ServiceAccount" +log " fluence-sidecar ServiceAccount created" -# Task ARN annotation must have been patched -arn="$(kubectl get pod classical-mock \ - -o jsonpath='{.metadata.annotations.braket\.quantum/task-arn}' 2>/dev/null || true)" -[ -n "$arn" ] || fail "braket.quantum/task-arn annotation not set on classical-mock" -log "task ARN annotation present: $arn" +# 2. Webhook should have created fluence-sidecar Role +kubectl get role fluence-sidecar -n default \ + || fail "webhook did not create fluence-sidecar Role" +log " fluence-sidecar Role created" -# Classical pod must have read the annotation via downward API -out="$(kubectl logs classical-mock 2>/dev/null || true)" -echo "$out" | grep -q "TASK_ARN=" \ - || fail "BRAKET_TASK_ARN not visible in classical-mock logs (got: $out)" +# 3. Webhook should have created fluence-sidecar RoleBinding +kubectl get rolebinding fluence-sidecar -n default \ + || fail "webhook did not create fluence-sidecar RoleBinding" +log " fluence-sidecar RoleBinding created" -log "PASS: gate/ungate plumbing works — annotation patched and read via downward API" +# 4. Webhook should have copied interceptor ConfigMap into the namespace +kubectl get configmap fluence-braket-interceptor -n default \ + || fail "webhook did not copy fluence-braket-interceptor ConfigMap into namespace" +log " fluence-braket-interceptor ConfigMap copied into namespace" + +# 5. Leader pod should have sidecar container injected +log "checking sidecar injected into leader pod..." +wait_pod_phase sidecar-test-leader Running 120 \ + || { kubectl describe pod sidecar-test-leader; fail "sidecar-test-leader did not reach Running"; } +containers=$(kubectl get pod sidecar-test-leader \ + -o jsonpath='{.spec.containers[*].name}') +echo "$containers" | grep -q "fluence-sidecar" \ + || fail "fluence-sidecar container not injected into leader (containers: $containers)" +log " fluence-sidecar container injected into leader" + +# 6. Worker pod should have scheduling gate added by webhook +gate=$(kubectl get pod sidecar-test-worker \ + -o jsonpath='{.spec.schedulingGates[0].name}') +[ "$gate" = "quantum.braket/ready" ] \ + || fail "worker pod does not have quantum.braket/ready gate (got: $gate)" +log " quantum.braket/ready gate set on worker" + +log "PASS: webhook correctly created RBAC, injected sidecar, gated worker" +log "NOTE: fluence-quantum-classical priority is set by the sidecar at ungate time, not the webhook" log "NOTE: braket sidecar integration test (SDK intercept, tag discovery," log " queue polling) is in sidecars/braket/test/integration.sh" -kubectl delete -f examples/test/e2e/sidecar-mock.yaml --wait=false || true + +# Only clean up pods and PodGroup — RBAC is namespace infrastructure +# that persists for future quantum workflows in this namespace +kubectl delete -f examples/test/e2e/sidecar-mock-pods.yaml