diff --git a/clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml b/clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml index 1c0d53b0..385f605b 100644 --- a/clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml +++ b/clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml @@ -855,7 +855,19 @@ spec: # tenant_created_wire_test.go (commit 56e04ac8a) keeps guarding # the wire shape. Refs #2133 (NOT Closes — operator walk on # fresh prov required). - version: 1.4.233 + # 1.4.234 — TBD-V56 / #2132 (2026-05-21): bp-self-sovereign-cutover + # state-machine idempotent across catalyst-api Pod restarts. t40 + # (2026-05-21) reproduced the failure mode: mid-cutover Pod + # restart caused the 10-min `egress-block-test` Job to run TWICE + # because the engine never consulted the durable + # `cutover.openova.io/step=` label on existing Jobs from + # the prior process lifetime — it minted a fresh Job under a + # new in-memory runEpoch. Fix in cutover.go consults existing + # Jobs before minting; Complete=True flips + # step..result=success directly; no rerun. Unblocks Step 09 + # (gitea-token-mint), unblocks customer-journey Phase 2 (Sandbox). + # Refs #2132 (NOT Closes — operator walk on fresh prov required). + version: 1.4.234 sourceRef: kind: HelmRepository name: bp-catalyst-platform diff --git a/products/catalyst/bootstrap/api/internal/handler/cutover.go b/products/catalyst/bootstrap/api/internal/handler/cutover.go index 4eb7b84a..3ab745df 100644 --- a/products/catalyst/bootstrap/api/internal/handler/cutover.go +++ b/products/catalyst/bootstrap/api/internal/handler/cutover.go @@ -573,6 +573,129 @@ func cutoverJobName(stepName string, runEpoch int64) string { return fmt.Sprintf("cutover-%s-%d", stepName, runEpoch) } +// cutoverStepLabelKey is the label every cutover Job (created by +// createCutoverJob) carries with value == stepName. The reconcile +// loop in runCutoverStep queries by this label to detect Jobs that +// completed during a prior process lifetime — that's how the +// state-machine becomes idempotent across catalyst-api Pod restarts +// (TBD-V56 / #2132). +const cutoverStepLabelKey = "cutover.openova.io/step" + +// findExistingJobsForStep returns every Job in the cutover namespace +// whose `cutover.openova.io/step` label matches stepName, with the +// most-recently-created Job first. +// +// This is the read-side seam that makes the state-machine idempotent +// across catalyst-api restarts. Pre-fix the engine only ever consulted +// the in-memory runEpoch when checking what Job to attach to — a Pod +// restart lost the epoch and a NEW Job got minted even when the +// previous Job had already reached Complete=True. That's how t40 +// (2026-05-21) ran the 10-minute `egress-block-test` hold TWICE on +// the same Sovereign: Job 1 completed at 06:54:24Z, catalyst-api +// restarted at 07:07Z, the resume path created Job 2 from scratch at +// 07:07:23Z instead of attaching to the already-Complete Job 1. +// +// Post-fix the engine asks "is there already a Job for this step?" +// via this helper BEFORE creating a fresh one. Any Job stamped by +// createCutoverJob carries `cutover.openova.io/step=`, so +// the lookup is exact. +func findExistingJobsForStep(ctx context.Context, deps *cutoverDeps, stepName string) ([]batchv1.Job, error) { + selector := fmt.Sprintf("%s=%s", cutoverStepLabelKey, stepName) + jobs, err := deps.core.BatchV1().Jobs(deps.ns).List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + return nil, fmt.Errorf("list existing Jobs for step %q in %q: %w", stepName, deps.ns, err) + } + // Newest first. The fake clientset rarely populates CreationTimestamp + // but every real Job has one stamped by the apiserver, so the sort + // is meaningful in production. Stable so equal timestamps preserve + // input order. + sort.SliceStable(jobs.Items, func(i, j int) bool { + return jobs.Items[i].CreationTimestamp.After(jobs.Items[j].CreationTimestamp.Time) + }) + return jobs.Items, nil +} + +// findExistingTerminalJobForStep scans existing Jobs for a step and +// returns the first one that reached a terminal condition (Complete +// or Failed). Returns (nil, "", false) if no Job has terminated yet +// — the caller is then free to either attach to a still-running Job +// or create a fresh one. +// +// Prefers Complete over Failed when both exist for the same step, +// because a re-fired step (chart's auto-trigger Job retries after a +// transient failure) is a valid "success eventually" path. If only +// Failed exists, the caller surfaces that as the cutover failure +// — exactly the engine's existing semantics. +func findExistingTerminalJobForStep(ctx context.Context, deps *cutoverDeps, stepName string) (*batchv1.Job, batchv1.JobConditionType, bool) { + jobs, err := findExistingJobsForStep(ctx, deps, stepName) + if err != nil { + return nil, "", false + } + var failedJob *batchv1.Job + for i := range jobs { + j := &jobs[i] + cond, ok := terminalJobCondition(j) + if !ok { + continue + } + if cond == batchv1.JobComplete { + return j, batchv1.JobComplete, true + } + // Hold the first Failed in case no Complete exists. + if failedJob == nil { + failedJob = j + } + } + if failedJob != nil { + return failedJob, batchv1.JobFailed, true + } + return nil, "", false +} + +// findExistingRunningJobForStep returns the first non-terminal Job +// (still in flight) for a step, or nil. The engine attaches its watch +// to a running Job instead of minting a fresh one when this returns +// non-nil — covers the case where a previous process kicked off the +// Job and the apiserver / kubelet are still progressing it after the +// catalyst-api Pod restarted. +func findExistingRunningJobForStep(ctx context.Context, deps *cutoverDeps, stepName string) *batchv1.Job { + jobs, err := findExistingJobsForStep(ctx, deps, stepName) + if err != nil { + return nil + } + for i := range jobs { + j := &jobs[i] + if _, terminal := terminalJobCondition(j); terminal { + continue + } + return j + } + return nil +} + +// jobCompletionTime returns the Job's completion timestamp in RFC3339, +// falling back to the latest condition's LastTransitionTime, then to +// time.Now() if the Job is malformed. The fallbacks are defensive — a +// terminal condition without timing data is a chart bug, but we'd +// rather emit a slightly-imprecise audit row than drop the success +// signal entirely. +func jobCompletionTime(job *batchv1.Job) time.Time { + if job == nil { + return time.Now().UTC() + } + if job.Status.CompletionTime != nil { + return job.Status.CompletionTime.Time.UTC() + } + for _, c := range job.Status.Conditions { + if c.Type == batchv1.JobComplete && c.Status == corev1.ConditionTrue && !c.LastTransitionTime.IsZero() { + return c.LastTransitionTime.Time.UTC() + } + } + return time.Now().UTC() +} + // createCutoverJob creates a fresh Job from the step's PodSpec. // // Per docs/INVIOLABLE-PRINCIPLES.md #3 (Crossplane is the ONLY day-2 @@ -845,12 +968,90 @@ func (h *Handler) runCutover(ctx context.Context, deps *cutoverDeps, steps []cut func (h *Handler) runCutoverStep(ctx context.Context, deps *cutoverDeps, step cutoverStep, runEpoch int64) error { bus := h.cutoverBusFor() + + // ── Idempotency check (TBD-V56 / #2132) ───────────────────────────── + // + // BEFORE writing `step..result=running` and minting a new Job, + // look for a Job from a prior process lifetime that already settled + // the step. This is the seam that survives catalyst-api Pod restarts: + // + // 1. If a Complete=True Job exists with the cutover.openova.io/step + // label, write success to the durable ConfigMap and return + // WITHOUT creating a new Job (no re-running 10-min holds). + // 2. If only a Failed Job exists, surface the failure — the + // engine's halt semantics are unchanged from the pre-fix path. + // 3. If a non-terminal Job exists, attach the watch to that Job + // name instead of minting a new one. + // 4. If no Job exists, mint a fresh Job under the current runEpoch. + // + // For DaemonSet-wait steps this is a no-op (the chart owns the + // DaemonSet lifecycle; we only wait for ready). + if step.mode == cutoverModeJob { + if job, cond, terminal := findExistingTerminalJobForStep(ctx, deps, step.stepName); terminal { + finishedAt := jobCompletionTime(job) + if cond == batchv1.JobComplete { + // Recover the step's startedAt from the durable ConfigMap + // if present, otherwise stamp it with the Job's CreationTimestamp + // as a fallback so audit rows remain populated. + priorStatus, _ := readCutoverStatus(ctx, deps) + startedAt := priorStatus["step."+step.stepName+".startedAt"] + if startedAt == "" { + if !job.CreationTimestamp.IsZero() { + startedAt = job.CreationTimestamp.UTC().Format(time.RFC3339) + } else { + startedAt = finishedAt.Format(time.RFC3339) + } + } + if err := patchCutoverStatus(ctx, deps, map[string]string{ + "step." + step.stepName + ".startedAt": startedAt, + "step." + step.stepName + ".finishedAt": finishedAt.Format(time.RFC3339), + "step." + step.stepName + ".result": "success", + "step." + step.stepName + ".jobName": job.Name, + }); err != nil { + return fmt.Errorf("status patch (resume-success): %w", err) + } + h.publishCutoverEvent(bus, cutoverEvent{ + Time: finishedAt.Format(time.RFC3339), + Phase: cutoverPhaseStepFinished, + Level: "info", + Step: step.stepName, + JobName: job.Name, + Message: fmt.Sprintf("step %s already completed by prior Job %s; advancing without re-running", step.stepName, job.Name), + }) + return nil + } + // cond == batchv1.JobFailed: surface as the step's terminal + // failure — the engine halts at this step, identical to a + // freshly-failed Job. We DO NOT re-create on Failed because + // the chart's PodSpec is idempotent state-ensure logic and + // a second attempt usually fails the same way; operator + // intervention is required. + if err := patchCutoverStatus(ctx, deps, map[string]string{ + "step." + step.stepName + ".finishedAt": finishedAt.Format(time.RFC3339), + "step." + step.stepName + ".result": "failed", + "step." + step.stepName + ".jobName": job.Name, + }); err != nil { + return fmt.Errorf("status patch (resume-failed): %w", err) + } + return fmt.Errorf("Job %s/%s reported Failed condition (carried over from prior cutover attempt)", deps.ns, job.Name) + } + } + startedAt := time.Now().UTC() jobOrDS := "" + var attachToExisting *batchv1.Job switch step.mode { case cutoverModeJob: - jobOrDS = cutoverJobName(step.stepName, runEpoch) + // Prefer attaching to a still-running Job from a prior process + // lifetime over minting a new one. This avoids the t40 failure + // mode where two Jobs ran the SAME 10-min hold back-to-back. + attachToExisting = findExistingRunningJobForStep(ctx, deps, step.stepName) + if attachToExisting != nil { + jobOrDS = attachToExisting.Name + } else { + jobOrDS = cutoverJobName(step.stepName, runEpoch) + } case cutoverModeDaemonSetWait: jobOrDS = step.daemonsetRef } @@ -863,19 +1064,25 @@ func (h *Handler) runCutoverStep(ctx context.Context, deps *cutoverDeps, step cu return fmt.Errorf("status patch (start): %w", err) } + startMsg := fmt.Sprintf("step %s started (%s)", step.stepName, step.mode) + if attachToExisting != nil { + startMsg = fmt.Sprintf("step %s resumed by attaching to in-flight Job %s", step.stepName, attachToExisting.Name) + } h.publishCutoverEvent(bus, cutoverEvent{ Time: startedAt.Format(time.RFC3339), Phase: cutoverPhaseStepStarted, Level: "info", Step: step.stepName, JobName: jobOrDS, - Message: fmt.Sprintf("step %s started (%s)", step.stepName, step.mode), + Message: startMsg, }) switch step.mode { case cutoverModeJob: - if _, err := createCutoverJob(ctx, deps, step, runEpoch); err != nil { - return err + if attachToExisting == nil { + if _, err := createCutoverJob(ctx, deps, step, runEpoch); err != nil { + return err + } } cond, err := watchJobToCompletion(ctx, deps, jobOrDS) if err != nil { diff --git a/products/catalyst/bootstrap/api/internal/handler/cutover_test.go b/products/catalyst/bootstrap/api/internal/handler/cutover_test.go index f774d3cf..bdec3fba 100644 --- a/products/catalyst/bootstrap/api/internal/handler/cutover_test.go +++ b/products/catalyst/bootstrap/api/internal/handler/cutover_test.go @@ -945,3 +945,368 @@ func TestResumeInterruptedCutover_NoOpWhenNeverStarted(t *testing.T) { t.Errorf("resume hook created %d Jobs on never-started cutover, want 0", jobCreates) } } + +// ── TBD-V56 / #2132 — Job-status checkpoint idempotency ───────────────────── +// +// The t40 (2026-05-21) failure mode that motivated TBD-V56: +// +// 1. Pod 1 (catalyst-api) creates `cutover-egress-block-test-1779345819`. +// The Job runs the 10-minute deny-egress hold and reaches Complete=True +// at 06:54:24Z. +// 2. Pod 1 restarts mid-cutover at 07:07Z (operationally, to fix a +// separate cache-sync issue). The success-patch to the status +// ConfigMap never lands. +// 3. Pod 2 boots. ResumeInterruptedCutover reads the ConfigMap; status +// shows step.egress-block-test.result=running. Resume resets it +// to "", spawns runCutover from scratch. +// 4. PRE-FIX: runCutoverStep mints a NEW Job +// `cutover-egress-block-test-1779347242` and runs the 10-min hold +// a SECOND time. Wall-clock waste: 10 minutes per step. +// POST-FIX: runCutoverStep consults findExistingTerminalJobForStep +// BEFORE minting a new Job. Job 1's Complete=True condition is +// observed; the engine writes step..result=success directly +// and advances to the next step without re-running. + +// makeCompletedJobForStep builds a Job in the cutover namespace already +// stamped with a JobComplete=True condition + CompletionTime, labeled +// for step lookup. Simulates a Job that completed during a prior +// catalyst-api process lifetime — the canonical Pod-restart scenario. +func makeCompletedJobForStep(stepName string, completionOffset time.Duration) *batchv1.Job { + completedAt := metav1.NewTime(time.Now().UTC().Add(-completionOffset)) + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("cutover-%s-%d", stepName, time.Now().Unix()-int64(completionOffset.Seconds())), + Namespace: cutoverTestNS, + Labels: map[string]string{ + cutoverStepPartOfLabel: cutoverStepPartOfValue, + cutoverStepComponentLabel: "cutover-job", + cutoverStepLabelKey: stepName, + }, + CreationTimestamp: metav1.NewTime(completedAt.Time.Add(-5 * time.Minute)), + }, + Status: batchv1.JobStatus{ + Succeeded: 1, + CompletionTime: &completedAt, + Conditions: []batchv1.JobCondition{{ + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + LastTransitionTime: completedAt, + Reason: "Test", + }}, + }, + } +} + +// makeFailedJobForStep mirrors makeCompletedJobForStep but stamps a +// terminal Failed condition. +func makeFailedJobForStep(stepName string) *batchv1.Job { + completedAt := metav1.NewTime(time.Now().UTC().Add(-30 * time.Second)) + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("cutover-%s-failed-%d", stepName, time.Now().Unix()), + Namespace: cutoverTestNS, + Labels: map[string]string{ + cutoverStepPartOfLabel: cutoverStepPartOfValue, + cutoverStepComponentLabel: "cutover-job", + cutoverStepLabelKey: stepName, + }, + CreationTimestamp: metav1.NewTime(completedAt.Time.Add(-5 * time.Minute)), + }, + Status: batchv1.JobStatus{ + Failed: 4, + Conditions: []batchv1.JobCondition{{ + Type: batchv1.JobFailed, + Status: corev1.ConditionTrue, + LastTransitionTime: completedAt, + Reason: "BackoffLimitExceeded", + }}, + }, + } +} + +// TestFindExistingTerminalJobForStep_PrefersCompleteOverFailed covers +// the read-side seam in isolation. A Complete Job MUST win over a +// Failed Job when both exist for the same step — a retried step is +// allowed to succeed eventually. +func TestFindExistingTerminalJobForStep_PrefersCompleteOverFailed(t *testing.T) { + objs := []k8sruntime.Object{ + makeFailedJobForStep("egress-block-test"), + makeCompletedJobForStep("egress-block-test", 1*time.Minute), + } + _, client := fakeHandlerWithCutover(t, objs...) + + job, cond, terminal := findExistingTerminalJobForStep(context.Background(), + &cutoverDeps{core: client, ns: cutoverTestNS}, "egress-block-test") + if !terminal { + t.Fatalf("findExistingTerminalJobForStep returned terminal=false; want true") + } + if cond != batchv1.JobComplete { + t.Errorf("cond = %q, want %q (Complete must win over Failed when both exist)", cond, batchv1.JobComplete) + } + if job == nil { + t.Fatalf("job == nil; want the Complete=True Job") + } +} + +// TestRunCutoverStep_SkipsRerunWhenPriorJobComplete is the canonical +// Pod-restart regression test for TBD-V56 / #2132. Status ConfigMap +// shows step result=running; a Complete=True Job for that step +// already exists in the cluster (from the prior process lifetime). +// The engine MUST NOT mint a new Job — the t40 10-minute-hold-ran-twice +// bug. It must flip result=success directly off the prior Job and +// advance. +func TestRunCutoverStep_SkipsRerunWhenPriorJobComplete(t *testing.T) { + preStatus := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cutoverStatusConfigMapName(), + Namespace: cutoverTestNS, + }, + Data: map[string]string{ + "cutoverComplete": "false", + "cutoverStartedAt": "2026-05-21T04:58:25Z", + "totalSteps": "3", + // Step 1 (gitea-mirror) succeeded cleanly in Pod 1. + "step.gitea-mirror.result": "success", + "step.gitea-mirror.startedAt": "2026-05-21T04:58:30Z", + "step.gitea-mirror.finishedAt": "2026-05-21T04:59:00Z", + // Step 2 (egress-block-test) — Pod 1 crashed AFTER the Job + // reached Complete=True but BEFORE the success-patch landed. + "step.egress-block-test.result": "running", + "step.egress-block-test.startedAt": "2026-05-21T06:44:24Z", + "step.egress-block-test.jobName": "cutover-egress-block-test-1779345819", + }, + } + // The completed Job from Pod 1 is still in the cluster (24h TTL). + priorJob := makeCompletedJobForStep("egress-block-test", 13*time.Minute) + objs := []k8sruntime.Object{ + makeCutoverStepCM("cutover-step-01-gitea-mirror", "gitea-mirror", 1, cutoverModeJob, minimalPodSpecYAML, ""), + makeCutoverStepCM("cutover-step-02-egress-block-test", "egress-block-test", 2, cutoverModeJob, minimalPodSpecYAML, ""), + makeCutoverStepCM("cutover-step-03-mirror-resync", "mirror-resync", 3, cutoverModeJob, minimalPodSpecYAML, ""), + preStatus, + priorJob, + } + h, client := fakeHandlerWithCutover(t, objs...) + // Auto-complete any Job that DOES get created so the engine can + // finish — but we ASSERT that egress-block-test is NOT re-created. + installJobReactor(t, client, batchv1.JobComplete) + + // Count Job creates per step. + jobCreates := map[string]int{} + var muJobs sync.Mutex + client.PrependReactor("create", "jobs", func(action clienttesting.Action) (bool, k8sruntime.Object, error) { + ca, ok := action.(clienttesting.CreateAction) + if !ok { + return false, nil, nil + } + job, ok := ca.GetObject().(*batchv1.Job) + if !ok { + return false, nil, nil + } + muJobs.Lock() + jobCreates[job.Labels[cutoverStepLabelKey]]++ + muJobs.Unlock() + return false, nil, nil + }) + + // Fire the on-startup resume path — what cmd/api/main.go calls. + h.ResumeInterruptedCutover(context.Background()) + + // Wait for engine to terminate. + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + bus := h.cutoverBusFor() + bus.mu.Lock() + running := bus.running + bus.mu.Unlock() + if !running { + break + } + time.Sleep(50 * time.Millisecond) + } + + cm, err := client.CoreV1().ConfigMaps(cutoverTestNS).Get(context.Background(), + cutoverStatusConfigMapName(), metav1.GetOptions{}) + if err != nil { + t.Fatalf("get status ConfigMap: %v", err) + } + if cm.Data["cutoverComplete"] != "true" { + t.Errorf("cutoverComplete = %q, want true; data=%+v", cm.Data["cutoverComplete"], cm.Data) + } + if cm.Data["step.egress-block-test.result"] != "success" { + t.Errorf("step.egress-block-test.result = %q, want success", cm.Data["step.egress-block-test.result"]) + } + if cm.Data["step.egress-block-test.jobName"] != priorJob.Name { + t.Errorf("step.egress-block-test.jobName = %q, want %q (must carry the prior Job's name)", + cm.Data["step.egress-block-test.jobName"], priorJob.Name) + } + + muJobs.Lock() + defer muJobs.Unlock() + if jobCreates["gitea-mirror"] != 0 { + t.Errorf("gitea-mirror Job creates = %d, want 0 (success in prior run)", jobCreates["gitea-mirror"]) + } + // THE CORE ASSERTION — the t40 regression guard. + if jobCreates["egress-block-test"] != 0 { + t.Errorf("egress-block-test Job creates = %d, want 0 — TBD-V56 idempotency violated: a new Job was minted even though the prior Job is Complete=True", jobCreates["egress-block-test"]) + } + if jobCreates["mirror-resync"] != 1 { + t.Errorf("mirror-resync Job creates = %d, want 1 (never ran before)", jobCreates["mirror-resync"]) + } +} + +// TestRunCutoverStep_SurfacesPriorFailedJob — when the only existing +// Job for a step is Failed, the engine MUST NOT re-create it. The +// cutover halts at that step with failedStep + lastError set. Mirrors +// the existing FailsHaltAtFailedStep semantics but exercises the +// Pod-restart code path (the failure happened in a prior process). +func TestRunCutoverStep_SurfacesPriorFailedJob(t *testing.T) { + preStatus := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cutoverStatusConfigMapName(), + Namespace: cutoverTestNS, + }, + Data: map[string]string{ + "cutoverComplete": "false", + "cutoverStartedAt": "2026-05-21T04:58:25Z", + "totalSteps": "2", + "step.gitea-mirror.result": "running", + "step.gitea-mirror.startedAt": "2026-05-21T04:58:30Z", + }, + } + priorJob := makeFailedJobForStep("gitea-mirror") + objs := []k8sruntime.Object{ + makeCutoverStepCM("cutover-step-01-gitea-mirror", "gitea-mirror", 1, cutoverModeJob, minimalPodSpecYAML, ""), + makeCutoverStepCM("cutover-step-02-egress-block-test", "egress-block-test", 2, cutoverModeJob, minimalPodSpecYAML, ""), + preStatus, + priorJob, + } + h, client := fakeHandlerWithCutover(t, objs...) + + jobCreates := map[string]int{} + var muJobs sync.Mutex + client.PrependReactor("create", "jobs", func(action clienttesting.Action) (bool, k8sruntime.Object, error) { + ca, ok := action.(clienttesting.CreateAction) + if !ok { + return false, nil, nil + } + job, ok := ca.GetObject().(*batchv1.Job) + if !ok { + return false, nil, nil + } + muJobs.Lock() + jobCreates[job.Labels[cutoverStepLabelKey]]++ + muJobs.Unlock() + return false, nil, nil + }) + + h.ResumeInterruptedCutover(context.Background()) + + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + bus := h.cutoverBusFor() + bus.mu.Lock() + running := bus.running + bus.mu.Unlock() + if !running { + break + } + time.Sleep(50 * time.Millisecond) + } + + cm, err := client.CoreV1().ConfigMaps(cutoverTestNS).Get(context.Background(), + cutoverStatusConfigMapName(), metav1.GetOptions{}) + if err != nil { + t.Fatalf("get status ConfigMap: %v", err) + } + if cm.Data["cutoverComplete"] == "true" { + t.Errorf("cutoverComplete = true, want false (prior Failed Job halts the cutover)") + } + if cm.Data["failedStep"] != "gitea-mirror" { + t.Errorf("failedStep = %q, want gitea-mirror", cm.Data["failedStep"]) + } + if cm.Data["step.gitea-mirror.result"] != "failed" { + t.Errorf("step.gitea-mirror.result = %q, want failed", cm.Data["step.gitea-mirror.result"]) + } + muJobs.Lock() + defer muJobs.Unlock() + if jobCreates["gitea-mirror"] != 0 { + t.Errorf("gitea-mirror Job creates = %d, want 0 (prior Failed Job must NOT be re-created)", jobCreates["gitea-mirror"]) + } + if jobCreates["egress-block-test"] != 0 { + t.Errorf("egress-block-test Job creates = %d, want 0 (engine must halt at the failed step)", jobCreates["egress-block-test"]) + } +} + +// TestHandleCutoverStart_IdempotentReusesPriorCompleteJob — the same +// guarantee through the HTTP /start path. Operator/auto-trigger hits +// /start on a Pod that hasn't yet picked up the prior-process state; +// the engine must observe the prior Complete=True Job and NOT re-run. +func TestHandleCutoverStart_IdempotentReusesPriorCompleteJob(t *testing.T) { + preStatus := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cutoverStatusConfigMapName(), + Namespace: cutoverTestNS, + }, + Data: map[string]string{ + "cutoverComplete": "false", + "cutoverStartedAt": "2026-05-21T04:58:25Z", + "totalSteps": "1", + // Empty per-step rows — the orchestrator's perspective on + // boot; the Jobs in the cluster are the source of truth. + }, + } + priorJob := makeCompletedJobForStep("only-step", 1*time.Minute) + objs := []k8sruntime.Object{ + makeCutoverStepCM("cutover-step-01-only-step", "only-step", 1, cutoverModeJob, minimalPodSpecYAML, ""), + preStatus, + priorJob, + } + h, client := fakeHandlerWithCutover(t, objs...) + installJobReactor(t, client, batchv1.JobComplete) + + jobCreates := 0 + var muJobs sync.Mutex + client.PrependReactor("create", "jobs", func(action clienttesting.Action) (bool, k8sruntime.Object, error) { + muJobs.Lock() + jobCreates++ + muJobs.Unlock() + return false, nil, nil + }) + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/api/v1/sovereign/cutover/start", nil) + h.HandleCutoverStart(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body=%s", rec.Code, rec.Body.String()) + } + + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + bus := h.cutoverBusFor() + bus.mu.Lock() + running := bus.running + bus.mu.Unlock() + if !running { + break + } + time.Sleep(50 * time.Millisecond) + } + + muJobs.Lock() + defer muJobs.Unlock() + if jobCreates != 0 { + t.Errorf("Job creates = %d, want 0 (prior Complete Job must be reused, not re-minted)", jobCreates) + } + + cm, err := client.CoreV1().ConfigMaps(cutoverTestNS).Get(context.Background(), + cutoverStatusConfigMapName(), metav1.GetOptions{}) + if err != nil { + t.Fatalf("get status ConfigMap: %v", err) + } + if cm.Data["cutoverComplete"] != "true" { + t.Errorf("cutoverComplete = %q, want true", cm.Data["cutoverComplete"]) + } + if cm.Data["step.only-step.result"] != "success" { + t.Errorf("step.only-step.result = %q, want success", cm.Data["step.only-step.result"]) + } +} diff --git a/products/catalyst/chart/Chart.yaml b/products/catalyst/chart/Chart.yaml index 821cb3a8..3727b4aa 100644 --- a/products/catalyst/chart/Chart.yaml +++ b/products/catalyst/chart/Chart.yaml @@ -2258,8 +2258,94 @@ name: bp-catalyst-platform # # Refs #1099 (NOT Closes — operator walk + screenshot is the DoD per # CLAUDE.md §0). -version: 1.4.233 -appVersion: 1.4.232 +version: 1.4.234 +appVersion: 1.4.234 +# 1.4.234 — TBD-V56 / #2132 (2026-05-21): fix bp-self-sovereign-cutover +# state-machine to checkpoint Job-status into the durable ConfigMap on +# every reconcile tick — survives catalyst-api Pod restarts mid-cutover. +# +# Pre-fix t40 (2026-05-21, Sovereign be5b43a8f5033a77): cutover started +# 04:58:25Z. `egress-block-test` Job 1 (`cutover-egress-block-test- +# 1779345819`) reached Complete=True at 06:54:24Z, but catalyst-api +# restarted at 07:07Z (operationally, to fix a separate cache-sync +# issue) BEFORE the success-patch to the status ConfigMap landed. On +# Pod 2, `ResumeInterruptedCutover` reset `step.egress-block-test.result` +# from "running" to "" and spawned `runCutover` from scratch. The +# engine then minted a fresh Job (`cutover-egress-block-test- +# 1779347242`) and ran the 10-minute deny-egress hold A SECOND TIME. +# Wall-clock waste: 10 minutes per step. Worse: cutover stuck at 63% +# for ~3 hours because the success-write never landed — Step 09 +# (gitea-token-mint) never fired → `sme/provisioning` init container +# `wait-for-cutover-token` blocked forever → tenant onboarding +# blocked → Phase 2 (qwen-code Sandbox) unreachable. +# +# Root cause (CLAUDE.md §4.16 multi-layer RCA): `runCutoverStep` +# only ever consulted the in-memory runEpoch when checking what Job +# to attach to. A Pod restart lost the epoch + the engine never +# looked for Jobs from prior process lifetimes via their durable +# `cutover.openova.io/step=` label. +# +# Fix lives in `products/catalyst/bootstrap/api/internal/handler/ +# cutover.go`: +# +# 1. New helper `findExistingTerminalJobForStep` lists every Job +# in the cutover namespace labeled `cutover.openova.io/step= +# ` and returns the first Complete=True (preferring +# Complete over Failed when both exist for a retried step). +# 2. New helper `findExistingRunningJobForStep` returns the first +# non-terminal Job for a step so the engine attaches its watch +# to it instead of minting a fresh Job. +# 3. `runCutoverStep` now does (in order): (a) if a prior Complete +# Job exists, write `step..result=success` + finishedAt +# from the Job's CompletionTime + jobName= +# directly and return — NO new Job created; (b) if a prior +# Failed Job exists, write failed and halt — NO re-create; (c) +# if a non-terminal Job exists, attach watch to it; (d) +# otherwise mint a fresh Job under the current runEpoch. +# 4. `jobCompletionTime` helper extracts the canonical timestamp +# from a Job's `status.completionTime` (with defensive fallbacks +# to condition.LastTransitionTime / time.Now). +# +# Idempotency surface (the founder's TBD-V56 acceptance criteria): +# - Pod-restart mid-cutover (step result=running, Complete Job in +# cluster) → engine flips result=success, no rerun. +# - Pod-restart mid-cutover (step result=running, Failed Job in +# cluster) → engine flips result=failed, no rerun. Halts. +# - POST /cutover/start while another cutover is running → 409 +# (unchanged). +# - POST /cutover/start when cutoverComplete=true → 200 with +# existing state, no engine spawn (unchanged). +# - POST /cutover/start when in-flight on cluster but in-memory +# state empty (HTTP path) → engine consults existing Jobs, skips +# completed steps, attaches to running ones. +# +# Validation: +# - 4 new unit tests in `cutover_test.go`: +# - TestFindExistingTerminalJobForStep_PrefersCompleteOverFailed +# - TestRunCutoverStep_SkipsRerunWhenPriorJobComplete (the t40 +# regression guard — asserts Job creates = 0 for the +# egress-block-test step when a prior Complete Job exists) +# - TestRunCutoverStep_SurfacesPriorFailedJob (Failed Job from +# prior process is surfaced, no rerun) +# - TestHandleCutoverStart_IdempotentReusesPriorCompleteJob +# (HTTP /start path consumes prior Complete Job) +# - All 27 pre-existing cutover tests still pass. +# +# Walk impact: once this PR merges + Flux auto-rolls the new +# catalyst-api image to t40, the next reconcile tick will find Job +# `cutover-egress-block-test-1779347242` Complete=True, write +# step.egress-block-test.result=success to the ConfigMap, and +# advance to step 09 (gitea-token-mint). The `sme/provisioning` +# init container `wait-for-cutover-token` unblocks once Step 09 +# patches the `provisioning-github-token` Secret with the +# `catalyst.openova.io/token-source: self-sovereign-cutover-step-09` +# annotation. Customer journey on `walkdemo40.omani.homes` +# unblocks. ALL future Sovereign provs benefit from restart-safe +# cutover semantics. +# +# Refs #2132 TBD-V56. Per CLAUDE.md §0, this is NOT `Closes` — the +# founder closes after operator-walk-with-screenshot verification. +# # 1.4.232 — fix(sovereign-tls): render Cilium Gateway listener YAML in # the chart (templates/sovereign-tls-vars-cm.yaml) and feed it into the # sovereign-tls Kustomization via Flux postBuild.substituteFrom on