Compare commits
2 Commits
a30/wire-r
...
archive/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c45a3e9af5 | ||
|
|
3e786e5b36 |
380
.github/workflows/blueprint-release.yaml
vendored
380
.github/workflows/blueprint-release.yaml
vendored
@ -69,16 +69,7 @@ on:
|
||||
- products
|
||||
|
||||
permissions:
|
||||
# contents: write — the auto-bump-pin step (added 2026-05-18, TBD-A6
|
||||
# meta-fix) writes back the `version:` line in
|
||||
# clusters/_template/bootstrap-kit/<NN>-<chart>.yaml so the bootstrap-
|
||||
# kit pin moves in lockstep with the published OCI artifact. Before
|
||||
# this, every chart bump required a SEPARATE manual collector PR to
|
||||
# bump the pin (PRs #1666, #1687, #1695, #1698, #1707 in the
|
||||
# 2026-05-17/18 wave alone). The bot-author commit does NOT re-trigger
|
||||
# workflows (GITHUB_TOKEN convention), so we can safely push without
|
||||
# looping the publish pipeline.
|
||||
contents: write
|
||||
contents: read
|
||||
packages: write
|
||||
id-token: write # for cosign keyless signing
|
||||
|
||||
@ -506,363 +497,6 @@ jobs:
|
||||
cosign attest --yes --predicate /tmp/sbom/sbom.spdx.json --type spdxjson \
|
||||
"${{ steps.push.outputs.ref }}@${{ steps.push.outputs.digest }}"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# AUTO-BUMP — clusters/_template/bootstrap-kit/<NN>-<chart>.yaml
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TBD-A6 meta-fix (2026-05-18): every chart-publishing wave in this
|
||||
# session required a SEPARATE manual collector PR to bump the
|
||||
# bootstrap-kit pin so Sovereigns would actually install the new
|
||||
# OCI artifact. Without the pin bump, the chart at e.g.
|
||||
# bp-catalyst-platform:1.4.166 gets published to GHCR but
|
||||
# clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml
|
||||
# still pins `version: 1.4.165` and fresh Sovereigns silently
|
||||
# install the OLD artifact.
|
||||
#
|
||||
# Manual collector PRs from this session ALONE (eliminated by
|
||||
# this hook):
|
||||
# - #1676 chart 1.4.162→1.4.163 — Wave 16 collector
|
||||
# - #1687 chart 1.4.163→1.4.164 — Wave 17 collector
|
||||
# - #1694 bp-guacamole 0.1.21 → 0.1.22 (TBD-G6)
|
||||
# - #1695 chart 1.4.164→1.4.165 — Wave 18 collector
|
||||
# - #1698 chart 1.4.165→1.4.166 (TBD-E8)
|
||||
# - #1700 bp-guacamole 0.1.22 → 0.1.23 (TBD-G4 phase 2)
|
||||
# - #1706 self-sovereign-cutover 0.1.29→0.1.30 (TBD-C18)
|
||||
# - #1707 chart 1.4.166→1.4.167 — Wave 24 collector
|
||||
#
|
||||
# Mechanism: the canonical chart name comes from Chart.yaml `name:`
|
||||
# (already read into ${{ steps.chart.outputs.name }} above). The
|
||||
# corresponding bootstrap-kit pin file is identified by grepping
|
||||
# for `^ chart: <name>$` (6-space indent matches the
|
||||
# HelmRelease.spec.chart.spec.chart shape in every existing slot).
|
||||
# If no pin file matches, the chart is NOT in the bootstrap kit
|
||||
# (e.g. it's an optional Application Blueprint that Sovereigns
|
||||
# opt into via overlay) — this is a graceful no-op, NOT a failure.
|
||||
#
|
||||
# The bot-author commit does NOT re-trigger blueprint-release per
|
||||
# the GITHUB_TOKEN convention. So this hook converges in ONE pass:
|
||||
# publish → bump pin → push. The next Sovereign provisioned will
|
||||
# pick up the new pin via the standard Flux reconciliation.
|
||||
#
|
||||
# Idempotent reset-and-rewrite: parallel matrix jobs (multiple
|
||||
# changed Blueprints in the same push) could race on the same
|
||||
# branch. Retry up to 3 times with `git fetch + reset --hard
|
||||
# origin/main + re-sed` so concurrent runs produce strictly
|
||||
# ordered commits instead of clobbering each other.
|
||||
- name: "Auto-bump bootstrap-kit pin for ${{ steps.chart.outputs.name }}"
|
||||
if: steps.chart.outputs.skip != 'true'
|
||||
id: bump_pin
|
||||
env:
|
||||
CHART_NAME: ${{ steps.chart.outputs.name }}
|
||||
CHART_VERSION: ${{ steps.chart.outputs.version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Locate the bootstrap-kit slot pinning this chart, if any.
|
||||
# The 6-space indent matches every existing slot's chart
|
||||
# directive:
|
||||
#
|
||||
# spec:
|
||||
# chart:
|
||||
# spec:
|
||||
# chart: bp-<name> ← 6 spaces
|
||||
# version: <semver> ← 6 spaces, same scope
|
||||
pin_file=$(grep -lE "^ chart: ${CHART_NAME}\$" \
|
||||
clusters/_template/bootstrap-kit/*.yaml 2>/dev/null || true)
|
||||
|
||||
if [ -z "$pin_file" ]; then
|
||||
# TBD-A6 hardening (2026-05-18): before declaring "not in the
|
||||
# kit", check whether a slot pins this chart at a DIFFERENT
|
||||
# indent than the canonical 6 spaces. If so, the auto-bump
|
||||
# would silently no-op forever and the chart-pin pair would
|
||||
# drift undetected. Fail loudly so the slot author re-indents
|
||||
# to the canonical shape before publishing.
|
||||
wrong_indent=$(grep -lE "^[[:space:]]+chart: ${CHART_NAME}\$" \
|
||||
clusters/_template/bootstrap-kit/*.yaml 2>/dev/null || true)
|
||||
if [ -n "$wrong_indent" ]; then
|
||||
echo "::error title=Bootstrap-kit slot indent drift::Slot file(s) pin ${CHART_NAME} but at a non-6-space indent: $wrong_indent. The auto-bump hook keys on \`^ chart: <name>\$\` (exactly 6 spaces). Re-indent the slot's \`chart:\` and \`version:\` lines under HelmRelease.spec.chart.spec to 6 spaces, matching every other slot. The pin-sync audit (scripts/check-bootstrap-kit-pin-sync.sh) keys on the same regex and would also miss this slot — drift would be undetected."
|
||||
exit 1
|
||||
fi
|
||||
echo "INFO: no bootstrap-kit slot pins ${CHART_NAME} — graceful no-op (chart is an opt-in Application Blueprint, not part of the kit)."
|
||||
echo "bumped=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Defensive: refuse to operate if multiple slots reference the
|
||||
# same chart name — bootstrap-kit invariant is one slot per
|
||||
# chart, and a violation would mean the schema changed under
|
||||
# us and the hook would write wrong things.
|
||||
slot_count=$(echo "$pin_file" | wc -l)
|
||||
if [ "$slot_count" -ne 1 ]; then
|
||||
echo "::error title=Multiple bootstrap-kit slots for ${CHART_NAME}::Expected exactly one slot file pinning chart '${CHART_NAME}', found ${slot_count}: $pin_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read current pin. The HelmRelease.spec.chart.spec.version is
|
||||
# at exactly 6 spaces of indent in every slot (audited 2026-
|
||||
# 05-18 across all 51 files). If the shape ever changes we
|
||||
# fail loudly rather than write the wrong line.
|
||||
current=$(awk '/^ version:/{print $2; exit}' "$pin_file" | tr -d '"')
|
||||
if [ -z "$current" ]; then
|
||||
echo "::error title=Unparseable bootstrap-kit pin::No ' version:' line at 6-space indent in $pin_file. The HelmRelease.spec.chart.spec.version shape changed under TBD-A6's auto-bump hook; refusing to write."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$current" = "$CHART_VERSION" ]; then
|
||||
echo "INFO: ${pin_file} already pins ${CHART_NAME}=${CHART_VERSION} — no-op."
|
||||
echo "bumped=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Bumping ${pin_file}: ${CHART_NAME} ${current} → ${CHART_VERSION}"
|
||||
# sed targets the single 6-space ` version:` line at the
|
||||
# chart-pin scope. There is exactly one such line per slot
|
||||
# (audited 2026-05-18); the regex is anchored to start-of-line
|
||||
# so a deeper-indented `version:` (e.g. inside .values.xxx)
|
||||
# cannot accidentally match.
|
||||
sed -i -E "s|^ version: .*\$| version: ${CHART_VERSION}|" "$pin_file"
|
||||
|
||||
# Verify the sed actually flipped the line — defence against
|
||||
# a future indent change shipping silently.
|
||||
new=$(awk '/^ version:/{print $2; exit}' "$pin_file" | tr -d '"')
|
||||
if [ "$new" != "$CHART_VERSION" ]; then
|
||||
echo "::error title=sed failed::After rewrite, ${pin_file} still pins '${new}', expected '${CHART_VERSION}'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "bumped=true" >> "$GITHUB_OUTPUT"
|
||||
echo "pin_file=${pin_file}" >> "$GITHUB_OUTPUT"
|
||||
echo "prev_version=${current}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# LOCKSTEP — platform/<bp>/blueprint.yaml `spec.version`
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# TBD-A20 (issue #1856, 2026-05-18): the auto-bump hook above only
|
||||
# touched clusters/_template/bootstrap-kit/*.yaml. The upstream
|
||||
# blueprint manifest (platform/<bp>/blueprint.yaml) ALSO carries
|
||||
# a `spec.version` field that must equal the Chart.yaml `version`
|
||||
# by convergence contract:
|
||||
#
|
||||
# TestBootstrapKit_BlueprintCardsHaveRequiredFields
|
||||
# (tests/e2e/bootstrap-kit/main_test.go)
|
||||
#
|
||||
# asserts `Chart.yaml.version == blueprint.yaml.spec.version` for
|
||||
# every kit blueprint. Six blueprints (cilium, cert-manager, flux,
|
||||
# openbao, keycloak, gitea) silently drifted between Chart.yaml
|
||||
# and blueprint.yaml until the test started failing — A17 (#1855)
|
||||
# hot-patched the six drifts; this lockstep step removes the
|
||||
# structural recurrence pattern.
|
||||
#
|
||||
# Location convention:
|
||||
# - Leaf platform blueprints live at ${matrix.path}/blueprint.yaml
|
||||
# (e.g. platform/cilium/blueprint.yaml)
|
||||
# - Umbrella product blueprints live at ${matrix.path}/chart/blueprint.yaml
|
||||
# (e.g. products/continuum/chart/blueprint.yaml)
|
||||
#
|
||||
# Some charts have no blueprint.yaml at all (e.g. products/catalyst —
|
||||
# the chart-only umbrella for bp-catalyst-platform). This is fine —
|
||||
# the lockstep is a graceful no-op when no blueprint.yaml exists.
|
||||
#
|
||||
# The `spec.version` line is at exactly 2-space indent in every
|
||||
# blueprint.yaml (audited 2026-05-18 across all 71 files in
|
||||
# platform/ + products/). We key on `^ version:` with a defensive
|
||||
# parse + post-write verify.
|
||||
- name: "Lockstep-bump blueprint.yaml spec.version for ${{ steps.chart.outputs.name }}"
|
||||
if: steps.chart.outputs.skip != 'true'
|
||||
id: bump_blueprint
|
||||
env:
|
||||
CHART_NAME: ${{ steps.chart.outputs.name }}
|
||||
CHART_VERSION: ${{ steps.chart.outputs.version }}
|
||||
CHART_PATH: ${{ matrix.path }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Try the two canonical blueprint.yaml locations in order.
|
||||
# Each Blueprint folder uses exactly one shape; never both.
|
||||
bp_file=""
|
||||
for candidate in "${CHART_PATH}/blueprint.yaml" "${CHART_PATH}/chart/blueprint.yaml"; do
|
||||
if [ -f "$candidate" ]; then
|
||||
bp_file="$candidate"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$bp_file" ]; then
|
||||
echo "INFO: no blueprint.yaml at ${CHART_PATH}/blueprint.yaml or ${CHART_PATH}/chart/blueprint.yaml — graceful no-op (chart has no Blueprint manifest, e.g. the products/catalyst umbrella)."
|
||||
echo "bumped=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Sanity: this file must actually be a Blueprint manifest (kind:
|
||||
# Blueprint), not some other YAML co-located in the folder.
|
||||
# Without this guard, a stray non-Blueprint blueprint.yaml (the
|
||||
# CRD definition file we saw at products/catalyst/chart/crds/
|
||||
# is an example of a generic name-collision) would be rewritten
|
||||
# incorrectly.
|
||||
bp_kind=$(awk '/^kind:/{print $2; exit}' "$bp_file" | tr -d '"')
|
||||
if [ "$bp_kind" != "Blueprint" ]; then
|
||||
echo "INFO: ${bp_file} kind='${bp_kind}' — not a Blueprint manifest, graceful no-op."
|
||||
echo "bumped=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Read current spec.version. The blueprint.yaml convention is:
|
||||
#
|
||||
# spec:
|
||||
# version: <semver> ← 2 spaces, single `version:` line
|
||||
#
|
||||
# The 2-space indent is anchored start-of-line so any deeper-
|
||||
# indented `version:` (inside spec.upgrades.from[], etc.) cannot
|
||||
# accidentally match.
|
||||
current=$(awk '/^ version:/{print $2; exit}' "$bp_file" | tr -d '"')
|
||||
if [ -z "$current" ]; then
|
||||
echo "::error title=Unparseable blueprint.yaml spec.version::No ' version:' line at 2-space indent in $bp_file. The Blueprint manifest shape changed under TBD-A20's lockstep hook; refusing to write."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$current" = "$CHART_VERSION" ]; then
|
||||
echo "INFO: ${bp_file} already at spec.version=${CHART_VERSION} — no-op."
|
||||
echo "bumped=false" >> "$GITHUB_OUTPUT"
|
||||
# Still emit the path so downstream steps know about it.
|
||||
echo "bp_file=${bp_file}" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Lockstep-bumping ${bp_file}: spec.version ${current} → ${CHART_VERSION}"
|
||||
sed -i -E "s|^ version: .*\$| version: ${CHART_VERSION}|" "$bp_file"
|
||||
|
||||
# Verify the sed actually flipped the line.
|
||||
new=$(awk '/^ version:/{print $2; exit}' "$bp_file" | tr -d '"')
|
||||
if [ "$new" != "$CHART_VERSION" ]; then
|
||||
echo "::error title=blueprint.yaml sed failed::After rewrite, ${bp_file} still has spec.version='${new}', expected '${CHART_VERSION}'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "bumped=true" >> "$GITHUB_OUTPUT"
|
||||
echo "bp_file=${bp_file}" >> "$GITHUB_OUTPUT"
|
||||
echo "prev_version=${current}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: "Commit + push bootstrap-kit pin bump + blueprint.yaml lockstep"
|
||||
# Run if either side has staged changes — usually they bump in
|
||||
# tandem, but a kit-only chart (no blueprint.yaml) or a non-kit
|
||||
# leaf (no pin file) might bump only one. Either is sufficient
|
||||
# to commit.
|
||||
if: steps.chart.outputs.skip != 'true' && (steps.bump_pin.outputs.bumped == 'true' || steps.bump_blueprint.outputs.bumped == 'true')
|
||||
env:
|
||||
CHART_NAME: ${{ steps.chart.outputs.name }}
|
||||
CHART_VERSION: ${{ steps.chart.outputs.version }}
|
||||
PIN_FILE: ${{ steps.bump_pin.outputs.pin_file }}
|
||||
PREV_VERSION: ${{ steps.bump_pin.outputs.prev_version }}
|
||||
BP_FILE: ${{ steps.bump_blueprint.outputs.bp_file }}
|
||||
BP_PREV_VERSION: ${{ steps.bump_blueprint.outputs.prev_version }}
|
||||
PIN_BUMPED: ${{ steps.bump_pin.outputs.bumped }}
|
||||
BP_BUMPED: ${{ steps.bump_blueprint.outputs.bumped }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git config user.name "hatiyildiz"
|
||||
git config user.email "hatiyildiz@users.noreply.github.com"
|
||||
|
||||
# Idempotent reset-and-rewrite on push conflict — parallel
|
||||
# matrix jobs (multiple Blueprints bumped in one push) can race
|
||||
# on the same branch. On conflict we re-fetch, re-sed against
|
||||
# whatever state is currently on origin/main, and re-commit.
|
||||
# The operation is convergent: every retry produces the same
|
||||
# final state (pin=CHART_VERSION, blueprint=CHART_VERSION).
|
||||
rewrite_pin() {
|
||||
local pf="$1"
|
||||
local cur
|
||||
cur=$(awk '/^ version:/{print $2; exit}' "$pf" | tr -d '"')
|
||||
if [ "$cur" = "${CHART_VERSION}" ]; then
|
||||
return 1 # already there
|
||||
fi
|
||||
sed -i -E "s|^ version: .*\$| version: ${CHART_VERSION}|" "$pf"
|
||||
return 0
|
||||
}
|
||||
rewrite_blueprint() {
|
||||
local bp="$1"
|
||||
local cur
|
||||
cur=$(awk '/^ version:/{print $2; exit}' "$bp" | tr -d '"')
|
||||
if [ "$cur" = "${CHART_VERSION}" ]; then
|
||||
return 1
|
||||
fi
|
||||
sed -i -E "s|^ version: .*\$| version: ${CHART_VERSION}|" "$bp"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Stage whichever files were rewritten.
|
||||
if [ "${PIN_BUMPED}" = "true" ] && [ -n "${PIN_FILE}" ]; then
|
||||
git add "${PIN_FILE}"
|
||||
fi
|
||||
if [ "${BP_BUMPED}" = "true" ] && [ -n "${BP_FILE}" ]; then
|
||||
git add "${BP_FILE}"
|
||||
fi
|
||||
|
||||
if git diff --staged --quiet; then
|
||||
echo "no staged changes — already in sync"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Compose commit message. Pin bump remains the primary subject
|
||||
# (preserves the existing `deploy(<chart>): bump bootstrap-kit
|
||||
# pin X -> Y (auto, Refs TBD-A6)` shape used by every existing
|
||||
# automation). The blueprint.yaml lockstep is mentioned as a
|
||||
# secondary line so consumers parsing recent log subjects
|
||||
# don't see a format change. When ONLY blueprint.yaml bumps
|
||||
# (chart not in the kit), the subject acknowledges TBD-A20.
|
||||
if [ "${PIN_BUMPED}" = "true" ] && [ "${BP_BUMPED}" = "true" ]; then
|
||||
msg="deploy(${CHART_NAME}): bump bootstrap-kit pin ${PREV_VERSION} -> ${CHART_VERSION} (auto, Refs TBD-A6)
|
||||
|
||||
Also locksteps platform blueprint.yaml spec.version ${BP_PREV_VERSION} -> ${CHART_VERSION} (Refs TBD-A20, #1856)."
|
||||
elif [ "${PIN_BUMPED}" = "true" ]; then
|
||||
msg="deploy(${CHART_NAME}): bump bootstrap-kit pin ${PREV_VERSION} -> ${CHART_VERSION} (auto, Refs TBD-A6)"
|
||||
else
|
||||
# Only blueprint.yaml moved — chart is not in the bootstrap kit
|
||||
# (e.g. an opt-in Application Blueprint like bp-velero, bp-vllm).
|
||||
msg="deploy(${CHART_NAME}): lockstep blueprint.yaml spec.version ${BP_PREV_VERSION} -> ${CHART_VERSION} (auto, Refs TBD-A20, #1856)"
|
||||
fi
|
||||
git commit -m "${msg}"
|
||||
|
||||
for i in 1 2 3; do
|
||||
if git push origin HEAD:main; then
|
||||
echo "Pushed lockstep bump for ${CHART_NAME}=${CHART_VERSION} (pin=${PIN_BUMPED}, blueprint=${BP_BUMPED})"
|
||||
exit 0
|
||||
fi
|
||||
echo "push attempt $i failed — re-fetching origin/main and re-applying lockstep"
|
||||
git fetch origin main
|
||||
git reset --hard origin/main
|
||||
|
||||
did_pin=0
|
||||
did_bp=0
|
||||
if [ "${PIN_BUMPED}" = "true" ] && [ -n "${PIN_FILE}" ]; then
|
||||
if rewrite_pin "${PIN_FILE}"; then
|
||||
git add "${PIN_FILE}"
|
||||
did_pin=1
|
||||
fi
|
||||
fi
|
||||
if [ "${BP_BUMPED}" = "true" ] && [ -n "${BP_FILE}" ]; then
|
||||
if rewrite_blueprint "${BP_FILE}"; then
|
||||
git add "${BP_FILE}"
|
||||
did_bp=1
|
||||
fi
|
||||
fi
|
||||
if [ "$did_pin" -eq 0 ] && [ "$did_bp" -eq 0 ]; then
|
||||
echo "origin/main already at ${CHART_VERSION} — nothing to push"
|
||||
exit 0
|
||||
fi
|
||||
if git diff --staged --quiet; then
|
||||
echo "no changes after re-fetch — already at ${CHART_VERSION} on origin/main"
|
||||
exit 0
|
||||
fi
|
||||
if [ "$did_pin" -eq 1 ] && [ "$did_bp" -eq 1 ]; then
|
||||
git commit -m "deploy(${CHART_NAME}): bump bootstrap-kit pin -> ${CHART_VERSION} + blueprint.yaml lockstep (auto, Refs TBD-A6 + TBD-A20, retry $i)"
|
||||
elif [ "$did_pin" -eq 1 ]; then
|
||||
git commit -m "deploy(${CHART_NAME}): bump bootstrap-kit pin -> ${CHART_VERSION} (auto, Refs TBD-A6, retry $i)"
|
||||
else
|
||||
git commit -m "deploy(${CHART_NAME}): lockstep blueprint.yaml spec.version -> ${CHART_VERSION} (auto, Refs TBD-A20, retry $i)"
|
||||
fi
|
||||
done
|
||||
echo "::error title=Lockstep push failed::3 attempts exhausted for ${CHART_NAME}=${CHART_VERSION}."
|
||||
exit 1
|
||||
|
||||
- name: Summary
|
||||
if: steps.chart.outputs.skip != 'true'
|
||||
run: |
|
||||
@ -875,15 +509,3 @@ Also locksteps platform blueprint.yaml spec.version ${BP_PREV_VERSION} -> ${CHAR
|
||||
echo "- **Cosigned:** ✓ (keyless via GitHub OIDC)" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "- **SBOM attested:** ✓ (SPDX-JSON)" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "- **Subchart guards:** ✓ working tree, ✓ packaged tgz, ✓ pulled OCI artifact, ✓ helm template smoke" >> "$GITHUB_STEP_SUMMARY"
|
||||
if [ "${{ steps.bump_pin.outputs.bumped }}" = "true" ]; then
|
||||
echo "- **Bootstrap-kit pin:** ✓ auto-bumped \`${{ steps.bump_pin.outputs.pin_file }}\` ${{ steps.bump_pin.outputs.prev_version }} → ${{ steps.chart.outputs.version }} (TBD-A6 meta-fix)" >> "$GITHUB_STEP_SUMMARY"
|
||||
else
|
||||
echo "- **Bootstrap-kit pin:** (chart is not in the kit — opt-in Application Blueprint, no pin to bump)" >> "$GITHUB_STEP_SUMMARY"
|
||||
fi
|
||||
if [ "${{ steps.bump_blueprint.outputs.bumped }}" = "true" ]; then
|
||||
echo "- **Blueprint.yaml lockstep:** ✓ auto-bumped \`${{ steps.bump_blueprint.outputs.bp_file }}\` spec.version ${{ steps.bump_blueprint.outputs.prev_version }} → ${{ steps.chart.outputs.version }} (TBD-A20, #1856)" >> "$GITHUB_STEP_SUMMARY"
|
||||
elif [ -n "${{ steps.bump_blueprint.outputs.bp_file }}" ]; then
|
||||
echo "- **Blueprint.yaml lockstep:** (already at ${{ steps.chart.outputs.version }}, no-op)" >> "$GITHUB_STEP_SUMMARY"
|
||||
else
|
||||
echo "- **Blueprint.yaml lockstep:** (chart has no platform/<bp>/blueprint.yaml — e.g. products/catalyst umbrella)" >> "$GITHUB_STEP_SUMMARY"
|
||||
fi
|
||||
|
||||
197
.github/workflows/build-bp-newapi.yaml
vendored
197
.github/workflows/build-bp-newapi.yaml
vendored
@ -1,197 +0,0 @@
|
||||
name: Build bp-newapi
|
||||
|
||||
# bp-newapi — Catalyst Blueprint wrapping the upstream NewAPI multi-tenant
|
||||
# LLM gateway (github.com/Calcium-Ion/new-api, MIT). Per
|
||||
# platform/newapi/chart/Chart.yaml the upstream ships a docker-compose
|
||||
# image only at `docker.io/calciumion/new-api:<UPSTREAM_VER>`. Per
|
||||
# docs/INVIOLABLE-PRINCIPLES.md #4a we never let production Sovereigns
|
||||
# pull from Docker Hub at runtime — every image must live in
|
||||
# ghcr.io/openova-io/* under a registry we own (no Docker Hub rate
|
||||
# limits, no upstream availability risk).
|
||||
#
|
||||
# This workflow mirrors the bp-guacamole pattern
|
||||
# (.github/workflows/build-bp-guacamole.yaml):
|
||||
# 1. Pulls `docker.io/calciumion/new-api:<UPSTREAM_VER>` from Docker Hub.
|
||||
# 2. Captures the upstream repo digest (sha256:...) so the GHCR tag
|
||||
# points at exactly the bytes we tested against, even if upstream
|
||||
# re-cuts the tag in the future.
|
||||
# 3. Re-tags + pushes into GHCR under
|
||||
# `ghcr.io/openova-io/openova/newapi-mirror:<UPSTREAM_VER>` so each
|
||||
# Sovereign pulls from a registry we own.
|
||||
# 4. Bumps platform/newapi/chart/values.yaml `newapi.image.tag` to the
|
||||
# mirrored tag.
|
||||
# 5. Bumps platform/newapi/chart/Chart.yaml `version` patch level + sets
|
||||
# `appVersion` to the upstream version + dispatches
|
||||
# blueprint-release.yaml so a fresh bp-newapi:<semver> OCI artifact
|
||||
# lands.
|
||||
#
|
||||
# Closes the gap surfaced by qa-loop bounded-cycle audit (prov #7, Gap F):
|
||||
# the chart referenced `ghcr.io/openova-io/openova/newapi-mirror:v0.4.5`
|
||||
# but no CI workflow ever built that image — the GHCR package didn't
|
||||
# exist and the Pod ImagePullBackOff'd on every fresh Sovereign, blocking
|
||||
# alice signup gate 5 (LLM).
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'platform/newapi/chart/**'
|
||||
- 'platform/newapi/blueprint.yaml'
|
||||
- '.github/workflows/build-bp-newapi.yaml'
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
upstream_version:
|
||||
description: 'Calcium-Ion/new-api upstream version (e.g. v0.13.2).'
|
||||
required: false
|
||||
default: 'v0.13.2'
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
NEWAPI_IMAGE: ghcr.io/openova-io/openova/newapi-mirror
|
||||
CHART_VALUES: platform/newapi/chart/values.yaml
|
||||
CHART_YAML: platform/newapi/chart/Chart.yaml
|
||||
# v0.13.2 is the latest stable (non-rc) Calcium-Ion/new-api release on
|
||||
# Docker Hub at the time this workflow was authored (2026-04-27 upstream
|
||||
# publish date). Bump in both this default AND in the chart Chart.yaml
|
||||
# `appVersion` when rolling forward. The v1.0.0-rc.x line is gated on
|
||||
# upstream stabilising the schema migration; do NOT auto-roll past
|
||||
# v0.13.x without re-running the channel-seed integration smoke against
|
||||
# NewAPI's `/api/channel/` admin shape (the seed Job uses the legacy
|
||||
# request body shape).
|
||||
DEFAULT_UPSTREAM_VERSION: 'v0.13.2'
|
||||
|
||||
jobs:
|
||||
mirror:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
actions: write
|
||||
outputs:
|
||||
upstream_version: ${{ steps.vars.outputs.upstream_version }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Resolve upstream version
|
||||
id: vars
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ver="${{ inputs.upstream_version }}"
|
||||
ver="${ver:-${DEFAULT_UPSTREAM_VERSION}}"
|
||||
echo "upstream_version=${ver}" >> "$GITHUB_OUTPUT"
|
||||
echo "Mirroring Calcium-Ion/new-api ${ver} to GHCR"
|
||||
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Mirror calciumion/new-api → ghcr.io/openova-io/openova/newapi-mirror
|
||||
env:
|
||||
UPSTREAM_VER: ${{ steps.vars.outputs.upstream_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
src="docker.io/calciumion/new-api:${UPSTREAM_VER}"
|
||||
dst="${NEWAPI_IMAGE}:${UPSTREAM_VER}"
|
||||
docker pull "${src}"
|
||||
# Capture the upstream repo digest so the GHCR tag points at
|
||||
# exactly the bytes we tested against, even if upstream ever
|
||||
# re-cuts the tag. Stored in GITHUB_ENV for the step summary.
|
||||
UPSTREAM_DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' "${src}")
|
||||
echo "Upstream digest: ${UPSTREAM_DIGEST}"
|
||||
docker tag "${src}" "${dst}"
|
||||
docker tag "${src}" "${NEWAPI_IMAGE}:latest"
|
||||
docker push "${dst}"
|
||||
docker push "${NEWAPI_IMAGE}:latest"
|
||||
echo "NEWAPI_UPSTREAM_DIGEST=${UPSTREAM_DIGEST}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Install yq
|
||||
run: |
|
||||
sudo wget -qO /usr/local/bin/yq \
|
||||
https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64
|
||||
sudo chmod +x /usr/local/bin/yq
|
||||
|
||||
- name: Bump image tag in values.yaml
|
||||
env:
|
||||
UPSTREAM_VER: ${{ steps.vars.outputs.upstream_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Set repository to GHCR mirror AND tag to the upstream version
|
||||
# we just mirrored. Repository write is idempotent (no-op once
|
||||
# values.yaml is already GHCR-pinned).
|
||||
yq eval -i ".newapi.image.repository = \"${NEWAPI_IMAGE}\"" "${CHART_VALUES}"
|
||||
yq eval -i ".newapi.image.tag = \"${UPSTREAM_VER}\"" "${CHART_VALUES}"
|
||||
echo "values.yaml after update:"
|
||||
yq eval '.newapi.image' "${CHART_VALUES}"
|
||||
|
||||
- name: Bump Chart.yaml patch version + appVersion
|
||||
env:
|
||||
UPSTREAM_VER: ${{ steps.vars.outputs.upstream_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
current=$(yq eval '.version' "${CHART_YAML}")
|
||||
IFS='.' read -r major minor patch <<<"${current}"
|
||||
next="${major}.${minor}.$((patch + 1))"
|
||||
yq eval -i ".version = \"${next}\"" "${CHART_YAML}"
|
||||
# appVersion mirrors the upstream tag we just mirrored (strip
|
||||
# leading v: Helm convention is appVersion = upstream version
|
||||
# without the v prefix). Operators reading `helm list` see the
|
||||
# actual NewAPI release running in their Sovereign.
|
||||
app_ver="${UPSTREAM_VER#v}"
|
||||
yq eval -i ".appVersion = \"${app_ver}\"" "${CHART_YAML}"
|
||||
echo "Chart.yaml: version ${current} -> ${next}, appVersion -> ${app_ver}"
|
||||
echo "CHART_NEW_VERSION=${next}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Commit and push chart bump
|
||||
id: deploy_commit
|
||||
env:
|
||||
UPSTREAM_VER: ${{ steps.vars.outputs.upstream_version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git add "${CHART_VALUES}" "${CHART_YAML}"
|
||||
if git diff --staged --quiet; then
|
||||
echo "No changes to commit"
|
||||
echo "pushed=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
git commit -m "deploy: bump bp-newapi upstream ${UPSTREAM_VER} chart ${CHART_NEW_VERSION}"
|
||||
for i in 1 2 3; do
|
||||
git push && break
|
||||
git pull --rebase
|
||||
done
|
||||
echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Trigger blueprint-release for the chart bump
|
||||
if: steps.deploy_commit.outputs.pushed == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
gh workflow run blueprint-release.yaml \
|
||||
--repo "${{ github.repository }}" \
|
||||
--ref main \
|
||||
-f blueprint=newapi \
|
||||
-f tree=platform
|
||||
echo "blueprint-release dispatched for platform/newapi @ main"
|
||||
|
||||
- name: Summary
|
||||
env:
|
||||
UPSTREAM_VER: ${{ steps.vars.outputs.upstream_version }}
|
||||
run: |
|
||||
{
|
||||
echo "## bp-newapi mirror complete"
|
||||
echo ""
|
||||
echo "- Upstream: \`docker.io/calciumion/new-api:${UPSTREAM_VER}\`"
|
||||
echo "- Mirrored: \`${NEWAPI_IMAGE}:${UPSTREAM_VER}\`"
|
||||
echo "- Upstream digest: \`${NEWAPI_UPSTREAM_DIGEST:-unknown}\`"
|
||||
echo "- Chart bumped to: \`${CHART_NEW_VERSION:-unchanged}\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
@ -1,175 +0,0 @@
|
||||
name: Build openova-flow-adapter-flux
|
||||
|
||||
# openova-flow-adapter-flux — DaemonSet sidecar that watches Flux
|
||||
# HelmRelease CRs and POSTs FlowMessage envelopes to openova-flow-server.
|
||||
# Source at products/openova-flow/adapter-flux/, chart at
|
||||
# platform/openova-flow-emitter/chart/.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4a (GitHub Actions is the ONLY
|
||||
# build path) every image that runs on OpenOva infra MUST be produced
|
||||
# by a CI workflow from a committed git SHA. This workflow mirrors the
|
||||
# shape of build-application-controller.yaml — same Buildx push, same
|
||||
# cosign keyless signing, same auto-bump of values.yaml + dispatch of
|
||||
# blueprint-release for chart re-publish.
|
||||
#
|
||||
# Per `feedback_inviolable_principles.md` / global CLAUDE.md "every
|
||||
# workflow MUST be event-driven, NEVER scheduled". Triggers on
|
||||
# push-to-main (paths filter), pull_request (test only, no push), and
|
||||
# workflow_dispatch for manual re-runs without a code change.
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'products/openova-flow/adapter-flux/**'
|
||||
- 'platform/openova-flow-emitter/chart/**'
|
||||
- '.github/workflows/build-openova-flow-adapter-flux.yaml'
|
||||
branches: [main]
|
||||
pull_request:
|
||||
paths:
|
||||
- 'products/openova-flow/adapter-flux/**'
|
||||
- 'platform/openova-flow-emitter/chart/**'
|
||||
- '.github/workflows/build-openova-flow-adapter-flux.yaml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE: ghcr.io/openova-io/openova/openova-flow-adapter-flux
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
id-token: write
|
||||
actions: write
|
||||
outputs:
|
||||
sha_short: ${{ steps.vars.outputs.sha_short }}
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set short SHA
|
||||
id: vars
|
||||
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
cache-dependency-path: |
|
||||
products/openova-flow/adapter-flux/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: products/openova-flow/adapter-flux
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: products/openova-flow/adapter-flux
|
||||
run: go test -count=1 -race ./...
|
||||
|
||||
# On pull_request runs we stop here — image push requires
|
||||
# `packages: write` which only main-branch authors hold.
|
||||
- name: Login to GHCR
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push image
|
||||
id: build
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: products/openova-flow/adapter-flux
|
||||
file: products/openova-flow/adapter-flux/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
|
||||
${{ env.IMAGE }}:latest
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/openova-io/openova
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.title=openova-flow-adapter-flux
|
||||
org.opencontainers.image.description=OpenovaFlow Flux adapter — HelmRelease informer to FlowMessage emitter
|
||||
provenance: false
|
||||
sbom: false
|
||||
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3
|
||||
|
||||
- name: Sign image with cosign (keyless)
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign sign --yes "${IMAGE}@${DIGEST}"
|
||||
|
||||
- name: Generate and attest SBOM
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign attest --yes \
|
||||
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
|
||||
--type spdx \
|
||||
"${IMAGE}@${DIGEST}"
|
||||
|
||||
# Auto-bump the chart values.yaml tag. The adapter-flux image is
|
||||
# consumed by the bp-openova-flow-emitter chart (chart name is
|
||||
# "emitter", binary name is "adapter-flux" — chart wraps the
|
||||
# adapter as a DaemonSet emitter per ADR contract).
|
||||
- name: Bump flowEmitter.image.tag in chart values.yaml
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
VALUES="platform/openova-flow-emitter/chart/values.yaml"
|
||||
awk -v sha="${SHA_SHORT}" '
|
||||
/^flowEmitter:/ { in_fe=1; print; next }
|
||||
in_fe && /^[a-zA-Z]/ && !/^flowEmitter:/ { in_fe=0 }
|
||||
in_fe && /^ image:/ { in_img=1; print; next }
|
||||
in_fe && /^ [a-zA-Z]/ && !/^ image:/ { in_img=0 }
|
||||
in_img && /^ tag:/ { sub(/:.*/, ": \"" sha "\""); in_img=0 }
|
||||
{ print }
|
||||
' "${VALUES}" > "${VALUES}.tmp" && mv "${VALUES}.tmp" "${VALUES}"
|
||||
echo "values.yaml after bump:"
|
||||
grep -A1 "^ image:" "${VALUES}" | head -6
|
||||
|
||||
- name: Commit and push values.yaml bump
|
||||
id: deploy_commit
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
if git diff --quiet platform/openova-flow-emitter/chart/values.yaml; then
|
||||
echo "no values.yaml change — already pinned to ${SHA_SHORT}"
|
||||
echo "pushed=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
git add platform/openova-flow-emitter/chart/values.yaml
|
||||
git commit -m "chore(deploy): bump openova-flow-adapter-flux image to ${SHA_SHORT} [skip ci]"
|
||||
git pull --rebase --autostash origin main || true
|
||||
git push origin HEAD:main
|
||||
echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Dispatch blueprint-release for chart re-publish
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' && steps.deploy_commit.outputs.pushed == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
gh workflow run blueprint-release.yaml \
|
||||
--repo "${GITHUB_REPOSITORY}" \
|
||||
--ref main \
|
||||
-f blueprint=openova-flow-emitter \
|
||||
-f tree=platform
|
||||
210
.github/workflows/build-openova-flow-server.yaml
vendored
210
.github/workflows/build-openova-flow-server.yaml
vendored
@ -1,210 +0,0 @@
|
||||
name: Build openova-flow-server
|
||||
|
||||
# openova-flow-server — stateless HTTP+SSE event router that drives the
|
||||
# OpenovaFlow timeline view in the Catalyst console. Source at
|
||||
# products/openova-flow/server/, chart at platform/openova-flow-server/chart/.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4a (GitHub Actions is the ONLY
|
||||
# build path) every image that runs on OpenOva infra MUST be produced
|
||||
# by a CI workflow from a committed git SHA. This workflow mirrors the
|
||||
# shape of build-application-controller.yaml — same Buildx push, same
|
||||
# cosign keyless signing, same auto-bump of values.yaml + dispatch of
|
||||
# blueprint-release for chart re-publish.
|
||||
#
|
||||
# Per `feedback_inviolable_principles.md` / global CLAUDE.md "every
|
||||
# workflow MUST be event-driven, NEVER scheduled". Triggers on
|
||||
# push-to-main (paths filter), pull_request (test only, no push), and
|
||||
# workflow_dispatch for manual re-runs without a code change.
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'products/openova-flow/server/**'
|
||||
- 'platform/openova-flow-server/chart/**'
|
||||
- '.github/workflows/build-openova-flow-server.yaml'
|
||||
branches: [main]
|
||||
pull_request:
|
||||
paths:
|
||||
- 'products/openova-flow/server/**'
|
||||
- 'platform/openova-flow-server/chart/**'
|
||||
- '.github/workflows/build-openova-flow-server.yaml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE: ghcr.io/openova-io/openova/openova-flow-server
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# contents: write — the deploy step below pushes a values.yaml SHA
|
||||
# bump back to main so the bp-openova-flow-server chart picks up
|
||||
# the newly-built image without an operator manually editing the
|
||||
# file (per `feedback_no_mvp_no_workarounds.md` rule 1: target-state,
|
||||
# never "manual follow-up bump").
|
||||
contents: write
|
||||
packages: write
|
||||
# id-token write is required by cosign keyless signing (Sigstore).
|
||||
id-token: write
|
||||
# actions: write — required for `gh workflow run` to dispatch
|
||||
# blueprint-release after the deploy commit lands. Without it
|
||||
# the GITHUB_TOKEN gets HTTP 403 "Resource not accessible by
|
||||
# integration" and bp-openova-flow-server OCI artifact stays
|
||||
# stuck on the previous deploy's SHA (#712 / catalyst-build.yaml
|
||||
# incident replicated here for parity).
|
||||
actions: write
|
||||
outputs:
|
||||
sha_short: ${{ steps.vars.outputs.sha_short }}
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set short SHA
|
||||
id: vars
|
||||
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
# server has no external deps (stdlib only) so no go.sum is
|
||||
# present in the tree — skip cache-dependency-path entirely.
|
||||
cache: false
|
||||
|
||||
- name: go vet
|
||||
working-directory: products/openova-flow/server
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: products/openova-flow/server
|
||||
run: go test -count=1 -race ./...
|
||||
|
||||
# On pull_request runs we stop here — image push requires
|
||||
# `packages: write` which only main-branch authors hold.
|
||||
- name: Login to GHCR
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push image
|
||||
id: build
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
# Build context is the server module dir — its Dockerfile's
|
||||
# `COPY go.mod ./` / `COPY cmd ./cmd` paths are relative to
|
||||
# that dir, not the repo root.
|
||||
context: products/openova-flow/server
|
||||
file: products/openova-flow/server/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
|
||||
${{ env.IMAGE }}:latest
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/openova-io/openova
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.title=openova-flow-server
|
||||
org.opencontainers.image.description=OpenovaFlow event router — HTTP ingest + SSE replay
|
||||
# provenance=false: containerd 1.7.x on k3s mis-resolves the
|
||||
# provenance attestation manifest. SBOM attestation handled by
|
||||
# the cosign attest step below.
|
||||
provenance: false
|
||||
sbom: false
|
||||
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3
|
||||
|
||||
- name: Sign image with cosign (keyless)
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign sign --yes "${IMAGE}@${DIGEST}"
|
||||
|
||||
- name: Generate and attest SBOM
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign attest --yes \
|
||||
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
|
||||
--type spdx \
|
||||
"${IMAGE}@${DIGEST}"
|
||||
|
||||
# Auto-bump the chart values.yaml tag so the next Sovereign chart
|
||||
# rollout picks up this image without a manual edit. Per
|
||||
# `feedback_no_mvp_no_workarounds.md` rule 1 (target-state, no
|
||||
# operator-action gates) and `feedback_inviolable_principles.md`
|
||||
# (event-driven, never cron). Mirrors the awk pattern in
|
||||
# build-application-controller.yaml (under `controllers.application.tag`).
|
||||
- name: Bump flowServer.image.tag in chart values.yaml
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
VALUES="platform/openova-flow-server/chart/values.yaml"
|
||||
# awk: find `flowServer:` (top-level key), then under it find
|
||||
# the nested ` image:` block, then update the next ` tag:`
|
||||
# line. Stops at the next top-level key so we don't bump a
|
||||
# sibling chart's tag.
|
||||
awk -v sha="${SHA_SHORT}" '
|
||||
/^flowServer:/ { in_fs=1; print; next }
|
||||
in_fs && /^[a-zA-Z]/ && !/^flowServer:/ { in_fs=0 }
|
||||
in_fs && /^ image:/ { in_img=1; print; next }
|
||||
in_fs && /^ [a-zA-Z]/ && !/^ image:/ { in_img=0 }
|
||||
in_img && /^ tag:/ { sub(/:.*/, ": \"" sha "\""); in_img=0 }
|
||||
{ print }
|
||||
' "${VALUES}" > "${VALUES}.tmp" && mv "${VALUES}.tmp" "${VALUES}"
|
||||
echo "values.yaml after bump:"
|
||||
grep -A1 "^ image:" "${VALUES}" | head -6
|
||||
|
||||
- name: Commit and push values.yaml bump
|
||||
id: deploy_commit
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
if git diff --quiet platform/openova-flow-server/chart/values.yaml; then
|
||||
echo "no values.yaml change — already pinned to ${SHA_SHORT}"
|
||||
echo "pushed=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
git add platform/openova-flow-server/chart/values.yaml
|
||||
# `[skip ci]` keeps blueprint-release from re-firing twice
|
||||
# (we explicitly dispatch it below — see the next step).
|
||||
git commit -m "chore(deploy): bump openova-flow-server image to ${SHA_SHORT} [skip ci]"
|
||||
# Pull-rebase to avoid races with parallel build commits.
|
||||
git pull --rebase --autostash origin main || true
|
||||
git push origin HEAD:main
|
||||
echo "pushed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# GitHub Actions does NOT trigger workflows from GITHUB_TOKEN bot
|
||||
# pushes by default (anti-recursion safeguard). The bot commit
|
||||
# above changes platform/openova-flow-server/chart/values.yaml
|
||||
# which would normally fire blueprint-release.yaml's path filter
|
||||
# — but bot pushes are silently filtered. Without this dispatch
|
||||
# the rebuilt image is NEVER baked into a new chart version, so
|
||||
# Sovereigns keep installing the previous chart with the previous
|
||||
# image tag (incident replicated from catalyst-build.yaml #712).
|
||||
- name: Dispatch blueprint-release for chart re-publish
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' && steps.deploy_commit.outputs.pushed == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
gh workflow run blueprint-release.yaml \
|
||||
--repo "${GITHUB_REPOSITORY}" \
|
||||
--ref main \
|
||||
-f blueprint=openova-flow-server \
|
||||
-f tree=platform
|
||||
190
.github/workflows/build-sandbox-controller.yaml
vendored
190
.github/workflows/build-sandbox-controller.yaml
vendored
@ -1,190 +0,0 @@
|
||||
name: Build sandbox-controller
|
||||
|
||||
# sandbox-controller — Wave 1 of the Sandbox product (PR #1622). Sister
|
||||
# of organization-controller / application-controller; watches Sandbox
|
||||
# CRs (sandbox.openova.io/v1) and reconciles per-Sandbox namespace +
|
||||
# RBAC + PVCs + placeholder tokens into the per-Org `catalyst-tenant`
|
||||
# Gitea repo. Per products/sandbox/docs/architecture.md §7.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4a (GitHub Actions is the only
|
||||
# build path) every image that runs on OpenOva infra MUST be produced
|
||||
# by a CI workflow from a committed git SHA. Shape mirrors
|
||||
# build-application-controller.yaml — same Buildx + cosign keyless
|
||||
# sign + SBOM attestation + auto-bump of the chart values.yaml so the
|
||||
# next Sovereign install picks up the SHA-pinned image without an
|
||||
# operator manually editing the file.
|
||||
#
|
||||
# Per `feedback_inviolable_principles.md`: event-driven only, NO cron.
|
||||
# Triggers on push-to-main with paths filter (so unrelated commits
|
||||
# don't burn CI minutes), pull_request for reviewers, and
|
||||
# workflow_dispatch for manual re-runs.
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'core/controllers/sandbox/**'
|
||||
- 'core/controllers/internal/**'
|
||||
- 'core/controllers/pkg/**'
|
||||
- 'core/controllers/go.mod'
|
||||
- 'core/controllers/go.sum'
|
||||
- '.github/workflows/build-sandbox-controller.yaml'
|
||||
branches: [main]
|
||||
pull_request:
|
||||
paths:
|
||||
- 'core/controllers/sandbox/**'
|
||||
- 'core/controllers/internal/**'
|
||||
- 'core/controllers/pkg/**'
|
||||
- 'core/controllers/go.mod'
|
||||
- 'core/controllers/go.sum'
|
||||
- '.github/workflows/build-sandbox-controller.yaml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE: ghcr.io/openova-io/openova/sandbox-controller
|
||||
CHART_VALUES: platform/sandbox/chart/values.yaml
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# contents: write — the deploy step below pushes a values.yaml SHA
|
||||
# bump back to main so the platform/sandbox chart picks up the
|
||||
# newly-built image without an operator manually editing the file
|
||||
# (per `feedback_no_mvp_no_workarounds.md` rule 1: target-state,
|
||||
# never "manual follow-up bump").
|
||||
contents: write
|
||||
packages: write
|
||||
# id-token write is required by cosign keyless signing (Sigstore).
|
||||
id-token: write
|
||||
outputs:
|
||||
sha_short: ${{ steps.vars.outputs.sha_short }}
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set short SHA
|
||||
id: vars
|
||||
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.23'
|
||||
cache-dependency-path: |
|
||||
core/controllers/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: core/controllers
|
||||
# Slice CC1 (#1095) consolidated the Group C controllers into a
|
||||
# single shared go.mod. Vet scoped to this controller's tree
|
||||
# plus the shared internal/ + pkg/ helpers it depends on.
|
||||
run: go vet ./sandbox/... ./internal/... ./pkg/...
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: core/controllers
|
||||
run: go test -count=1 -race ./sandbox/... ./internal/... ./pkg/...
|
||||
|
||||
# On pull_request runs we stop here — image push requires
|
||||
# `packages: write` which only main-branch authors hold.
|
||||
- name: Login to GHCR
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push image
|
||||
id: build
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
# Build context is the repository root so the Dockerfile's
|
||||
# COPY paths can reach core/controllers/{go.mod,internal,pkg,
|
||||
# sandbox}/.
|
||||
context: .
|
||||
file: core/controllers/sandbox/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
|
||||
${{ env.IMAGE }}:latest
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/openova-io/openova
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.title=sandbox-controller
|
||||
org.opencontainers.image.description=Reconciles Sandbox.sandbox.openova.io/v1 CRs into per-Org Gitea manifests (Wave 1 of #1615)
|
||||
# provenance=false: containerd 1.7.x on k3s mis-resolves the
|
||||
# provenance attestation manifest. SBOM attestation handled by
|
||||
# the cosign attest step below.
|
||||
provenance: false
|
||||
sbom: false
|
||||
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3
|
||||
|
||||
- name: Sign image with cosign (keyless)
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign sign --yes "${IMAGE}@${DIGEST}"
|
||||
|
||||
- name: Generate and attest SBOM
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign attest --yes \
|
||||
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
|
||||
--type spdx \
|
||||
"${IMAGE}@${DIGEST}"
|
||||
|
||||
# Auto-bump the chart values.yaml tag so the next Sovereign chart
|
||||
# rollout picks up this image without a manual edit. Per
|
||||
# `feedback_no_mvp_no_workarounds.md` rule 1 (target-state, no
|
||||
# operator-action gates) and `feedback_inviolable_principles.md`
|
||||
# (event-driven, never cron). Unlike build-k8s-ws-proxy.yaml this
|
||||
# workflow does NOT bump Chart.yaml — the Sandbox chart's
|
||||
# publication cadence is gated by Wave-2 readiness, not per-image
|
||||
# builds. Operators flip `enabled: true` per Sovereign overlay.
|
||||
- name: Install yq
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
run: |
|
||||
sudo wget -qO /usr/local/bin/yq \
|
||||
https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64
|
||||
sudo chmod +x /usr/local/bin/yq
|
||||
|
||||
- name: Bump image.tag in values.yaml
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
yq eval -i ".image.tag = \"${SHA_SHORT}\"" "${CHART_VALUES}"
|
||||
echo "values.yaml after bump:"
|
||||
yq eval '.image' "${CHART_VALUES}"
|
||||
|
||||
- name: Commit and push values.yaml bump
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
if git diff --quiet "${CHART_VALUES}"; then
|
||||
echo "no values.yaml change — already pinned to ${SHA_SHORT}"
|
||||
exit 0
|
||||
fi
|
||||
git add "${CHART_VALUES}"
|
||||
git commit -m "deploy: bump sandbox-controller image to ${SHA_SHORT}"
|
||||
# Pull-rebase to avoid races with parallel build commits.
|
||||
git pull --rebase --autostash origin main || true
|
||||
git push origin HEAD:main
|
||||
193
.github/workflows/build-sandbox-mcp-server.yaml
vendored
193
.github/workflows/build-sandbox-mcp-server.yaml
vendored
@ -1,193 +0,0 @@
|
||||
name: Build sandbox-mcp-server
|
||||
|
||||
# sandbox-mcp-server — Wave 2 of the Sandbox product (PR #1618). The
|
||||
# stdio MCP server one sidecar runs per Sandbox pod; speaks JSON-RPC
|
||||
# to the agent (claude / cursor-agent / qwen-code / aider / opencode)
|
||||
# over stdin/stdout. See products/sandbox/docs/architecture.md §3.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4a (GitHub Actions is the only
|
||||
# build path) every image that runs on OpenOva infra MUST be produced
|
||||
# by a CI workflow from a committed git SHA. Shape mirrors
|
||||
# build-sandbox-controller.yaml — same Buildx + cosign keyless sign +
|
||||
# SBOM attestation + auto-bump of the chart values.yaml so the next
|
||||
# Sovereign install picks up the SHA-pinned image without an operator
|
||||
# manually editing the file.
|
||||
#
|
||||
# Wave 8 / PR #1658 wired this module's go.mod to depend on
|
||||
# core/controllers + core/services/shared via `replace` directives
|
||||
# (canonical Gitea client + auth.Claims). The Dockerfile therefore
|
||||
# requires the repository ROOT as the build context, mirroring
|
||||
# build-sandbox-controller.yaml. Paths-filter widens to the dep trees
|
||||
# so a change to those sources re-triggers the build.
|
||||
#
|
||||
# Per `feedback_inviolable_principles.md`: event-driven only, NO cron.
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'products/sandbox/mcp-server/**'
|
||||
- 'core/controllers/**'
|
||||
- 'core/services/shared/**'
|
||||
- '.github/workflows/build-sandbox-mcp-server.yaml'
|
||||
branches: [main]
|
||||
pull_request:
|
||||
paths:
|
||||
- 'products/sandbox/mcp-server/**'
|
||||
- 'core/controllers/**'
|
||||
- 'core/services/shared/**'
|
||||
- '.github/workflows/build-sandbox-mcp-server.yaml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE: ghcr.io/openova-io/openova/sandbox-mcp-server
|
||||
CHART_VALUES: platform/sandbox/chart/values.yaml
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# contents: write — the deploy step below pushes a values.yaml SHA
|
||||
# bump back to main so the platform/sandbox chart picks up the
|
||||
# newly-built image without an operator manually editing the file
|
||||
# (per `feedback_no_mvp_no_workarounds.md` rule 1: target-state,
|
||||
# never "manual follow-up bump").
|
||||
contents: write
|
||||
packages: write
|
||||
# id-token write is required by cosign keyless signing (Sigstore).
|
||||
id-token: write
|
||||
outputs:
|
||||
sha_short: ${{ steps.vars.outputs.sha_short }}
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set short SHA
|
||||
id: vars
|
||||
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.23'
|
||||
cache-dependency-path: |
|
||||
products/sandbox/mcp-server/go.sum
|
||||
core/controllers/go.sum
|
||||
core/services/shared/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: products/sandbox/mcp-server
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: products/sandbox/mcp-server
|
||||
run: go test -count=1 -race ./...
|
||||
|
||||
# On pull_request runs we stop here — image push requires
|
||||
# `packages: write` which only main-branch authors hold.
|
||||
- name: Login to GHCR
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push image
|
||||
id: build
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
# Build context is the repository root so the Dockerfile's
|
||||
# COPY paths can reach core/controllers + core/services/shared
|
||||
# (PR #1658 `replace` targets) alongside products/sandbox/
|
||||
# mcp-server. Mirrors build-sandbox-controller.yaml.
|
||||
context: .
|
||||
file: products/sandbox/mcp-server/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
|
||||
${{ env.IMAGE }}:latest
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/openova-io/openova
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.title=sandbox-mcp-server
|
||||
org.opencontainers.image.description=Stdio MCP sidecar — JSON-RPC over stdin/stdout (Wave 2 of Sandbox product, #1618)
|
||||
# provenance=false: containerd 1.7.x on k3s mis-resolves the
|
||||
# provenance attestation manifest. SBOM attestation handled
|
||||
# by the cosign attest step below.
|
||||
provenance: false
|
||||
sbom: false
|
||||
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3
|
||||
|
||||
- name: Sign image with cosign (keyless)
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign sign --yes "${IMAGE}@${DIGEST}"
|
||||
|
||||
- name: Generate and attest SBOM
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign attest --yes \
|
||||
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
|
||||
--type spdx \
|
||||
"${IMAGE}@${DIGEST}"
|
||||
|
||||
# Auto-bump the chart values.yaml runtime.mcpImage so the next
|
||||
# Sovereign chart rollout picks up this image without a manual
|
||||
# edit. Per `feedback_no_mvp_no_workarounds.md` rule 1
|
||||
# (target-state, no operator-action gates) and
|
||||
# `feedback_inviolable_principles.md` (event-driven, never cron).
|
||||
# The chart's deployment.yaml `required` guard fails-fast when
|
||||
# runtime.mcpImage is empty (Inviolable Principle #4a), so an
|
||||
# un-bumped values.yaml = un-deployable chart. Mirrors the
|
||||
# build-sandbox-controller.yaml auto-bump shape, just targeting a
|
||||
# different yq path and writing a fully-qualified `<repo>:<sha>`
|
||||
# string (the consumer reads runtime.mcpImage as a single image
|
||||
# reference, not a {repository,tag} pair).
|
||||
- name: Install yq
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
run: |
|
||||
sudo wget -qO /usr/local/bin/yq \
|
||||
https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64
|
||||
sudo chmod +x /usr/local/bin/yq
|
||||
|
||||
- name: Bump runtime.mcpImage in values.yaml
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
yq eval -i ".runtime.mcpImage = \"${IMAGE}:${SHA_SHORT}\"" "${CHART_VALUES}"
|
||||
echo "values.yaml after bump:"
|
||||
yq eval '.runtime.mcpImage' "${CHART_VALUES}"
|
||||
|
||||
- name: Commit and push values.yaml bump
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
if git diff --quiet "${CHART_VALUES}"; then
|
||||
echo "no values.yaml change — already pinned to ${SHA_SHORT}"
|
||||
exit 0
|
||||
fi
|
||||
git add "${CHART_VALUES}"
|
||||
git commit -m "deploy: bump sandbox-mcp-server image to ${SHA_SHORT}"
|
||||
# Pull-rebase to avoid races with parallel build commits.
|
||||
git pull --rebase --autostash origin main || true
|
||||
git push origin HEAD:main
|
||||
184
.github/workflows/build-sandbox-pty-server.yaml
vendored
184
.github/workflows/build-sandbox-pty-server.yaml
vendored
@ -1,184 +0,0 @@
|
||||
name: Build sandbox-pty-server
|
||||
|
||||
# sandbox-pty-server — Wave 2 of the Sandbox product (PR #1618). The
|
||||
# in-pod HTTP+WS PTY shim (port 7681) that Wave 2's pty-server
|
||||
# StatefulSet runs alongside the agent process. See
|
||||
# products/sandbox/docs/architecture.md §2.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4a (GitHub Actions is the only
|
||||
# build path) every image that runs on OpenOva infra MUST be produced
|
||||
# by a CI workflow from a committed git SHA. Shape mirrors
|
||||
# build-sandbox-controller.yaml — same Buildx + cosign keyless sign +
|
||||
# SBOM attestation + auto-bump of the chart values.yaml so the next
|
||||
# Sovereign install picks up the SHA-pinned image without an operator
|
||||
# manually editing the file.
|
||||
#
|
||||
# Per `feedback_inviolable_principles.md`: event-driven only, NO cron.
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'products/sandbox/pty-server/**'
|
||||
- '.github/workflows/build-sandbox-pty-server.yaml'
|
||||
branches: [main]
|
||||
pull_request:
|
||||
paths:
|
||||
- 'products/sandbox/pty-server/**'
|
||||
- '.github/workflows/build-sandbox-pty-server.yaml'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE: ghcr.io/openova-io/openova/sandbox-pty-server
|
||||
CHART_VALUES: platform/sandbox/chart/values.yaml
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# contents: write — the deploy step below pushes a values.yaml SHA
|
||||
# bump back to main so the platform/sandbox chart picks up the
|
||||
# newly-built image without an operator manually editing the file
|
||||
# (per `feedback_no_mvp_no_workarounds.md` rule 1: target-state,
|
||||
# never "manual follow-up bump").
|
||||
contents: write
|
||||
packages: write
|
||||
# id-token write is required by cosign keyless signing (Sigstore).
|
||||
id-token: write
|
||||
outputs:
|
||||
sha_short: ${{ steps.vars.outputs.sha_short }}
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set short SHA
|
||||
id: vars
|
||||
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
cache-dependency-path: |
|
||||
products/sandbox/pty-server/go.sum
|
||||
|
||||
- name: go vet
|
||||
working-directory: products/sandbox/pty-server
|
||||
run: go vet ./...
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: products/sandbox/pty-server
|
||||
# Empty `go test ./...` is harmless: prints "no test files" and
|
||||
# exits 0. Wave-2 follow-ups will add unit tests under
|
||||
# internal/session/.
|
||||
run: go test -count=1 -race ./...
|
||||
|
||||
# On pull_request runs we stop here — image push requires
|
||||
# `packages: write` which only main-branch authors hold.
|
||||
- name: Login to GHCR
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push image
|
||||
id: build
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
# pty-server's Dockerfile uses `COPY . .` so the build context
|
||||
# is the pty-server directory itself (its own go.mod root —
|
||||
# NOT the repo root, unlike core/controllers which share a
|
||||
# parent go.mod). pty-server has no cross-tree `replace`
|
||||
# directives so a narrow context still resolves cleanly.
|
||||
context: products/sandbox/pty-server
|
||||
file: products/sandbox/pty-server/Dockerfile
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
|
||||
${{ env.IMAGE }}:latest
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/openova-io/openova
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.title=sandbox-pty-server
|
||||
org.opencontainers.image.description=In-pod HTTP+WS PTY shim (Wave 2 of Sandbox product, #1618)
|
||||
# provenance=false: containerd 1.7.x on k3s mis-resolves the
|
||||
# provenance attestation manifest. SBOM attestation handled
|
||||
# by the cosign attest step below.
|
||||
provenance: false
|
||||
sbom: false
|
||||
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3
|
||||
|
||||
- name: Sign image with cosign (keyless)
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign sign --yes "${IMAGE}@${DIGEST}"
|
||||
|
||||
- name: Generate and attest SBOM
|
||||
if: github.event_name != 'pull_request'
|
||||
env:
|
||||
DIGEST: ${{ steps.build.outputs.digest }}
|
||||
run: |
|
||||
cosign attest --yes \
|
||||
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
|
||||
--type spdx \
|
||||
"${IMAGE}@${DIGEST}"
|
||||
|
||||
# Auto-bump the chart values.yaml runtime.ptyServerImage so the
|
||||
# next Sovereign chart rollout picks up this image without a
|
||||
# manual edit. Per `feedback_no_mvp_no_workarounds.md` rule 1
|
||||
# (target-state, no operator-action gates) and
|
||||
# `feedback_inviolable_principles.md` (event-driven, never cron).
|
||||
# The chart's deployment.yaml `required` guard fails-fast when
|
||||
# runtime.ptyServerImage is empty (Inviolable Principle #4a), so
|
||||
# an un-bumped values.yaml = un-deployable chart. Mirrors the
|
||||
# build-sandbox-controller.yaml auto-bump shape, just targeting a
|
||||
# different yq path and writing a fully-qualified `<repo>:<sha>`
|
||||
# string (the consumer reads runtime.ptyServerImage as a single
|
||||
# image reference, not a {repository,tag} pair).
|
||||
- name: Install yq
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
run: |
|
||||
sudo wget -qO /usr/local/bin/yq \
|
||||
https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64
|
||||
sudo chmod +x /usr/local/bin/yq
|
||||
|
||||
- name: Bump runtime.ptyServerImage in values.yaml
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
yq eval -i ".runtime.ptyServerImage = \"${IMAGE}:${SHA_SHORT}\"" "${CHART_VALUES}"
|
||||
echo "values.yaml after bump:"
|
||||
yq eval '.runtime.ptyServerImage' "${CHART_VALUES}"
|
||||
|
||||
- name: Commit and push values.yaml bump
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
env:
|
||||
SHA_SHORT: ${{ steps.vars.outputs.sha_short }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
if git diff --quiet "${CHART_VALUES}"; then
|
||||
echo "no values.yaml change — already pinned to ${SHA_SHORT}"
|
||||
exit 0
|
||||
fi
|
||||
git add "${CHART_VALUES}"
|
||||
git commit -m "deploy: bump sandbox-pty-server image to ${SHA_SHORT}"
|
||||
# Pull-rebase to avoid races with parallel build commits.
|
||||
git pull --rebase --autostash origin main || true
|
||||
git push origin HEAD:main
|
||||
102
.github/workflows/catalyst-build.yaml
vendored
102
.github/workflows/catalyst-build.yaml
vendored
@ -308,108 +308,6 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
# In-flight provisioning guard — t13/t17/t21 incident, 2026-05-17.
|
||||
#
|
||||
# The mothership catalyst-api Pod is single-replica and is rolled
|
||||
# by Flux whenever this workflow bumps the image SHA. The OpenTofu
|
||||
# workdir lives on a /tmp emptyDir that dies with the Pod, so any
|
||||
# in-flight `tofu apply` is killed mid-resource. The on-disk
|
||||
# deployment record is rewritten to status=failed on the new Pod's
|
||||
# restoreFromStore (deployments.go:413), but the Hetzner resources
|
||||
# tagged with the abandoned deployment-id remain orphans that
|
||||
# require manual `hcloud` cleanup. Three consecutive provs
|
||||
# (t13/t17/t21) died this way during 2026-05-17, each costing
|
||||
# ~15 minutes of provisioning time plus cleanup overhead.
|
||||
#
|
||||
# This step polls the public, read-only in-flight-count endpoint
|
||||
# on the mothership catalyst-api (added in this PR, served at
|
||||
# console.openova.io/api/v1/deployments/in-flight-count). The
|
||||
# endpoint counts ONLY Phase-0 in-flight statuses (pending /
|
||||
# provisioning / tofu-applying / flux-bootstrapping) — Phase-1 is
|
||||
# observational and resumes across Pod restarts, so it does not
|
||||
# block. When count==0 we proceed with the values.yaml bump.
|
||||
#
|
||||
# Timeout policy: cap at MAX_WAIT_SECONDS (default 30 minutes —
|
||||
# the upper bound on a healthy multi-region prov). If a prov is
|
||||
# still in flight after the cap, we proceed anyway and log a
|
||||
# WARNING. Blocking deploys indefinitely on a stuck prov would
|
||||
# mean an operator can never ship a fix for whatever is causing
|
||||
# the stuck prov (the worst possible failure mode for a CI gate).
|
||||
#
|
||||
# Endpoint outage policy: if the curl fails for any reason
|
||||
# (network blip, mothership down, endpoint not yet deployed on
|
||||
# the live SHA), we proceed with the bump after logging. Same
|
||||
# rationale — a broken gate must not block all future deploys.
|
||||
# First-time-rollout consideration: the endpoint does not exist
|
||||
# on the LIVE mothership until THIS PR's image lands, so the
|
||||
# first run after merge will fall through the "endpoint not
|
||||
# found" branch and proceed normally. Subsequent runs benefit
|
||||
# from the gate.
|
||||
- name: Wait for in-flight provisioning to drain
|
||||
env:
|
||||
# Override-able via repo variables/secrets if a different
|
||||
# mothership URL is in play (Sovereign chroot self-deploy,
|
||||
# staging, etc.). Default targets the production mothership.
|
||||
CATALYST_API_URL: ${{ vars.CATALYST_API_URL || 'https://console.openova.io' }}
|
||||
MAX_WAIT_SECONDS: '1800' # 30 min hard cap
|
||||
POLL_INTERVAL_SECONDS: '20'
|
||||
run: |
|
||||
set -u
|
||||
ENDPOINT="${CATALYST_API_URL%/}/api/v1/deployments/in-flight-count"
|
||||
echo "Polling ${ENDPOINT} every ${POLL_INTERVAL_SECONDS}s (cap ${MAX_WAIT_SECONDS}s)"
|
||||
|
||||
START=$(date +%s)
|
||||
ATTEMPT=0
|
||||
while : ; do
|
||||
ATTEMPT=$((ATTEMPT + 1))
|
||||
HTTP_CODE=$(curl -fsSL --max-time 10 -o /tmp/inflight.json -w '%{http_code}' \
|
||||
"${ENDPOINT}" 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$HTTP_CODE" = "000" ]; then
|
||||
# Network failure (DNS, connect refused, timeout). Do NOT
|
||||
# block the deploy — fail-open per "broken gate must not
|
||||
# halt all deploys" rule above. Log + proceed.
|
||||
echo "WARN: ${ENDPOINT} unreachable on attempt ${ATTEMPT} (curl failed). Proceeding without gate."
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$HTTP_CODE" = "404" ]; then
|
||||
# First-rollout case — the endpoint is not yet present on
|
||||
# the LIVE catalyst-api. Once this PR merges, subsequent
|
||||
# runs will see the endpoint and start gating properly.
|
||||
echo "INFO: ${ENDPOINT} returned 404 — endpoint not yet deployed on live mothership. Proceeding (first-rollout fall-through)."
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
# Any other non-2xx: log + proceed (fail-open).
|
||||
echo "WARN: ${ENDPOINT} returned HTTP ${HTTP_CODE} on attempt ${ATTEMPT}. Body:"
|
||||
cat /tmp/inflight.json 2>/dev/null || true
|
||||
echo
|
||||
echo "Proceeding without gate (fail-open)."
|
||||
break
|
||||
fi
|
||||
|
||||
COUNT=$(jq -r '.count // 0' /tmp/inflight.json 2>/dev/null || echo "0")
|
||||
IDS=$(jq -r '.ids // [] | join(",")' /tmp/inflight.json 2>/dev/null || echo "")
|
||||
|
||||
if [ "$COUNT" -eq 0 ] 2>/dev/null; then
|
||||
echo "OK: 0 deployments in-flight. Safe to bump catalyst-api image."
|
||||
break
|
||||
fi
|
||||
|
||||
ELAPSED=$(($(date +%s) - START))
|
||||
if [ "$ELAPSED" -ge "$MAX_WAIT_SECONDS" ]; then
|
||||
echo "WARN: ${COUNT} deployment(s) still in-flight after ${ELAPSED}s (cap ${MAX_WAIT_SECONDS}s)."
|
||||
echo "WARN: in-flight ids: ${IDS}"
|
||||
echo "WARN: proceeding with image bump anyway — stuck provs must not block all future deploys."
|
||||
break
|
||||
fi
|
||||
|
||||
echo "WAIT: attempt ${ATTEMPT} — ${COUNT} deployment(s) in-flight (ids: ${IDS}); elapsed=${ELAPSED}s. Sleeping ${POLL_INTERVAL_SECONDS}s."
|
||||
sleep "${POLL_INTERVAL_SECONDS}"
|
||||
done
|
||||
|
||||
- name: Update SHA tags in values.yaml and deployment manifests
|
||||
# The catalyst-ui and catalyst-api images are referenced in two places:
|
||||
#
|
||||
|
||||
47
.github/workflows/infra-hetzner-tofu.yaml
vendored
47
.github/workflows/infra-hetzner-tofu.yaml
vendored
@ -52,53 +52,6 @@ jobs:
|
||||
- name: tofu validate
|
||||
run: tofu validate
|
||||
|
||||
# Fix #111 (2026-05-10) — guard against the PR #1311 regression class.
|
||||
#
|
||||
# tftpl files (cloudinit-control-plane.tftpl, cloudinit-worker.tftpl,
|
||||
# any future *.tftpl) are consumed by tofu's `templatefile()` function
|
||||
# which parses ALL `${...}` interpolation sequences regardless of
|
||||
# YAML/HCL/shell context — including ones that appear inside YAML
|
||||
# comments. When a comment references a downstream shell-envsubst
|
||||
# variable like `${QA_FIXTURES_ENABLED:-false}`, tofu sees the colon
|
||||
# inside the interpolation and dies with:
|
||||
#
|
||||
# Error: Extra characters after interpolation expression;
|
||||
# Template interpolation doesn't expect a colon at this location.
|
||||
#
|
||||
# PR #1311 (Fix #73) shipped exactly this bug, broke `tofu plan`
|
||||
# immediately, and prov #9 (4204f0b0c5e37a80) wasted ~30 min before
|
||||
# PR #1328 caught and escaped the one offender. Without a CI guard,
|
||||
# the next operator who adds a similar comment will repeat the
|
||||
# incident.
|
||||
#
|
||||
# The fix is to use HCL's literal-dollar escape: `$$` → emits one
|
||||
# literal `$` from templatefile(), so `$${VAR:-default}` survives
|
||||
# tofu and reaches the cloud-init shell as `${VAR:-default}`.
|
||||
#
|
||||
# This grep scans every *.tftpl in infra/hetzner/ for any line that:
|
||||
# - starts with `#` (a comment) — leading whitespace optional
|
||||
# - contains a single-`$` `${UPPERCASE_VAR:-...}` interpolation —
|
||||
# the colon-in-interpolation shape that breaks tofu
|
||||
# and fails the build with an actionable error message. The regex
|
||||
# uses PCRE's negative lookbehind `(?<!\$)` so correctly-escaped
|
||||
# `$${VAR:-default}` (which expands to literal `${VAR:-default}` after
|
||||
# templatefile()) does NOT trip the guard. Code lines (non-comment)
|
||||
# that reference shell vars are caught at `tofu validate` time; this
|
||||
# guard plugs the comment-line gap that validate misses because
|
||||
# templatefile() doesn't actually run during `validate`.
|
||||
#
|
||||
# Ubuntu-latest runners ship GNU grep with PCRE (`-P`) enabled.
|
||||
- name: tftpl shell-expansion escape guard (Fix #111)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
violations=$(grep -rEnP '^\s*#.*(?<!\$)\$\{[A-Z_]+:-' *.tftpl || true)
|
||||
if [ -n "$violations" ]; then
|
||||
echo "::error title=Unescaped tftpl shell expansion::Use \$\${VAR:-default} (double-dollar) in tftpl YAML comments — bare \${VAR:-default} is consumed by tofu's templatefile() and breaks 'tofu plan' with 'Template interpolation doesn't expect a colon at this location' (see PR #1311 / PR #1328 / Fix #111)."
|
||||
echo "Offending lines:"
|
||||
echo "$violations"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: tofu test (offline — mock_provider + override_resource)
|
||||
# The module's tests/multi_region.tftest.hcl exercises the
|
||||
# multi-region wiring shape WITHOUT touching real Hetzner.
|
||||
|
||||
55
.github/workflows/test-bootstrap-kit.yaml
vendored
55
.github/workflows/test-bootstrap-kit.yaml
vendored
@ -12,10 +12,8 @@ on:
|
||||
- 'tests/e2e/bootstrap-kit/**'
|
||||
- 'platform/**/blueprint.yaml'
|
||||
- 'platform/**/chart/**'
|
||||
- 'products/**/chart/**'
|
||||
- 'clusters/**'
|
||||
- 'scripts/check-bootstrap-deps.sh'
|
||||
- 'scripts/check-bootstrap-kit-pin-sync.sh'
|
||||
- 'scripts/expected-bootstrap-deps.yaml'
|
||||
- '.github/workflows/test-bootstrap-kit.yaml'
|
||||
branches: [main]
|
||||
@ -24,10 +22,8 @@ on:
|
||||
- 'tests/e2e/bootstrap-kit/**'
|
||||
- 'platform/**/blueprint.yaml'
|
||||
- 'platform/**/chart/**'
|
||||
- 'products/**/chart/**'
|
||||
- 'clusters/**'
|
||||
- 'scripts/check-bootstrap-deps.sh'
|
||||
- 'scripts/check-bootstrap-kit-pin-sync.sh'
|
||||
- 'scripts/expected-bootstrap-deps.yaml'
|
||||
- '.github/workflows/test-bootstrap-kit.yaml'
|
||||
workflow_dispatch:
|
||||
@ -55,57 +51,6 @@ jobs:
|
||||
- name: Run bootstrap-kit dependency audit
|
||||
run: bash scripts/check-bootstrap-deps.sh
|
||||
|
||||
pin-sync-audit:
|
||||
# TBD-A6 regression test. Asserts every Chart.yaml in platform/* or
|
||||
# products/* whose chart is pinned in clusters/_template/bootstrap-
|
||||
# kit/ has the SAME version on both sides.
|
||||
#
|
||||
# On `pull_request` we use --changed-only --base <base-ref> so a PR
|
||||
# is only blocked on chart→pin pairs IT modified. This keeps the
|
||||
# gate effective (every new chart bump must update the pin) without
|
||||
# forcing pre-existing drifts (13 charts as of 2026-05-18) to be
|
||||
# fixed before any unrelated PR can land. The auto-bump hook in
|
||||
# blueprint-release.yaml will heal those drifts on the next bump
|
||||
# of each lagging chart.
|
||||
#
|
||||
# On `push` to main and `workflow_dispatch` we run the FULL sweep
|
||||
# so post-merge drift is observable on the run summary even if the
|
||||
# PR gate let it through.
|
||||
#
|
||||
# TBD-A17 mitigation (#1849, 2026-05-18): the full sweep on `push`
|
||||
# to main races with the blueprint-release auto-bump hook. When a
|
||||
# PR bumps a Chart.yaml version, the merge commit (which is what
|
||||
# this push event sees) does NOT yet contain the matching
|
||||
# bootstrap-kit pin bump — the auto-bump hook runs in a DIFFERENT
|
||||
# workflow (blueprint-release.yaml) and pushes the pin bump as a
|
||||
# follow-up bot commit, which (per GITHUB_TOKEN convention) does
|
||||
# NOT retrigger this workflow. So the FIRST run on every chart-
|
||||
# bumping merge sees `chart=N pin=N-1` drift and would block.
|
||||
# The actual desired-state is that the follow-up bot commit heals
|
||||
# the drift within ~60s. Push-mode is therefore observational, not
|
||||
# blocking; we use `continue-on-error: true` so the workflow stays
|
||||
# green while the drift is still visible on the run summary.
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# Need history back to the PR base for the --changed-only diff.
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Run pin-sync audit (changed-only on PR, full sweep otherwise)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||
base="${{ github.event.pull_request.base.sha }}"
|
||||
echo "Running --changed-only against base ${base}"
|
||||
bash scripts/check-bootstrap-kit-pin-sync.sh --changed-only --base "${base}"
|
||||
else
|
||||
echo "Running full sweep (event=${{ github.event_name }})"
|
||||
bash scripts/check-bootstrap-kit-pin-sync.sh
|
||||
fi
|
||||
|
||||
manifest-validation:
|
||||
# Static-only validation: blueprint.yaml + chart Chart.yaml + clusters/_template
|
||||
# parsing + dependency order check. Runs on every push.
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-cilium
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "01"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: cilium
|
||||
@ -38,32 +36,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cilium
|
||||
# 1.3.4 (prov #55, 2026-05-12): flip kubeProxyReplacement false→true
|
||||
# in chart defaults so the BPF masquerade datapath (bpf.masquerade:
|
||||
# true, already on by default) gets the NodePort it needs at startup.
|
||||
# Worker cilium-agent on prov 8d85a64cb8807cdc crashloop'd with
|
||||
# "BPF masquerade requires NodePort" → node.cilium.io/agent-not-ready
|
||||
# taint persisted → every post-install Job pod (keycloak-config-cli,
|
||||
# powerdns, mimir, openbao) stayed Pending → bootstrap-kit chain
|
||||
# stalled. Aligns with the cloud-init pre-Flux Cilium install which
|
||||
# already used kubeProxyReplacement: true.
|
||||
# 1.3.3 (qa-loop iter-16 Fix #70): Hubble UI HTTPRoute defaults
|
||||
# corrected — gatewayRef.namespace=kube-system (was the stale
|
||||
# cilium-gateway), serviceRef.namespace=kube-system (was the stale
|
||||
# cilium), plus chart auto-derives hubble.<sovereignFQDN> when only
|
||||
# SOVEREIGN_FQDN is provided. Combined with the bootstrap-kit
|
||||
# default flip below (HUBBLE_ENABLED=true, hubble.relay/ui enabled,
|
||||
# SOVEREIGN_FQDN forwarded), every Sovereign exposes Hubble UI at
|
||||
# https://hubble.<sovereignFQDN>/ out of the box. TC-289 NXDOMAIN
|
||||
# is closed because external-dns now sees the HTTPRoute hostname
|
||||
# and writes the A record into PowerDNS.
|
||||
# 1.3.1 (qa-loop iter-12 Fix #54 Workstream 2): bpf.preallocateMaps=true
|
||||
# + socketLB.hostNamespaceOnly=true defaults so fresh worker pods can
|
||||
# resolve DNS reliably on first-join (cilium/cilium#28456 mitigation).
|
||||
# 1.3.0 (qa-loop iter-12 Fix #53C+D): adds the Hubble UI HTTPRoute
|
||||
# overlay (slice H7 #1095) that the catalystOverlay.hubbleUI block
|
||||
# below depends on.
|
||||
version: 1.3.5
|
||||
version: 1.2.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cilium
|
||||
@ -73,30 +46,15 @@ spec:
|
||||
# SAME chart installs — legitimate slow-Ready). Replaces blanket
|
||||
# spec.timeout: 15m band-aid from PR #221.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
cilium:
|
||||
# Multi-region (operator mandate 2026-05-12) — each region's k3s
|
||||
# is an INDEPENDENT cluster per NAMING-CONVENTION §1.3, so each
|
||||
# region's cilium MUST talk to its OWN local CP, not the primary's
|
||||
# 10.0.1.2. Flux postBuild.substitute in
|
||||
# cloudinit-control-plane.tftpl renders CILIUM_K8S_SERVICE_HOST to
|
||||
# the local CP's private IP per region (10.0.1.2 for primary,
|
||||
# 10.0.<10+idx>.2 for secondaries — see main.tf:267
|
||||
# secondary_region_cp_ips). Without this, secondary regions'
|
||||
# cilium-operator crash-loops with x509 unknown authority (the
|
||||
# primary's CA doesn't sign the secondary cluster's API cert).
|
||||
# The :=10.0.1.2 fallback preserves single-region (primary-only)
|
||||
# provisions where the substitute var would be empty/absent.
|
||||
k8sServiceHost: ${CILIUM_K8S_SERVICE_HOST:=10.0.1.2}
|
||||
# Phase-8a bug #15 (otech8 deployment 1bfc46347564467b 2026-05-01):
|
||||
# cilium-agent waits forever for the operator to register
|
||||
# ciliumenvoyconfigs + ciliumclusterwideenvoyconfigs CRDs.
|
||||
@ -115,36 +73,13 @@ spec:
|
||||
enabled: false
|
||||
hubble:
|
||||
metrics:
|
||||
# `null` (NOT [] and NOT a populated list) suppresses the
|
||||
# upstream chart's metrics ServiceMonitor render. Hubble flow
|
||||
# collection still works for Hubble Relay/UI without a
|
||||
# ServiceMonitor — that pulls the kube-prometheus-stack CRDs
|
||||
# which do not exist on a fresh Sovereign at bp-cilium install
|
||||
# time. Operators flip metrics on once
|
||||
# bp-kube-prometheus-stack is reconciled (issue #182).
|
||||
enabled: null
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
# qa-loop iter-16 Fix #70: default Hubble UI ON for every
|
||||
# Sovereign so TC-289 (https://hubble.<fqdn>/) resolves out of
|
||||
# the box. Hubble flow telemetry is the canonical L3-L7 visibility
|
||||
# surface for the Catalyst control plane (EPIC-5 #1100); shipping
|
||||
# it default-OFF made every Sovereign blind by default and
|
||||
# required a per-Sovereign overlay touch that nobody remembered
|
||||
# to wire. Per-Sovereign overlay can still set HUBBLE_ENABLED=false
|
||||
# for fully air-gapped lab Sovereigns where Relay traffic to
|
||||
# cilium-agents is not desired.
|
||||
relay:
|
||||
enabled: ${HUBBLE_ENABLED:=true}
|
||||
enabled: false
|
||||
ui:
|
||||
enabled: ${HUBBLE_ENABLED:=true}
|
||||
|
||||
# qa-loop iter-12 Fix #53C: BGP control plane (default off, opt-in
|
||||
# via BGP_ENABLED=true). Per ADR-0001 §9 the BGP control plane is the
|
||||
# canonical path for Sovereign-to-customer-router prefix advertisement
|
||||
# (LoadBalancer VIPs, Pod CIDRs to customer's existing core network).
|
||||
bgpControlPlane:
|
||||
enabled: ${BGP_ENABLED:=false}
|
||||
enabled: false
|
||||
|
||||
# ── Cilium ClusterMesh — multi-region peering ──────────────────
|
||||
#
|
||||
@ -179,100 +114,8 @@ spec:
|
||||
useAPIServer: true
|
||||
apiserver:
|
||||
service:
|
||||
# 2026-05-15: default flipped NodePort → LoadBalancer per DoD A3
|
||||
# (docs/SOVEREIGN-MULTI-REGION-DOD.md). Founder ruling:
|
||||
# "ClusterMesh apiserver Service = LoadBalancer (NEVER NodePort)".
|
||||
#
|
||||
# On Hetzner, hcloud-ccm allocates a public-IPv4 LB per peer
|
||||
# region; AutoEstablishClusterMesh (handler/clustermesh.go,
|
||||
# PR #1508) hard-fails on type != LoadBalancer and reads the
|
||||
# LB ingress IP for the peer endpoint. Cilium WG node
|
||||
# encryption secures the LB→node→pod path end-to-end.
|
||||
#
|
||||
# ${CLUSTERMESH_SERVICE_TYPE:=LoadBalancer} keeps the
|
||||
# operator escape hatch (e.g. bare-metal Sovereigns with
|
||||
# MetalLB or non-cloud peers can override to NodePort) but
|
||||
# the cloud-Hetzner default is now A3-compliant out of the
|
||||
# box.
|
||||
type: ${CLUSTERMESH_SERVICE_TYPE:=LoadBalancer}
|
||||
# Hetzner CCM requires location OR network-zone annotation
|
||||
# to allocate the LB. ${HCLOUD_LB_LOCATION} flows from the
|
||||
# bootstrap-kit Kustomization substitute, set by the
|
||||
# cloud-init template for EVERY region (primary CP renders
|
||||
# var.region; secondary CPs render each.value.cloudRegion).
|
||||
# No default fallback: a missing substitute is a tofu
|
||||
# rendering bug, not a runtime fallback opportunity. The
|
||||
# previous `:=hel1` default silently masked the 2026-05-16
|
||||
# multi-region rendering regression (t114-omani-works
|
||||
# primary=hel1 — fallback APPEARED correct but every
|
||||
# secondary also rendered hel1; an explicit empty render
|
||||
# would have failed cilium chart admission and surfaced
|
||||
# the bug at provision time instead of at clustermesh-
|
||||
# apiserver LB allocation time).
|
||||
annotations:
|
||||
load-balancer.hetzner.cloud/location: "${HCLOUD_LB_LOCATION}"
|
||||
load-balancer.hetzner.cloud/type: "lb11"
|
||||
# use-private-ip: false — LB→backend connection transits
|
||||
# the PUBLIC IP. PR #1537 had set this to "true" attempting
|
||||
# to bypass the firewall NodePort block; that approach was
|
||||
# NOT viable because the per-region Hetzner LB has no
|
||||
# private-network attachment by default. CCM rejected:
|
||||
# "ReconcileHCLBTargets: use private ip: missing network id"
|
||||
# → LB never allocated → clustermesh apiserver Service
|
||||
# stayed `<pending>` → clustermesh orchestrator waited 5min
|
||||
# for LB IP then bailed with empty peerEntries.
|
||||
#
|
||||
# PR #1538's canonical fix opens TCP 30000-32767 in the
|
||||
# Hetzner firewall so the public-IP LB health checks pass.
|
||||
# This file reverts to use-private-ip=false to align with
|
||||
# that approach. Caught on t130 (30463cd0a5a931be, 2026-05-16).
|
||||
load-balancer.hetzner.cloud/use-private-ip: "false"
|
||||
# 2026-05-16: per-region LB name suffix. Without
|
||||
# ${SOVEREIGN_REGION_KEY} interpolated, all 3 regions'
|
||||
# clustermesh-apiserver Services adopted the FIRST LB
|
||||
# CCM-created (Hetzner LBs are unique by name; second
|
||||
# creation just reuses the first). Caught on t121
|
||||
# (48d8fe77...): primary + nbg1 both reported external_ip
|
||||
# 167.233.14.208 (nbg1 LB), sin stayed <pending>.
|
||||
load-balancer.hetzner.cloud/name: "${SOVEREIGN_FQDN_SLUG:=catalyst}-${SOVEREIGN_REGION_KEY:=primary}-clustermesh"
|
||||
|
||||
# ── Catalyst overlay templates (chart/templates/) ────────────────────
|
||||
# qa-loop iter-16 Fix #70: Hubble UI HTTPRoute now defaults ON for
|
||||
# every Sovereign. The chart auto-derives hostname `hubble.${SOVEREIGN_FQDN}`
|
||||
# so the operator only needs the SOVEREIGN_FQDN substitute (already
|
||||
# mandatory for every Sovereign — see clusters/_template/bootstrap-kit/
|
||||
# 13-bp-catalyst-platform.yaml `host: console.${SOVEREIGN_FQDN}`).
|
||||
# Per-Sovereign overlay can still:
|
||||
# - HUBBLE_ENABLED=false → disable Hubble UI on this Sovereign
|
||||
# - HUBBLE_HOSTNAME=... → override the auto-derived hostname
|
||||
# - HUBBLE_AUTH=oidc → enable OIDC enforcement once the
|
||||
# Keycloak realm wires the hubble-ui client
|
||||
catalystOverlay:
|
||||
hubbleUI:
|
||||
enabled: ${HUBBLE_ENABLED:=true}
|
||||
# Explicit override; empty triggers the chart to derive
|
||||
# `hubble.${SOVEREIGN_FQDN}` from sovereignFQDN below.
|
||||
hostname: ${HUBBLE_HOSTNAME:=}
|
||||
sovereignFQDN: ${SOVEREIGN_FQDN}
|
||||
gatewayRef:
|
||||
# The Sovereign Gateway lives in kube-system — installed by
|
||||
# clusters/_template/sovereign-tls/cilium-gateway.yaml. Every
|
||||
# other bootstrap-kit HTTPRoute (gitea, auth, grafana, harbor,
|
||||
# openbao, powerdns, console/catalyst-platform) attaches to
|
||||
# cilium-gateway/kube-system; this overlay matches.
|
||||
name: cilium-gateway
|
||||
namespace: kube-system
|
||||
# `none` until the Keycloak `hubble-ui` OIDC client is wired by
|
||||
# bp-keycloak realm-config; flip to `oidc` per per-Sovereign
|
||||
# overlay once that lands. Until then Hubble UI is publicly
|
||||
# reachable — acceptable for the in-progress qa-loop iter-16
|
||||
# observability slice; lock down before production handover via
|
||||
# HUBBLE_AUTH=oidc.
|
||||
auth: ${HUBBLE_AUTH:=none}
|
||||
serviceRef:
|
||||
name: hubble-ui
|
||||
namespace: kube-system
|
||||
port: 80
|
||||
type: NodePort
|
||||
nodePort: 32379
|
||||
---
|
||||
# ─── Per-Sovereign Gateway API resources (issue #387) ────────────────────
|
||||
#
|
||||
|
||||
@ -48,8 +48,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-gateway-api
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "01a"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: gateway-api
|
||||
@ -73,12 +71,10 @@ spec:
|
||||
# `dependsOn: bp-gateway-api` so Flux gates them on this release's
|
||||
# Ready condition.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-cert-manager
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "02"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: cert-manager
|
||||
@ -40,10 +38,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cert-manager
|
||||
# 1.2.1 (Fix #158): crdGate hook image switched from
|
||||
# bitnami/kubectl:1.30.4 (deleted from Docker Hub 2025-08) to
|
||||
# bitnamilegacy/kubectl:1.30.7.
|
||||
version: 1.2.2
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cert-manager
|
||||
@ -54,12 +49,10 @@ spec:
|
||||
# Helm install completes when manifests apply; subsequent dependsOn
|
||||
# checks Ready=True independently. Replaces PR #221 spec.timeout: 15m.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -50,8 +50,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-flux
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "03"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: flux
|
||||
@ -61,10 +59,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-flux
|
||||
# 1.2.1 (Fix #158): stuckHelmReleaseRecovery image switched from
|
||||
# bitnami/kubectl:1.31 (deleted from Docker Hub 2025-08) to
|
||||
# bitnamilegacy/kubectl:1.31.4. (Catches up from 1.1.3 → 1.2.1.)
|
||||
version: 1.2.2
|
||||
version: 1.1.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-flux
|
||||
@ -74,7 +69,6 @@ spec:
|
||||
# a target of the chart, so blocking on Ready=True is structurally
|
||||
# impossible. disableWait avoids the deadlock. Replaces PR #221 timeout.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
# Adopt cloud-init-installed Flux objects rather than fail on
|
||||
# ownership conflict (the objects exist before the HelmRelease ever
|
||||
@ -83,7 +77,6 @@ spec:
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
# Keep operator-supplied values (e.g. resource overrides applied via
|
||||
# helm-controller out-of-band, or dry-run patches during incident
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-crossplane
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "04"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -41,16 +39,14 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-crossplane
|
||||
version: 1.1.4
|
||||
version: 1.1.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-crossplane
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -25,8 +25,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-sealed-secrets
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "05"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: sealed-secrets
|
||||
@ -36,7 +34,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-sealed-secrets
|
||||
version: 1.1.2
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-sealed-secrets
|
||||
@ -44,12 +42,10 @@ spec:
|
||||
# Event-driven install: single-replica controller + CRD; install
|
||||
# completes when manifests apply. Replaces PR #221 spec.timeout: 15m.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -37,8 +37,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-reflector
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "05a"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: reflector
|
||||
@ -57,12 +55,10 @@ spec:
|
||||
# when manifests apply. disableWait per architecture convention —
|
||||
# replaces blanket spec.timeout band-aid.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -52,13 +52,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-self-sovereign-cutover
|
||||
namespace: flux-system
|
||||
labels:
|
||||
# slot drives the openova-flow adapter's Phase derivation
|
||||
# (clusters/_template/bootstrap-kit/<NN>-...yaml encodes the
|
||||
# install order). component=cutover overrides the slot rule so
|
||||
# this HR lands in Phase 2 (Cutover) on the canvas, NOT Phase 1.
|
||||
catalyst.openova.io/slot: "06a"
|
||||
catalyst.openova.io/component: cutover
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: self-sovereign-cutover
|
||||
@ -211,97 +204,17 @@ spec:
|
||||
# commits, and pushes. Subsequent reconciles see local Harbor
|
||||
# as steady-state. Image bumped to alpine/k8s:1.31.4 (kubectl
|
||||
# + git in one image; verified live on otech116).
|
||||
# 0.1.24: Step-06 phase-0 ghcr-pull harbor.<sov-fqdn> auth merge
|
||||
# (#1184, bounded-cycle backfill). Once 0.1.20's phase-1 pivots
|
||||
# HelmRepository URLs to oci://harbor.<sov-fqdn>/openova-io,
|
||||
# source-controller hits a 401 on every pull because the
|
||||
# ghcr-pull Secret only carries auth for ghcr.io and
|
||||
# harbor.openova.io (cloud-init writes those two; harbor.<sov
|
||||
# -fqdn> is a per-Sovereign coordinate that doesn't exist at
|
||||
# bake time). Manually fixed on omantel 2026-05-10 (session
|
||||
# 5c468708) — bp-guacamole / bp-netbird / bp-dmz-vcluster all
|
||||
# stuck Reconciling until `kubectl patch secret ghcr-pull` was
|
||||
# run by hand. 0.1.24 codifies that patch as Phase-0 of Step-06
|
||||
# so the next fresh `tofu apply` comes up GREEN with zero
|
||||
# manual intervention. Idempotent (no-op when entry already
|
||||
# matches), reads HARBOR_PASSWORD from the harbor-admin Secret
|
||||
# already mirrored into `catalyst` ns by bp-harbor 1.2.14+.
|
||||
# Adds `secrets: [update,patch]` to the runner ClusterRole.
|
||||
# 0.1.25: Step-06 Phase-0 probe brittleness fix (qa-loop bounded-
|
||||
# cycle Wave 5 Fix #77, Gap A). 0.1.24 used kubectl jsonpath
|
||||
# `{.data['.dockerconfigjson']}` which silently returns EMPTY
|
||||
# because kubectl interprets the leading dot inside the bracket
|
||||
# as a child accessor (escape `\.dockerconfigjson` would work
|
||||
# but is a footgun). Caught live on omantel prov #7 2026-05-10:
|
||||
# `cutover-helmrepository-patches` Job FAILED 4× with
|
||||
# `FATAL: ghcr-pull Secret has no .dockerconfigjson key —
|
||||
# cloud-init did not run?` despite `kubectl get secret -o yaml`
|
||||
# showing the key present. 0.1.25 replaces the probe with
|
||||
# `kubectl get -o json | jq -r --arg k ...` (escape-free), adds
|
||||
# a 60s wait-loop for Reflector lag, and falls back to the
|
||||
# source namespace (flux-system) if the local copy is still
|
||||
# missing. Idempotent path unchanged.
|
||||
# 0.1.26: HR install/upgrade timeout 15m + values
|
||||
# autoWaitForAPISeconds=720, autoTimeoutSeconds=840 (Fix #127).
|
||||
# Provisions #12 + #14 wedged at phase1-watching because the
|
||||
# HR had no explicit timeout → Helm 5m default → hit before
|
||||
# the auto-trigger Job's 600s activeDeadline could complete.
|
||||
# 0.1.27: HR install/upgrade timeout 15m → 30m + values
|
||||
# autoWaitForAPISeconds 720→1500s (25m wait), autoTimeoutSeconds
|
||||
# 840→1740s (29m Job deadline) (Fix #152). Prov #23 wedged
|
||||
# identically with 3× consecutive DeadlineExceeded on the auto-
|
||||
# trigger Job: catalyst-api had not yet become reachable inside
|
||||
# the 14m Job deadline. Cold-start of catalyst-platform on a
|
||||
# fresh Sovereign exceeds 14m on slow Hetzner regions; 2×
|
||||
# headroom (29m Job, 30m HR) restores the safety margin Fix #127
|
||||
# intended. NOTE: also bumps HR version pin from 0.1.25 → 0.1.27
|
||||
# — Fix #127 (commit 58f518ff) bumped Chart.yaml to 0.1.26 but
|
||||
# left this pin at 0.1.25, so the new HR-timeout/values changes
|
||||
# never landed on any Sovereign. The pin update here is what
|
||||
# actually delivers BOTH Fix #127 and Fix #152.
|
||||
# 0.1.28 (Fix #158, 2026-05-11): values.yaml comment cleanup —
|
||||
# platform-wide migration off bitnami/kubectl (deleted from
|
||||
# Docker Hub 2025-08). This Blueprint already uses alpine/k8s
|
||||
# + alpine since 0.1.10; no functional image change here.
|
||||
# 0.1.30 (TBD-C18, 2026-05-18): NEW step 09 (gitea-token-mint)
|
||||
# mints a real Gitea API token at cutover + patches Secret
|
||||
# sme/provisioning-github-token.GITHUB_TOKEN. The catalyst-
|
||||
# platform chart's provisioning-github-token.yaml template
|
||||
# previously mirrored the Gitea admin PASSWORD verbatim into
|
||||
# that Secret; SME provisioning then sent `Authorization: token
|
||||
# <PWD>` to Gitea which 401s ("user does not exist [uid: 0]").
|
||||
# On t22 2026-05-18: voucher checkout completed 200 + /jobs
|
||||
# redirect fired, but no Organization CR was ever created.
|
||||
# Step 09 closes the loop: DELETE-then-POST /api/v1/users/
|
||||
# gitea_admin/tokens, capture .sha1, GET /api/v1/user validate,
|
||||
# kubectl patch dest Secret, rollout-restart provisioning
|
||||
# Deployment. Order=9 (last) is fine — none of steps 02-08 read
|
||||
# the Secret and the SME provisioning service first consumes
|
||||
# the token at voucher checkout time (always postdates cutover).
|
||||
# 0.1.31 (TBD-C19, 2026-05-18): step-06 now also pivots
|
||||
# openova-catalog HelmRepository (rendered by bp-catalyst-
|
||||
# platform chart, not directly from bootstrap-kit). Adds
|
||||
# `openova-catalog` to helmRepositories.names; new Phase-1.6
|
||||
# patches the parent HelmRelease's spec.values.catalog.
|
||||
# helmRepository.url; new Phase-2.5 injects same override
|
||||
# into 13-bp-catalyst-platform.yaml in local Gitea so
|
||||
# bootstrap-kit Kustomization reconcile preserves it. Without
|
||||
# this pin bump, step-08 catches openova-catalog as the lone
|
||||
# OFFENDER ~1m after step-06 (chart re-render reverts the
|
||||
# live HR patch). Caught live on t22.omantel.biz 2026-05-18.
|
||||
version: 0.1.31
|
||||
version: 0.1.23
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-self-sovereign-cutover
|
||||
namespace: flux-system
|
||||
install:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overrides — the chart's values.yaml carries
|
||||
@ -311,15 +224,6 @@ spec:
|
||||
sovereign:
|
||||
fqdn: ${SOVEREIGN_FQDN}
|
||||
harborInternalURL: http://harbor-core.harbor.svc.cluster.local
|
||||
# NB: Harbor HTTPRoute publishes at `registry.<sov>` (see
|
||||
# `clusters/_template/bootstrap-kit/19-harbor.yaml` gateway.host),
|
||||
# NOT `harbor.<sov>`. Step-06 phase-1 rewrites every HelmRepository
|
||||
# to `oci://${harbor_host}/openova-io`, so this MUST be the public
|
||||
# hostname that actually answers — `registry.${SOVEREIGN_FQDN}`.
|
||||
# Pre-2026-05-18 this said `harbor.${SOVEREIGN_FQDN}`, which no
|
||||
# HTTPRoute matched → all post-pivot OCI pulls EOF → bp-sandbox HR
|
||||
# never Ready → bootstrap-kit Ks stuck (chicken-and-egg). See
|
||||
# t20 debug matrix.
|
||||
harborPublicURL: https://registry.${SOVEREIGN_FQDN}
|
||||
harborPublicURL: https://harbor.${SOVEREIGN_FQDN}
|
||||
giteaInternalURL: http://gitea-http.gitea.svc.cluster.local:3000
|
||||
giteaPublicURL: https://gitea.${SOVEREIGN_FQDN}
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-nats-jetstream
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "07"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: nats-jetstream
|
||||
@ -42,7 +40,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-nats-jetstream
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-nats-jetstream
|
||||
@ -52,12 +50,10 @@ spec:
|
||||
# cold start. Helm install completes when manifests apply; downstream
|
||||
# dependsOn checks Ready=True independently. Replaces PR #221 timeout.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-openbao
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "08"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: openbao
|
||||
@ -54,7 +52,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-openbao
|
||||
version: 1.2.16
|
||||
version: 1.2.14
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-openbao
|
||||
|
||||
@ -29,8 +29,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-keycloak
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "09"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: keycloak
|
||||
@ -43,29 +41,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-keycloak
|
||||
# 1.5.0 (qa-loop iter-12 Fix #53A): adds .Values.sovereignRealm.name
|
||||
# parameter so each Sovereign owns its KC realm named after the tenant
|
||||
# short-name (omantel chroot → "omantel"). Default `sovereign` is kept
|
||||
# in the chart for backward compat with overlays not yet migrated.
|
||||
# 1.4.5 (issue #146, prov #21+#22 hung 30+ min on bp-keycloak install):
|
||||
# post-#140 retune. (a) availabilityCheck.timeout 900s → 300s — coarser
|
||||
# retry was busting HR window before backoff could retry; faster
|
||||
# failure means more attempts fit. (b) startupProbe enabled with 30m
|
||||
# budget so slow Liquibase doesn't get killed by livenessProbe mid-
|
||||
# migration. (c) livenessProbe.initialDelaySeconds 300 → 60 (cold-
|
||||
# start protection now lives in startupProbe). Coupled HR change
|
||||
# below: install/upgrade.remediation.retries 3 → 1 (Job's own
|
||||
# backoffLimit handles retries without losing state across Helm
|
||||
# restarts). Together: ≤ 30m wall-clock vs. 90m+ before.
|
||||
# 1.4.4 (issue #140, post-upgrade hook regression on prov #21): bumps
|
||||
# keycloakConfigCli.availabilityCheck.timeout 600s → 900s + adds
|
||||
# cleanupAfterFinished (1h TTL) so stale hook Pods don't race
|
||||
# before-hook-creation deletes on subsequent upgrades. Coupled with
|
||||
# the install/upgrade timeout bump below (15m → 30m) so Helm's
|
||||
# outer hook-wait accommodates the inner 15m availability window.
|
||||
# 1.4.3 (issue #129): bumped keycloakConfigCli.availabilityCheck.timeout
|
||||
# 120s → 600s + backoffLimit 1 → 5 (fresh-install wedge).
|
||||
version: 1.4.5
|
||||
version: 1.4.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-keycloak
|
||||
@ -75,53 +51,21 @@ spec:
|
||||
# 100+ Liquibase changesets). Helm install completes when manifests
|
||||
# apply; downstream dependsOn checks Ready=True independently.
|
||||
# Replaces PR #221 spec.timeout: 15m.
|
||||
#
|
||||
# 15m → 30m bump (issue #140, post-upgrade hook regression on prov #21):
|
||||
# `disableWait: true` skips Pod-Ready waits but does NOT skip Helm hook
|
||||
# waits — Helm always blocks on hook-Pod completion bounded by this
|
||||
# timeout. The bp-keycloak chart's keycloak-config-cli post-install/
|
||||
# post-upgrade hook now has an inner availabilityCheck.timeout of 900s
|
||||
# (15m), and on chart-roll-triggered upgrades the keycloak StatefulSet
|
||||
# rolling-restart + Liquibase re-validation can consume that full
|
||||
# window. 30m gives Helm room to wait out one full inner attempt plus
|
||||
# exponential backoff if needed, without blowing past Flux's HR timer.
|
||||
# If you bump availabilityCheck.timeout further, bump THIS too.
|
||||
# remediation.retries 3 → 1 (issue #146, prov #21+#22 hung 30+ min):
|
||||
# Flux HR remediation does a full Helm uninstall+reinstall on each retry,
|
||||
# losing all hook-Pod state and restarting Liquibase from zero. With 3
|
||||
# retries × 30m HR timeout = up to 90m of wasted work before Flux gives
|
||||
# up. The keycloak-config-cli Job already retries internally via
|
||||
# `backoffLimit: 5` (set in chart values.yaml) — Job-level backoff
|
||||
# preserves Keycloak's state and only re-runs the realm-import sidecar.
|
||||
# HR-level remediation is reserved for genuine release-failure (e.g.
|
||||
# invalid manifest) where a clean reinstall is the right answer; one
|
||||
# retry is sufficient for that. Job-level vs. HR-level retry is the
|
||||
# correct separation per the bitnami subchart's design.
|
||||
install:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 1
|
||||
retries: 3
|
||||
upgrade:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 1
|
||||
# Per-Sovereign overrides — issue #387 + #604 + qa-loop iter-12 Fix #53A:
|
||||
retries: 3
|
||||
# Per-Sovereign overrides — issue #387 + #604:
|
||||
# Wire the per-Sovereign hostname into the HTTPRoute template and
|
||||
# sovereign realm ConfigMap (catalyst-ui redirect URIs). The HTTPRoute
|
||||
# attaches to cilium-gateway/kube-system installed by 01-cilium.yaml.
|
||||
#
|
||||
# sovereignRealm.name: per `feedback_no_mvp_no_workarounds.md` target-state
|
||||
# rule, each Sovereign owns its KC realm named after the tenant short-name.
|
||||
# The bootstrap-kit Kustomization's postBuild.substitute supplies
|
||||
# SOVEREIGN_REALM_NAME (canonical: first label of SOVEREIGN_FQDN, e.g.
|
||||
# `omantel` for omantel.biz). When unset the envsubst rule
|
||||
# ${VAR:=default} resolves to "sovereign" — backward-compat with
|
||||
# overlays that haven't been migrated.
|
||||
values:
|
||||
sovereignFQDN: ${SOVEREIGN_FQDN}
|
||||
sovereignRealm:
|
||||
name: ${SOVEREIGN_REALM_NAME:=sovereign}
|
||||
gateway:
|
||||
host: auth.${SOVEREIGN_FQDN}
|
||||
|
||||
@ -30,8 +30,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-gitea
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "10"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: gitea
|
||||
@ -54,7 +52,7 @@ spec:
|
||||
# bp-self-sovereign-cutover Step 1 gitea-mirror Job mounts it. K8s
|
||||
# forbids cross-namespace secretKeyRef; reflector is the canonical
|
||||
# platform-level mirror. Caught live on otech103 2026-05-04.
|
||||
version: 1.2.7
|
||||
version: 1.2.5
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-gitea
|
||||
@ -65,12 +63,10 @@ spec:
|
||||
# checks Ready=True independently. Replaces PR #221 spec.timeout: 15m.
|
||||
install:
|
||||
disableWait: true
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
disableWait: true
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
@ -83,16 +79,3 @@ spec:
|
||||
# cilium-gateway from clusters/_template/bootstrap-kit/01-cilium.yaml.
|
||||
gateway:
|
||||
host: gitea.${SOVEREIGN_FQDN}
|
||||
# DoD D25 (t129 2026-05-16): override the chart's baked dev hostname
|
||||
# `gitea.catalyst.local` so the Gitea Web UI renders the LIVE
|
||||
# Sovereign FQDN in pageData.appUrl, clone URLs, and internal links.
|
||||
# Without this every Sovereign's Gitea page told the operator to
|
||||
# clone from `gitea.catalyst.local` (which public DNS can't resolve),
|
||||
# breaking the canonical "Sovereign-local Git server" contract that
|
||||
# bp-self-sovereign-cutover relies on.
|
||||
gitea:
|
||||
gitea:
|
||||
config:
|
||||
server:
|
||||
DOMAIN: gitea.${SOVEREIGN_FQDN}
|
||||
ROOT_URL: https://gitea.${SOVEREIGN_FQDN}
|
||||
|
||||
@ -70,8 +70,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-powerdns
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "11"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -102,29 +100,7 @@ spec:
|
||||
# provisioning paths operative.
|
||||
# 1.2.1: zone-bootstrap Job needs /tmp emptyDir (readOnlyRootFS+
|
||||
# curl -o /tmp/zone-resp). Caught live on otech103 2026-05-04.
|
||||
# 1.2.2 (issue #144): zone-bootstrap Job activeDeadlineSeconds
|
||||
# raised 300s → 840s. Cold Sovereign on prov #22 had bp-cnpg
|
||||
# still synthesising the `pdns-pg-app` Secret when this Job
|
||||
# ran; powerdns Pod was not Ready, curl against
|
||||
# http://powerdns:8081 looped, Job hit 5m DeadlineExceeded,
|
||||
# Helm post-install hook failed, HR FAILED 4× → terminal.
|
||||
# New deadline (14m) sits below the HR install.timeout cap of
|
||||
# 15m so Flux's remediation can still reclaim a true failure.
|
||||
# 1.2.3 (Fix #144-followup, prov #37+#38 recurrence 2026-05-12):
|
||||
# bumping activeDeadlineSeconds alone was insufficient — the Job
|
||||
# hit BackoffLimitExceeded (NOT DeadlineExceeded) at ~10min
|
||||
# because each container invocation curl'd a Service with empty
|
||||
# Ready endpoints (powerdns Pods Pending behind a worker-capacity
|
||||
# wedge that kept bp-cnpg's pdns-pg-1-initdb itself Pending).
|
||||
# Container restartPolicy=OnFailure + backoffLimit=6 killed the
|
||||
# Job long before activeDeadlineSeconds had any effect. Fix moves
|
||||
# the wait-for-API loop INSIDE the container (restartPolicy=Never,
|
||||
# bounded by new apiReadyTimeoutSeconds=600s) so one Pod owns
|
||||
# the full 14m budget. Trace: in chroot prov #38, HR status
|
||||
# message read "Helm install failed for release powerdns/powerdns
|
||||
# with chart bp-powerdns@1.2.2: failed post-install: 1 error
|
||||
# occurred: * job powerdns-zone-bootstrap failed: BackoffLimitExceeded".
|
||||
version: 1.2.3
|
||||
version: 1.2.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-powerdns
|
||||
@ -139,12 +115,10 @@ spec:
|
||||
# cleanly; runtime convergence (powerdns pods becoming Ready once
|
||||
# CNPG lands) is observed via kubectl, not gated on Helm.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -44,8 +44,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-external-dns
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "12"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: external-dns
|
||||
@ -73,12 +71,10 @@ spec:
|
||||
# slow-Ready cascade. Helm install completes when manifests apply.
|
||||
# Replaces PR #221 spec.timeout: 15m.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -34,13 +34,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-catalyst-platform
|
||||
namespace: flux-system
|
||||
labels:
|
||||
# slot encodes bootstrap-kit install order; component=catalyst-platform
|
||||
# overrides the default Phase 1 mapping so this HR lands in
|
||||
# Phase 3 (Sovereign Live) on the openova-flow canvas — once
|
||||
# Ready=True the Sovereign is fully self-sufficient.
|
||||
catalyst.openova.io/slot: "13"
|
||||
catalyst.openova.io/component: catalyst-platform
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: catalyst-platform
|
||||
@ -62,16 +55,6 @@ spec:
|
||||
# the umbrella install, eliminating the race.
|
||||
- name: bp-keycloak
|
||||
- name: bp-cnpg
|
||||
# bp-crossplane-claims (chart-roll-rca iter-15, 2026-05-10): owns the
|
||||
# access.openova.io/v1alpha1 XRD that qa-fixtures UserAccess CRs
|
||||
# require. Without this dep, slot 13 races slot 14 and the umbrella
|
||||
# upgrade fails admission with `no matches for kind "UserAccess" in
|
||||
# version "access.openova.io/v1alpha1"`. The release Secret then
|
||||
# enters `pending-upgrade` and waits the full 15m timeout × 3 retries
|
||||
# before any operator-visible failure (the 2026-05-10 omantel.biz
|
||||
# 90-min wedge). With this edge, the chart never enters the failing
|
||||
# state on a fresh roll.
|
||||
- name: bp-crossplane-claims
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-catalyst-platform
|
||||
@ -380,230 +363,7 @@ spec:
|
||||
# EnsureOrg / EnsureRepo blocking qa-wp Application reconcile.
|
||||
# bootstrap-kit qaFixtures.cnpgPairName default qa-cnpg → qa-cnpgpair
|
||||
# so TC-306's "cnpgpair" substring assertion passes.
|
||||
# 1.4.123 (qa-loop iter-12 Fix #53A): triggers catalyst-api StatefulSet
|
||||
# restart so it picks up the new CATALYST_KC_REALM=omantel value from
|
||||
# the bp-keycloak 1.5.0 mirrored Secret (realm-rename target-state).
|
||||
# 1.4.127 (qa-loop iter-12 Fix #54 Workstream 4): chart-side
|
||||
# templates/catalyst-gitea-token-secret.yaml + post-install Job
|
||||
# auto-mints the Gitea PAT into catalyst-gitea-token (replaces
|
||||
# kubectl-applied operational hack).
|
||||
# 1.4.133 (qa-loop iter-1 prefetch Fix #113, prov #9 wedge):
|
||||
# qa-fixtures Kyverno disallow-privileged-containers exclusion
|
||||
# list now includes `catalyst` namespace so the registry-pivot
|
||||
# DaemonSet shipped by bp-self-sovereign-cutover (which legitimately
|
||||
# needs `securityContext.privileged: true` to rewrite
|
||||
# /etc/rancher/k3s/registries.yaml on every node) is not blocked
|
||||
# by the validating admission webhook. Without this, prov #9
|
||||
# bp-self-sovereign-cutover HR went Ready=False and bp-catalyst-
|
||||
# platform never reached Ready → console.<sov> Ingress never
|
||||
# materialised → iter-1 was unrunnable.
|
||||
# 1.4.134 (qa-loop iter-1 prefetch Fix #114, prov #9 unwedge):
|
||||
# New pre-install hook Job (qa-finalizer-strip, weight -99)
|
||||
# strips orphaned controller finalizers off Application /
|
||||
# Organization / Environment / UserAccess CRs in the qa-
|
||||
# namespace + force-finalizes the namespace itself if it's
|
||||
# stuck Terminating. Breaks the rollback-orphan finalizer
|
||||
# deadlock that left prov #9 in an unrecoverable install loop:
|
||||
# 1. install creates qa-omantel ns + Application + controllers
|
||||
# in same pass (no hook ordering)
|
||||
# 2. qa-cnpg-backup-s3-seed post-install hook stalls 15m
|
||||
# 3. cleanupOnFail rolls back, killing controllers BEFORE they
|
||||
# can process Application's deletion finalizer
|
||||
# 4. qa-omantel ns wedged in Terminating; no controller exists
|
||||
# 5. retry: "namespace is being terminated" → seed Job RBAC
|
||||
# creation rejected → 15m hook timeout → loop forever.
|
||||
# This Job runs at the very start of every install attempt and
|
||||
# guarantees a clean slate.
|
||||
# 1.4.136 (qa-loop iter-1 Fix #124, secondary Fix #122): convert
|
||||
# catalyst-gitea-token bootstrap from post-install to pre-install
|
||||
# hook so catalyst-catalog + catalyst-organization-controller
|
||||
# (which validate non-empty CATALYST_GITEA_TOKEN at startup) see
|
||||
# a populated Secret at first container start. Prior post-install
|
||||
# ordering caused chicken-and-egg deadlock: Deployments crashed
|
||||
# because Secret was empty; mint Job ran AFTER Deployments,
|
||||
# exponential back-off blew past Helm's 15m install timeout,
|
||||
# remediation looped forever. Pre-install hook (weight=10) now
|
||||
# populates the Secret (weight=5) BEFORE any consumer Deployment
|
||||
# rolls. See Chart.yaml top comment for the full diagnostic chain.
|
||||
# 1.4.135 (qa-loop bounded-provision-cycle Fix #119): sanitize
|
||||
# illegal `/` in qa-fixtures Continuum mirror label value. Prov
|
||||
# #10 wedge — helm install crashed on Continuum CR validation
|
||||
# because the Fix #102 platform-mirror label
|
||||
# `openova.io/continuum-mirror-of: <ns>/<name>` violates k8s
|
||||
# label-value spec (`/` forbidden in values, allowed only in
|
||||
# keys as the prefix separator). Split into two valid labels:
|
||||
# `openova.io/continuum-mirror-of-namespace` +
|
||||
# `openova.io/continuum-mirror-of-name`. Unblocks prov #11+.
|
||||
# 1.4.138 (qa-loop iter-1 Fix #138, prov #20 wedge): converts
|
||||
# qa-fixtures qa-cnpg-backup-s3-seed + qa-cnpg-status-seed Jobs
|
||||
# from post-install hooks → regular release resources. Resolves
|
||||
# the circular bootstrap-kit DAG (this slot 13 install hook needed
|
||||
# bp-seaweedfs slot 18 to be Ready, which couldn't happen until
|
||||
# this HR was Ready). bp-catalyst-platform install now completes
|
||||
# in ~5 min instead of timing out at 15 min then loop-rolling back.
|
||||
# 1.4.137: deploy-bot auto-bump (no template changes).
|
||||
# 1.4.139 (Fix #163, 2026-05-11, MIRROR-EVERYTHING): every
|
||||
# chart-hook image reference in this Blueprint uses the explicit
|
||||
# harbor.openova.io/proxy-dockerhub prefix per CLAUDE.md
|
||||
# inviolable rule. SBOM-auditable, no functional change.
|
||||
# 1.4.140 (qa-loop Wave 27 Fix #184, prov #33 wedge, 2026-05-11):
|
||||
# catalyst-gitea-token-mint pre-install hook Gitea-API wait loop
|
||||
# raised from hardcoded 60×5s (300s = 5m) to values-driven knob
|
||||
# (giteaWait.iterations × giteaWait.intervalSeconds, default
|
||||
# 168×5 = 840s = 14m). Covers the autoscaler-hcloud cold-start
|
||||
# observed on multi-region prov #33: workerCount=0 (Fix #157
|
||||
# sizing) means the autoscaler must spawn a worker in fsn1/hel1
|
||||
# before bp-gitea's Pod can schedule, which takes 10-15m on a
|
||||
# fresh provision. Pre-Fix #184 budget (300s) always expired
|
||||
# before gitea was reachable → bp-catalyst-platform installFailed
|
||||
# and HR loop-rolled forever. Budget arithmetic: hook 840s + 60s
|
||||
# slack ≤ HR install.timeout 900s (15m).
|
||||
# 1.4.141 (qa-loop Fix #185, prov #38/#39/#41 recurrence,
|
||||
# 2026-05-12): qa-finalizer-strip pre-install hook (helm.sh/hook-
|
||||
# weight -99) now tolerates the control-plane NoSchedule taint
|
||||
# and runs with priorityClassName: system-cluster-critical so it
|
||||
# is ALWAYS schedulable regardless of worker-node CPU saturation.
|
||||
# Root cause on prov #41: after bootstrap-kit fan-out the worker
|
||||
# (cpx32, 8vCPU/16GB) sat at 99% CPU requests; the autoscaler
|
||||
# had backed off scale-up of a second worker; the Job's 50m CPU
|
||||
# request couldn't be satisfied; Helm pre-install timed out at
|
||||
# 15m; Flux remediated 3× and gave up. Same recurring failure on
|
||||
# prov #38, #39, #41 — all on chart pin 1.4.140 which (correctly)
|
||||
# had no scheduling concession for the -99 hook. Image switched
|
||||
# from bitnamilegacy/kubectl:1.29.3 → alpine/k8s:1.31.4 in same
|
||||
# commit (rule-17 MIRROR-EVERYTHING hygiene; bitnamilegacy is
|
||||
# the Docker-Hub redirect for deprecated Bitnami 2025-08 cutover).
|
||||
# 1.4.147 (D31 wordpress-tenant activeHotStandby + D21 owner auto-seed):
|
||||
# - PR #1562 wires bp-cnpg-pair Primary+Replica pattern into
|
||||
# wordpress-tenant chart via pg.activeHotStandby knob
|
||||
# - PR #1564 baked into catalyst-api:8d2a947 — handover now
|
||||
# auto-seeds the operator's UserAccess CR (D21 zero-touch)
|
||||
# 1.4.146 (D29 billing internal JWT bypass for public routes):
|
||||
# - PR #1561 mirrors PR #1559's gateway public routes in the billing
|
||||
# service's own JWT middleware. Without this, the gateway passed
|
||||
# through but billing still 401-d.
|
||||
# 1.4.145 (D29 gateway public routes for redeem flow):
|
||||
# - PR #1559 makes /api/billing/{vouchers/redeem-preview,plans,addons}
|
||||
# public so the marketplace /redeem?code=XXX landing can validate
|
||||
# codes without auth (the entire D29 voucher-redeem zero-touch
|
||||
# flow is broken without this)
|
||||
# 1.4.144 (D27 admin tag override + D28 voucher email wire):
|
||||
# - PR #1557 decouples admin tag from smeTag bundle (admin image
|
||||
# may not publish for every SME services CI SHA — caught t132
|
||||
# 2026-05-16 with admin:b0ed216 stuck in ImagePullBackOff)
|
||||
# - PR #1556 adds the billing→notification wire so the voucher
|
||||
# issuance flow emails the recipient (D28 zero-touch contract)
|
||||
#
|
||||
# 1.4.148 (D16 + D17 + D27 founder-flagged bug fixes, t139 verify cycle):
|
||||
# - PR #1583: D16 /cloud nodes multi-cluster fan-out + handover
|
||||
# export retry/reorder/auth-bypass (catalyst-api 2ab8a0e)
|
||||
# - PR #1584: D27 catalog fresh-seed Published=true default
|
||||
# (sme services catalog 964dc15)
|
||||
# - PR #1585: D17 /app/$componentId route-collision fix (catalyst-ui 2ab8a0e)
|
||||
# Caught on t136/t138 fresh-prov runs that bootstrap-kit was
|
||||
# still pinned to 1.4.147 → none of the fixes reached the chroot.
|
||||
# 1.4.153 — D17 Wave-1 Family A: /cloud?view=list&kind=<X>
|
||||
# no longer drifts to /dashboard (kind-alias map in
|
||||
# router.tsx validateSearch). Caught on t10.omantel.biz
|
||||
# test agents E/C2 2026-05-17.
|
||||
# 1.4.155 — Wave 5 UX polish (founder review 2026-05-17):
|
||||
# - Sidebar reorder: Dashboard → Cloud → Apps → Jobs → Users →
|
||||
# BSS → Settings (operator mental model: overview → infra →
|
||||
# workloads → ops → access → commerce → config).
|
||||
# - BSS icon swapped from bespoke receipt glyph to briefcase
|
||||
# line-glyph matching the rest of the icon family.
|
||||
# - Marketplace toggle moved off Settings sub-nav + standalone
|
||||
# /settings/marketplace page INTO SettingsPage as a
|
||||
# <SectionCard id="marketplace"> anchor section (same pattern
|
||||
# as #dns, #sovereign, #notifications). MarketplaceSettings.tsx
|
||||
# page deleted; MarketplaceSection.tsx new inner component;
|
||||
# /settings/marketplace route + sidebar sub-nav child removed.
|
||||
# Old URL now 404s — operators click Settings then scroll to
|
||||
# the Marketplace anchor.
|
||||
# - Save flow UNCHANGED: POST /api/v1/sovereigns/{id}/marketplace
|
||||
# still commits per-Sovereign overlay to GitOps repo, Flux
|
||||
# reconciles ~1 min.
|
||||
#
|
||||
# 1.4.154 — Wave 2 collector PR. Bundles 6 Fix-Author PRs that
|
||||
# landed AFTER the 1.4.153 Wave-1 roll, all from the same t10
|
||||
# test sweep:
|
||||
# - #1598 Family F: BSS menu in Sovereign Console
|
||||
# (Billing/Orders/Revenue/Vouchers/Tenants iframe-embed of
|
||||
# marketplace.<fqdn>/back-office/*). Founder bug #1.
|
||||
# - #1599 Family D: dashboard treemap fan-out for cluster /
|
||||
# region / vcluster / family + Layer-1 cluster default.
|
||||
# Founder bug #2.
|
||||
# - #1600 Family C: ResourceDetailPage real-data rewrite —
|
||||
# per-kind summary, owner chain, navigate (not assign).
|
||||
# Founder bug #5.
|
||||
# - #1601 Family G: 6 singletons — hcloud-volumes StorageClass
|
||||
# (C9-006), /fleet/applications aggregator (C10-002),
|
||||
# secondary install-* Job bridge backfill (C10-003), legacy
|
||||
# wildcard-tls cert cleanup (C7-007), D22 settings em-dash
|
||||
# placeholder lift (C8-001), /jobs region filter (C8-005).
|
||||
# - #1602 Family E: Compliance UI — Falco runtime alerts +
|
||||
# SBOM/CVE tab + framework filter chip strip + policy
|
||||
# drilldown live-cluster fallback + PolicyReport /
|
||||
# ClusterPolicyReport list kinds (C11-003/005/006/007/008/
|
||||
# 009/010).
|
||||
# - #1603 Family B: AppDetail HR-overlay status sync +
|
||||
# Resources/Logs tab namespace+label fix (HR.spec.target-
|
||||
# Namespace + chart-name label) + "Bootstrap blueprint"
|
||||
# chip for bp-* (founder bug #4, C4-003/004/005/007/013).
|
||||
# 1.4.163 (Wave 16 collector, 2026-05-18): republishes the chart
|
||||
# OCI artifact so it actually contains every chart-template change
|
||||
# merged after the 1.4.162 publish (commit 0ad78790). Without the
|
||||
# republish, bootstrap-kit pin 1.4.162 pulls an artifact missing
|
||||
# the new templates and Sovereigns boot with stale chart bytes.
|
||||
# Baked: #1644 tenantPublic HTTPRoute reconciler + #1650
|
||||
# tenantPublic setter on product-install + #1640 Cilium Gateway
|
||||
# per-zone listener pairs + #1654 bp-newapi attestation gate +
|
||||
# sandbox-controller post-handover refinements (D31 HS env vars,
|
||||
# sovereign-fqdn ConfigMap keys, cutover-driver sandboxes RBAC,
|
||||
# values.yaml sovereign.{enableHotStandby,primaryRegion,
|
||||
# replicaRegion} defaults). See Chart.yaml header comment for
|
||||
# the full change list.
|
||||
# 1.4.166 (TBD-E8 / C4-015, 2026-05-18): seed 13 baseline Blueprint
|
||||
# CRs unconditionally so `/api/v1/catalog` returns a non-empty
|
||||
# items[] from handover-time. Pre-fix every fresh Sovereign had
|
||||
# empty catalog because (a) self-sovereign-cutover step-01 only
|
||||
# mirrors `openova-io/openova` into Gitea — not the `catalog` /
|
||||
# `catalog-sovereign` Orgs that catalyst-catalog reads from — and
|
||||
# (b) qa-fixtures (the only chart-shipped Blueprint CRs) defaults
|
||||
# OFF on production. Adds templates/catalog-seed/blueprints.yaml
|
||||
# (bp-wordpress-tenant, bp-cnpg, bp-keycloak, bp-grafana,
|
||||
# bp-prometheus, bp-loki, bp-redis, bp-clickhouse, bp-opensearch,
|
||||
# bp-temporal, bp-n8n, bp-langfuse, bp-llm-gateway) which the
|
||||
# chained catalog client surfaces via in-cluster LIST fallback.
|
||||
# 1.4.168 (TBD-C18b, 2026-05-18): stop clobbering the cutover-minted
|
||||
# Gitea API token. templates/sme-services/provisioning-github-
|
||||
# token.yaml gains a lookup-persistence guard — if the destination
|
||||
# Secret carries annotation `catalyst.openova.io/token-source:
|
||||
# self-sovereign-cutover-step-09` (stamped by Step 09 of bp-self-
|
||||
# sovereign-cutover when it mints the real Gitea API token), the
|
||||
# template preserves the existing GITHUB_TOKEN bytes instead of
|
||||
# mirroring gitea-admin-secret.password over them on every Flux
|
||||
# reconcile. Pre-fix on t22: Step 09 minted a real token at
|
||||
# 13:43:33Z; ~5 min later helm reconcile rewrote GITHUB_TOKEN back
|
||||
# to the admin password byte, so every subsequent SME provisioning
|
||||
# call to Gitea returned 401 "user does not exist" and journey
|
||||
# step 16 (tenant repo creation) silently stuck.
|
||||
# 1.4.179 (TBD-A14/A15/A10b, 2026-05-18): three t24 zero-touch
|
||||
# Wave 36 P1 fresh-prov blockers — see chart Chart.yaml header for
|
||||
# the full diagnostic + fix description per gate.
|
||||
# - A14 issue #1843: networkpolicies (networking.k8s.io) RBAC
|
||||
# get/list/watch verbs added to clusterrole-cutover-driver.
|
||||
# - A15 issue #1844: sovereign-fqdn ConfigMap empty fields
|
||||
# populated end-to-end via the cloud-init → bootstrap-kit →
|
||||
# chart substitute chain (configuredRegions / controlPlaneIP /
|
||||
# primaryRegion / replicaRegion / selfDeploymentId /
|
||||
# enableHotStandby / qaApplications). This Kustomization gains
|
||||
# 3 new value mappings: global.sovereignSelfDeploymentId,
|
||||
# sovereign.configuredRegions, sovereign.qaApplications.
|
||||
# - A10b issue #1845: GET kubeconfig?region=<cloudRegion>
|
||||
# resolves the slot-suffixed on-disk shape
|
||||
# `<id>-<region>-<i>.yaml` (handler-side glob fallback).
|
||||
version: 1.4.179
|
||||
version: 1.4.123
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-catalyst-platform
|
||||
@ -622,56 +382,16 @@ spec:
|
||||
# specifically for this umbrella chart — every other bp-* chart
|
||||
# remains at its previous (or default) timeout because they install
|
||||
# in well under 5 minutes empirically.
|
||||
#
|
||||
# chart-roll-rca iter-15 (2026-05-10): timeout reduced 25m → 15m and
|
||||
# remediation hardened with cleanupOnFail + strategy: rollback +
|
||||
# remediateLastFailure. Background: the 25m ceiling existed to absorb
|
||||
# the dep-ordering race RC-1 (qa-fixtures UserAccess CRs rendering
|
||||
# before the bp-crossplane-claims XRD existed). With that race fixed
|
||||
# via the bp-crossplane-claims dependsOn edge above, 15m is plenty for
|
||||
# the umbrella's true install latency on a healthy cluster.
|
||||
# cleanupOnFail purges partial release artifacts on retry; rollback
|
||||
# strategy reverts to the last good release before retrying instead of
|
||||
# leaving the release Secret pinned at `pending-upgrade` for the full
|
||||
# timeout ceiling. Net effect: a failed-then-recoverable upgrade
|
||||
# collapses from ~75m worst case → ~15m worst case.
|
||||
#
|
||||
# post-prov7 fix (2026-05-10, refs chart-roll-rca-iter15): the
|
||||
# HelmRelease v2 schema only allows `cleanupOnFail` and
|
||||
# `remediation.strategy` on the `upgrade` block. The previous version
|
||||
# of this file placed both fields on the `install` block as well,
|
||||
# which caused the bootstrap-kit Kustomization to fail dry-run on a
|
||||
# fresh Sovereign with `field not declared in schema`, blocking ALL
|
||||
# HRs from rendering. The install block here keeps only the schema-
|
||||
# legal fields (`retries`, `remediateLastFailure`); rollback semantics
|
||||
# apply naturally to upgrades, and a failed first install is
|
||||
# remediated via retry without rollback (no prior release to roll
|
||||
# back to).
|
||||
#
|
||||
# F8 fix (2026-05-12, prov #44 RCA): bumped install + upgrade timeout
|
||||
# 15m → 30m. F1-F7 ship live on main, qa-finalizer-strip Completed
|
||||
# and autoscaler workers joined, but bp-catalyst-platform HR was
|
||||
# still mid-retry (failures=3) at the catalyst-api 60m phase1 watch
|
||||
# cap on d9399223c3caa4f9. Total bootstrap-kit install on a fresh
|
||||
# cpx42×1 Sovereign genuinely exceeds the 15m PR #221 ceiling when
|
||||
# the umbrella chart's full SME + Catalyst service stack rolls
|
||||
# without a warm Harbor proxy-cache. Paired with the F8 catalyst-api
|
||||
# DefaultWatchTimeout bump (60m → 120m) so the outer watch budget
|
||||
# comfortably contains the new 30m × 3-retry inner HR ceiling.
|
||||
install:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
timeout: 25m
|
||||
remediation:
|
||||
retries: 3
|
||||
remediateLastFailure: true
|
||||
upgrade:
|
||||
disableWait: true
|
||||
timeout: 30m
|
||||
timeout: 25m
|
||||
remediation:
|
||||
retries: 3
|
||||
strategy: rollback
|
||||
remediateLastFailure: true
|
||||
cleanupOnFail: true
|
||||
# Per-Sovereign overrides for the umbrella — sovereign-FQDN-derived hostnames
|
||||
# for console/admin/api. All chart-level Catalyst service config (image refs,
|
||||
# OTel endpoints, NATS subjects) lives in products/catalyst/chart/values.yaml.
|
||||
@ -689,15 +409,6 @@ spec:
|
||||
# then short-circuits the glue registration and falls back to plain
|
||||
# set_ns (legacy behaviour).
|
||||
sovereignLBIP: ${SOVEREIGN_LB_IP}
|
||||
# sovereignSelfDeploymentId — the catalyst-api deployment-record id
|
||||
# this Sovereign was provisioned under on the contabo mothership.
|
||||
# Threaded from cloud-init's SOVEREIGN_DEPLOYMENT_ID Kustomization
|
||||
# postBuild substitute. Consumed by the chart's sovereign-fqdn
|
||||
# ConfigMap `selfDeploymentId` key so the chroot catalyst-api's
|
||||
# GET /api/v1/sovereign/self answers with the correct id at
|
||||
# handover-time (no wait for the orchestrator's chart-values
|
||||
# overlay write). TBD-A15 (t24 zero-touch, 2026-05-18, issue #1844).
|
||||
sovereignSelfDeploymentId: '${SOVEREIGN_DEPLOYMENT_ID:-}'
|
||||
ingress:
|
||||
hosts:
|
||||
console:
|
||||
@ -727,106 +438,6 @@ spec:
|
||||
# zero-touch flow), cloud-init pre-renders this variable to a
|
||||
# single-entry array derived from ${sovereign_fqdn}.
|
||||
parentZones: ${PARENT_DOMAINS_YAML}
|
||||
# ─── Wildcard cert issuer environment (Fix #123, LE rate-limit) ────
|
||||
# Default-OFF (production LE issuer); flipped to true via envsubst
|
||||
# WILDCARD_CERT_USE_STAGING=true on the per-Sovereign overlay for any
|
||||
# Sovereign that should issue staging-LE certs instead of production.
|
||||
# The qa-loop coordinator pairs this knob with QA_FIXTURES_ENABLED on
|
||||
# QA Sovereigns (omantel.biz and qa.* pools) so the wipe + re-provision
|
||||
# cadence never trips Let's Encrypt's 5-certs/168h production ceiling
|
||||
# per registered domain. Customer Sovereigns leave this empty (=false)
|
||||
# and get real-trusted production certs.
|
||||
#
|
||||
# Staging certs are signed by Fake LE Intermediate X1; browsers
|
||||
# reject without an explicit exception, but `curl -sk` and Playwright
|
||||
# (ignoreHTTPSErrors:true) accept them — sufficient for the qa-loop
|
||||
# Test Executor's contract assertions.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #4 every Sovereign may flip this
|
||||
# independently; the chart values.yaml carries the staging issuer
|
||||
# name (`letsencrypt-dns01-staging-powerdns`, shipped by
|
||||
# bp-cert-manager-powerdns-webhook 1.1.0+) as an overridable default.
|
||||
wildcardCert:
|
||||
useStaging: ${WILDCARD_CERT_USE_STAGING:-false}
|
||||
# ─── Sovereign-side region seeding (DoD D5) ─────────────────────
|
||||
# regionsJson — JSON-array literal of the canonical multi-region
|
||||
# RegionSpec[] this Sovereign was provisioned with. Threaded
|
||||
# through from the mothership prov body via the tofu cloud-init
|
||||
# `SOVEREIGN_REGIONS_JSON` envsubst placeholder. The chart writes
|
||||
# this string into the `sovereign-fqdn` ConfigMap's `regionsJson`
|
||||
# key (sovereign-fqdn-configmap.yaml); the catalyst-api Pod reads
|
||||
# via env `SOVEREIGN_REGIONS_JSON`; chrootEnsureDeployment parses
|
||||
# and stamps Request.Regions so /infrastructure/topology emits
|
||||
# the right per-region tree and /cloud?view=graph renders all
|
||||
# N regions correctly. Without this the chroot fell back to the
|
||||
# live-Nodes path and emitted "1 cluster 1 region" on every
|
||||
# multi-region Sovereign (caught on t126, 2026-05-16).
|
||||
sovereign:
|
||||
# MUST be quoted: SOVEREIGN_REGIONS_JSON contains valid JSON like
|
||||
# `[{"cloudRegion":"hel1",...}]`. Without quotes, YAML interprets
|
||||
# the JSON as a YAML flow-sequence-of-flow-mappings, parses into
|
||||
# `[]map[string]interface{}`, then Helm's chart template `{{ .Values.
|
||||
# sovereign.regionsJson }}` stringifies via Go's `%v` printf —
|
||||
# producing `[map[cloudRegion:hel1 ...]]` (Go map syntax, NOT JSON).
|
||||
# The chroot's chrootRegionsFromEnv then can't json.Unmarshal it →
|
||||
# falls back to live-Nodes path → /cloud renders "1 region 1 cluster"
|
||||
# on every multi-region Sovereign. Caught on t131 2026-05-16.
|
||||
# Single-quoted so embedded double-quotes in the JSON are literal.
|
||||
regionsJson: '${SOVEREIGN_REGIONS_JSON:-}'
|
||||
# ─── D22 (settings empty values) sovereign-side identity ──────────
|
||||
# ORG_EMAIL / ORG_NAME / SOVEREIGN_CONTROL_PLANE_IP / GITOPS_REPO_URL
|
||||
# threaded from cloud-init (provisioner.go::writeTfvars + Hetzner
|
||||
# tofu cloudinit-control-plane.tftpl). Chart's sovereign-fqdn
|
||||
# ConfigMap exposes these as keys; catalyst-api reads via env in
|
||||
# api-deployment.yaml (PR #1569); chrootEnsureDeployment populates
|
||||
# the deployment record so Sovereign Console Settings page renders
|
||||
# real ownerEmail/region/controlPlaneIP/gitopsRepoURL/consoleURL
|
||||
# instead of `—` placeholders. Empty default = same as today,
|
||||
# backwards-compatible for charts that don't have the cloud-init
|
||||
# placeholders wired yet.
|
||||
orgEmail: '${ORG_EMAIL:-}'
|
||||
orgName: '${ORG_NAME:-}'
|
||||
controlPlaneIP: '${SOVEREIGN_CONTROL_PLANE_IP:-}'
|
||||
gitopsRepoURL: '${GITOPS_REPO_URL:-}'
|
||||
# ─── D31 active-hot-standby (cross-region CNPG) ──────────────────
|
||||
# Sovereign-level opt-in for the active-hot-standby Postgres shape
|
||||
# on every CNPG-backed tenant app the marketplace installs.
|
||||
# Default-OFF — every Sovereign that has not flipped
|
||||
# SOVEREIGN_ENABLE_HOT_STANDBY=true on the per-Sovereign overlay
|
||||
# keeps rendering single-Cluster CNPG (no regression). When ON
|
||||
# AND both region keys are non-empty AND distinct, the SME-tenant
|
||||
# gitops writer injects pg.activeHotStandby.* into every fresh
|
||||
# bp-wordpress-tenant HelmRelease so the chart's
|
||||
# cnpg-cluster.yaml template renders a primary + replica
|
||||
# Cluster.postgresql.cnpg.io pair across the two regions, WAL
|
||||
# streaming over Cilium ClusterMesh (DoD D11 + D31). Same wiring
|
||||
# extends to any future tenant product chart (gitlab-tenant,
|
||||
# nextcloud-tenant) that adopts the same value contract.
|
||||
#
|
||||
# Region keys MUST match the canonical openova.io/region node
|
||||
# label value (e.g. `hz-fsn-rtz-prod`, `hz-hel-rtz-prod`) — the
|
||||
# WordPress chart's cnpg-cluster.yaml uses nodeAffinity on that
|
||||
# label to pin the primary + replica Pods to the right regions.
|
||||
enableHotStandby: '${SOVEREIGN_ENABLE_HOT_STANDBY:-}'
|
||||
primaryRegion: '${SOVEREIGN_PRIMARY_REGION:-}'
|
||||
replicaRegion: '${SOVEREIGN_REPLICA_REGION:-}'
|
||||
# configuredRegions — YAML list of region keys this Sovereign was
|
||||
# provisioned with (e.g. ["fsn1", "hel1"]). Threaded from cloud-init's
|
||||
# SOVEREIGN_CONFIGURED_REGIONS_YAML Kustomization postBuild substitute
|
||||
# which the tofu module renders as a YAML inline list literal from
|
||||
# var.regions[*].cloudRegion. The chart's sovereign-fqdn ConfigMap
|
||||
# joins this list into a comma-separated `configuredRegions` key for
|
||||
# the catalyst-ui Dashboard SovereignCard + Networking → ClusterMesh
|
||||
# tab to render configured-but-not-active chips. Defaults to empty
|
||||
# list so non-multi-region Sovereigns surface only their live region.
|
||||
# TBD-A15 (t24 zero-touch, 2026-05-18, issue #1844).
|
||||
configuredRegions: ${SOVEREIGN_CONFIGURED_REGIONS_YAML:-[]}
|
||||
# qaApplications — YAML list of qa-fixtures applicationRef literals
|
||||
# the chroot Sovereign's /compliance/scorecard surface emits via
|
||||
# appRefs[]. Default empty so production Sovereigns surface only
|
||||
# PolicyReport-observed apps. QA Sovereigns set via QA_APPLICATIONS_YAML.
|
||||
# TBD-A15 (t24 zero-touch, 2026-05-18, issue #1844).
|
||||
qaApplications: ${QA_APPLICATIONS_YAML:-[]}
|
||||
# ─── QA fixtures (qa-loop iter-6 Cluster-F + EPIC-6 iter-6) ────────
|
||||
# Default-OFF on production; flipped to true via envsubst
|
||||
# QA_FIXTURES_ENABLED=true on the per-Sovereign overlay for any
|
||||
|
||||
@ -28,8 +28,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-crossplane-claims
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "14"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: crossplane-claims
|
||||
@ -46,20 +44,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-crossplane-claims
|
||||
# 1.1.3 (qa-loop iter-16 Fix #71): legacy XUserAccess Composition
|
||||
# gated behind new `userAccess.compositionEnabled` (default false).
|
||||
# The catalyst-useraccess-controller is now the canonical day-2
|
||||
# path; the Composition was setting `Ready=False` on every CR
|
||||
# because (a) provider-kubernetes is not installed and (b) post-EPIC-3
|
||||
# CRs use `tierRoleRef` not `applications[0]`. The composite
|
||||
# controller's status-write was overwriting the controller's
|
||||
# `Ready=True`. Disabling the Composition (and `defaultCompositionRef`)
|
||||
# leaves the controller in sole charge of `useraccesses.access.openova.io`
|
||||
# while the XRD itself stays installed (it owns the CRD the
|
||||
# controller watches).
|
||||
# 1.1.4 (Fix #158): kubectlImage switched from bitnami/kubectl:1.31
|
||||
# (deleted from Docker Hub 2025-08) to bitnamilegacy/kubectl:1.31.4.
|
||||
version: 1.1.5
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-crossplane-claims
|
||||
@ -69,12 +54,10 @@ spec:
|
||||
# HR on the upstream CRDs being live; disableWait replaces PR #221's
|
||||
# blanket spec.timeout: 15m band-aid.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -41,8 +41,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-external-secrets
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "15"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: external-secrets
|
||||
@ -59,7 +57,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-external-secrets
|
||||
version: 1.1.1
|
||||
version: 1.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-external-secrets
|
||||
@ -68,12 +66,10 @@ spec:
|
||||
# dependsOn is the gate, not Helm timeout). Replaces blanket
|
||||
# spec.timeout: 15m band-aid pattern from PR #221, removed in PR #250.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -34,8 +34,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-external-secrets-stores
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "15a"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: external-secrets-stores
|
||||
@ -50,29 +48,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-external-secrets-stores
|
||||
# 1.0.3 (issue #147 / prov #23): bump webhookGate.timeoutSeconds
|
||||
# 300 -> 600. Fix #141's 300s budget raced on prov #23
|
||||
# (`3ea80c75e1568a5c`, 3rd consecutive FAIL of this HR) where
|
||||
# cold-node image pull (+60-120s, no warmed cache) compounded on
|
||||
# cert-manager's own earlier retry latency, pushing total webhook
|
||||
# convergence past 300s. 600s = realistic max cold-start budget.
|
||||
#
|
||||
# 1.0.2 (issue #141 / prov #21): bump webhookGate.timeoutSeconds
|
||||
# 60 -> 300. Fix #137's 60s budget raced on prov #21
|
||||
# (`f84f6c3ff2b60296`, HR FAILED `failed pre-install: timed out`)
|
||||
# where webhook convergence took 75-105s on slow Hetzner cold-start.
|
||||
#
|
||||
# 1.0.1 (issue #137 / prov #20): pre-install hook gates the
|
||||
# ClusterSecretStore apply on the upstream ESO admission webhook
|
||||
# actually being dial-able (Pod-Ready ≠ Endpoints-populated +
|
||||
# cert-manager Cert mounted + CABundle injected). Without it the
|
||||
# HR FAILED with `exceeded max retries` on cold-cluster provisions
|
||||
# even though dependsOn (bp-external-secrets) was satisfied.
|
||||
#
|
||||
# 1.0.4 (Fix #158): webhookGate hook image switched from
|
||||
# bitnami/kubectl:1.30.4 (deleted from Docker Hub 2025-08) to
|
||||
# bitnamilegacy/kubectl:1.30.7.
|
||||
version: 1.0.5
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-external-secrets-stores
|
||||
@ -81,11 +57,9 @@ spec:
|
||||
# dependsOn is the gate, not Helm timeout).
|
||||
install:
|
||||
disableWait: true
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
disableWait: true
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -44,8 +44,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-cnpg
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "16"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: cnpg
|
||||
@ -57,33 +55,17 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cnpg
|
||||
version: 1.0.1
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cnpg
|
||||
namespace: flux-system
|
||||
# CNPG: KEEP Helm wait (disableWait: false / default). Consumers
|
||||
# bp-harbor + bp-powerdns + bp-keycloak + bp-gitea apply
|
||||
# postgresql.cnpg.io/v1.Cluster CRs gated by the cnpg mutating webhook
|
||||
# `mcluster.cnpg.io`. If bp-cnpg's HelmRelease goes Ready before the
|
||||
# cnpg-webhook-service has endpoints, Flux dependsOn lets downstream
|
||||
# HRs proceed → their Cluster CR apply gets:
|
||||
# "failed calling webhook \"mcluster.cnpg.io\": no endpoints
|
||||
# available for service \"cnpg-webhook-service\""
|
||||
# → Helm install fails → RetriesExceeded → entire DB-backed chain
|
||||
# (Harbor/PowerDNS/Keycloak/Gitea) wedges. Caught on prov #55/#56
|
||||
# (2026-05-12). disableWait: false (the default) tells Helm to block
|
||||
# the HR's Ready until the webhook deployment is rolled and the
|
||||
# service has endpoints, which is exactly what downstream consumers
|
||||
# need. This is the carve-out from the INVIOLABLE-PRINCIPLES #3
|
||||
# event-driven blanket — the rule's WHY (avoiding agent-waits-for-
|
||||
# its-own-CRDs cilium-style deadlock) does NOT apply here because
|
||||
# bp-cnpg's CRDs are loaded by helm-controller before pods schedule.
|
||||
# Event-driven install per docs/INVIOLABLE-PRINCIPLES.md #3.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -36,8 +36,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-valkey
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "17"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: valkey
|
||||
@ -48,19 +46,17 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-valkey
|
||||
version: 1.0.1
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-valkey
|
||||
namespace: flux-system
|
||||
# Event-driven install per docs/INVIOLABLE-PRINCIPLES.md #3.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -41,8 +41,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-seaweedfs
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "18"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: seaweedfs
|
||||
@ -57,24 +55,17 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-seaweedfs
|
||||
# 1.2.0 — qa-loop Wave 5 Fix #79 Gap B: ships
|
||||
# `seaweedfs-storage` StorageClass (chart-rendered) so PVCs that
|
||||
# default to it (bp-guacamole recordings, future
|
||||
# bp-loki/mimir/tempo cache) bind day-1 on bare-k3s Sovereigns
|
||||
# without waiting for bp-hcloud-csi or the SeaweedFS CSI driver.
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-seaweedfs
|
||||
namespace: flux-system
|
||||
# Event-driven install per docs/INVIOLABLE-PRINCIPLES.md #3.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -67,8 +67,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-harbor
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "19"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: harbor
|
||||
@ -101,24 +99,17 @@ spec:
|
||||
# live on otech113 2026-05-05 (issue #935 Bug 1) — Step 02 was
|
||||
# in CreateContainerConfigError for 11+ retries, blocking
|
||||
# cutover indefinitely.
|
||||
version: 1.2.17
|
||||
version: 1.2.15
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-harbor
|
||||
namespace: flux-system
|
||||
# Event-driven install per docs/INVIOLABLE-PRINCIPLES.md #3.
|
||||
# timeout: 15m — Harbor's post-install hooks (DB migration, job-service
|
||||
# init) legitimately need >5m on cold k3s. Same canonical-seam pattern
|
||||
# as Fix #127 (cutover), Fix #131 (gitea), Fix #143 (es-stores):
|
||||
# explicit HR-level timeout overrides Helm's 5m default which expires
|
||||
# before Harbor reaches Ready (prov #24 c776423270f4ae30 04:17 incident).
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -1,151 +0,0 @@
|
||||
# bp-sandbox — Catalyst bootstrap-kit Blueprint slot 19a (post-harbor).
|
||||
#
|
||||
# Deploys the sandbox-controller (Wave 1 + Wave 8) on a Sovereign so
|
||||
# that `sandbox.openova.io/v1.Sandbox` CRs are actually reconciled.
|
||||
# Wave 8 extends the controller to ALSO render per-Sandbox pty-server
|
||||
# StatefulSet + MCP Deployment + Service + HTTPRoute (architecture.md
|
||||
# §7) — without this slot enabled, every Sandbox CR sits unreconciled.
|
||||
#
|
||||
# ─── Slot history: 61 → 19a (Wave 11 convergence fix, 2026-05-18) ────
|
||||
# Originally slot 61. Caught live on t16.omantel.biz: bp-sandbox HR
|
||||
# stuck Reconciling because its chart pull went through
|
||||
# harbor.<sov-fqdn> (bp-self-sovereign-cutover Step-06 phase-1 rewrites
|
||||
# every HelmRepository URL `oci://ghcr.io/openova-io` →
|
||||
# `oci://harbor.<sov-fqdn>/openova-io` after handover), but harbor.<sov
|
||||
# -fqdn> wasn't reachable yet because bp-harbor itself hadn't reached
|
||||
# Ready — chicken-and-egg. Same failure shape as Wave 7 #1610 with
|
||||
# bp-hcloud-csi (REMOVED — see kustomization.yaml comment block).
|
||||
#
|
||||
# Fix here is the cleaner long-term cousin of the Wave 7 hotfix:
|
||||
# instead of removing the slot, sequence it AFTER bp-harbor (slot 19)
|
||||
# by renumbering to 19a + adding `bp-harbor` to dependsOn. Once
|
||||
# bp-harbor is Ready (its chart pull goes through harbor.openova.io,
|
||||
# the mothership-warmed proxy-cache wired into k3s registries.yaml at
|
||||
# cloud-init time — NOT through harbor.<sov-fqdn>, so no cycle there),
|
||||
# this slot's chart pull can resolve against either ghcr.io
|
||||
# (pre-cutover) or harbor.<sov-fqdn> (post-cutover) and find the
|
||||
# artifact. The cutover Step-06 phase-1 URL rewrite is safe by then.
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-sandbox
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-sandbox
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "19a"
|
||||
catalyst.openova.io/component: sandbox-controller
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: sandbox
|
||||
targetNamespace: catalyst-system
|
||||
dependsOn:
|
||||
- name: bp-vcluster-helmrepo
|
||||
- name: bp-catalyst-platform
|
||||
# bp-harbor (slot 19, Wave 11 convergence fix 2026-05-18) — sandbox's
|
||||
# chart pull goes through harbor.<sov-fqdn> after the post-handover
|
||||
# cutover Step-06 phase-1 HelmRepository URL rewrite. Without this
|
||||
# edge, source-controller hits harbor.<sov-fqdn> before bp-harbor
|
||||
# is Ready, the OCI fetch 503s, and bp-sandbox sits Reconciling for
|
||||
# the entire bootstrap-kit timeout window — preventing the umbrella
|
||||
# Kustomization from ever reaching Ready. Same chicken-and-egg as
|
||||
# Wave 7 #1610 (bp-hcloud-csi, REMOVED) but resolved by sequencing
|
||||
# rather than removal so the slot remains available for Wave 11
|
||||
# Sandbox MVP without manual Day-2 add-app re-introduction.
|
||||
- name: bp-harbor
|
||||
chart:
|
||||
spec:
|
||||
chart: sandbox
|
||||
version: 0.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-sandbox
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 10m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 10m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface.
|
||||
#
|
||||
# enabled — default-ON via ${SANDBOX_ENABLED:-true} on the
|
||||
# bootstrap-kit Kustomization substitute. Wave 11 convergence fix
|
||||
# (TBD-D11, t22.omantel.biz 2026-05-18): every Sandbox CR sat
|
||||
# unreconciled because the bootstrap-kit Kustomization's substitute
|
||||
# map never wires SANDBOX_ENABLED, so the envsubst resolved to the
|
||||
# `:-false` fallback and the chart skip-rendered the entire
|
||||
# controller Deployment. With Wave 8 pty-server + MCP images now
|
||||
# SHA-stamped in chart values.yaml (auto-bumped by .github/workflows/
|
||||
# build-sandbox-{pty-server,mcp-server}.yaml), the gate's original
|
||||
# purpose is satisfied — flip default-ON so the controller materialises
|
||||
# on every fresh prov. Operators may still opt-OUT by setting
|
||||
# `SANDBOX_ENABLED=false` on the per-Sovereign overlay's substitute
|
||||
# map (mirrors how MARKETPLACE_ENABLED works in slot 13).
|
||||
#
|
||||
# runtime.* — Wave 8 pty-server / MCP / NEWAPI wiring. The
|
||||
# controller surfaces these to its per-Sandbox renderer (manifests
|
||||
# rendered into the per-Org `catalyst-tenant` Gitea repo at
|
||||
# sandbox/<owner-uid>/).
|
||||
#
|
||||
# Image overrides are OMITTED from this slot's HR values — the
|
||||
# chart's values.yaml already SHA-pins both images (auto-bumped by
|
||||
# CI) and exposing them as substitute vars without the corresponding
|
||||
# entries in the bootstrap-kit Kustomization postBuild.substitute
|
||||
# map causes Flux to substitute empty strings → null → the chart's
|
||||
# `required` guard would fail render once enabled=true. Day-2 SHA
|
||||
# overrides remain available via Sovereign-overlay HelmRelease
|
||||
# patches under spec.values.runtime.{ptyServerImage,mcpImage} — but
|
||||
# the canonical path is bumping chart values.yaml + bootstrap-kit
|
||||
# pin (single source of truth, INVIOLABLE-PRINCIPLES.md #4a).
|
||||
values:
|
||||
enabled: ${SANDBOX_ENABLED:-true}
|
||||
env:
|
||||
hostCluster: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
sovereignFQDN: ${SOVEREIGN_FQDN}
|
||||
# TBD-D35c (Wave 32 verifier fix) — comma-separated list of
|
||||
# NewAPI channel names the controller stamps as `allowed_channels`
|
||||
# on every per-Sandbox token mint. Default `qwen` matches the
|
||||
# only channel bp-newapi's channel-seed-job.yaml writes on a
|
||||
# fresh Sovereign install (alias for `qwen3.6-bankdhofar`,
|
||||
# products/sandbox/docs/newapi-proxy-contract.md §2). Per-
|
||||
# Sovereign overlays MUST extend this list to mirror their
|
||||
# channel rollout (e.g. `qwen,anthropic,openai`) — the chart's
|
||||
# NoAllowedChannels guard fails every mint if this resolves to
|
||||
# empty.
|
||||
newapiDefaultChannels: ${SANDBOX_DEFAULT_CHANNELS:-qwen}
|
||||
runtime:
|
||||
newapiURL: https://newapi.${SOVEREIGN_FQDN}/v1
|
||||
# D31 active-hot-standby — when SOVEREIGN_ENABLE_HOT_STANDBY=true on
|
||||
# the per-Sovereign overlay (and both regions are non-empty AND
|
||||
# distinct), sandbox.db.provision materialises a primary + replica
|
||||
# Cluster.postgresql.cnpg.io pair instead of a single Cluster
|
||||
# (mirrors the bp-cnpg-pair pattern + bp-wordpress-tenant chart
|
||||
# 0.2.0+). Same trio of envsubst placeholders bp-catalyst-platform
|
||||
# slot 13 consumes for the marketplace tenant path — flipping one
|
||||
# knob on the per-Sovereign overlay covers BOTH paths so HA stays
|
||||
# consistent across the marketplace tenant install and the
|
||||
# sandbox.db plane. Default empty = single-Cluster CNPG (zero
|
||||
# regression). Region keys MUST match the canonical openova.io/
|
||||
# region node label value (e.g. `hz-fsn-rtz-prod`).
|
||||
cnpg:
|
||||
activeHotStandby:
|
||||
enabled: ${SOVEREIGN_ENABLE_HOT_STANDBY:-}
|
||||
primaryRegion: ${SOVEREIGN_PRIMARY_REGION:-}
|
||||
replicaRegion: ${SOVEREIGN_REPLICA_REGION:-}
|
||||
@ -49,8 +49,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-opentelemetry
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "20"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -67,12 +65,10 @@ spec:
|
||||
name: bp-opentelemetry
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -46,8 +46,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-alloy
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "21"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -64,12 +62,10 @@ spec:
|
||||
name: bp-alloy
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -43,8 +43,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-loki
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "22"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -61,12 +59,10 @@ spec:
|
||||
name: bp-loki
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -43,8 +43,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-mimir
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "23"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -55,18 +53,16 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-mimir
|
||||
version: 1.0.4
|
||||
version: 1.0.2
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-mimir
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -41,8 +41,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-tempo
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "24"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -59,12 +57,10 @@ spec:
|
||||
name: bp-tempo
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -46,8 +46,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-grafana
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "25"
|
||||
spec:
|
||||
interval: 15m
|
||||
timeout: 15m
|
||||
@ -65,18 +63,16 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-grafana
|
||||
version: 1.0.1
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-grafana
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -43,8 +43,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-kyverno
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "27"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: kyverno
|
||||
@ -54,7 +52,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-kyverno
|
||||
version: 1.1.0
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-kyverno
|
||||
@ -66,12 +64,10 @@ spec:
|
||||
# past the point where downstream HRs could legitimately reconcile.
|
||||
# disableWait lets Flux mark this Ready as soon as manifests apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -41,8 +41,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-reloader
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "28"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: reloader
|
||||
@ -60,12 +58,10 @@ spec:
|
||||
# HR Ready signal aligned with manifest apply rather than runtime
|
||||
# convergence, matching the rest of the bootstrap-kit.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -41,8 +41,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-vpa
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "29"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: vpa
|
||||
@ -59,12 +57,10 @@ spec:
|
||||
# updater, admission-controller) plus admission webhook TLS bootstrap.
|
||||
# disableWait keeps Flux's Ready signal aligned with manifest apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -42,8 +42,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-trivy
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "30"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: trivy
|
||||
@ -64,12 +62,10 @@ spec:
|
||||
# mark this Ready as soon as manifests apply; runtime convergence
|
||||
# (DB hydration, first scan reports landing) is observed via kubectl.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -40,8 +40,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-falco
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "31"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: falco
|
||||
@ -62,12 +60,10 @@ spec:
|
||||
# Helm `--wait`. disableWait keeps Flux's signal aligned with
|
||||
# manifest apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -43,8 +43,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-sigstore
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "32"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: sigstore
|
||||
@ -64,12 +62,10 @@ spec:
|
||||
# Certificate is issued + bound. disableWait avoids holding the HR
|
||||
# signal on a runtime-convergence event.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -40,8 +40,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-syft-grype
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "33"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: syft-grype
|
||||
@ -61,12 +59,10 @@ spec:
|
||||
# meaningful — disableWait is the correct shape so Flux marks Ready
|
||||
# as soon as manifests apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -58,8 +58,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-velero
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "34"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: velero
|
||||
@ -67,7 +65,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-velero
|
||||
version: 1.2.2
|
||||
version: 1.2.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-velero
|
||||
@ -78,12 +76,10 @@ spec:
|
||||
# observes via the BSL CR phase, not via Helm `--wait`. disableWait
|
||||
# keeps the HR's Ready signal aligned with manifest apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -45,8 +45,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-coraza
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "35"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: coraza
|
||||
@ -63,12 +61,10 @@ spec:
|
||||
name: bp-coraza
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -82,8 +82,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-cert-manager-powerdns-webhook
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "49"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: cert-manager-powerdns-webhook
|
||||
@ -97,7 +95,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cert-manager-powerdns-webhook
|
||||
version: 1.1.0
|
||||
version: 1.0.4
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cert-manager-powerdns-webhook
|
||||
@ -107,12 +105,10 @@ spec:
|
||||
# so blocking on Helm `--wait` for the leaf Certificate to reach
|
||||
# Ready is unnecessary. Replaces blanket spec.timeout band-aids.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -68,8 +68,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-cluster-autoscaler-hcloud
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "50"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: cluster-autoscaler
|
||||
@ -77,24 +75,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cluster-autoscaler-hcloud
|
||||
# 1.3.0 — qa-loop chroot-canvas Fix: wire HCLOUD_NETWORK /
|
||||
# HCLOUD_FIREWALL / HCLOUD_SSH_KEY env vars onto the autoscaler
|
||||
# deployment so scale-up VMs land on the Phase-0 private network +
|
||||
# firewall + ssh-key, identical to Phase-0 workers. Without these
|
||||
# the autoscaler-spawned VMs only receive a public IP, the worker
|
||||
# cloud-init's `K3S_URL=https://10.0.1.2:6443` is unreachable, the
|
||||
# k3s agent join silently fails, and every scale-up times out at
|
||||
# 15m → backoff. Live evidence: prov #38/#39/#41/#43 omantel.biz.
|
||||
#
|
||||
# 1.2.0 — qa-loop Wave 5 Fix #79 Gap D: chart-derived
|
||||
# HCLOUD_CLUSTER_CONFIG fallback. When the per-Sovereign
|
||||
# cloud-init has not stamped the `hcloud-cloud-init` key into
|
||||
# `flux-system/cloud-credentials`, the chart synthesises a
|
||||
# minimal HCLOUD_CLUSTER_CONFIG JSON from the existing
|
||||
# `cluster-autoscaler.autoscalingGroups[]` so the autoscaler
|
||||
# never FATALs with the generic
|
||||
# "HCLOUD_CLUSTER_CONFIG or HCLOUD_CLOUD_INIT is not specified".
|
||||
version: 1.3.0
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cluster-autoscaler-hcloud
|
||||
@ -105,12 +86,10 @@ spec:
|
||||
# not a Helm-wait concern. disableWait keeps Flux's Ready signal
|
||||
# aligned with manifest apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
@ -152,39 +131,6 @@ spec:
|
||||
# operators rotate by re-running cloud-init or by patching
|
||||
# cloud-credentials directly).
|
||||
optional: true
|
||||
# ── Issue #1778 — Hetzner network/firewall/ssh-key attachment ──────
|
||||
# The cluster-autoscaler-hcloud provider only attaches scale-up
|
||||
# servers to an existing private network when HCLOUD_NETWORK is set
|
||||
# at startup. Without it, the Phase-0 workers (which join via
|
||||
# 10.0.1.2:6443 on the private subnet) and the autoscaler-spawned
|
||||
# workers (which only have a public IP) live on different network
|
||||
# planes — the autoscaler VMs cannot reach the apiserver private
|
||||
# endpoint, the k3s agent join times out, the node never registers,
|
||||
# the autoscaler hits a 15m scale-up timeout and enters backoff.
|
||||
# Names are written by cloud-init (see
|
||||
# infra/hetzner/cloudinit-control-plane.tftpl `hcloud-network-name`
|
||||
# etc.) so the autoscaler attaches every scale-up VM to the
|
||||
# SAME network + firewall + ssh-key the Phase-0 Tofu module created.
|
||||
# The chart's values.yaml default of empty-string keeps the upstream
|
||||
# deployment shape valid for legacy Sovereigns whose cloud-init
|
||||
# never stamped these keys; on those Sovereigns Flux just skips the
|
||||
# entry (optional: true) and the autoscaler runs in its pre-#1778
|
||||
# shape (still broken, but no Helm render error).
|
||||
- kind: Secret
|
||||
name: cloud-credentials
|
||||
valuesKey: hcloud-network-name
|
||||
targetPath: cluster-autoscaler.extraEnv.HCLOUD_NETWORK
|
||||
optional: true
|
||||
- kind: Secret
|
||||
name: cloud-credentials
|
||||
valuesKey: hcloud-firewall-name
|
||||
targetPath: cluster-autoscaler.extraEnv.HCLOUD_FIREWALL
|
||||
optional: true
|
||||
- kind: Secret
|
||||
name: cloud-credentials
|
||||
valuesKey: hcloud-ssh-key-name
|
||||
targetPath: cluster-autoscaler.extraEnv.HCLOUD_SSH_KEY
|
||||
optional: true
|
||||
# Per-Sovereign baseline values. clusters/<sovereign>/bootstrap-kit/
|
||||
# 40-cluster-autoscaler.yaml MAY override `autoscalingGroups` to set
|
||||
# the actual instanceType + region + min/max + name the Tofu module
|
||||
|
||||
@ -49,8 +49,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-k8s-ws-proxy
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "51"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: k8s-ws-proxy
|
||||
@ -66,34 +64,16 @@ spec:
|
||||
# in values.yaml. The imagePullSecrets default is required so
|
||||
# omantel pods can pull from private GHCR without per-Sovereign
|
||||
# overlay (the catalyst-system `ghcr-pull` secret is canonical).
|
||||
# 0.1.6 (qa-loop bounded-cycle Wave 5 Fix #78, Gap E): adds
|
||||
# pre-install hook-weight -10 Job that auto-generates the
|
||||
# `k8s-ws-proxy-hmac` Secret from /dev/urandom when absent.
|
||||
# Pre-this, every fresh Sovereign sat with three k8s-ws-proxy
|
||||
# pods ContainerCreating forever — the chart referenced a
|
||||
# Secret that nothing ever created. Idempotent on upgrade
|
||||
# (preserves the existing key — rotating it would invalidate
|
||||
# every in-flight catalyst-api signature).
|
||||
# 0.1.9 (qa-loop bounded-cycle Fix #95, regression of Fix #78):
|
||||
# explicit hook-weight ordering for the hmac-bootstrap quartet
|
||||
# (SA=-20, Role+RoleBinding=-15, Job=-10) so the SA lands BEFORE
|
||||
# the Job that references it. Pre-this, prov #8 failed with
|
||||
# `serviceaccount "k8s-ws-proxy-hmac-bootstrap" not found`
|
||||
# because the Job (weight -10, lower=earlier in Helm) was
|
||||
# applied before its SA (weight 0). Bumps Chart.yaml 0.1.7 ->
|
||||
# 0.1.8; CI promote auto-bumps to 0.1.9 with new image SHA.
|
||||
version: 0.1.11
|
||||
version: 0.1.5
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-k8s-ws-proxy
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -63,8 +63,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-guacamole
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "52"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: guacamole
|
||||
@ -91,55 +89,16 @@ spec:
|
||||
# chart wanted seaweedfs-storage, K8s rejected the immutable-spec
|
||||
# patch with `cannot patch ... PersistentVolumeClaim ... is
|
||||
# invalid: spec: Forbidden: spec is immutable after creation`).
|
||||
# 0.1.15 (Fix #158): migrationImage bumped to
|
||||
# bitnamilegacy/kubectl:1.30.7 (was 1.29.3); template fallback no
|
||||
# longer references bitnami/kubectl (deleted from Docker Hub 2025-08).
|
||||
# 0.1.19 (2026-05-12): chart-bump mirror chain caught up — tests/
|
||||
# render.sh expect_total realigned from 15 → 19 (Fix #125's
|
||||
# bootstrap Job for guacamole-oidc Secret added 4 resources: 1 Job
|
||||
# + 1 ServiceAccount + 1 Role + 1 RoleBinding). Prior to the test
|
||||
# fix, every Build bp-guacamole run published images but the
|
||||
# Blueprint Release dispatched on the bump commit failed render.sh
|
||||
# → 0.1.13–0.1.18 were never published to GHCR → bootstrap-kit
|
||||
# HRs wedged at "ghcr.io/openova-io/bp-guacamole:0.1.17: not found".
|
||||
# 0.1.21 (Refs TBD-G4 / C5-009, 2026-05-18): pulls in PR #1684
|
||||
# (guacamole-deployment.yaml mount /home/guacamole instead of
|
||||
# /home/guacamole/.guacamole). The official Apache Guacamole
|
||||
# image entrypoint runs `rm -rf $GUACAMOLE_HOME` before
|
||||
# repopulating the directory on every start; when the emptyDir
|
||||
# was mounted directly at /home/guacamole/.guacamole the path
|
||||
# was a mount point and `rm` failed with `Read-only file
|
||||
# system`, crash-looping the webapp before Tomcat ever booted
|
||||
# (observed on t22, 16 restarts). Mounting the parent dir
|
||||
# makes .guacamole a regular subdirectory the entrypoint can
|
||||
# freely rm and recreate.
|
||||
# 0.1.22 (Refs TBD-G6 / C12-004, 2026-05-18): pulls in PR #1692
|
||||
# (values default guacamole.httproute.parentRef.namespace
|
||||
# gateway-system -> kube-system). The
|
||||
# catalyst-system/guacamole-server HTTPRoute on t22 went
|
||||
# Accepted=False because gateway-system/cilium-gateway does not
|
||||
# exist on any Sovereign — the canonical gateway is
|
||||
# kube-system/cilium-gateway installed by 01-cilium.yaml and
|
||||
# used by every other Sovereign HTTPRoute.
|
||||
# 0.1.23 (Refs TBD-G4 phase 2 / C12-005, 2026-05-18): pulls in
|
||||
# PR #1699 (liveness + readiness probe paths flipped from `/`
|
||||
# to `/guacamole/`). The Apache Guacamole webapp deploys under
|
||||
# Tomcat's context path /guacamole/, not /, so probing `/`
|
||||
# made kubelet restart the Pod every ~60s and the kube-system
|
||||
# Cilium gateway returned 503 to the public hostname because
|
||||
# the Endpoint was never Ready (observed on t22, 5 restarts).
|
||||
version: 0.1.24
|
||||
version: 0.1.9
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-guacamole
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
|
||||
@ -1,107 +0,0 @@
|
||||
# bp-dmz-vcluster — Catalyst bootstrap-kit Blueprint slot 54.
|
||||
#
|
||||
# Per-region DMZ vCluster — installed on EVERY region (primary AND
|
||||
# every secondary). The DMZ vCluster is:
|
||||
# - the public-fronted vCluster (hosts Cilium Gateway HTTPS ingress)
|
||||
# - the inter-region WireGuard hop per docs/SOVEREIGN-MULTI-REGION-
|
||||
# DOD.md A2 (inter-region link = DMZ WG over PUBLIC IPs, ALWAYS)
|
||||
# - the home of clustermesh-apiserver Service type=LoadBalancer
|
||||
# per DoD A3
|
||||
#
|
||||
# Per docs/SOVEREIGN-MULTI-REGION-DOD.md A4 (vCluster topology):
|
||||
# primary region → MGMT + DMZ vCluster (slot 58 + this slot)
|
||||
# secondary region → DMZ + RTZ vCluster (this slot + slot 59)
|
||||
#
|
||||
# Supersedes the inert `bp-dmz-vcluster` slot 54 entry declared in
|
||||
# scripts/expected-bootstrap-deps.yaml since qa-loop iter-12 Fix #53C
|
||||
# (the chart was authored at products/dmz-vcluster but never wired
|
||||
# into the bootstrap kit). The chart at products/dmz-vcluster
|
||||
# remains the per-tenant marketplace deliverable — different artifact,
|
||||
# now also a different chart name: `bp-dmz-vcluster-tenant` (renamed
|
||||
# 2026-05-18 per TBD-A6c, issue #1719, so the pin-sync audit can
|
||||
# disambiguate the two). This slot pins THIS chart (platform/) at
|
||||
# the version declared in platform/bp-dmz-vcluster/chart/Chart.yaml.
|
||||
#
|
||||
# Wrapper chart: platform/bp-dmz-vcluster/chart/
|
||||
# Bundles loft-sh/vcluster 0.20.0 as a Helm subchart.
|
||||
#
|
||||
# Reconciled by: Flux on every region's k3s control plane.
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-cilium — CNI + Gateway API
|
||||
# - bp-cert-manager — TLS for ClusterIssuers / wildcard cert
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-dmz-vcluster
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-dmz-vcluster
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "54"
|
||||
catalyst.openova.io/vcluster-role: dmz
|
||||
catalyst.openova.io/topology: dod-a4
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: dmz-vcluster
|
||||
targetNamespace: dmz
|
||||
dependsOn:
|
||||
- name: bp-cilium
|
||||
- name: bp-cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-dmz-vcluster
|
||||
version: 0.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-dmz-vcluster
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface.
|
||||
#
|
||||
# dmzVcluster.enabled — DMZ runs on every region by design (DoD A4).
|
||||
# Default-ON to deliver the topology contract on day-one.
|
||||
values:
|
||||
dmzVcluster:
|
||||
enabled: true
|
||||
hostNamespace: dmz
|
||||
vclusterName: dmz
|
||||
role: every-region
|
||||
nodeSelector:
|
||||
regionLabelKey: openova.io/region
|
||||
# Substituted by the bootstrap-kit Kustomization with the THIS
|
||||
# region's CANONICAL k3s node-label value (e.g.
|
||||
# `hz-hel-rtz-prod` on hel1, `hz-nbg-rtz-prod` on nbg1,
|
||||
# `hz-sin-rtz-prod` on sin). Caught on t126 (2026-05-16): the
|
||||
# prior `${SOVEREIGN_REGION_KEY}` ("hel1"/"nbg1-1"/"sin-2")
|
||||
# didn't match the node label written by cloud-init (which uses
|
||||
# `region_canonical_label` not `sovereign_region_key`) so every
|
||||
# DMZ vCluster Pod sat Pending with FailedScheduling.
|
||||
regionLabelValue: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
vcluster:
|
||||
controlPlane:
|
||||
statefulSet:
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
openova.io/region: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
@ -1,110 +0,0 @@
|
||||
# bp-hcloud-ccm — Catalyst bootstrap-kit Blueprint #55
|
||||
# (Tier 5 — Cloud Integration). Pairs with bp-cluster-autoscaler-hcloud
|
||||
# (slot 50) and bp-hcloud-csi (slot 51, when present) — the full
|
||||
# Hetzner-cloud-direct trio.
|
||||
#
|
||||
# Wires hcloud-cloud-controller-manager into the cluster as the canonical
|
||||
# Hetzner cloud-provider integration. Without this CCM running:
|
||||
# - Node providerID stays as k3s://<node-name> (kube-controller-manager
|
||||
# + scheduler cannot correlate Pods with Hetzner zones).
|
||||
# - Service-of-type-LoadBalancer stays in EXTERNAL-IP: <pending> forever
|
||||
# (no cloud-provider to call out to).
|
||||
#
|
||||
# The second consequence is the proximate root cause clustermesh-apiserver
|
||||
# could not migrate from NodePort to LB on omantel multi-region (qa-loop
|
||||
# iter-12 Fix #53D + Fix #54 Workstream 1) — until hcloud-CCM is
|
||||
# installed, the LB-typed Service from Fix #53D's PR #1274 stays Pending.
|
||||
#
|
||||
# Wrapper chart: platform/hcloud-ccm/chart/ — umbrella over upstream
|
||||
# hetznercloud/hcloud-cloud-controller-manager chart 1.20.0
|
||||
# (appVersion 1.20.0). Catalyst-curated values flow under the
|
||||
# `hcloud-cloud-controller-manager:` key + a vendor-agnostic
|
||||
# `hcloudCcm.*` block that ships the namespace-local Hetzner-API-token
|
||||
# Secret (`hcloud-token`).
|
||||
#
|
||||
# Reconciled by: Flux on the new Sovereign's k3s control plane.
|
||||
#
|
||||
# Hetzner-token wiring (mirrors bp-cluster-autoscaler-hcloud at slot 50
|
||||
# + bp-velero at slot 34 + bp-harbor at slot 19):
|
||||
# - cloud-init writes `flux-system/cloud-credentials` Secret with the
|
||||
# `hcloud-token` key (see infra/hetzner/cloudinit-control-plane.tftpl
|
||||
# §"cloud-credentials-secret").
|
||||
# - This HelmRelease lifts the `hcloud-token` value into the umbrella
|
||||
# chart's `hcloudCcm.hcloudToken` value via Flux `valuesFrom`. The
|
||||
# umbrella chart then synthesises a namespace-local
|
||||
# `kube-system/hcloud-token` Secret (templates/hcloud-token-secret.yaml)
|
||||
# the upstream chart's `env.HCLOUD_TOKEN` wiring binds as the
|
||||
# deployment's HCLOUD_TOKEN env var.
|
||||
#
|
||||
# dependsOn: (none) — hcloud-CCM is the FIRST cloud-provider seam, must
|
||||
# install BEFORE any blueprint that creates a LoadBalancer Service. The
|
||||
# cloud-credentials Secret is provisioned by cloud-init BEFORE Flux
|
||||
# installs anything.
|
||||
|
||||
---
|
||||
# kube-system is built into every Kubernetes cluster — never re-declare it.
|
||||
# The HelmRelease's targetNamespace below installs hcloud-CCM into
|
||||
# kube-system (canonical CCM placement; mirrors hcloud-CCM upstream
|
||||
# documentation).
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-hcloud-ccm
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-hcloud-ccm
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "55"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: hcloud-ccm
|
||||
targetNamespace: kube-system
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-hcloud-ccm
|
||||
version: 1.0.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-hcloud-ccm
|
||||
namespace: flux-system
|
||||
# Event-driven install: hcloud-CCM is a single Deployment +
|
||||
# ServiceAccount + RBAC. Helm install completes when manifests apply;
|
||||
# the binary's Hetzner-API connectivity check is a runtime concern,
|
||||
# not a Helm-wait concern. disableWait keeps Flux's Ready signal
|
||||
# aligned with manifest apply.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# ── Hetzner-token wiring ─────────────────────────────────────────────
|
||||
# Pulls the `hcloud-token` key from the canonical
|
||||
# `flux-system/cloud-credentials` Secret cloud-init writes at Phase 0
|
||||
# (infra/hetzner/cloudinit-control-plane.tftpl §"cloud-credentials-secret").
|
||||
# Flux dereferences `valuesFrom` at HelmRelease apply time, so the
|
||||
# plaintext payload never appears in this committed manifest.
|
||||
#
|
||||
# The chart's templates/hcloud-token-secret.yaml renders this value
|
||||
# into a namespace-local `kube-system/hcloud-token` Secret which the
|
||||
# upstream chart's `env.HCLOUD_TOKEN.valueFrom.secretKeyRef` binding
|
||||
# lifts onto the deployment's env.
|
||||
valuesFrom:
|
||||
- kind: Secret
|
||||
name: cloud-credentials
|
||||
valuesKey: hcloud-token
|
||||
targetPath: hcloudCcm.hcloudToken
|
||||
@ -1,113 +0,0 @@
|
||||
# bp-openova-flow-server — Catalyst bootstrap-kit Blueprint slot 56
|
||||
# (Observability / OpenovaFlow event router).
|
||||
#
|
||||
# Stateless HTTP+SSE event router for OpenovaFlow. Emitters
|
||||
# (bp-openova-flow-emitter on every cluster, catalyst-api proxy on the
|
||||
# mother) POST FlowMessage envelopes; consumers (Sovereign Console
|
||||
# canvas) GET snapshots and subscribe to the SSE stream.
|
||||
#
|
||||
# Architecture:
|
||||
# - Primary-cluster only — one Service per Sovereign, reached
|
||||
# cross-region via Cilium Gateway HTTPRoute over public HTTPS.
|
||||
# No NetBird required for v1.
|
||||
# - In-memory ring buffer per flowId (default 4096 envelopes).
|
||||
# State is lost on Pod restart; emitters re-emit snapshot on
|
||||
# reconnect.
|
||||
# - Workload: single Deployment, ClusterIP Service, optional
|
||||
# HTTPRoute for cross-cluster reachability.
|
||||
#
|
||||
# Wrapper chart: platform/openova-flow-server/chart/
|
||||
# Catalyst-curated values: platform/openova-flow-server/chart/values.yaml
|
||||
# Reconciled by: Flux on the new Sovereign's k3s control plane.
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-cilium — Pod network + Gateway API for the operator-
|
||||
# facing HTTPRoute.
|
||||
# - bp-cert-manager — TLS for openova-flow.<sovereign-fqdn>.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #1 (target-state) the chart ships
|
||||
# the real workload. Per #4 (never hardcode) the hostname,
|
||||
# ringCapacity, and image tag are operator-driven.
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-openova-flow-server
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-openova-flow-server
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "56"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: openova-flow-server
|
||||
# Lands in catalyst-system co-located with the rest of the
|
||||
# Catalyst control-plane stack (catalyst-api / catalyst-ui / etc.).
|
||||
targetNamespace: catalyst-system
|
||||
dependsOn:
|
||||
- name: bp-cilium
|
||||
- name: bp-cert-manager
|
||||
# CNPG provides postgresql.cnpg.io/v1, the CRD used by the chart's
|
||||
# cnpg-cluster.yaml. Without this dep the cold install's `kind:
|
||||
# Cluster` manifest is rejected and the HR loops on InstallFailed.
|
||||
# Added 2026-05-14 alongside the in-memory → CNPG-backed store
|
||||
# rewrite (PR replacing the brittle in-memory map+RingBuffer that
|
||||
# lost ALL flow state on pod restart).
|
||||
- name: bp-cnpg
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-openova-flow-server
|
||||
version: 0.2.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-openova-flow-server
|
||||
namespace: flux-system
|
||||
# Event-driven install: openova-flow-server is a single Deployment +
|
||||
# Service + ServiceAccount. Helm install completes when manifests
|
||||
# apply; readiness signalled via Flux dependsOn, never via
|
||||
# spec.timeout watchdogs.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface. The Sovereign's FQDN is interpolated
|
||||
# at Flux apply time via the bootstrap-kit Kustomization's
|
||||
# postBuild.substitute env hook — `${SOVEREIGN_FQDN}` is replaced
|
||||
# with the concrete sovereign FQDN before the HR bytes land in the
|
||||
# cluster.
|
||||
values:
|
||||
flowServer:
|
||||
enabled: true
|
||||
httproute:
|
||||
# Default ON — cross-cluster emitters reach this server's
|
||||
# public HTTPS endpoint via the Cilium Gateway. Per-Sovereign
|
||||
# overlay disables when only the in-cluster Service is needed.
|
||||
enabled: true
|
||||
hostname: openova-flow.${SOVEREIGN_FQDN}
|
||||
# Canonical Sovereign Gateway — every other HTTPRoute
|
||||
# (catalyst-api, catalyst-ui, marketplace, gitea, harbor,
|
||||
# keycloak, …) parents to kube-system/cilium-gateway installed
|
||||
# by bootstrap-kit/01-cilium.yaml. Fix (TBD-G6 / C12-004):
|
||||
# the previous value `catalyst-gateway` does not exist on any
|
||||
# Sovereign — the HTTPRoute went Accepted=False with "no
|
||||
# matching parent" on t22.
|
||||
gatewayRef:
|
||||
name: cilium-gateway
|
||||
namespace: kube-system
|
||||
@ -1,101 +0,0 @@
|
||||
# bp-openova-flow-emitter — Catalyst bootstrap-kit Blueprint slot 57
|
||||
# (Observability / OpenovaFlow Flux adapter).
|
||||
#
|
||||
# Region-aware DaemonSet sidecar that watches HelmRelease + HelmChart
|
||||
# CRs on the LOCAL cluster's Flux and POSTs FlowMessage envelopes to
|
||||
# the configured openova-flow-server (slot 56, primary cluster only).
|
||||
#
|
||||
# Topology — runs on EVERY cluster (mother + primary Sovereign + every
|
||||
# secondary region). The receiving server sits on the primary cluster;
|
||||
# cross-cluster reachability is via the Cilium Gateway HTTPRoute over
|
||||
# public HTTPS.
|
||||
#
|
||||
# Wrapper chart: platform/openova-flow-emitter/chart/
|
||||
# Catalyst-curated values: platform/openova-flow-emitter/chart/values.yaml
|
||||
# Reconciled by: Flux on the new Sovereign's k3s control plane.
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-flux — informer needs Flux's helmrelease CRDs.
|
||||
#
|
||||
# Per docs/INVIOLABLE-PRINCIPLES.md #1 (target-state) the emitter runs
|
||||
# from first cut on every cluster. Per #4 (never hardcode) the
|
||||
# FLOW_SERVER_URL, FLOW_ID, and REGION_KEY all flow from the
|
||||
# per-Sovereign overlay's substitute env.
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-openova-flow-emitter
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-openova-flow-emitter
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "57"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: openova-flow-emitter
|
||||
targetNamespace: catalyst-system
|
||||
dependsOn:
|
||||
- name: bp-flux
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-openova-flow-emitter
|
||||
version: 0.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-openova-flow-emitter
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface. ${SOVEREIGN_FQDN},
|
||||
# ${SOVEREIGN_DEPLOYMENT_ID} and ${SOVEREIGN_REGION_KEY} are all
|
||||
# provided by the bootstrap-kit Kustomization's postBuild.substitute
|
||||
# env hook (see infra/hetzner/cloudinit-control-plane.tftpl, wired in
|
||||
# main.tf for primary CP + secondary CP for_each so multi-region
|
||||
# Sovereigns get distinct region tags on FlowNodes).
|
||||
#
|
||||
# FlowID — the catalyst-api per-deployment 16-char hex id. The catalyst-
|
||||
# api proxy /api/v1/flows/{deploymentId}/* queries the openova-flow-
|
||||
# server under the same id, so this is the canonical key linking the
|
||||
# canvas to the emitter.
|
||||
# RegionKey — Hetzner region code for this cluster ("fsn1" for primary,
|
||||
# "hel1"/etc for secondaries). Stamped onto every FlowNode.region so
|
||||
# the canvas groups bubbles into per-region super-bubbles via
|
||||
# `contains` relationships.
|
||||
values:
|
||||
flowEmitter:
|
||||
enabled: true
|
||||
# In-cluster Service URL — the emitter DaemonSet lives in the same
|
||||
# k3s as the openova-flow-server Deployment, so the POST stays
|
||||
# cluster-local with no TLS dependency. The public HTTPRoute at
|
||||
# https://openova-flow.<fqdn> exists for the MOTHERSHIP
|
||||
# catalyst-api proxy (Agent #8 PR #1405) and any external consumer,
|
||||
# NOT for the in-cluster emitter. Using the public URL was a live
|
||||
# regression on prov #34, 2026-05-11: emitter posted to
|
||||
# https://openova-flow.omantel.biz, TLS handshake EOF'd because
|
||||
# bp-catalyst-platform InstallFailed → no wildcard *.<fqdn> cert
|
||||
# → no Gateway listener → emitter retry-loop → server stays empty
|
||||
# → canvas showed "No nodes to render".
|
||||
flowServerUrl: http://openova-flow-server.catalyst-system.svc.cluster.local
|
||||
flowId: ${SOVEREIGN_DEPLOYMENT_ID}
|
||||
regionKey: ${SOVEREIGN_REGION_KEY}
|
||||
namespaceFilter: flux-system
|
||||
@ -1,126 +0,0 @@
|
||||
# bp-mgmt-vcluster — Catalyst bootstrap-kit Blueprint slot 58.
|
||||
#
|
||||
# Primary-region MGMT vCluster. Hosts catalyst-api, catalyst-ui,
|
||||
# openova-flow-server, and other Sovereign control-plane workloads.
|
||||
#
|
||||
# Per docs/SOVEREIGN-MULTI-REGION-DOD.md A4 (vCluster topology):
|
||||
# primary region → MGMT + DMZ vCluster (this slot + slot 54)
|
||||
# secondary region → DMZ + RTZ vCluster (slot 54 + slot 59)
|
||||
#
|
||||
# Cross-vCluster intra-region traffic between MGMT and DMZ stays
|
||||
# inside the host k3s via Cilium endpoint identity routing. Inter-
|
||||
# region traffic goes over the DMZ WireGuard hop per DoD A2.
|
||||
#
|
||||
# Wrapper chart: platform/bp-mgmt-vcluster/chart/
|
||||
# Bundles loft-sh/vcluster 0.20.0 as a Helm subchart so
|
||||
# `helm dependency build` packages it into the OCI artifact.
|
||||
#
|
||||
# Reconciled by: Flux on the new Sovereign's k3s control plane.
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-cilium — CNI + Gateway API
|
||||
# - bp-cert-manager — TLS for ClusterIssuers (vCluster's exported
|
||||
# kubeconfig needs the cluster's CA chain)
|
||||
#
|
||||
# Per-role gating: this slot defaults to DISABLED (mgmtVcluster.
|
||||
# enabled=${MGMT_VCLUSTER_ENABLED:=false}). The Sovereign-provisioning
|
||||
# tofu module (infra/hetzner/main.tf primary-CP block) flips this to
|
||||
# "true" via postBuild.substitute on the PRIMARY region's CP only.
|
||||
# Secondary regions leave it unset → defaults to false → no resources
|
||||
# render.
|
||||
#
|
||||
# Until the tofu-substitute follow-up PR lands (which adds
|
||||
# MGMT_VCLUSTER_ENABLED to the substitute block per the v3.2 Gap A
|
||||
# refactor), operators can opt in per-Sovereign overlay by patching
|
||||
# `values.mgmtVcluster.enabled: true` on the primary cluster's
|
||||
# bootstrap-kit slot.
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-mgmt-vcluster
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-mgmt-vcluster
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "58"
|
||||
catalyst.openova.io/vcluster-role: mgmt
|
||||
catalyst.openova.io/topology: dod-a4
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: mgmt-vcluster
|
||||
# The chart's templates/namespace.yaml creates the host namespace,
|
||||
# so we point Flux to install INTO that namespace. The vCluster
|
||||
# subchart's StatefulSet + Service + RBAC land here.
|
||||
targetNamespace: mgmt
|
||||
dependsOn:
|
||||
- name: bp-cilium
|
||||
- name: bp-cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-mgmt-vcluster
|
||||
version: 0.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-mgmt-vcluster
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface.
|
||||
#
|
||||
# mgmtVcluster.enabled — flipped on by the primary CP's tofu postBuild
|
||||
# substitute (MGMT_VCLUSTER_ENABLED). When the substitute is unset
|
||||
# (secondary regions, or pre-substitute-rollout primary), envsubst
|
||||
# leaves the placeholder which Flux resolves to the literal
|
||||
# "${MGMT_VCLUSTER_ENABLED:=false}" → Helm's YAML parser interprets
|
||||
# the rendered string. To prevent that subtlety, the chart's default
|
||||
# in values.yaml is already `enabled: false`; this slot ONLY sets it
|
||||
# via valuesFrom on the primary CP. The simplest correct shape is to
|
||||
# leave the value at chart-default `false` and rely on per-Sovereign
|
||||
# overlay patches OR the follow-up tofu substitute (which will add
|
||||
# `values.mgmtVcluster.enabled: true` only on the primary CP's
|
||||
# cloud-init).
|
||||
values:
|
||||
mgmtVcluster:
|
||||
# Flipped on for the primary region by tofu's primary postBuild
|
||||
# (mgmt_vcluster_enabled=true). Secondaries render this slot but
|
||||
# tofu sets the substitute false → chart renders zero resources.
|
||||
enabled: ${MGMT_VCLUSTER_ENABLED:=false}
|
||||
hostNamespace: mgmt
|
||||
vclusterName: mgmt
|
||||
role: primary
|
||||
nodeSelector:
|
||||
regionLabelKey: openova.io/region
|
||||
# Canonical region label (hz-<stem>-rtz-prod) — matches the
|
||||
# k3s node-label written at install time. See slot 54 + the
|
||||
# SOVEREIGN_REGION_CANONICAL_LABEL substitute in
|
||||
# infra/hetzner/cloudinit-control-plane.tftpl.
|
||||
regionLabelValue: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
# Subchart values overlay — pinned to the per-region canonical label
|
||||
# so the upstream vcluster StatefulSet's nodeSelector binds the same
|
||||
# CP node as the umbrella's nodeSelector helper.
|
||||
vcluster:
|
||||
controlPlane:
|
||||
statefulSet:
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
openova.io/region: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
@ -1,97 +0,0 @@
|
||||
# bp-rtz-vcluster — Catalyst bootstrap-kit Blueprint slot 59.
|
||||
#
|
||||
# Per-secondary-region RTZ vCluster — installed ONLY on secondary
|
||||
# regions. Hosts regional tenant workloads + caches.
|
||||
#
|
||||
# Per docs/SOVEREIGN-MULTI-REGION-DOD.md A4 (vCluster topology):
|
||||
# primary region → MGMT + DMZ vCluster (slot 58 + slot 54)
|
||||
# secondary region → DMZ + RTZ vCluster (slot 54 + this slot)
|
||||
#
|
||||
# Cross-vCluster intra-region traffic between RTZ and DMZ stays
|
||||
# inside the host k3s via Cilium endpoint identity routing. Cross-
|
||||
# region traffic (RTZ secondary ↔ MGMT primary) goes through the DMZ
|
||||
# WireGuard hop per DoD A2.
|
||||
#
|
||||
# Wrapper chart: platform/bp-rtz-vcluster/chart/
|
||||
# Bundles loft-sh/vcluster 0.20.0 as a Helm subchart.
|
||||
#
|
||||
# Reconciled by: Flux on every secondary region's k3s control plane.
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-cilium — CNI + Gateway API
|
||||
# - bp-cert-manager — TLS for ClusterIssuers
|
||||
#
|
||||
# Per-role gating: defaults to DISABLED (rtzVcluster.enabled=false).
|
||||
# The Sovereign-provisioning tofu module (infra/hetzner/main.tf
|
||||
# secondary-CP block) flips this to "true" via postBuild.substitute
|
||||
# on secondary regions only — pending the follow-up substitute PR
|
||||
# (which adds RTZ_VCLUSTER_ENABLED to the substitute block). Until
|
||||
# that lands, operators opt in per-Sovereign overlay by patching
|
||||
# `values.rtzVcluster.enabled: true` on secondary clusters'
|
||||
# bootstrap-kit slot.
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-rtz-vcluster
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-rtz-vcluster
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "59"
|
||||
catalyst.openova.io/vcluster-role: rtz
|
||||
catalyst.openova.io/topology: dod-a4
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: rtz-vcluster
|
||||
targetNamespace: rtz
|
||||
dependsOn:
|
||||
- name: bp-cilium
|
||||
- name: bp-cert-manager
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-rtz-vcluster
|
||||
version: 0.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-rtz-vcluster
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
values:
|
||||
rtzVcluster:
|
||||
# Flipped on by tofu's secondary-CP postBuild (rtz_vcluster_enabled=true).
|
||||
# Primary renders this slot too but tofu sets the substitute false →
|
||||
# chart renders zero resources on the primary.
|
||||
enabled: ${RTZ_VCLUSTER_ENABLED:=false}
|
||||
hostNamespace: rtz
|
||||
vclusterName: rtz
|
||||
role: secondary
|
||||
nodeSelector:
|
||||
regionLabelKey: openova.io/region
|
||||
regionLabelValue: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
vcluster:
|
||||
controlPlane:
|
||||
statefulSet:
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
openova.io/region: ${SOVEREIGN_REGION_CANONICAL_LABEL}
|
||||
@ -1,109 +0,0 @@
|
||||
# bp-vcluster-helmrepo — Catalyst bootstrap-kit Blueprint slot 60.
|
||||
#
|
||||
# Pre-stages the upstream loft-sh vcluster Helm chart source on the
|
||||
# Sovereign cluster so the Organization controller
|
||||
# (core/controllers/organization) can render per-tenant
|
||||
# `helm.toolkit.fluxcd.io/v2 HelmRelease` CRs whose `sourceRef` points
|
||||
# at `name=loft, namespace=vcluster-system` (the controller's defaults
|
||||
# at core/controllers/organization/cmd/main.go).
|
||||
#
|
||||
# Without this slot, every per-tenant vcluster HelmRelease the
|
||||
# Organization controller writes into the per-Org Gitea repo fails
|
||||
# Source reconcile with:
|
||||
#
|
||||
# HelmRepository.source.toolkit.fluxcd.io "loft" not found
|
||||
#
|
||||
# → no per-tenant vCluster is ever spawned → the Organization
|
||||
# controller's reconciliation loop blocks on tenant onboarding.
|
||||
# Convergence blocker #2 (vCluster source install on Sovereign).
|
||||
#
|
||||
# Wrapper chart: platform/bp-vcluster-helmrepo/chart/
|
||||
# Pure source-registration chart — registers a HelmRepository CR +
|
||||
# the vcluster-system namespace it lives in. Ships NO upstream
|
||||
# subchart (same shape as bp-gateway-api). The upstream chart is
|
||||
# pulled per-tenant by Flux at HelmRelease reconcile time, NOT
|
||||
# bundled into this slot's OCI artifact.
|
||||
#
|
||||
# Reconciled by: Flux on the new Sovereign's k3s control plane.
|
||||
#
|
||||
# Slot 60 chosen as the first free slot after the existing vCluster
|
||||
# cohort (54/58/59 — DMZ/MGMT/RTZ Sovereign-tier vClusters). This
|
||||
# slot is the per-TENANT vCluster source registration (a different
|
||||
# layer): the Sovereign-tier slots embed loft-sh/vcluster 0.20.0 as
|
||||
# a Helm subchart so they ship a single OCI artifact; this slot
|
||||
# registers a live `source.toolkit.fluxcd.io/HelmRepository` CR so
|
||||
# the Organization controller's per-tenant rendered HelmReleases
|
||||
# can resolve `chart.spec.sourceRef name=loft namespace=vcluster-
|
||||
# system` at reconcile time. The two paths are independent — this
|
||||
# slot does NOT depend on slots 54/58/59 (and vice versa).
|
||||
#
|
||||
# dependsOn:
|
||||
# - bp-flux — Flux's source-controller must be Ready so the
|
||||
# HelmRepository CR is actually reconciled (otherwise
|
||||
# the CR sits without artifacts and downstream Flux
|
||||
# HelmReleases that reference it can't resolve).
|
||||
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bp-vcluster-helmrepo
|
||||
namespace: flux-system
|
||||
spec:
|
||||
type: oci
|
||||
interval: 15m
|
||||
url: oci://ghcr.io/openova-io
|
||||
secretRef:
|
||||
name: ghcr-pull
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-vcluster-helmrepo
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "60"
|
||||
catalyst.openova.io/tenant-spawn: vcluster
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: vcluster-helmrepo
|
||||
# The release marker Secret lives next to every other bootstrap-kit
|
||||
# release. The chart's templates/namespace.yaml creates the actual
|
||||
# vcluster-system namespace (cluster-scoped Namespace resource).
|
||||
targetNamespace: flux-system
|
||||
dependsOn:
|
||||
- name: bp-flux
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-vcluster-helmrepo
|
||||
version: 0.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-vcluster-helmrepo
|
||||
namespace: flux-system
|
||||
install:
|
||||
timeout: 5m
|
||||
disableWait: false
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 5m
|
||||
disableWait: false
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overlay surface — operators MAY swap the upstream URL
|
||||
# for a Harbor proxy cache (MIRROR-EVERYTHING per
|
||||
# docs/INVIOLABLE-PRINCIPLES.md #4a) or rename the CR / namespace
|
||||
# to align with a custom Organization-controller config.
|
||||
#
|
||||
# Defaults match the controller's hardcoded defaults at
|
||||
# core/controllers/organization/cmd/main.go:
|
||||
# CATALYST_VCLUSTER_HELMREPO_NAME = "loft"
|
||||
# CATALYST_VCLUSTER_HELMREPO_NAMESPACE = "vcluster-system"
|
||||
values:
|
||||
vclusterHelmRepo:
|
||||
name: loft
|
||||
namespace: vcluster-system
|
||||
url: https://charts.loft.sh
|
||||
interval: 15m
|
||||
createNamespace: true
|
||||
@ -40,8 +40,6 @@ kind: HelmRelease
|
||||
metadata:
|
||||
name: bp-newapi
|
||||
namespace: flux-system
|
||||
labels:
|
||||
catalyst.openova.io/slot: "80"
|
||||
spec:
|
||||
interval: 15m
|
||||
releaseName: newapi
|
||||
@ -86,64 +84,7 @@ spec:
|
||||
# of the PRIVATE newapi-mirror + metering-sidecar images. Paired
|
||||
# with cloud-init adding `newapi` to flux-system/ghcr-pull's
|
||||
# reflector auto-namespaces list.
|
||||
# 1.4.2 (qa-loop bounded-cycle audit prov #7 Gap F, 2026-05-10):
|
||||
# `.Values.newapi.image.tag` repointed from `v0.4.5` (fictitious —
|
||||
# never built by any CI workflow) to `v0.13.2` (actual upstream
|
||||
# Calcium-Ion/new-api Docker Hub release, mirrored into
|
||||
# ghcr.io/openova-io/openova/newapi-mirror by the new
|
||||
# `.github/workflows/build-bp-newapi.yaml` workflow). Pre-1.4.2
|
||||
# the NewAPI Pod ImagePullBackOff'd 403 on every fresh Sovereign,
|
||||
# blocking alice signup gate 5 (LLM).
|
||||
# 1.4.4 (qa-loop bounded-cycle audit prov #20 Fix #138, 2026-05-11):
|
||||
# add pre-install/pre-upgrade hook that polls the external-secrets
|
||||
# validating-admission webhook until it returns a structured HTTP
|
||||
# response — closes the race between bp-external-secrets reaching
|
||||
# HR Ready=True and the apiserver-side EndpointSlice for the
|
||||
# webhook Service being observable. Pre-1.4.4 the chart's
|
||||
# ExternalSecret apply was rejected with `no endpoints available
|
||||
# for service "external-secrets-webhook"` on every fresh provision,
|
||||
# blocking the chart from reaching Ready and the Catalyst signup
|
||||
# hook (ADR-0003 §3.2) from finding the admin-token Secret.
|
||||
# 1.4.10 (fix-convergence-wave11, 2026-05-18): gate the
|
||||
# defaultChannels.qwenBankDhofar entry on attestation-complete
|
||||
# rather than hard-failing the helm template. Pre-1.4.10 the
|
||||
# chart raised `commercial-contract attestation requires accountId`
|
||||
# on every Sovereign that opted in to marketplace
|
||||
# (MARKETPLACE_ENABLED=true) without ALSO supplying a signed
|
||||
# commercial contract's `LLM_BANK_DHOFAR_ACCOUNT_ID` /
|
||||
# `LLM_BANK_DHOFAR_CONTRACT_REF` envsubst variables. Post-1.4.10
|
||||
# the chart silently skips the qwenBankDhofar channel when
|
||||
# attestation is incomplete; once the operator overlay supplies
|
||||
# the attestation values the channel composes on the next
|
||||
# reconcile.
|
||||
# 1.4.12 (PR #1677, 2026-05-18): default
|
||||
# `.Values.sandboxTokenSigningKey.reflectorNamespaces` flipped
|
||||
# from `"sandbox"` → `"catalyst-system,sandbox"`. Pre-1.4.12 the
|
||||
# chart-emitted `newapi-bp-newapi-token-signing-key` Secret was
|
||||
# mirrored only into a `sandbox` namespace (which does NOT exist
|
||||
# on a stock Sovereign — bp-sandbox installs into
|
||||
# `catalyst-system` per slot 19a `targetNamespace`); the sandbox-
|
||||
# controller's `NEWAPI_ADMIN_SECRET` env var (secretKeyRef
|
||||
# `optional: true`) landed EMPTY, the controller silently dropped
|
||||
# into gitops-only mode, and zero per-Sandbox LLM-gateway tokens
|
||||
# were ever minted (operator-visible only via the controller's
|
||||
# `newapi_admin_secret_set=false` startup log). Caught on t22
|
||||
# 2026-05-18 (TBD-D14). Bumping the pin pulls the post-#1677
|
||||
# default so reflector mirrors into `catalyst-system` too.
|
||||
# 1.4.14 (current main, 2026-05-18): latest upstream-tracking
|
||||
# chart cut — includes 1.4.12's reflector fix.
|
||||
# 1.4.19 (TBD-A12 #1798, 2026-05-18): add startupProbe so kubelet
|
||||
# does NOT SIGKILL the binary at the 50s mark while GORM
|
||||
# AutoMigrate is still in-flight on the freshly-provisioned empty
|
||||
# `newapi` CNPG database. Pre-1.4.19 the empty DB on t22 sat with
|
||||
# ZERO tables after 29 CrashLoopBackOff restarts — every kill
|
||||
# raced AutoMigrate's first CREATE TABLE call mid-TLS-handshake;
|
||||
# pg_stat_activity on the CNPG primary showed no `newapi` user
|
||||
# connections because the kill happened before the GORM
|
||||
# connection pool's first wire write completed. Probe budget:
|
||||
# 30 × 10s = 5 min, comfortably above the observed 60-120s
|
||||
# ceiling on cpx21/cpx31 nodes with sslmode=require.
|
||||
version: 1.4.20
|
||||
version: 1.4.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-newapi
|
||||
@ -153,12 +94,10 @@ spec:
|
||||
# ~10 s once the Postgres DSN Secret is present; the long pole is
|
||||
# waiting for the operator's Crossplane claim to materialise the DB.
|
||||
install:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
timeout: 15m
|
||||
disableWait: true
|
||||
remediation:
|
||||
retries: 3
|
||||
@ -189,20 +128,6 @@ spec:
|
||||
tls:
|
||||
enabled: true
|
||||
issuer: letsencrypt-prod
|
||||
# Cilium Gateway HTTPRoute for `newapi.<fqdn>` (TBD-D35d, issue
|
||||
# #1778). Sandbox runtimes hit the LLM gateway at the URL the
|
||||
# sandbox controller mints into their environment
|
||||
# (`NEWAPI_BASE_URL=https://newapi.${SOVEREIGN_FQDN}/v1`). Without
|
||||
# this HTTPRoute the marketplace `tenant-wildcard` (hostnames=
|
||||
# `*.${SOVEREIGN_FQDN}`) absorbs every newapi.${SOVEREIGN_FQDN}
|
||||
# request and forwards to the storefront `console` Service —
|
||||
# blocking the entire BYOS Claude Code journey at the LLM gate.
|
||||
# An exact-hostname HTTPRoute outranks the wildcard per Gateway
|
||||
# API spec, so enabling this on every Sovereign restores LLM
|
||||
# reachability without touching the marketplace wildcard.
|
||||
httpRoute:
|
||||
enabled: true
|
||||
host: newapi.${SOVEREIGN_FQDN}
|
||||
auth:
|
||||
adminUI:
|
||||
mode: keycloak
|
||||
@ -262,27 +187,11 @@ spec:
|
||||
# operators that run their own bp-vllm + open-weight model in-
|
||||
# cluster; it composes after `qwenBankDhofar` and any operator
|
||||
# `.Values.channels`.
|
||||
# Sandbox Wave 4 (2026-05-18, retry of sandbox-wave4-newapi-sovereign-install):
|
||||
# qwenBankDhofar is now gated on `${MARKETPLACE_ENABLED:-false}` — the
|
||||
# same envsubst variable bp-catalyst-platform (slot 13) reads to flip
|
||||
# marketplace.enabled on the Catalyst control plane. This lets a
|
||||
# franchised Sovereign with `MARKETPLACE_ENABLED=true` auto-seed the
|
||||
# default Bank Dhofar Qwen3.6 channel without the operator having to
|
||||
# supply per-Sovereign overlay values. The endpoint defaults to the
|
||||
# canonical first-otech relay; `LLM_BANK_DHOFAR_BASE_URL` overrides
|
||||
# it (e.g. for staging at https://omtd.bankdhofar.com). The upstream
|
||||
# API key MUST be present in the Secret `newapi-channel-qwen-bankdhofar`
|
||||
# under key `API_KEY` — either pre-seeded by cloud-init or pulled from
|
||||
# OpenBao via the operator's ExternalSecret at path
|
||||
# `sovereign/<fqdn>/newapi/channel-qwen-bankdhofar`. Sandbox agents
|
||||
# (sandbox-wave4) depend on this channel being live on every Sovereign
|
||||
# that opted in to marketplace; without it the agents fall back to
|
||||
# mothership newapi, defeating the per-Sovereign sandboxing.
|
||||
defaultChannels:
|
||||
qwenBankDhofar:
|
||||
enabled: ${MARKETPLACE_ENABLED:-false}
|
||||
enabled: false
|
||||
name: qwen3.6-bankdhofar
|
||||
endpoint: ${LLM_BANK_DHOFAR_BASE_URL:-https://llm-api.omtd.bankdhofar.com}
|
||||
endpoint: ""
|
||||
models:
|
||||
- qwen3.6
|
||||
- qwen3-coder
|
||||
@ -290,8 +199,8 @@ spec:
|
||||
existingSecretKey: API_KEY
|
||||
attestation:
|
||||
kind: commercial-contract
|
||||
accountId: ${LLM_BANK_DHOFAR_ACCOUNT_ID:-}
|
||||
contractRef: ${LLM_BANK_DHOFAR_CONTRACT_REF:-}
|
||||
accountId: ""
|
||||
contractRef: ""
|
||||
vllm:
|
||||
enabled: false
|
||||
name: qwen
|
||||
|
||||
@ -24,48 +24,8 @@ resources:
|
||||
- 15a-external-secrets-stores.yaml
|
||||
- 16-cnpg.yaml
|
||||
- 17-valkey.yaml
|
||||
# bp-hcloud-csi (formerly slot 17a) REMOVED 2026-05-17 (Wave 7):
|
||||
# the Flux source-controller chart pull went through harbor.t11.* OCI
|
||||
# endpoint BEFORE harbor itself was reachable (chicken-and-egg —
|
||||
# harbor depends on Gateway, Gateway lives in sovereign-tls which
|
||||
# dependsOn bootstrap-kit Ready, which never went Ready because
|
||||
# bp-hcloud-csi was stuck on harbor pull). Caught live on t11 fresh
|
||||
# prov 2026-05-17: bootstrap-kit Reconciliation-in-progress for 30+
|
||||
# min → sovereign-tls "not ready: dependency bootstrap-kit not ready"
|
||||
# → no Gateway CR → console.t11.<sov> ERR_CONNECTION_CLOSED →
|
||||
# entire UI test matrix BLOCKED. C9-006 (hcloud-volumes default SC)
|
||||
# is a cosmetic operator-facing nice-to-have; Gateway availability
|
||||
# is launch-critical. Removing this slot unblocks the chain. Follow-
|
||||
# up PR will re-add at a later slot (e.g., 19a, AFTER bp-harbor 19)
|
||||
# OR fix the pull path to bypass the registry pivot during bootstrap.
|
||||
- 18-seaweedfs.yaml
|
||||
- 19-harbor.yaml
|
||||
# bp-sandbox (slot 19a) — sandbox-controller Wave 1 (PR #1622) + Wave 8
|
||||
# pty-server / MCP / NEWAPI runtime wiring. Reconciles
|
||||
# `sandbox.openova.io/v1.Sandbox` CRs into per-Sandbox manifests
|
||||
# written into the per-Org `catalyst-tenant` Gitea repo.
|
||||
#
|
||||
# Wave 11 convergence fix (2026-05-18, caught on t16.omantel.biz):
|
||||
# originally slot 61 — moved here after bp-harbor (slot 19) because the
|
||||
# post-handover cutover (slot 06a, Step-06 phase-1) rewrites every
|
||||
# HelmRepository URL `oci://ghcr.io/openova-io` →
|
||||
# `oci://harbor.<sov-fqdn>/openova-io`, and the bp-sandbox chart pull
|
||||
# then hits harbor.<sov-fqdn> BEFORE bp-harbor is Ready — chicken-and-
|
||||
# egg. Same failure shape as Wave 7 #1610 with bp-hcloud-csi (REMOVED,
|
||||
# see the slot-17a comment block above) but resolved here by
|
||||
# sequencing rather than removal so the slot remains available for
|
||||
# the Wave 11 Sandbox MVP without manual Day-2 add-app re-introduction.
|
||||
# HR's dependsOn pins ordering to AFTER bp-harbor + bp-vcluster-
|
||||
# helmrepo + bp-catalyst-platform. Wave 11 convergence fix (TBD-D11,
|
||||
# 2026-05-18): now gated default-ON via ${SANDBOX_ENABLED:-true} on
|
||||
# the bootstrap-kit Kustomization substitute so the controller
|
||||
# materialises on every fresh prov (Wave 8 pty-server + MCP images
|
||||
# are SHA-stamped in chart values.yaml). Operators may opt-OUT via
|
||||
# SANDBOX_ENABLED=false on the per-Sovereign overlay's substitute
|
||||
# map. The chart's own values.enabled default remains false (defence
|
||||
# in depth — a stale per-Sovereign overlay that hand-installs the
|
||||
# HR without our envsubst layer still default-OFFs gracefully).
|
||||
- 19a-bp-sandbox.yaml
|
||||
# 06a — Post-handover Self-Sovereignty Cutover (issue #791). Filename
|
||||
# carries the 06a prefix to colocate cohorts visually, but the slot's
|
||||
# dependsOn pins actual install order to AFTER bp-gitea (slot 10) and
|
||||
@ -99,63 +59,6 @@ resources:
|
||||
# installs proxy → gateway.
|
||||
- 51-bp-k8s-ws-proxy.yaml
|
||||
- 52-bp-guacamole.yaml
|
||||
# qa-loop iter-12 Fix #53C — EPIC-5 leftovers (NetBird zero-trust mesh
|
||||
# + DMZ vCluster isolation). Slots 53/54. Both default-OFF; flip on
|
||||
# via NETBIRD_ENABLED=true / DMZ_VCLUSTER_ENABLED=true on the
|
||||
# bootstrap-kit Kustomization substitute.
|
||||
#
|
||||
# Slot 54 (bp-dmz-vcluster) implements docs/SOVEREIGN-MULTI-REGION-
|
||||
# DOD.md A4 ("each region runs a DMZ vCluster") + A2 ("inter-region
|
||||
# link = DMZ WireGuard over PUBLIC IPs"). Default-ON because the DMZ
|
||||
# vCluster is the public-fronted vCluster AND the inter-region WG
|
||||
# hop — every region needs it for the topology to converge.
|
||||
- 54-bp-dmz-vcluster.yaml
|
||||
# qa-loop iter-12 Fix #54 Workstream 1 — bp-hcloud-ccm (slot 55).
|
||||
# Hetzner Cloud Controller Manager. The CCM owns node providerID
|
||||
# flips (k3s://… → hcloud://<server-id>) AND materialisation of
|
||||
# Service-of-type-LoadBalancer as Hetzner Cloud LBs. Without this,
|
||||
# every LB-typed Service stays Pending — the proximate root cause
|
||||
# clustermesh-apiserver could not migrate from NodePort to LB on
|
||||
# omantel multi-region (qa-loop iter-12 Fix #53D).
|
||||
- 55-bp-hcloud-ccm.yaml
|
||||
# OpenovaFlow observability cohort — slots 56/57. Three-agent split
|
||||
# (Agent #1: TS @openova/flow-core + @openova/flow-canvas, Agent #2:
|
||||
# Go server + flux adapter, Agent #3: bootstrap-kit + catalyst-api
|
||||
# proxy integration). Slot 56 (server) installs on PRIMARY clusters
|
||||
# only; per-Sovereign overlay disables on secondaries. Slot 57
|
||||
# (emitter) is a DaemonSet — runs on every cluster (mother + every
|
||||
# Sovereign + every secondary region) so each region's Flux events
|
||||
# land in the same per-deployment flow.
|
||||
- 56-bp-openova-flow-server.yaml
|
||||
- 57-bp-openova-flow-emitter.yaml
|
||||
# DoD A4 vCluster topology (2026-05-16) — slots 58 + 59 finish the
|
||||
# primary-mgmt + secondary-rtz pair that goes alongside the slot 54
|
||||
# DMZ vCluster (every region). Combined topology per region:
|
||||
# primary region → MGMT (58) + DMZ (54) vCluster
|
||||
# secondary region → DMZ (54) + RTZ (59) vCluster
|
||||
# Slot 58 default-OFF until the per-CP postBuild substitute follow-up
|
||||
# PR adds MGMT_VCLUSTER_ENABLED only on primary. Slot 59 same shape
|
||||
# for secondaries via RTZ_VCLUSTER_ENABLED. See each slot's header
|
||||
# comment for the migration plan.
|
||||
- 58-bp-mgmt-vcluster.yaml
|
||||
- 59-bp-rtz-vcluster.yaml
|
||||
# bp-vcluster-helmrepo (slot 60) — pre-stages the upstream loft-sh
|
||||
# vcluster Helm chart source so the Organization controller
|
||||
# (core/controllers/organization) can render per-tenant
|
||||
# `helm.toolkit.fluxcd.io/v2 HelmRelease` CRs whose `chart.spec.
|
||||
# sourceRef` points at `name=loft, namespace=vcluster-system`.
|
||||
# Convergence blocker #2 (vCluster source install on Sovereign).
|
||||
# Different layer from slots 54/58/59 (those bundle loft-sh/vcluster
|
||||
# 0.20.0 as a subchart for the Sovereign-tier DMZ/MGMT/RTZ vClusters;
|
||||
# this slot registers a live Flux source so per-TENANT vClusters can
|
||||
# be spawned by the Organization controller at runtime). Default-ON.
|
||||
- 60-bp-vcluster-helmrepo.yaml
|
||||
# bp-sandbox MOVED 2026-05-18 (Wave 11 convergence fix) from slot 61
|
||||
# to slot 19a (above, after bp-harbor) to break a chicken-and-egg
|
||||
# cycle with harbor.<sov-fqdn> chart pulls during bootstrap. See the
|
||||
# slot-19a comment block + 19a-bp-sandbox.yaml header for full
|
||||
# diagnostic chain. No functional difference for operators — the
|
||||
# SANDBOX_ENABLED knob still gates rendering identically.
|
||||
# bp-newapi (slot 80) — multi-tenant LLM marketplace gateway. Sequenced
|
||||
# after the W2.K1 dependency wave (cnpg/keycloak/openbao Ready) so
|
||||
# NewAPI's ExternalSecret + DSN dependencies resolve on first reconcile.
|
||||
|
||||
@ -1,283 +0,0 @@
|
||||
# cilium-envoy SDS hot-reload trigger.
|
||||
#
|
||||
# Root cause (qa-loop bounded-cycle Provision #7, ad2532a8):
|
||||
#
|
||||
# cilium-envoy DaemonSet starts as part of the bp-cilium HelmRelease,
|
||||
# which lands ~20 min before the wildcard cert backing the
|
||||
# `sovereign-wildcard-tls` Secret is issued by cert-manager (the cert
|
||||
# resource itself is in this same Kustomization, but the DNS-01 challenge
|
||||
# round-trip against the central PowerDNS adds 60-90s on top of the
|
||||
# Kustomization apply). Envoy's xDS subscription for the SDS Secret
|
||||
# observed the Secret was missing at startup, hit its initial-fetch
|
||||
# timeout, and marked the Gateway listener unready. cilium-envoy does
|
||||
# NOT re-subscribe after the Secret materialises — once the SDS bind
|
||||
# is in `error` state the listener stays down until the envoy process
|
||||
# restarts. Symptom: `console.<sov>` returns curl rc=000 (TLS handshake
|
||||
# failure: "no listener on this port") indefinitely after a fresh
|
||||
# provision, even though every HelmRelease reaches Ready.
|
||||
#
|
||||
# Affected: every fresh Sovereign provision will hit this until the
|
||||
# upstream cilium-envoy ships SDS hot-reload. Each one previously
|
||||
# required a manual `kubectl rollout restart ds/cilium-envoy` for the
|
||||
# console to come up — exactly the kind of out-of-band step that
|
||||
# violates the zero-touch-provision rule (memory entry
|
||||
# feedback_zero_touch_provision_no_questions.md).
|
||||
#
|
||||
# Fix path B from the brief: ship a Job in this same Kustomization that
|
||||
# (a) waits for the `sovereign-wildcard-tls` Secret to materialise with
|
||||
# a non-empty `tls.crt` field, then (b) bumps the cilium-envoy DaemonSet
|
||||
# pod template via `kubectl rollout restart`. The fresh envoy pods
|
||||
# subscribe to the now-existing Secret and the listener comes up cleanly
|
||||
# within ~30s of the cert appearing.
|
||||
#
|
||||
# Why this Kustomization (and not the bp-cilium chart):
|
||||
# - The cert lifecycle and the restart trigger live together. When the
|
||||
# cert resource is removed (multi-zone migration via #831), removing
|
||||
# this Job is a single-PR delete.
|
||||
# - bp-cilium chart has wide blast radius; bumping its OCI artifact for
|
||||
# a Sovereign-bootstrap-only behaviour would force every cluster
|
||||
# (including SME, contabo) to take the change.
|
||||
# - The SDS Secret name + namespace are the contract between this
|
||||
# Kustomization (cert producer) and cilium-envoy (cert consumer);
|
||||
# keeping the restart trigger here keeps both ends of the contract
|
||||
# in one file tree.
|
||||
#
|
||||
# Re-fire semantics:
|
||||
# - The Job has a deterministic name (no generateName) so re-applying
|
||||
# this Kustomization with the same revision is a no-op once the Job
|
||||
# reached Complete.
|
||||
# - On a chart-bump or re-provision (different revision), Kustomize
|
||||
# re-creates the Job with the new generation; it runs again, re-
|
||||
# bumps the DaemonSet, and exits — idempotent.
|
||||
# - ttlSecondsAfterFinished cleans up the Pod/Job after 1h so kubectl
|
||||
# output stays clean.
|
||||
#
|
||||
# Idempotency at the envoy side:
|
||||
# - `kubectl rollout restart` patches a `kubectl.kubernetes.io/restartedAt`
|
||||
# annotation on the pod template. If the cert was already present at
|
||||
# envoy startup (steady-state cluster), the restart still happens but
|
||||
# incurs only a ~10s data-plane blip; the bootstrap window is short
|
||||
# enough that this is the expected hot-path on every fresh provision
|
||||
# and a no-op-cost on every subsequent revision.
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cilium-envoy-tls-restart
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-envoy-tls-restart
|
||||
---
|
||||
# Namespaced Role — only needs to (a) read the Secret to detect
|
||||
# materialisation and (b) patch the DaemonSet pod template to trigger
|
||||
# rollout. Both verbs scoped to kube-system, both resources scoped to
|
||||
# the exact resource names so this SA cannot be repurposed to mutate
|
||||
# other workloads.
|
||||
#
|
||||
# Note: `patch` (not `update`) on daemonsets is sufficient for
|
||||
# `kubectl rollout restart` because that command issues a strategic-
|
||||
# merge patch on `.spec.template.metadata.annotations`. Verified
|
||||
# against kubectl 1.31.4 source (pkg/cmd/rollout/rollout_restart.go).
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: cilium-envoy-tls-restart
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-envoy-tls-restart
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
# 2026-05-17 t143 dual-cert collision cleanup: the per-zone Secret
|
||||
# the Cilium Gateway now references is named
|
||||
# `sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}`
|
||||
# (see clusters/_template/sovereign-tls/cilium-gateway.yaml:44 +
|
||||
# clusters/_template/sovereign-tls/cilium-gateway-cert.yaml). The
|
||||
# legacy `sovereign-wildcard-tls` (no dashed suffix) is no longer
|
||||
# produced anywhere — drop it from the resourceNames allowlist so
|
||||
# this Role grants the minimum needed for the live Secret name.
|
||||
resourceNames: ["sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["daemonsets"]
|
||||
resourceNames: ["cilium-envoy"]
|
||||
verbs: ["get", "patch", "list", "watch"]
|
||||
# ALSO patch the cilium-operator Deployment. Reason: on a fresh
|
||||
# Sovereign, cilium-operator's first CEC reconciliation produces a
|
||||
# CiliumEnvoyConfig WITHOUT the hostNetwork bind `additionalAddresses.
|
||||
# socketAddress: 0.0.0.0:30443` — even though `gateway-api-hostnetwork-
|
||||
# enabled=true` and `gateway-api-hostnetwork-nodelabelselector=kubernetes.io/os=linux`
|
||||
# are correctly set in cilium-config. After an operator pod-restart
|
||||
# the next CEC reconcile DOES populate the bind, and cilium-envoy
|
||||
# binds host:30443 cleanly. Without this restart, Hetzner LB targets
|
||||
# stay `unhealthy` on 30080/30443 forever and console.<sov-fqdn>
|
||||
# never serves. Caught live on prov 492c81e2 (omantel.biz, 2026-05-15)
|
||||
# plus every prior multi-region prov where operators were doing the
|
||||
# restart manually. This rule lets the Job fix it without operator
|
||||
# intervention.
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["deployments"]
|
||||
resourceNames: ["cilium-operator"]
|
||||
verbs: ["get", "patch", "list", "watch"]
|
||||
# Read rollout status so the Job can wait for new pods to come up
|
||||
# before exiting. `kubectl rollout status` does NOT just GET — it
|
||||
# uses client-go informerwatcher to LIST+WATCH the
|
||||
# Deployment/DaemonSet resource. Without list+watch verbs the
|
||||
# informer fails with "forbidden: cannot list resource ..." and the
|
||||
# Job stalls at the rollout-status check until activeDeadlineSeconds.
|
||||
# Caught on prov t110.omani.works (fe09897a1b6b3c1d, 2026-05-15):
|
||||
# tls-restart Job stuck Running 10m+ on the cilium-operator rollout
|
||||
# check, never restarted cilium-envoy, console.<fqdn> never served.
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: cilium-envoy-tls-restart
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-envoy-tls-restart
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: cilium-envoy-tls-restart
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cilium-envoy-tls-restart
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: cilium-envoy-tls-restart
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-envoy-tls-restart
|
||||
spec:
|
||||
# backoffLimit 6 with a default backoff of 10s..6m == ≈10 min of total
|
||||
# retry headroom. The cert provisioning DNS-01 round-trip is the only
|
||||
# thing this Job waits on, and that completes ≤90s in steady-state. A
|
||||
# higher limit absorbs the rare PowerDNS propagation flake.
|
||||
backoffLimit: 6
|
||||
# Clean up Job + Pod 1h after success so kubectl get jobs stays sane.
|
||||
ttlSecondsAfterFinished: 3600
|
||||
# 15 min hard cap. If the cert hasn't arrived in 15 min, something is
|
||||
# broken upstream (cert-manager, PowerDNS webhook, ACME) and a Job
|
||||
# restart loop won't fix it; surface the failure to Flux so the
|
||||
# operator sees Kustomization NotReady.
|
||||
activeDeadlineSeconds: 900
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: cilium-envoy-tls-restart
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-envoy-tls-restart
|
||||
spec:
|
||||
serviceAccountName: cilium-envoy-tls-restart
|
||||
restartPolicy: OnFailure
|
||||
# alpine/k8s:1.31.4 — canonical kubectl image used across the
|
||||
# Catalyst fleet (self-sovereign-cutover, seaweedfs, harbor have
|
||||
# all converged on this after Bitnami deprecated public Docker
|
||||
# Hub in 2025). Ships kubectl + sh + standard busybox toolchain.
|
||||
containers:
|
||||
- name: wait-and-restart
|
||||
image: alpine/k8s:1.31.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
# NOTE: Flux postBuild.substitute processes ${...} in this
|
||||
# YAML BEFORE it lands as a Job. Bash variable references
|
||||
# below MUST be escaped as $${...} so Flux emits a literal
|
||||
# ${...} that bash then evaluates at Job runtime. Without
|
||||
# the escape, Flux replaces $${SECRET_NS} (etc.) with an
|
||||
# empty string because those names aren't in
|
||||
# substituteFrom, and the Job ends up running
|
||||
# `kubectl get secret -n "" ""` forever (caught live on
|
||||
# prov c9df5eed1c1ba6cf, t101.omani.works, 2026-05-15).
|
||||
- |
|
||||
set -eu
|
||||
|
||||
SECRET_NS=kube-system
|
||||
# 2026-05-17 t143 dual-cert collision cleanup: the canonical
|
||||
# SDS Secret the Cilium Gateway now references is the
|
||||
# per-zone `sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}`.
|
||||
# Cloud-init substitutes SOVEREIGN_FQDN_DASHED via Flux
|
||||
# postBuild.substitute, so the literal cluster value lands
|
||||
# here at apply time (verified in
|
||||
# infra/hetzner/cloudinit-control-plane.tftpl §SOVEREIGN_FQDN_DASHED).
|
||||
SECRET_NAME=sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}
|
||||
DS_NS=kube-system
|
||||
DS_NAME=cilium-envoy
|
||||
|
||||
echo "[tls-restart] waiting for $${SECRET_NS}/$${SECRET_NAME} with non-empty tls.crt"
|
||||
# Poll up to ~14 min (84 * 10s); activeDeadlineSeconds=900
|
||||
# is the outer hard limit. We treat "not yet" and "empty
|
||||
# tls.crt" the same — both mean the cert hasn't been issued.
|
||||
for i in $(seq 1 84); do
|
||||
# `--ignore-not-found` so the early polls (Secret not
|
||||
# created yet) don't error — kubectl returns empty
|
||||
# output and the for-loop continues.
|
||||
tls_crt=$(kubectl get secret -n "$${SECRET_NS}" "$${SECRET_NAME}" \
|
||||
--ignore-not-found \
|
||||
-o jsonpath='{.data.tls\.crt}' 2>/dev/null || true)
|
||||
if [ -n "$${tls_crt}" ]; then
|
||||
echo "[tls-restart] $${SECRET_NAME} present with non-empty tls.crt (attempt $${i})"
|
||||
break
|
||||
fi
|
||||
if [ "$${i}" = "84" ]; then
|
||||
echo "[tls-restart] FATAL: $${SECRET_NAME} did not become non-empty after 14m" >&2
|
||||
exit 1
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Step 1 — restart cilium-operator FIRST so it regenerates the
|
||||
# CiliumEnvoyConfig with the hostNetwork bind
|
||||
# `additionalAddresses.socketAddress: 0.0.0.0:30443`. On a
|
||||
# fresh Sovereign, the FIRST CEC reconcile (driven by
|
||||
# Gateway create) misses this bind even though the
|
||||
# configmap keys gateway-api-hostnetwork-{enabled,
|
||||
# nodelabelselector} are correct. An operator pod-restart
|
||||
# forces a fresh CEC render that includes the bind. Without
|
||||
# this, cilium-envoy is restarted in step 2 but binds only
|
||||
# 127.0.0.1:* control sockets — host:30443 stays empty,
|
||||
# Hetzner LB targets stay unhealthy, console returns 000.
|
||||
echo "[tls-restart] bumping deploy/cilium-operator to regenerate CEC with hostNetwork bind"
|
||||
kubectl rollout restart -n "$${DS_NS}" deploy/cilium-operator
|
||||
echo "[tls-restart] waiting for deploy/cilium-operator rollout"
|
||||
kubectl rollout status -n "$${DS_NS}" deploy/cilium-operator --timeout=3m
|
||||
|
||||
# Step 2 — restart cilium-envoy so it picks up (a) the
|
||||
# freshly-regenerated CEC with the hostNetwork bind, AND
|
||||
# (b) the now-existing sovereign-wildcard-tls SDS Secret.
|
||||
echo "[tls-restart] bumping ds/$${DS_NAME} in $${DS_NS} to force envoy SDS re-subscribe + CEC reload"
|
||||
kubectl rollout restart -n "$${DS_NS}" "ds/$${DS_NAME}"
|
||||
|
||||
# Block until the new pods are Ready. `kubectl rollout
|
||||
# status` exits 0 on full rollout, non-zero on timeout.
|
||||
# Bound to 5 min — a single-node k3s rolls 1 pod, a 3-node
|
||||
# HA cluster rolls 3 in parallel; both finish ≤90s in
|
||||
# practice.
|
||||
echo "[tls-restart] waiting for ds/$${DS_NAME} rollout"
|
||||
kubectl rollout status -n "$${DS_NS}" "ds/$${DS_NAME}" --timeout=5m
|
||||
|
||||
echo "[tls-restart] complete — cilium-operator regenerated hostNetwork CEC + cilium-envoy serves SDS Secret $${SECRET_NAME}"
|
||||
@ -1,87 +1,68 @@
|
||||
# Per-name TLS Certificates for the Cilium Gateway listeners.
|
||||
# Wildcard TLS Certificate for the Cilium Gateway listener.
|
||||
#
|
||||
# Architecture change (2026-05-15): switched from ONE wildcard cert
|
||||
# `*.<sovereignFQDN>` to N per-name certs (console, auth, gitea, harbor,
|
||||
# registry, api, bao, grafana, hubble, openova-flow, pdns, marketplace).
|
||||
# Split from clusters/_template/bootstrap-kit/01-cilium.yaml in
|
||||
# fix/cilium-cert-split-from-bootstrap-kit (Phase-8a bug #13). The
|
||||
# Cert lives in its OWN Flux Kustomization (`sovereign-tls`) which
|
||||
# depends on bootstrap-kit being Ready — i.e. cert-manager + the
|
||||
# powerdns-webhook are both installed and their CRDs registered.
|
||||
#
|
||||
# Why: Let's Encrypt enforces "5 New Certificates per Exact Set of
|
||||
# Identifiers per 168h". The wildcard pattern bundled ALL hostnames
|
||||
# under ONE identifier set `[*.<fqdn>, <fqdn>]` — every prov-cycle
|
||||
# burned 1 of 5 slots from that single bucket. Five iterations on the
|
||||
# same FQDN locked the apex for a week. Hit live three times:
|
||||
# - omantel.biz exhausted 2026-05-13 (12 reprovs)
|
||||
# - omani.works exhausted 2026-05-14 (5 reprovs in 90 min)
|
||||
# - omantel.biz exhausted again 2026-05-15 (this PR's origin)
|
||||
# Without this split, Flux's server-side dry-run on the bootstrap-kit
|
||||
# Kustomization fails with `no matches for kind "Certificate" in
|
||||
# version "cert-manager.io/v1"` because the validation runs BEFORE any
|
||||
# HelmRelease has installed the cert-manager CRDs — and a single
|
||||
# dry-run failure aborts the entire Kustomization apply, leaving the
|
||||
# Sovereign with zero HRs reconciled.
|
||||
#
|
||||
# Per-name model gives each hostname its OWN 5/168h bucket:
|
||||
# - console.<fqdn> → 5 reprovs/week
|
||||
# - auth.<fqdn> → 5 reprovs/week
|
||||
# - gitea.<fqdn> → 5 reprovs/week
|
||||
# ... × 12 hostnames = 60 effective reprov-slots/week
|
||||
# The Gateway resource stays in 01-cilium.yaml: Gateway.networking.k8s.io
|
||||
# CRDs ship with Cilium itself (gatewayAPI.enabled=true) and dry-run
|
||||
# against them only requires the Gateway API CRD bundle which Cilium
|
||||
# pre-installs at chart-time. The Certificate is the ONLY resource
|
||||
# whose CRD is provided by a HelmRelease in the same Kustomization
|
||||
# that needs to validate it.
|
||||
#
|
||||
# 2026-05-17 t143 dual-cert collision cleanup
|
||||
# -------------------------------------------
|
||||
# Previously this Certificate was named `sovereign-wildcard-tls` and
|
||||
# wrote a Secret of the same name. After PR O (2026-05-17) moved the
|
||||
# Cilium Gateway listener's certificateRefs to the per-zone Secret
|
||||
# `sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}` (see
|
||||
# clusters/_template/sovereign-tls/cilium-gateway.yaml:44), the legacy
|
||||
# Secret stopped being referenced by anything — but the Certificate
|
||||
# kept renewing, burning LE budget for no production value and showing
|
||||
# up in audits as an orphan TLS Secret on every Sovereign.
|
||||
# Issuer: `letsencrypt-dns01-prod-powerdns` is shipped by
|
||||
# bp-cert-manager-powerdns-webhook (bootstrap-kit slot 49). It writes
|
||||
# the ACME challenge TXT record to contabo's central PowerDNS at
|
||||
# https://pdns.openova.io (authoritative for omani.works) so Let's
|
||||
# Encrypt validation succeeds even before the Sovereign's own NS
|
||||
# delegation has propagated. Replaces the previous letsencrypt-dns01-prod
|
||||
# (dynadot-webhook-backed) — Dynadot is not the API-level authority for
|
||||
# omani.works subdomains. Caught live on otech43–46.
|
||||
#
|
||||
# Single-source-of-truth fix: this Certificate now writes to the SAME
|
||||
# dashed-suffix Secret the Gateway already references. One Cert, one
|
||||
# Secret, one LE issuance per renewal. No more dual-cert collision
|
||||
# and no extra LE budget consumed.
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# Multi-zone Sovereign (issue #827, parent epic #825) coexistence note
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# bp-catalyst-platform 1.4.0+ ships templates/sovereign-wildcard-certs.yaml
|
||||
# which renders one Certificate PER ENTRY in `.Values.parentZones`, each
|
||||
# named `sovereign-wildcard-tls-<sanitised-zone>` (e.g.
|
||||
# `sovereign-wildcard-tls-omani-trade`). Those resource names are DISTINCT
|
||||
# from this file's `sovereign-wildcard-tls` so the two paths never collide:
|
||||
# - Single-zone Sovereigns (parentZones empty) — this file owns the only
|
||||
# wildcard cert.
|
||||
# - Multi-zone Sovereigns (parentZones populated) — this file STILL owns
|
||||
# `sovereign-wildcard-tls` (covering the operator's primary parent
|
||||
# zone) AND the chart adds N additional zone-specific certs. The
|
||||
# Cilium Gateway listener is updated in the per-cluster overlay to
|
||||
# reference the appropriate Secret per zone listener.
|
||||
#
|
||||
# This pattern is the standard production approach (see Cloudflare,
|
||||
# Vercel, Render). Wildcards are reserved for the limited cases where
|
||||
# customer-provided subdomains aren't known in advance.
|
||||
# Once issue #831 lands a multi-listener Gateway template in
|
||||
# bp-catalyst-platform itself, this file becomes redundant and is
|
||||
# deletable.
|
||||
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
# Match the Secret name the Gateway listener references
|
||||
# (clusters/_template/sovereign-tls/cilium-gateway.yaml:44). Cloud-init
|
||||
# substitutes SOVEREIGN_FQDN_DASHED = SOVEREIGN_FQDN with `.` → `-`
|
||||
# (infra/hetzner/cloudinit-control-plane.tftpl §SOVEREIGN_FQDN_DASHED).
|
||||
name: sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}
|
||||
name: sovereign-wildcard-tls
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
catalyst.openova.io/component: cilium-gateway
|
||||
spec:
|
||||
secretName: sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}
|
||||
secretName: sovereign-wildcard-tls
|
||||
issuerRef:
|
||||
name: ${WILDCARD_CERT_ISSUER}
|
||||
name: letsencrypt-dns01-prod-powerdns
|
||||
kind: ClusterIssuer
|
||||
commonName: "console.${SOVEREIGN_FQDN}"
|
||||
# SubjectAltNames — explicit list of Sovereign-facing surfaces. New
|
||||
# services added to bootstrap-kit MUST be added here so the cert
|
||||
# covers them at issuance time. Order is preserved in the cert; for
|
||||
# cosmetic reasons the operator-facing names come first.
|
||||
commonName: "*.${SOVEREIGN_FQDN}"
|
||||
dnsNames:
|
||||
- "console.${SOVEREIGN_FQDN}"
|
||||
- "auth.${SOVEREIGN_FQDN}"
|
||||
- "gitea.${SOVEREIGN_FQDN}"
|
||||
- "registry.${SOVEREIGN_FQDN}"
|
||||
- "api.${SOVEREIGN_FQDN}"
|
||||
- "bao.${SOVEREIGN_FQDN}"
|
||||
- "grafana.${SOVEREIGN_FQDN}"
|
||||
- "hubble.${SOVEREIGN_FQDN}"
|
||||
- "pdns.${SOVEREIGN_FQDN}"
|
||||
- "openova-flow.${SOVEREIGN_FQDN}"
|
||||
- "guacamole.${SOVEREIGN_FQDN}"
|
||||
- "marketplace.${SOVEREIGN_FQDN}"
|
||||
# sandbox.<sov-fqdn> — public URL for the per-Sandbox pty-server (PR #1641).
|
||||
# The sandbox-controller renders an HTTPRoute on sandbox.<sov-fqdn>/
|
||||
# sessions/<owner-uid>/* attached to the cilium-gateway. The wildcard
|
||||
# `*.<sov-fqdn>` Gateway listener already matches the hostname, but the
|
||||
# per-name SAN cert here must include `sandbox.<sov-fqdn>` for
|
||||
# cilium-envoy to serve the right cert (otherwise browsers see
|
||||
# NET::ERR_CERT_COMMON_NAME_INVALID). Matches the entry in
|
||||
# products/catalyst/bootstrap/api/internal/handler/sovereign_dns_records.go
|
||||
# CanonicalSovereignSubdomains so the parent-zone A-record set + cert
|
||||
# SAN list stay aligned.
|
||||
- "sandbox.${SOVEREIGN_FQDN}"
|
||||
- "*.${SOVEREIGN_FQDN}"
|
||||
- "${SOVEREIGN_FQDN}"
|
||||
|
||||
@ -4,79 +4,6 @@
|
||||
# whole Kustomization before applying any HR, so Gateway dry-run fails on
|
||||
# a fresh cluster. The sovereign-tls Kustomization dependsOn bootstrap-kit
|
||||
# Ready, so by the time Gateway is applied here, Cilium has installed.
|
||||
#
|
||||
# Multi-zone listeners (issue #831, parent epic #827)
|
||||
# ---------------------------------------------------
|
||||
# Before this change the Gateway declared a single listener pair (HTTPS +
|
||||
# HTTP) on `*.${SOVEREIGN_FQDN}`. That worked for legacy single-zone
|
||||
# Sovereigns but BROKE every tenant URL under a non-primary parent zone:
|
||||
# - Primary zone: omani.works → console.omani.works ✅ TLS terminates
|
||||
# - SME pool: omani.homes → wp-foo.omani.homes ❌ TLS handshake
|
||||
# mismatch (cert
|
||||
# exists per chart's
|
||||
# sovereign-wildcard-
|
||||
# certs.yaml but no
|
||||
# Gateway listener
|
||||
# claims the
|
||||
# hostname).
|
||||
# Symptom: cilium-envoy serves the default fallback cert, browser shows
|
||||
# NET::ERR_CERT_COMMON_NAME_INVALID, marketplace WordPress tenants on
|
||||
# omani.homes are unreachable.
|
||||
#
|
||||
# Fix: render one listener pair per parent zone. The listener block is
|
||||
# materialised at Terraform plan time (infra/hetzner/main.tf
|
||||
# locals.parent_domains_listeners_yaml — jsonencode of the listener
|
||||
# objects), threaded through Flux postBuild.substitute as
|
||||
# ${PARENT_DOMAINS_LISTENERS_YAML}, and consumed BELOW as a YAML inline-
|
||||
# flow array value on `spec.listeners`. Each pair's certificateRefs
|
||||
# target the per-zone Secret rendered by products/catalyst/chart/
|
||||
# templates/sovereign-wildcard-certs.yaml (PR #827) so the Gateway
|
||||
# listener and the cert resource are always in lockstep.
|
||||
#
|
||||
# Why a scalar placeholder, not a multi-line block:
|
||||
# - kustomize-build PARSES the YAML before Flux runs envsubst. A
|
||||
# placeholder on its own line at column 0 is rejected by the YAML
|
||||
# parser ("could not find expected ':'"), and kustomize fails. A
|
||||
# scalar like `listeners: ${VAR}` parses cleanly — kustomize sees
|
||||
# the value as the literal string `${VAR}` and emits it unchanged.
|
||||
# Flux's envsubst step then swaps it for the JSON-flow array string
|
||||
# `[{name: https-omani-works, ...}, ...]`, which the apiserver
|
||||
# parses as the real listener list.
|
||||
#
|
||||
# Why not a Helm template here: the Cilium Gateway resource lives in the
|
||||
# Kustomize-managed sovereign-tls path (not the chart) because its
|
||||
# Kustomization dependsOn bootstrap-kit Ready — i.e. it lands BEFORE
|
||||
# bp-catalyst-platform reconciles. Moving it into the chart would invert
|
||||
# the ordering and produce a transient "no Gateway → no envoy listener →
|
||||
# console unreachable" gap during every Helm upgrade. envsubst-driven
|
||||
# pre-rendered YAML is the canonical pattern for this slot.
|
||||
#
|
||||
# Listener naming convention (t20 critical fix #3):
|
||||
# - SINGLE parent zone (the common case) → bare names `https` /
|
||||
# `http`. Every platform chart's HTTPRoute (harbor, keycloak,
|
||||
# grafana, gitea, openbao, powerdns, stalwart-tenant) hardcodes
|
||||
# `parentRefs[0].sectionName: https`. If we rename the listener to
|
||||
# `https-<sanitised-zone>` for a single-zone Sovereign, every
|
||||
# HTTPRoute reports `Accepted=False NoMatchingListener` and the
|
||||
# Sovereign Console / Harbor / Keycloak etc. are unreachable at
|
||||
# the Gateway. Keeping bare names for the single-zone case is the
|
||||
# safer rollback. (Was broken between PR #1640 and the t20 fix.)
|
||||
# - MULTIPLE parent zones → unique names `https-<sanitised-zone>` /
|
||||
# `http-<sanitised-zone>` where sanitised-zone = zone name with
|
||||
# '.' → '-' (e.g. omani-works, omani-homes). Distinct names per
|
||||
# listener so the Gateway controller programs them all (duplicate
|
||||
# `name: https` produces a Conflicting status condition and skips
|
||||
# all but the first). For multi-zone Sovereigns whose HTTPRoutes
|
||||
# must attach under a non-primary zone, override `sectionName` via
|
||||
# values.yaml at the chart level.
|
||||
# - The certificateRefs.name is ALWAYS the per-zone
|
||||
# `sovereign-wildcard-tls-<sanitised-zone>` (see
|
||||
# products/catalyst/chart/templates/sovereign-wildcard-certs.yaml)
|
||||
# — independent of the listener-name choice above.
|
||||
#
|
||||
# The listener block is rendered by infra/hetzner/main.tf locals.
|
||||
# parent_domains_listeners_yaml using local.parent_domains_single_zone
|
||||
# to switch between the two naming schemes.
|
||||
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: Gateway
|
||||
@ -105,4 +32,23 @@ spec:
|
||||
#
|
||||
# See infra/hetzner/main.tf hcloud_load_balancer_service.{http,https}
|
||||
# destination_port settings — they MUST match these listener ports.
|
||||
listeners: ${PARENT_DOMAINS_LISTENERS_YAML}
|
||||
listeners:
|
||||
- name: https
|
||||
port: 30443
|
||||
protocol: HTTPS
|
||||
hostname: "*.${SOVEREIGN_FQDN}"
|
||||
tls:
|
||||
mode: Terminate
|
||||
certificateRefs:
|
||||
- kind: Secret
|
||||
name: sovereign-wildcard-tls
|
||||
allowedRoutes:
|
||||
namespaces:
|
||||
from: All
|
||||
- name: http
|
||||
port: 30080
|
||||
protocol: HTTP
|
||||
hostname: "*.${SOVEREIGN_FQDN}"
|
||||
allowedRoutes:
|
||||
namespaces:
|
||||
from: All
|
||||
|
||||
@ -3,18 +3,3 @@ kind: Kustomization
|
||||
resources:
|
||||
- cilium-gateway-cert.yaml
|
||||
- cilium-gateway.yaml
|
||||
# Watch+rollout-restart Job for cilium-envoy. cilium-envoy's xDS SDS
|
||||
# subscription does NOT recover after the initial-fetch timeout, so a
|
||||
# fresh Sovereign whose envoy started before the wildcard cert was
|
||||
# issued serves no listener forever. This Job waits for the Secret
|
||||
# then bumps the DaemonSet, restoring the listener within ≤90s of
|
||||
# the cert appearing. See file header for full root cause + design
|
||||
# rationale (qa-loop bounded-cycle Provision #7).
|
||||
- cilium-envoy-tls-restart-job.yaml
|
||||
# C7-007 (2026-05-17 t143) — one-shot cleanup of the pre-PR-O legacy
|
||||
# `sovereign-wildcard-tls` Certificate + Secret pair. Idempotent
|
||||
# (`--ignore-not-found`), runs once per Flux reconciliation
|
||||
# generation. Fresh Sovereigns succeed as a no-op; pre-PR-O
|
||||
# Sovereigns delete the orphan resources. Removable from the list
|
||||
# once every live prov has reconciled past it.
|
||||
- legacy-cert-cleanup-job.yaml
|
||||
|
||||
@ -1,151 +0,0 @@
|
||||
# C7-007 (2026-05-17 t143) — one-shot cleanup Job for the legacy
|
||||
# `sovereign-wildcard-tls` Certificate + Secret pair.
|
||||
#
|
||||
# Background
|
||||
# ----------
|
||||
# Pre-PR-O Sovereigns rendered a Certificate named `sovereign-wildcard-tls`
|
||||
# (with a Secret of the same name) AND, after PR O moved the Cilium
|
||||
# Gateway listener to the per-zone `sovereign-wildcard-tls-${SOVEREIGN_FQDN_DASHED}`
|
||||
# Secret, the legacy Certificate kept renewing on cert-manager's
|
||||
# default schedule. Result: every audit on a pre-PR-O Sovereign showed
|
||||
# an orphan TLS Secret in kube-system, cert-manager wasted LE budget
|
||||
# renewing a Secret nothing consumed, and operators had to remember to
|
||||
# `kubectl delete` it after every Flux reconciliation re-asserted the
|
||||
# legacy resource (which it no longer does — PR O's `cilium-gateway-cert.yaml`
|
||||
# now produces ONLY the dashed-suffix shape).
|
||||
#
|
||||
# What this Job does
|
||||
# ------------------
|
||||
# Idempotent delete of:
|
||||
# 1. `kube-system/sovereign-wildcard-tls` Certificate (cert-manager.io/v1)
|
||||
# 2. `kube-system/sovereign-wildcard-tls` Secret (kubernetes.io/tls)
|
||||
#
|
||||
# Each delete is `--ignore-not-found` so a fresh Sovereign that never
|
||||
# carried the legacy shape reports "no-op" and Succeeds. The Job runs
|
||||
# ONCE per Flux reconciliation generation (the helm.sh/hook
|
||||
# annotations on the bp-self-sovereign-cutover chart aren't applicable
|
||||
# here because this lives in the per-Sovereign overlay, not a Helm
|
||||
# chart — Flux's Kustomization re-applies idempotently).
|
||||
#
|
||||
# Image
|
||||
# -----
|
||||
# Uses the canonical OpenOva-mirrored alpine/k8s image (mothership
|
||||
# Harbor proxy-cache for Docker Hub, per CLAUDE.md mirror rule).
|
||||
# Bitnami/kubectl was deprecated 2025-08; alpine/k8s is the standard
|
||||
# replacement (see platform/self-sovereign-cutover/chart/values.yaml:252
|
||||
# for the canonical reasoning, captured live on otech103 2026-05-04).
|
||||
#
|
||||
# Why a Job and not a Helm hook
|
||||
# -----------------------------
|
||||
# This file lives in `clusters/_template/sovereign-tls/` — a per-Sovereign
|
||||
# Kustomize overlay reconciled by Flux, NOT a Helm chart. Helm hooks
|
||||
# require a HelmRelease container; this is a single one-shot K8s Job.
|
||||
# Flux's Kustomization reconciliation drives idempotent re-apply.
|
||||
#
|
||||
# Removal plan
|
||||
# ------------
|
||||
# Once every live Sovereign has reconciled past this Job (verified via
|
||||
# `kubectl get jobs -n kube-system | grep legacy-cert-cleanup` showing
|
||||
# Complete on every prov), this file may be deleted from
|
||||
# clusters/_template/sovereign-tls/kustomization.yaml.
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: legacy-cert-cleanup
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/component: legacy-cert-cleanup
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: legacy-cert-cleanup
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/component: legacy-cert-cleanup
|
||||
rules:
|
||||
# Legacy Secret to delete. Only the specific name — RBAC stays
|
||||
# least-privilege.
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
resourceNames: ["sovereign-wildcard-tls"]
|
||||
verbs: ["get", "delete"]
|
||||
# cert-manager Certificate to delete. Only the specific name.
|
||||
- apiGroups: ["cert-manager.io"]
|
||||
resources: ["certificates"]
|
||||
resourceNames: ["sovereign-wildcard-tls"]
|
||||
verbs: ["get", "delete"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: legacy-cert-cleanup
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/component: legacy-cert-cleanup
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: legacy-cert-cleanup
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: legacy-cert-cleanup
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: legacy-cert-cleanup
|
||||
namespace: kube-system
|
||||
labels:
|
||||
catalyst.openova.io/component: legacy-cert-cleanup
|
||||
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
|
||||
spec:
|
||||
# Keep the Job around 5 minutes after completion so an operator can
|
||||
# `kubectl logs job/legacy-cert-cleanup -n kube-system` to confirm
|
||||
# what was (or wasn't) cleaned up. After TTL the GC reclaims.
|
||||
ttlSecondsAfterFinished: 300
|
||||
backoffLimit: 2
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
catalyst.openova.io/component: legacy-cert-cleanup
|
||||
spec:
|
||||
serviceAccountName: legacy-cert-cleanup
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: cleanup
|
||||
# Pinned via Harbor proxy-cache. See CLAUDE.md mirror-everything
|
||||
# rule + values.yaml:252 in self-sovereign-cutover for the
|
||||
# Bitnami→alpine/k8s decision history.
|
||||
image: harbor.openova.io/proxy-dockerhub/alpine/k8s:1.31.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
echo "[legacy-cert-cleanup] starting on ${SOVEREIGN_FQDN}"
|
||||
# The dashed-suffix Secret (the live one PR O introduced)
|
||||
# MUST remain — only delete the bare-name legacy pair.
|
||||
echo "[legacy-cert-cleanup] removing legacy Certificate sovereign-wildcard-tls"
|
||||
kubectl -n kube-system delete certificate.cert-manager.io sovereign-wildcard-tls --ignore-not-found=true --wait=false
|
||||
echo "[legacy-cert-cleanup] removing legacy Secret sovereign-wildcard-tls"
|
||||
kubectl -n kube-system delete secret sovereign-wildcard-tls --ignore-not-found=true --wait=false
|
||||
echo "[legacy-cert-cleanup] complete"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: "10m"
|
||||
memory: "32Mi"
|
||||
limits:
|
||||
cpu: "100m"
|
||||
memory: "64Mi"
|
||||
@ -36,12 +36,11 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cilium
|
||||
# 1.3.1 (qa-loop iter-12 Fix #54 Workstream 2): bpf.preallocateMaps=true
|
||||
# + socketLB.hostNamespaceOnly=true defaults so fresh worker pods on
|
||||
# catalyst-omantel-biz-w2/w3 can resolve DNS on first-join.
|
||||
# 1.3.0 (qa-loop iter-12 Fix #53C): Hubble UI HTTPRoute overlay +
|
||||
# Cilium ClusterMesh LoadBalancer-typed Service shape.
|
||||
version: 1.3.2
|
||||
# 1.3.0 (qa-loop iter-12 Fix #53C): Hubble UI HTTPRoute overlay
|
||||
# (slice H7 #1095) that the catalystOverlay.hubbleUI block depends
|
||||
# on; +Cilium ClusterMesh values shape (LoadBalancer-typed Service
|
||||
# for cross-region peering per Fix #53D).
|
||||
version: 1.3.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cilium
|
||||
|
||||
@ -38,7 +38,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cert-manager
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cert-manager
|
||||
|
||||
@ -38,7 +38,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-keycloak
|
||||
version: 1.4.1
|
||||
version: 1.3.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-keycloak
|
||||
@ -58,17 +58,10 @@ spec:
|
||||
timeout: 15m
|
||||
remediation:
|
||||
retries: 3
|
||||
# Per-Sovereign overrides — issue #387 + #604 + qa-loop iter-12:
|
||||
# Per-Sovereign overrides — issue #387 + #604:
|
||||
# Wire the per-Sovereign hostname into the HTTPRoute template and
|
||||
# sovereign realm ConfigMap (catalyst-ui redirect URIs).
|
||||
# sovereignRealm.name: per `feedback_no_mvp_no_workarounds.md` target-state
|
||||
# rule, each Sovereign owns its KC realm named after the tenant short-name.
|
||||
# Matrix tests (TC-124, TC-125, TC-159, TC-160, TC-161, TC-176, TC-190,
|
||||
# TC-285) assert paths like `/admin/realms/omantel/...`.
|
||||
values:
|
||||
sovereignFQDN: omantel.omani.works
|
||||
sovereignRealm:
|
||||
name: omantel
|
||||
displayName: "Omantel Sovereign"
|
||||
gateway:
|
||||
host: auth.omantel.omani.works
|
||||
|
||||
@ -55,9 +55,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-seaweedfs
|
||||
# 1.2.0 — qa-loop Wave 5 Fix #79 Gap B: chart-rendered
|
||||
# `seaweedfs-storage` StorageClass.
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-seaweedfs
|
||||
|
||||
@ -38,23 +38,7 @@ spec:
|
||||
# 0.1.5: 0.1.4 (default imagePullSecrets) + CI auto-bumped
|
||||
# image.tag. imagePullSecrets default required for omantel pods
|
||||
# to pull from private GHCR.
|
||||
# 0.1.6 (qa-loop bounded-cycle Wave 5 Fix #78, Gap E): adds
|
||||
# pre-install hook-weight -10 Job that auto-generates the
|
||||
# `k8s-ws-proxy-hmac` Secret from /dev/urandom. Without this,
|
||||
# every fresh Sovereign provision left k8s-ws-proxy pods stuck
|
||||
# ContainerCreating forever — the chart referenced a Secret
|
||||
# that nothing ever created. Idempotent on upgrade (preserves
|
||||
# the existing key — rotating it would invalidate every
|
||||
# in-flight catalyst-api signature).
|
||||
# 0.1.9 (qa-loop bounded-cycle Fix #95, regression of Fix #78):
|
||||
# explicit hook-weight ordering for the hmac-bootstrap quartet
|
||||
# (SA=-20, Role+RoleBinding=-15, Job=-10) so the SA lands BEFORE
|
||||
# the Job that references it. Pre-this, prov #8 failed with
|
||||
# `serviceaccount "k8s-ws-proxy-hmac-bootstrap" not found`
|
||||
# because the Job (weight -10, lower=earlier in Helm) was
|
||||
# applied before its SA (weight 0). CI promote auto-bumps from
|
||||
# Chart.yaml 0.1.8 to 0.1.9 with the new image SHA on merge.
|
||||
version: 0.1.9
|
||||
version: 0.1.5
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-k8s-ws-proxy
|
||||
|
||||
@ -51,14 +51,3 @@ resources:
|
||||
# via the catalystOverlay block in 01-cilium.yaml (no separate slot).
|
||||
- 53-bp-netbird.yaml
|
||||
- 54-bp-dmz-vcluster.yaml
|
||||
# qa-loop iter-12 Fix #54 Workstream 1 — bp-hcloud-ccm (slot 55).
|
||||
# The chroot omantel reconciles from clusters/_template/bootstrap-kit/
|
||||
# which carries the actual 55-bp-hcloud-ccm.yaml file. The line below
|
||||
# is intentionally omitted from this per-Sovereign overlay until Phase-2
|
||||
# cutover separates the chroot reconcile from the per-Sovereign one
|
||||
# (per the same pattern as 26-langfuse.yaml — present in this overlay
|
||||
# but NOT in _template, vs slot 55 — present in _template only). When
|
||||
# the per-Sovereign overlay becomes the canonical reconcile target,
|
||||
# copy clusters/_template/bootstrap-kit/55-bp-hcloud-ccm.yaml here AND
|
||||
# uncomment the resource list entry below.
|
||||
# - 55-bp-hcloud-ccm.yaml
|
||||
|
||||
@ -38,7 +38,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-cert-manager
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-cert-manager
|
||||
|
||||
@ -55,9 +55,7 @@ spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: bp-seaweedfs
|
||||
# 1.2.0 — qa-loop Wave 5 Fix #79 Gap B: chart-rendered
|
||||
# `seaweedfs-storage` StorageClass.
|
||||
version: 1.2.0
|
||||
version: 1.1.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-seaweedfs
|
||||
|
||||
@ -7,8 +7,6 @@ go 1.23
|
||||
|
||||
require (
|
||||
github.com/go-logr/logr v1.4.2
|
||||
github.com/nats-io/nats.go v1.37.0
|
||||
github.com/prometheus/client_golang v1.19.1
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1
|
||||
github.com/stretchr/testify v1.9.0
|
||||
k8s.io/api v0.31.1
|
||||
@ -45,10 +43,12 @@ require (
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/nats-io/nats.go v1.37.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.7 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/prometheus/client_golang v1.19.1 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
github.com/prometheus/common v0.55.0 // indirect
|
||||
github.com/prometheus/procfs v0.15.1 // indirect
|
||||
|
||||
@ -11,7 +11,6 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
@ -23,13 +22,11 @@ import (
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/healthz"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
"sigs.k8s.io/controller-runtime/pkg/manager"
|
||||
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/organization/internal/controller"
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
)
|
||||
|
||||
var scheme = runtime.NewScheme()
|
||||
@ -120,57 +117,6 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// D35 consume-leg — subscribe to the two canonical Catalyst NATS
|
||||
// subjects so a `tenant.created` / `order.placed` envelope nudges
|
||||
// the matching Organization CR into a fresh Reconcile within ~50ms
|
||||
// of the publish. Best-effort wiring: when NATS_URL is unset (e.g.
|
||||
// Catalyst-Zero contabo path where NATS is not deployed) we log
|
||||
// "NATS not wired" and continue — the existing 30s informer
|
||||
// requeue fallback inside r.Reconcile keeps the controller correct.
|
||||
natsURL := strings.TrimSpace(os.Getenv("NATS_URL"))
|
||||
if natsURL != "" {
|
||||
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
|
||||
sub, err := natsbus.Connect(natsURL)
|
||||
if err != nil {
|
||||
log.Error(err, "natsbus: connect failed — D35 consume-leg disabled",
|
||||
"nats_url", natsURL)
|
||||
return nil // non-fatal — informer requeue is the canonical fallback
|
||||
}
|
||||
bridge := &controller.NATSBridge{
|
||||
Client: mgr.GetClient(),
|
||||
Log: log.WithName("natsbridge"),
|
||||
}
|
||||
if err := sub.Subscribe(ctx,
|
||||
natsbus.SubjectTenantCreated,
|
||||
"organization-controller-tenant-created",
|
||||
bridge.HandleTenantCreated,
|
||||
natsbus.SubscribeOptions{},
|
||||
); err != nil {
|
||||
log.Error(err, "natsbus: subscribe tenant.created failed")
|
||||
}
|
||||
if err := sub.Subscribe(ctx,
|
||||
natsbus.SubjectOrderPlaced,
|
||||
"organization-controller-order-placed",
|
||||
bridge.HandleOrderPlaced,
|
||||
natsbus.SubscribeOptions{},
|
||||
); err != nil {
|
||||
log.Error(err, "natsbus: subscribe order.placed failed")
|
||||
}
|
||||
<-ctx.Done()
|
||||
sub.Close()
|
||||
return nil
|
||||
})); err != nil {
|
||||
log.Error(err, "natsbus: add runnable failed")
|
||||
os.Exit(1)
|
||||
}
|
||||
log.Info("natsbus: D35 consume-leg wired",
|
||||
"nats_url", natsURL,
|
||||
"subjects", []string{natsbus.SubjectTenantCreated, natsbus.SubjectOrderPlaced},
|
||||
)
|
||||
} else {
|
||||
log.Info("natsbus: NATS_URL unset — D35 consume-leg disabled (informer-requeue fallback only)")
|
||||
}
|
||||
|
||||
log.Info("starting manager",
|
||||
"host_cluster", hostCluster,
|
||||
"keycloak_addr", kcAddr,
|
||||
|
||||
@ -1,202 +0,0 @@
|
||||
// nats_bridge wires the canonical Catalyst NATS subjects (D35 consume
|
||||
// leg) into the organization-controller's reconcile loop.
|
||||
//
|
||||
// PR #1626 closed the publish-side of D35 — tenant + billing services
|
||||
// now emit `catalyst.tenant.created` + `catalyst.billing.order.placed`
|
||||
// on the NATS JetStream `CATALYST_SME` stream per ADR-0001 §6. The
|
||||
// consume-side was missing: no in-cluster controller subscribed, so the
|
||||
// envelopes accumulated on the broker and the only path that
|
||||
// reconciled an Organization CR was the 30s informer requeue plus
|
||||
// whatever wrote the CR in the first place. D35 (gate: "NATS broker
|
||||
// round-trips end-to-end") therefore stayed yellow even though the
|
||||
// publish leg shipped.
|
||||
//
|
||||
// This bridge subscribes to both subjects and, on each envelope:
|
||||
//
|
||||
// 1. Decodes the Event body into the tenant_id / slug fields the
|
||||
// publishers stamp (see core/services/tenant/handlers/handlers.go +
|
||||
// core/services/billing/handlers/handlers.go dispatchOrderPlaced).
|
||||
// 2. Looks up the Organization CR whose `spec.slug` matches the event's
|
||||
// slug. The CR may not exist yet (e.g. tenant.created arrives
|
||||
// before the operator wrote the CR) — that's a soft miss, we log
|
||||
// and Ack so JetStream advances.
|
||||
// 3. Stamps `openova.io/last-event-observed-at` (RFC3339) +
|
||||
// `openova.io/last-event-subject` on the CR via a patch. The
|
||||
// annotation patch is treated as a generation-2 mutation by
|
||||
// controller-runtime, which enqueues a fresh Reconcile within
|
||||
// ~50ms — far faster than the 30s informer requeue fallback. The
|
||||
// 30s requeue is RETAINED inside Reconcile so a missed NATS message
|
||||
// never strands a CR; subscription is an accelerator, not the only
|
||||
// path.
|
||||
//
|
||||
// The bridge is intentionally idempotent — JetStream guarantees
|
||||
// at-least-once delivery, so the same envelope may arrive twice on a
|
||||
// broker rebalance. Stamping an annotation with the broker-side
|
||||
// Event.Timestamp keeps the patch byte-stable on duplicate delivery,
|
||||
// so controller-runtime does NOT enqueue a redundant Reconcile.
|
||||
//
|
||||
// Per HARD CONSTRAINT: no credential write-paths. The bridge reads
|
||||
// only the Event envelope + the matching CR; it never touches Secrets
|
||||
// or Keycloak service-account creds.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
)
|
||||
|
||||
// Annotation keys stamped on the matching Organization CR when a
|
||||
// canonical NATS envelope is observed. Stable across pod restarts so
|
||||
// duplicate JetStream delivery does NOT trigger a redundant
|
||||
// Reconcile (Event.Timestamp is a stable per-event value).
|
||||
const (
|
||||
AnnotationLastNATSObservedAt = "openova.io/last-event-observed-at"
|
||||
AnnotationLastNATSSubject = "openova.io/last-event-subject"
|
||||
)
|
||||
|
||||
// NATSBridge is the consume-leg adapter for the organization-controller.
|
||||
// One bridge instance per canonical subject; the bridge handles all
|
||||
// envelopes that match its handler shape.
|
||||
type NATSBridge struct {
|
||||
Client client.Client
|
||||
Log logr.Logger
|
||||
}
|
||||
|
||||
// HandleTenantCreated reacts to a `catalyst.tenant.created` envelope.
|
||||
// The publish-side (PR #1626) ships the tenant doc as the Data payload
|
||||
// — we read `slug` (canonical Org slug) and `id` (tenant id, used for
|
||||
// the audit log). When the matching Organization CR exists, we stamp
|
||||
// the observation annotation so controller-runtime enqueues a fresh
|
||||
// Reconcile. When it does not exist, we log + Ack — the operator (or a
|
||||
// future provisioning controller) is responsible for creating the CR.
|
||||
func (b *NATSBridge) HandleTenantCreated(ctx context.Context, ev *natsbus.Event) error {
|
||||
if ev == nil {
|
||||
return nil
|
||||
}
|
||||
var payload struct {
|
||||
ID string `json:"id"`
|
||||
Slug string `json:"slug"`
|
||||
TenID string `json:"tenant_id"`
|
||||
}
|
||||
if err := json.Unmarshal(ev.Data, &payload); err != nil {
|
||||
// Malformed inside the envelope — log and Ack via the
|
||||
// natsbus dispatcher (returning nil acks). Don't Nak; the
|
||||
// next delivery would fail identically.
|
||||
b.Log.Error(err, "tenant.created: malformed Data payload — ack to skip",
|
||||
"event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
slug := strings.TrimSpace(payload.Slug)
|
||||
if slug == "" {
|
||||
// PR #1626 stamps `slug` on the tenant doc; if it's missing
|
||||
// the publish side regressed. Log loudly so the operator
|
||||
// notices but Ack so the subscriber doesn't hot-loop.
|
||||
b.Log.Error(fmt.Errorf("missing slug"), "tenant.created: payload has no slug — ack to skip",
|
||||
"event_id", ev.ID, "tenant_id", payload.TenID)
|
||||
return nil
|
||||
}
|
||||
return b.stampObservation(ctx, slug, natsbus.SubjectTenantCreated, ev)
|
||||
}
|
||||
|
||||
// HandleOrderPlaced reacts to a `catalyst.billing.order.placed`
|
||||
// envelope. The publish-side (PR #1626 dispatchOrderPlaced) ships a
|
||||
// payload enriched with the tenant's subdomain — we read `subdomain`
|
||||
// (matches Org slug on the Sovereign-side wildcard tenancy model) and
|
||||
// `tenant_id` for the audit trail.
|
||||
func (b *NATSBridge) HandleOrderPlaced(ctx context.Context, ev *natsbus.Event) error {
|
||||
if ev == nil {
|
||||
return nil
|
||||
}
|
||||
var payload struct {
|
||||
TenantID string `json:"tenant_id"`
|
||||
Subdomain string `json:"subdomain"`
|
||||
OrgSlug string `json:"org_slug"`
|
||||
}
|
||||
if err := json.Unmarshal(ev.Data, &payload); err != nil {
|
||||
b.Log.Error(err, "order.placed: malformed Data payload — ack to skip",
|
||||
"event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
// Prefer the explicit org_slug field when present (forward-compat);
|
||||
// fall back to subdomain which dispatchOrderPlaced currently stamps.
|
||||
slug := strings.TrimSpace(payload.OrgSlug)
|
||||
if slug == "" {
|
||||
slug = strings.TrimSpace(payload.Subdomain)
|
||||
}
|
||||
if slug == "" {
|
||||
b.Log.Error(fmt.Errorf("missing slug"), "order.placed: payload has neither org_slug nor subdomain — ack to skip",
|
||||
"event_id", ev.ID, "tenant_id", payload.TenantID)
|
||||
return nil
|
||||
}
|
||||
return b.stampObservation(ctx, slug, natsbus.SubjectOrderPlaced, ev)
|
||||
}
|
||||
|
||||
// stampObservation looks up the Organization CR by slug and patches in
|
||||
// the two observation annotations. The patch is byte-stable on
|
||||
// duplicate delivery (Event.Timestamp is the broker-side timestamp,
|
||||
// which is fixed per envelope), so controller-runtime does NOT enqueue
|
||||
// a redundant Reconcile.
|
||||
//
|
||||
// Missing CR is not an error — log + return nil so the natsbus
|
||||
// dispatcher Acks. A Nak on a soft miss would hot-loop the subscriber
|
||||
// against a permanently-absent CR.
|
||||
func (b *NATSBridge) stampObservation(ctx context.Context, slug, subject string, ev *natsbus.Event) error {
|
||||
var org orgapi.Organization
|
||||
// Organization is cluster-scoped (see orgapi/types.go), name == slug.
|
||||
if err := b.Client.Get(ctx, types.NamespacedName{Name: slug}, &org); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
b.Log.Info("nats observation: no matching Organization CR — ack and skip",
|
||||
"subject", subject, "slug", slug, "event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
// Transient API-server error — return so the dispatcher Naks
|
||||
// and JetStream redelivers after backoff.
|
||||
return fmt.Errorf("get organization %s: %w", slug, err)
|
||||
}
|
||||
|
||||
observedAt := ev.Timestamp.UTC().Format(time.RFC3339Nano)
|
||||
if observedAt == "" || ev.Timestamp.IsZero() {
|
||||
observedAt = time.Now().UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
|
||||
// Skip the patch when the annotations already match — JetStream's
|
||||
// at-least-once delivery means we will see the same envelope on
|
||||
// broker rebalance, and a redundant patch would churn the informer.
|
||||
cur := org.GetAnnotations()
|
||||
if cur != nil &&
|
||||
cur[AnnotationLastNATSObservedAt] == observedAt &&
|
||||
cur[AnnotationLastNATSSubject] == subject {
|
||||
b.Log.V(1).Info("nats observation: duplicate envelope — skip patch",
|
||||
"subject", subject, "slug", slug, "event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
|
||||
desired := &orgapi.Organization{}
|
||||
org.DeepCopyInto(desired)
|
||||
anns := desired.GetAnnotations()
|
||||
if anns == nil {
|
||||
anns = map[string]string{}
|
||||
}
|
||||
anns[AnnotationLastNATSObservedAt] = observedAt
|
||||
anns[AnnotationLastNATSSubject] = subject
|
||||
desired.SetAnnotations(anns)
|
||||
|
||||
if err := b.Client.Patch(ctx, desired, client.MergeFrom(&org)); err != nil {
|
||||
return fmt.Errorf("patch organization %s: %w", slug, err)
|
||||
}
|
||||
b.Log.Info("nats observation stamped — reconcile enqueued",
|
||||
"subject", subject, "slug", slug, "event_id", ev.ID, "observed_at", observedAt)
|
||||
return nil
|
||||
}
|
||||
@ -1,261 +0,0 @@
|
||||
// Unit tests for the NATS consume-leg bridge (D35).
|
||||
//
|
||||
// The handler is wired through a fake controller-runtime client so we
|
||||
// can assert:
|
||||
//
|
||||
// - tenant.created envelope with a matching CR → annotations stamped.
|
||||
// - order.placed envelope with the legacy subdomain field → CR found
|
||||
// and annotated (back-compat with PR #1626 publish-side).
|
||||
// - envelope with no matching CR → handler returns nil (Ack-to-skip),
|
||||
// no patch attempted (assert via list count).
|
||||
// - duplicate envelope (same timestamp) → no redundant patch.
|
||||
// - malformed Data payload → handler returns nil so dispatcher Acks
|
||||
// instead of hot-looping.
|
||||
//
|
||||
// The bridge is decoupled from JetStream by construction — the
|
||||
// natsbus.Handler signature is `func(ctx, *Event) error`, so these
|
||||
// tests exercise the same surface the live subscriber drives.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr/testr"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/fake"
|
||||
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
)
|
||||
|
||||
func newBridgeFixture(t *testing.T, objs ...runtime.Object) *NATSBridge {
|
||||
t.Helper()
|
||||
scheme := runtime.NewScheme()
|
||||
if err := clientgoscheme.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("clientgo addtoscheme: %v", err)
|
||||
}
|
||||
if err := orgapi.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("orgapi addtoscheme: %v", err)
|
||||
}
|
||||
cb := fake.NewClientBuilder().WithScheme(scheme)
|
||||
if len(objs) > 0 {
|
||||
cb = cb.WithRuntimeObjects(objs...)
|
||||
}
|
||||
return &NATSBridge{
|
||||
Client: cb.Build(),
|
||||
Log: testr.New(t),
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_TenantCreated_HappyPath pins: an envelope on
|
||||
// catalyst.tenant.created with a matching Organization CR results in
|
||||
// both observation annotations being patched. This is the D35 happy
|
||||
// path proof.
|
||||
func TestNATSBridge_TenantCreated_HappyPath(t *testing.T) {
|
||||
org := &orgapi.Organization{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "acme"},
|
||||
Spec: orgapi.OrganizationSpec{Slug: "acme"},
|
||||
}
|
||||
bridge := newBridgeFixture(t, org)
|
||||
|
||||
ts := time.Date(2026, 5, 18, 12, 34, 56, 789012345, time.UTC)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"id": "tnt-1",
|
||||
"slug": "acme",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-tc-1",
|
||||
Type: "tenant.created",
|
||||
Source: "tenant-service",
|
||||
Timestamp: ts,
|
||||
TenantID: "tnt-1",
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleTenantCreated(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleTenantCreated: %v", err)
|
||||
}
|
||||
|
||||
var got orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "acme"}, &got); err != nil {
|
||||
t.Fatalf("get organization: %v", err)
|
||||
}
|
||||
anns := got.GetAnnotations()
|
||||
if anns[AnnotationLastNATSSubject] != natsbus.SubjectTenantCreated {
|
||||
t.Errorf("subject annotation: got %q want %q",
|
||||
anns[AnnotationLastNATSSubject], natsbus.SubjectTenantCreated)
|
||||
}
|
||||
wantObservedAt := ts.Format(time.RFC3339Nano)
|
||||
if anns[AnnotationLastNATSObservedAt] != wantObservedAt {
|
||||
t.Errorf("observed-at annotation: got %q want %q",
|
||||
anns[AnnotationLastNATSObservedAt], wantObservedAt)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_OrderPlaced_BackCompatSubdomain pins the back-compat
|
||||
// path: when org_slug is absent, the bridge falls back to the
|
||||
// `subdomain` field PR #1626's dispatchOrderPlaced enriches in.
|
||||
// Bodyguard against the publish-side renaming the field without the
|
||||
// consume-side noticing.
|
||||
func TestNATSBridge_OrderPlaced_BackCompatSubdomain(t *testing.T) {
|
||||
org := &orgapi.Organization{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "globex"},
|
||||
Spec: orgapi.OrganizationSpec{Slug: "globex"},
|
||||
}
|
||||
bridge := newBridgeFixture(t, org)
|
||||
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"tenant_id": "tnt-2",
|
||||
"subdomain": "globex",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-op-1",
|
||||
Type: "order.placed",
|
||||
Source: "billing-service",
|
||||
Timestamp: time.Date(2026, 5, 18, 13, 0, 0, 0, time.UTC),
|
||||
TenantID: "tnt-2",
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleOrderPlaced(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleOrderPlaced: %v", err)
|
||||
}
|
||||
|
||||
var got orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "globex"}, &got); err != nil {
|
||||
t.Fatalf("get organization: %v", err)
|
||||
}
|
||||
if got.GetAnnotations()[AnnotationLastNATSSubject] != natsbus.SubjectOrderPlaced {
|
||||
t.Errorf("subject annotation: got %q want %q",
|
||||
got.GetAnnotations()[AnnotationLastNATSSubject], natsbus.SubjectOrderPlaced)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_NoMatchingCR pins: an envelope referencing a slug
|
||||
// that doesn't exist returns nil (Ack-to-skip) and does NOT churn the
|
||||
// API server. Critical for cold-start ordering — tenant.created may
|
||||
// arrive before the operator's Organization CR write.
|
||||
func TestNATSBridge_NoMatchingCR(t *testing.T) {
|
||||
bridge := newBridgeFixture(t) // empty fake client
|
||||
|
||||
body, _ := json.Marshal(map[string]any{"slug": "nonexistent"})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-miss",
|
||||
Type: "tenant.created",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleTenantCreated(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleTenantCreated on missing CR returned error (should soft-miss): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_DuplicateEnvelope_NoChurn pins: replaying the same
|
||||
// envelope (same Timestamp) does not mutate the CR a second time. The
|
||||
// gen-2 controller-runtime informer enqueues on annotation drift; a
|
||||
// byte-stable patch keeps the reconcile queue clean.
|
||||
func TestNATSBridge_DuplicateEnvelope_NoChurn(t *testing.T) {
|
||||
org := &orgapi.Organization{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "dup", ResourceVersion: "1"},
|
||||
Spec: orgapi.OrganizationSpec{Slug: "dup"},
|
||||
}
|
||||
bridge := newBridgeFixture(t, org)
|
||||
ts := time.Date(2026, 5, 18, 14, 0, 0, 0, time.UTC)
|
||||
body, _ := json.Marshal(map[string]any{"slug": "dup"})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-dup",
|
||||
Type: "tenant.created",
|
||||
Timestamp: ts,
|
||||
Data: body,
|
||||
}
|
||||
|
||||
// First delivery — patches.
|
||||
if err := bridge.HandleTenantCreated(context.Background(), ev); err != nil {
|
||||
t.Fatalf("first delivery: %v", err)
|
||||
}
|
||||
var afterFirst orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "dup"}, &afterFirst); err != nil {
|
||||
t.Fatalf("get after first: %v", err)
|
||||
}
|
||||
rvAfterFirst := afterFirst.GetResourceVersion()
|
||||
|
||||
// Second delivery (same envelope, same timestamp) — skip path.
|
||||
if err := bridge.HandleTenantCreated(context.Background(), ev); err != nil {
|
||||
t.Fatalf("second delivery: %v", err)
|
||||
}
|
||||
var afterSecond orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "dup"}, &afterSecond); err != nil {
|
||||
t.Fatalf("get after second: %v", err)
|
||||
}
|
||||
if afterSecond.GetResourceVersion() != rvAfterFirst {
|
||||
t.Errorf("duplicate envelope mutated CR — rv went %q → %q",
|
||||
rvAfterFirst, afterSecond.GetResourceVersion())
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_MalformedData pins: a Data blob that fails
|
||||
// json.Unmarshal returns nil so the natsbus dispatcher Acks-to-skip.
|
||||
// A Nak would hot-loop the consumer against a poison pill.
|
||||
func TestNATSBridge_MalformedData(t *testing.T) {
|
||||
bridge := newBridgeFixture(t)
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-bad",
|
||||
Type: "tenant.created",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: []byte("{not-json"),
|
||||
}
|
||||
if err := bridge.HandleTenantCreated(context.Background(), ev); err != nil {
|
||||
t.Errorf("malformed Data should NOT return error (would Nak + hot-loop), got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_OrderPlaced_PreferOrgSlug pins: when both org_slug and
|
||||
// subdomain are present, org_slug wins. Forward-compat with the
|
||||
// publish-side normalizing to the explicit field.
|
||||
func TestNATSBridge_OrderPlaced_PreferOrgSlug(t *testing.T) {
|
||||
org := &orgapi.Organization{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "winner"},
|
||||
Spec: orgapi.OrganizationSpec{Slug: "winner"},
|
||||
}
|
||||
loser := &orgapi.Organization{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "loser"},
|
||||
Spec: orgapi.OrganizationSpec{Slug: "loser"},
|
||||
}
|
||||
bridge := newBridgeFixture(t, org, loser)
|
||||
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"tenant_id": "tnt-99",
|
||||
"org_slug": "winner",
|
||||
"subdomain": "loser",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-pref",
|
||||
Type: "order.placed",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleOrderPlaced(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleOrderPlaced: %v", err)
|
||||
}
|
||||
|
||||
var winner orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "winner"}, &winner); err != nil {
|
||||
t.Fatalf("get winner: %v", err)
|
||||
}
|
||||
if _, ok := winner.GetAnnotations()[AnnotationLastNATSSubject]; !ok {
|
||||
t.Error("winner Organization was not annotated despite org_slug match")
|
||||
}
|
||||
|
||||
var loserGot orgapi.Organization
|
||||
if err := bridge.Client.Get(context.Background(), types.NamespacedName{Name: "loser"}, &loserGot); err != nil {
|
||||
t.Fatalf("get loser: %v", err)
|
||||
}
|
||||
if _, ok := loserGot.GetAnnotations()[AnnotationLastNATSSubject]; ok {
|
||||
t.Error("loser Organization was unexpectedly annotated; org_slug should outrank subdomain")
|
||||
}
|
||||
}
|
||||
@ -38,9 +38,9 @@ import (
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/organization/internal/gitops"
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
)
|
||||
|
||||
// userAccessGVR is the namespace-scoped UserAccess CR group/version/kind
|
||||
@ -243,19 +243,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
|
||||
return r.fail(ctx, &org, idpCond.Reason, fedErr.Error())
|
||||
}
|
||||
|
||||
// 5b. Per-tenant public-hostname HTTPRoute (issue #1629 follow-up).
|
||||
// When `spec.tenantPublic.parentDomain` is set, render a Gateway-API
|
||||
// HTTPRoute attaching `<subdomain>.<parentDomain>` to the supplied
|
||||
// backend Service on the canonical cilium-gateway. No-op when the
|
||||
// field is empty — Orgs that don't yet have a public hostname keep
|
||||
// working via the Sovereign-wide `*.<sovFQDN>` tenant-wildcard
|
||||
// route. Failure is non-fatal for the Org's other reconciliation
|
||||
// outputs (Keycloak group + Gitea Org + vCluster manifests already
|
||||
// landed) so we requeue instead of marking the whole Org Failed.
|
||||
if _, err := r.reconcileTenantRoute(ctx, &org); err != nil {
|
||||
return r.fail(ctx, &org, "TenantRouteFailed", err.Error())
|
||||
}
|
||||
|
||||
// 6. Status update — Ready=True plus the per-step federation
|
||||
// conditions (always present so the access-matrix UI can render
|
||||
// the federation column without conditional logic).
|
||||
|
||||
@ -36,8 +36,8 @@ import (
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/fake"
|
||||
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
@ -53,10 +53,10 @@ type fakeKeycloak struct {
|
||||
groupPath string
|
||||
|
||||
// Federation surface (F2).
|
||||
idps map[string]KCIdentityProvider
|
||||
mappers map[string][]KCIdentityProviderMapper // key = alias
|
||||
idpEnsureCalls int
|
||||
idpDeleteCalls int
|
||||
idps map[string]KCIdentityProvider
|
||||
mappers map[string][]KCIdentityProviderMapper // key = alias
|
||||
idpEnsureCalls int
|
||||
idpDeleteCalls int
|
||||
mapperEnsureCalls int
|
||||
}
|
||||
|
||||
@ -661,10 +661,10 @@ func TestReconcile_Missing_NoError(t *testing.T) {
|
||||
// no Pod was ever scheduled.
|
||||
//
|
||||
// This test asserts:
|
||||
// 1. Upsert writes the UserAccess CR into the configured
|
||||
// r.UserAccessNamespace (default `catalyst-system`).
|
||||
// 2. The CR carries metadata.namespace == that namespace (NOT empty).
|
||||
// 3. The owner-per-CR mapping holds (1 owner = 1 CR).
|
||||
// 1. Upsert writes the UserAccess CR into the configured
|
||||
// r.UserAccessNamespace (default `catalyst-system`).
|
||||
// 2. The CR carries metadata.namespace == that namespace (NOT empty).
|
||||
// 3. The owner-per-CR mapping holds (1 owner = 1 CR).
|
||||
func TestUpsertUserAccess_NamespaceScoped(t *testing.T) {
|
||||
t.Parallel()
|
||||
org := sampleOrg()
|
||||
@ -752,114 +752,3 @@ func TestUpsertUserAccess_DefaultsToCatalystSystem(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_TenantPublic_RendersHTTPRoute covers the issue #1629
|
||||
// follow-up: when spec.tenantPublic.parentDomain is set, the reconciler
|
||||
// MUST render an HTTPRoute in the Org's namespace pointing at the
|
||||
// supplied backend Service. Without this, PowerDNS-resolved tenant
|
||||
// hostnames (e.g. `acme.omani.homes`) fall through to the marketplace
|
||||
// `tenant-wildcard` route and 404 instead of hitting the tenant's
|
||||
// installed WordPress.
|
||||
func TestReconcile_TenantPublic_RendersHTTPRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
org := sampleOrg()
|
||||
org.Spec.TenantPublic = orgapi.OrganizationTenantPublic{
|
||||
ParentDomain: "omani.homes",
|
||||
BackendService: "wordpress-x-acme-x-vcluster",
|
||||
BackendPort: 80,
|
||||
Product: "wordpress",
|
||||
}
|
||||
|
||||
// Register HTTPRoute (Gateway API) with the fake client's scheme so
|
||||
// it can serialise the unstructured object the reconciler writes.
|
||||
r, _, _ := makeReconciler(t, org)
|
||||
scheme := r.Scheme()
|
||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRoute",
|
||||
}, &unstructured.Unstructured{})
|
||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRouteList",
|
||||
}, &unstructured.UnstructuredList{})
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: "acme"},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
hr := unstructured.Unstructured{}
|
||||
hr.SetGroupVersionKind(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRoute",
|
||||
})
|
||||
if err := r.Get(context.Background(), client.ObjectKey{Namespace: "acme", Name: "acme"}, &hr); err != nil {
|
||||
t.Fatalf("get HTTPRoute acme/acme: %v", err)
|
||||
}
|
||||
hostnames, _, _ := unstructured.NestedSlice(hr.Object, "spec", "hostnames")
|
||||
if len(hostnames) != 1 || hostnames[0] != "acme.omani.homes" {
|
||||
t.Errorf("hostnames: got %v, want [acme.omani.homes]", hostnames)
|
||||
}
|
||||
parents, _, _ := unstructured.NestedSlice(hr.Object, "spec", "parentRefs")
|
||||
if len(parents) != 1 {
|
||||
t.Fatalf("parentRefs: got %d, want 1", len(parents))
|
||||
}
|
||||
pr := parents[0].(map[string]any)
|
||||
if pr["name"] != "cilium-gateway" || pr["namespace"] != "kube-system" {
|
||||
t.Errorf("parentRef: got %+v, want cilium-gateway/kube-system", pr)
|
||||
}
|
||||
rules, _, _ := unstructured.NestedSlice(hr.Object, "spec", "rules")
|
||||
if len(rules) != 1 {
|
||||
t.Fatalf("rules: got %d, want 1", len(rules))
|
||||
}
|
||||
brs, _, _ := unstructured.NestedSlice(rules[0].(map[string]any), "backendRefs")
|
||||
if len(brs) != 1 {
|
||||
t.Fatalf("backendRefs: got %d, want 1", len(brs))
|
||||
}
|
||||
br := brs[0].(map[string]any)
|
||||
if br["name"] != "wordpress-x-acme-x-vcluster" {
|
||||
t.Errorf("backendRef name: got %v, want wordpress-x-acme-x-vcluster", br["name"])
|
||||
}
|
||||
labels := hr.GetLabels()
|
||||
if labels["catalyst.openova.io/tenant-product"] != "wordpress" {
|
||||
t.Errorf("expected tenant-product=wordpress label, got %q",
|
||||
labels["catalyst.openova.io/tenant-product"])
|
||||
}
|
||||
if labels["catalyst.openova.io/parent-zone"] != "omani.homes" {
|
||||
t.Errorf("expected parent-zone=omani.homes label, got %q",
|
||||
labels["catalyst.openova.io/parent-zone"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_TenantPublic_DisabledByDefault covers the no-op path:
|
||||
// when spec.tenantPublic.parentDomain is empty (the default for every
|
||||
// existing Org CR), NO HTTPRoute MUST be written. Without this guard
|
||||
// every legacy Org would suddenly try to render an HTTPRoute and the
|
||||
// reconciler would surface TenantRouteFailed because BackendService is
|
||||
// empty.
|
||||
func TestReconcile_TenantPublic_DisabledByDefault(t *testing.T) {
|
||||
t.Parallel()
|
||||
org := sampleOrg() // no TenantPublic set
|
||||
r, _, _ := makeReconciler(t, org)
|
||||
scheme := r.Scheme()
|
||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRoute",
|
||||
}, &unstructured.Unstructured{})
|
||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRouteList",
|
||||
}, &unstructured.UnstructuredList{})
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: "acme"},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
hrList := unstructured.UnstructuredList{}
|
||||
hrList.SetGroupVersionKind(schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io", Version: "v1", Kind: "HTTPRouteList",
|
||||
})
|
||||
if err := r.List(context.Background(), &hrList); err != nil {
|
||||
t.Fatalf("list HTTPRoute: %v", err)
|
||||
}
|
||||
if len(hrList.Items) != 0 {
|
||||
t.Errorf("expected 0 HTTPRoutes when tenantPublic is unset, got %d", len(hrList.Items))
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,190 +0,0 @@
|
||||
// tenant_route.go — per-Organization HTTPRoute reconciler.
|
||||
//
|
||||
// Issue #1629 follow-up. PowerDNS now resolves `<slug>.<parentDomain>`
|
||||
// (e.g. `acme.omani.homes`) for every Org whose Sovereign has a
|
||||
// parent_domains entry with role=sme-pool, but no HTTPRoute attaches
|
||||
// that hostname to the Org's installed product Service. Result: the
|
||||
// Cilium Gateway happily terminates TLS on the wildcard cert, then
|
||||
// returns the storefront landing page (the only HTTPRoute attached
|
||||
// to `*.<sovFQDN>` is the `tenant-wildcard` route → marketplace
|
||||
// console Service) instead of the tenant's WordPress / Nextcloud /
|
||||
// GitLab install.
|
||||
//
|
||||
// The fix is reconciler-side: when `spec.tenantPublic.parentDomain`
|
||||
// is set on an Organization, the controller renders a per-tenant
|
||||
// HTTPRoute in the Org's namespace (= spec.slug) pointing at the
|
||||
// supplied BackendService. The route attaches to the canonical
|
||||
// `cilium-gateway/kube-system` parent — the same parent the
|
||||
// marketplace, back-office, and tenant-wildcard routes already attach
|
||||
// to — and surfaces `<subdomain>.<parentDomain>` as its hostname so
|
||||
// the Cilium Gateway hostname matcher picks the per-tenant route
|
||||
// over the wildcard for any request matching the exact host.
|
||||
//
|
||||
// Design notes:
|
||||
//
|
||||
// - HTTPRoute is created/updated via the controller-runtime client
|
||||
// with an Unstructured object (same pattern continuum/switchover
|
||||
// uses for HTTPRoute weight drains). This avoids pulling in the
|
||||
// gateway-api Go types for a single resource.
|
||||
// - BackendService is treated as a Service in the Org's own
|
||||
// namespace — no ReferenceGrant required. Operators that point
|
||||
// at a cross-namespace Service (rare) can ship the
|
||||
// ReferenceGrant alongside the Org.
|
||||
// - The HTTPRoute name is the Org slug (deterministic, idempotent).
|
||||
// OwnerReferences are intentionally NOT set: Organizations are
|
||||
// cluster-scoped while the HTTPRoute is namespaced, and K8s rejects
|
||||
// namespaced→cluster OwnerReferences. Deletion is handled by the
|
||||
// Org's namespace teardown (when the Org's vCluster ns is
|
||||
// removed, every HTTPRoute under it goes with it).
|
||||
// - Skipped silently when ParentDomain is empty (the zero-value
|
||||
// case for Orgs that don't yet have a public hostname).
|
||||
//
|
||||
// Per docs/INVIOLABLE-PRINCIPLES.md #4 every operationally-meaningful
|
||||
// value flows through the CR — no hardcoded gateway name, parent
|
||||
// namespace, or port number in the renderer.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
orgapi "github.com/openova-io/openova/core/controllers/organization/internal/orgapi"
|
||||
)
|
||||
|
||||
// httpRouteGVK identifies the Gateway API HTTPRoute v1 resource the
|
||||
// reconciler writes. Matches the GVK referenced by the existing
|
||||
// marketplace-routes.yaml, httproute.yaml, and continuum/switchover
|
||||
// drainers — every Cilium Gateway-API path on a Sovereign goes through
|
||||
// gateway.networking.k8s.io/v1.HTTPRoute.
|
||||
var httpRouteGVK = schema.GroupVersionKind{
|
||||
Group: "gateway.networking.k8s.io",
|
||||
Version: "v1",
|
||||
Kind: "HTTPRoute",
|
||||
}
|
||||
|
||||
// tenantRouteParentDefaults are the defaults the reconciler applies
|
||||
// when the Organization spec doesn't override them. They match the
|
||||
// canonical Cilium Gateway placement on every Sovereign
|
||||
// (clusters/_template/sovereign-tls/cilium-gateway.yaml installs the
|
||||
// Gateway as `cilium-gateway` in `kube-system`).
|
||||
const (
|
||||
tenantRouteDefaultGatewayName = "cilium-gateway"
|
||||
tenantRouteDefaultGatewayNamespace = "kube-system"
|
||||
tenantRouteDefaultBackendPort = int32(80)
|
||||
)
|
||||
|
||||
// reconcileTenantRoute creates or updates the per-Organization
|
||||
// HTTPRoute when `spec.tenantPublic.parentDomain` is set. Returns
|
||||
// (rendered=true, nil) when the route was written, (false, nil) when
|
||||
// the feature is disabled (empty parentDomain), or (false, err) on a
|
||||
// transient write failure (the parent reconciler requeues).
|
||||
func (r *Reconciler) reconcileTenantRoute(ctx context.Context, org *orgapi.Organization) (bool, error) {
|
||||
tp := org.Spec.TenantPublic
|
||||
parentDomain := strings.TrimSpace(tp.ParentDomain)
|
||||
if parentDomain == "" {
|
||||
// Feature disabled — Orgs that don't yet have a public
|
||||
// hostname are accessed via the Sovereign-wide
|
||||
// `*.<sovFQDN>` tenant-wildcard route. No-op + no condition
|
||||
// surfacing (matches the existing reconciler's quiet-mode
|
||||
// for unset optional fields).
|
||||
return false, nil
|
||||
}
|
||||
|
||||
subdomain := strings.TrimSpace(tp.Subdomain)
|
||||
if subdomain == "" {
|
||||
subdomain = org.Spec.Slug
|
||||
}
|
||||
backend := strings.TrimSpace(tp.BackendService)
|
||||
if backend == "" {
|
||||
return false, fmt.Errorf("tenantPublic.backendService is required when parentDomain is set")
|
||||
}
|
||||
port := tp.BackendPort
|
||||
if port == 0 {
|
||||
port = tenantRouteDefaultBackendPort
|
||||
}
|
||||
|
||||
hostname := fmt.Sprintf("%s.%s", subdomain, parentDomain)
|
||||
ns := org.Spec.Slug
|
||||
name := org.Spec.Slug
|
||||
|
||||
labels := map[string]string{
|
||||
"openova.io/organization": org.Spec.Slug,
|
||||
"openova.io/sovereign": org.Spec.SovereignRef,
|
||||
"openova.io/managed-by": "organization-controller",
|
||||
"app.kubernetes.io/managed-by": "catalyst",
|
||||
"catalyst.openova.io/component": "tenant-public-route",
|
||||
"catalyst.openova.io/parent-zone": parentDomain,
|
||||
}
|
||||
if p := strings.TrimSpace(tp.Product); p != "" {
|
||||
labels["catalyst.openova.io/tenant-product"] = p
|
||||
}
|
||||
|
||||
desiredSpec := map[string]any{
|
||||
"parentRefs": []any{
|
||||
map[string]any{
|
||||
"name": tenantRouteDefaultGatewayName,
|
||||
"namespace": tenantRouteDefaultGatewayNamespace,
|
||||
},
|
||||
},
|
||||
"hostnames": []any{hostname},
|
||||
"rules": []any{
|
||||
map[string]any{
|
||||
"matches": []any{
|
||||
map[string]any{
|
||||
"path": map[string]any{
|
||||
"type": "PathPrefix",
|
||||
"value": "/",
|
||||
},
|
||||
},
|
||||
},
|
||||
"backendRefs": []any{
|
||||
map[string]any{
|
||||
"name": backend,
|
||||
"port": int64(port),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
desired := unstructured.Unstructured{}
|
||||
desired.SetGroupVersionKind(httpRouteGVK)
|
||||
desired.SetName(name)
|
||||
desired.SetNamespace(ns)
|
||||
desired.SetLabels(labels)
|
||||
desired.Object["spec"] = desiredSpec
|
||||
|
||||
current := unstructured.Unstructured{}
|
||||
current.SetGroupVersionKind(httpRouteGVK)
|
||||
err := r.Get(ctx, client.ObjectKey{Namespace: ns, Name: name}, ¤t)
|
||||
if err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
return false, fmt.Errorf("get HTTPRoute %s/%s: %w", ns, name, err)
|
||||
}
|
||||
if err := r.Create(ctx, &desired); err != nil {
|
||||
if apierrors.IsAlreadyExists(err) {
|
||||
// Race: another reconcile created it between Get
|
||||
// and Create. Re-Get + Update on next pass.
|
||||
return true, nil
|
||||
}
|
||||
return false, fmt.Errorf("create HTTPRoute %s/%s: %w", ns, name, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Update: copy desired spec + labels onto current (preserves
|
||||
// resourceVersion + any operator-added annotations).
|
||||
current.Object["spec"] = desiredSpec
|
||||
current.SetLabels(labels)
|
||||
if err := r.Update(ctx, ¤t); err != nil {
|
||||
return false, fmt.Errorf("update HTTPRoute %s/%s: %w", ns, name, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
@ -84,66 +84,6 @@ type OrganizationSpec struct {
|
||||
// Identity holds optional federation config — empty means use the
|
||||
// Sovereign's own Keycloak realm.
|
||||
Identity OrganizationIdentity `json:"identity,omitempty"`
|
||||
|
||||
// TenantPublic optionally exposes the Org's installed product on a
|
||||
// per-tenant public hostname. When set the organization-controller
|
||||
// renders a Gateway-API HTTPRoute in the Org's namespace pointing at
|
||||
// the supplied backend Service. When empty (the zero value) the
|
||||
// controller skips the HTTPRoute step — Orgs that don't yet have a
|
||||
// product installed (or that are accessed only via the per-Sovereign
|
||||
// console wildcard `*.<sovFQDN>`) keep working unchanged.
|
||||
//
|
||||
// The motivating use case (issue #1629 follow-up) is the
|
||||
// `<slug>.omani.homes` family of tenant hostnames: PowerDNS now
|
||||
// resolves them via the sme-pool parent zone reconciler, but no
|
||||
// HTTPRoute was attaching them to the tenant's WordPress install.
|
||||
// Without this struct that traffic 404s at the Cilium Gateway.
|
||||
TenantPublic OrganizationTenantPublic `json:"tenantPublic,omitempty"`
|
||||
}
|
||||
|
||||
// OrganizationTenantPublic is the per-tenant public-hostname binding
|
||||
// the organization-controller renders into an HTTPRoute on Ready Orgs.
|
||||
//
|
||||
// All fields are optional at the CRD level — the controller treats an
|
||||
// empty ParentDomain as "do not render". Defaulting rules:
|
||||
//
|
||||
// - Subdomain defaults to spec.slug.
|
||||
// - BackendPort defaults to 80 (the conventional HTTP port WordPress,
|
||||
// Nextcloud, GitLab, BookStack, and Ghost all listen on inside the
|
||||
// vCluster).
|
||||
//
|
||||
// Per docs/INVIOLABLE-PRINCIPLES.md #4 no value is hardcoded inside the
|
||||
// renderer — every knob flows through the CR.
|
||||
type OrganizationTenantPublic struct {
|
||||
// ParentDomain is the apex zone the per-tenant hostname lives
|
||||
// under (e.g. "omani.homes"). Sovereign-wide parentZones lists the
|
||||
// pool of valid candidates; this field picks one specific apex per
|
||||
// Organization. Required to render the HTTPRoute — empty disables
|
||||
// the whole TenantPublic feature for this Org.
|
||||
ParentDomain string `json:"parentDomain,omitempty"`
|
||||
|
||||
// Subdomain is the leftmost label of the per-tenant hostname.
|
||||
// Defaults to spec.slug when empty so the canonical
|
||||
// `<slug>.<parentDomain>` hostname renders without extra config.
|
||||
Subdomain string `json:"subdomain,omitempty"`
|
||||
|
||||
// BackendService is the Service name the HTTPRoute routes "/" to —
|
||||
// e.g. `wordpress` for an in-cluster WordPress install, or the
|
||||
// vCluster-synced `wordpress-x-<slug>-x-vcluster` name when the
|
||||
// product lives inside a vCluster. The Service MUST resolve in the
|
||||
// Org's host namespace (= spec.slug) so the HTTPRoute backendRefs
|
||||
// don't need cross-namespace ReferenceGrants.
|
||||
BackendService string `json:"backendService,omitempty"`
|
||||
|
||||
// BackendPort is the Service port number to route to. Defaults to
|
||||
// 80 when zero.
|
||||
BackendPort int32 `json:"backendPort,omitempty"`
|
||||
|
||||
// Product is an operator-meaningful tag carried on the rendered
|
||||
// HTTPRoute's labels (e.g. "wordpress", "nextcloud", "gitlab").
|
||||
// Surfaced on the access-matrix UI so operators can filter routes
|
||||
// by installed product. Optional — empty just omits the label.
|
||||
Product string `json:"product,omitempty"`
|
||||
}
|
||||
|
||||
// OrganizationOwner is an entry in spec.owners.
|
||||
|
||||
@ -1,223 +0,0 @@
|
||||
// Issue CRUD on top of the canonical pkg/gitea Client.
|
||||
//
|
||||
// pulls.go covers the PR read surface (Wave 8) + merge (Wave 11). This
|
||||
// file covers the Issue read+write surface needed by Wave 11's
|
||||
// openova-sandbox-mcp tools (`gitea.issue.list / get / create /
|
||||
// comment`). Same client envelope; same error mapping (ErrRepoNotFound
|
||||
// on 404; *HTTPError otherwise).
|
||||
//
|
||||
// New endpoints (Gitea Admin REST API):
|
||||
//
|
||||
// GET /api/v1/repos/{owner}/{repo}/issues?state=...&page=...&limit=50
|
||||
// GET /api/v1/repos/{owner}/{repo}/issues/{index}
|
||||
// POST /api/v1/repos/{owner}/{repo}/issues
|
||||
// POST /api/v1/repos/{owner}/{repo}/issues/{index}/comments
|
||||
//
|
||||
// Gitea conflates issues + PRs on the same /issues collection (PRs have
|
||||
// `pull_request != nil`); the MCP `gitea.pr.*` family uses the dedicated
|
||||
// /pulls endpoint, so callers wanting true issues only should either
|
||||
// pass `type=issues` (Gitea ≥1.20) or filter on Issue.IsPullRequest().
|
||||
package gitea
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IssueState constrains the `state` query param on ListIssues. Gitea
|
||||
// accepts "open" | "closed" | "all"; empty defaults server-side to
|
||||
// "open" but we make the wire shape explicit.
|
||||
type IssueState string
|
||||
|
||||
const (
|
||||
// IssueStateOpen lists only open issues (Gitea default).
|
||||
IssueStateOpen IssueState = "open"
|
||||
// IssueStateClosed lists only closed issues.
|
||||
IssueStateClosed IssueState = "closed"
|
||||
// IssueStateAll lists every issue regardless of state.
|
||||
IssueStateAll IssueState = "all"
|
||||
)
|
||||
|
||||
// Issue is the slice of Gitea Issue fields the MCP tools surface.
|
||||
//
|
||||
// Gitea's `pull_request` field is non-nil when the row is actually a PR
|
||||
// — callers wanting true issues should `.IsPullRequest()` filter, or
|
||||
// pass `type=issues` to ListIssues for server-side scoping.
|
||||
type Issue struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
Number int64 `json:"number,omitempty"`
|
||||
URL string `json:"html_url,omitempty"`
|
||||
State string `json:"state,omitempty"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Body string `json:"body,omitempty"`
|
||||
CreatedAt *time.Time `json:"created_at,omitempty"`
|
||||
UpdatedAt *time.Time `json:"updated_at,omitempty"`
|
||||
ClosedAt *time.Time `json:"closed_at,omitempty"`
|
||||
PullRequest *struct {
|
||||
// Only set when the row is a PR. Non-nil → IsPullRequest()==true.
|
||||
Merged bool `json:"merged,omitempty"`
|
||||
} `json:"pull_request,omitempty"`
|
||||
}
|
||||
|
||||
// IsPullRequest reports whether the row is actually a Pull Request on
|
||||
// Gitea's shared /issues collection. Callers wanting true issues only
|
||||
// should `if !i.IsPullRequest()` filter.
|
||||
func (i Issue) IsPullRequest() bool { return i.PullRequest != nil }
|
||||
|
||||
// IssueComment is the slice of Gitea Issue-comment fields the MCP
|
||||
// `gitea.issue.comment` tool surfaces back to the agent.
|
||||
type IssueComment struct {
|
||||
ID int64 `json:"id,omitempty"`
|
||||
URL string `json:"html_url,omitempty"`
|
||||
Body string `json:"body,omitempty"`
|
||||
CreatedAt *time.Time `json:"created_at,omitempty"`
|
||||
UpdatedAt *time.Time `json:"updated_at,omitempty"`
|
||||
}
|
||||
|
||||
// ListIssuesOpts threads optional filters through ListIssues without
|
||||
// growing the positional signature.
|
||||
type ListIssuesOpts struct {
|
||||
// State filters by open/closed/all. Empty → server default ("open").
|
||||
State IssueState
|
||||
// Type filters by "issues" | "pulls" | "" (both). Empty → both, with
|
||||
// PRs distinguishable via Issue.IsPullRequest().
|
||||
Type string
|
||||
}
|
||||
|
||||
// issueCreate is the body of POST /repos/{owner}/{repo}/issues.
|
||||
type issueCreate struct {
|
||||
Title string `json:"title"`
|
||||
Body string `json:"body,omitempty"`
|
||||
}
|
||||
|
||||
// issueCommentCreate is the body of POST /issues/{index}/comments.
|
||||
type issueCommentCreate struct {
|
||||
Body string `json:"body"`
|
||||
}
|
||||
|
||||
// ListIssues returns every issue on the repo matching opts, walking
|
||||
// Gitea's pagination (page=1..N, limit=50). Result order matches what
|
||||
// Gitea returns (typically newest-first by creation).
|
||||
//
|
||||
// Returns ErrRepoNotFound on a first-page 404. Subsequent pagination
|
||||
// failures bubble up as *HTTPError.
|
||||
//
|
||||
// Added Wave 11 for openova-sandbox-mcp `gitea.issue.list`.
|
||||
func (c *Client) ListIssues(ctx context.Context, org, repo string, opts ListIssuesOpts) ([]Issue, error) {
|
||||
if org == "" || repo == "" {
|
||||
return nil, errors.New("gitea: ListIssues requires non-empty org, repo")
|
||||
}
|
||||
const pageSize = 50
|
||||
out := make([]Issue, 0, pageSize)
|
||||
for page := 1; ; page++ {
|
||||
q := url.Values{}
|
||||
q.Set("limit", fmt.Sprintf("%d", pageSize))
|
||||
q.Set("page", fmt.Sprintf("%d", page))
|
||||
if opts.State != "" {
|
||||
q.Set("state", string(opts.State))
|
||||
}
|
||||
if opts.Type != "" {
|
||||
q.Set("type", opts.Type)
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/issues?%s",
|
||||
url.PathEscape(org), url.PathEscape(repo), q.Encode())
|
||||
var batch []Issue
|
||||
status, _, err := c.do(ctx, http.MethodGet, endpoint, nil, &batch)
|
||||
if err != nil {
|
||||
if page == 1 && status == http.StatusNotFound {
|
||||
return nil, ErrRepoNotFound
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, batch...)
|
||||
if len(batch) < pageSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// GetIssue fetches a single Issue by number. Returns ErrRepoNotFound on
|
||||
// any 404 (Gitea doesn't distinguish "repo gone" from "issue number gone"
|
||||
// cleanly on this endpoint), or *HTTPError otherwise. Callers can
|
||||
// `IsNotFound(err)` to fold the 404 case.
|
||||
//
|
||||
// Added Wave 11 for openova-sandbox-mcp `gitea.issue.get`.
|
||||
func (c *Client) GetIssue(ctx context.Context, org, repo string, number int64) (Issue, error) {
|
||||
if org == "" || repo == "" {
|
||||
return Issue{}, errors.New("gitea: GetIssue requires non-empty org, repo")
|
||||
}
|
||||
if number <= 0 {
|
||||
return Issue{}, errors.New("gitea: GetIssue requires positive issue number")
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/issues/%d",
|
||||
url.PathEscape(org), url.PathEscape(repo), number)
|
||||
var out Issue
|
||||
status, _, err := c.do(ctx, http.MethodGet, endpoint, nil, &out)
|
||||
if err != nil {
|
||||
if status == http.StatusNotFound {
|
||||
return Issue{}, ErrRepoNotFound
|
||||
}
|
||||
return Issue{}, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// CreateIssue opens a new Issue on (org, repo). Returns ErrRepoNotFound
|
||||
// on 404 (repo doesn't exist); other non-2xx surface as *HTTPError.
|
||||
// Idempotency: Gitea does NOT de-duplicate issues by title — calling
|
||||
// CreateIssue twice opens TWO issues. The MCP tool exposes this verbatim;
|
||||
// callers wanting find-or-create semantics should ListIssues first.
|
||||
//
|
||||
// Added Wave 11 for openova-sandbox-mcp `gitea.issue.create`.
|
||||
func (c *Client) CreateIssue(ctx context.Context, org, repo, title, body string) (Issue, error) {
|
||||
if org == "" || repo == "" {
|
||||
return Issue{}, errors.New("gitea: CreateIssue requires non-empty org, repo")
|
||||
}
|
||||
if title == "" {
|
||||
return Issue{}, errors.New("gitea: CreateIssue requires non-empty title")
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/issues",
|
||||
url.PathEscape(org), url.PathEscape(repo))
|
||||
var out Issue
|
||||
status, _, err := c.do(ctx, http.MethodPost, endpoint, issueCreate{Title: title, Body: body}, &out)
|
||||
if err != nil {
|
||||
if status == http.StatusNotFound {
|
||||
return Issue{}, ErrRepoNotFound
|
||||
}
|
||||
return Issue{}, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// CommentOnIssue posts a comment on issue #number. Works on both true
|
||||
// issues and PR rows (Gitea conflates them on the /comments endpoint).
|
||||
// Returns ErrRepoNotFound on 404; other non-2xx surface as *HTTPError.
|
||||
//
|
||||
// Added Wave 11 for openova-sandbox-mcp `gitea.issue.comment`.
|
||||
func (c *Client) CommentOnIssue(ctx context.Context, org, repo string, number int64, body string) (IssueComment, error) {
|
||||
if org == "" || repo == "" {
|
||||
return IssueComment{}, errors.New("gitea: CommentOnIssue requires non-empty org, repo")
|
||||
}
|
||||
if number <= 0 {
|
||||
return IssueComment{}, errors.New("gitea: CommentOnIssue requires positive issue number")
|
||||
}
|
||||
if body == "" {
|
||||
return IssueComment{}, errors.New("gitea: CommentOnIssue requires non-empty body")
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/issues/%d/comments",
|
||||
url.PathEscape(org), url.PathEscape(repo), number)
|
||||
var out IssueComment
|
||||
status, _, err := c.do(ctx, http.MethodPost, endpoint, issueCommentCreate{Body: body}, &out)
|
||||
if err != nil {
|
||||
if status == http.StatusNotFound {
|
||||
return IssueComment{}, ErrRepoNotFound
|
||||
}
|
||||
return IssueComment{}, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
@ -1,346 +0,0 @@
|
||||
// Wave-11 tests for MergePullRequest + Issue CRUD (issues.go).
|
||||
//
|
||||
// We don't extend the existing pullsFake / fakeGitea handlers — they were
|
||||
// frozen to lock the surface they cover. A focused per-feature fake
|
||||
// keeps the assertion radius small and avoids regressing the existing
|
||||
// list/get coverage when this file's expectations evolve.
|
||||
package gitea
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mergeIssuesFake handles ONLY the endpoints touched by MergePullRequest
|
||||
// + the four Issue methods. Anything else 404s so a test catching a typo
|
||||
// gets a clear "unhandled" failure instead of a silent pass.
|
||||
type mergeIssuesFake struct {
|
||||
mu sync.Mutex
|
||||
|
||||
// merged is keyed by "<org>/<repo>/<number>" → the Do style the
|
||||
// client posted. Lets a test assert "we passed style=squash".
|
||||
merged map[string]string
|
||||
|
||||
// issuesByRepo is keyed by "<org>/<repo>" → ordered issue list.
|
||||
// Mutation: CreateIssue appends a new entry with an
|
||||
// auto-incrementing Number; GET /issues/{idx} reads it back.
|
||||
issuesByRepo map[string][]Issue
|
||||
|
||||
// comments is keyed by "<org>/<repo>/<number>" → comment count.
|
||||
// We don't store the comment bodies — the test asserts the
|
||||
// returned IssueComment shape end-to-end.
|
||||
comments map[string]int
|
||||
}
|
||||
|
||||
func newMergeIssuesFake() *mergeIssuesFake {
|
||||
return &mergeIssuesFake{
|
||||
merged: map[string]string{},
|
||||
issuesByRepo: map[string][]Issue{},
|
||||
comments: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (f *mergeIssuesFake) handler() http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Header.Get("Authorization") == "" {
|
||||
http.Error(w, "no auth", http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
p := r.URL.Path
|
||||
// POST /api/v1/repos/{owner}/{repo}/pulls/{number}/merge
|
||||
if r.Method == http.MethodPost && strings.HasSuffix(p, "/merge") {
|
||||
rest := strings.TrimPrefix(p, "/api/v1/repos/")
|
||||
rest = strings.TrimSuffix(rest, "/merge")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 4 || parts[2] != "pulls" {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
var body mergePullRequestPayload
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
key := parts[0] + "/" + parts[1] + "/" + parts[3]
|
||||
f.mu.Lock()
|
||||
f.merged[key] = body.Do
|
||||
f.mu.Unlock()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
// GET /api/v1/repos/{owner}/{repo}/issues/{index}
|
||||
if r.Method == http.MethodGet && strings.Contains(p, "/issues/") {
|
||||
rest := strings.TrimPrefix(p, "/api/v1/repos/")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 4 || parts[2] != "issues" {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
idx, _ := strconv.Atoi(parts[3])
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
for _, i := range f.issuesByRepo[repoKey] {
|
||||
if i.Number == int64(idx) {
|
||||
writeJSON(w, http.StatusOK, i)
|
||||
return
|
||||
}
|
||||
}
|
||||
http.Error(w, "no issue", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
// POST /api/v1/repos/{owner}/{repo}/issues/{index}/comments
|
||||
if r.Method == http.MethodPost && strings.HasSuffix(p, "/comments") {
|
||||
rest := strings.TrimPrefix(p, "/api/v1/repos/")
|
||||
rest = strings.TrimSuffix(rest, "/comments")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 4 || parts[2] != "issues" {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
idx, _ := strconv.Atoi(parts[3])
|
||||
var body issueCommentCreate
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
f.mu.Lock()
|
||||
f.comments[repoKey+"/"+parts[3]] = f.comments[repoKey+"/"+parts[3]] + 1
|
||||
cid := int64(f.comments[repoKey+"/"+parts[3]])
|
||||
f.mu.Unlock()
|
||||
writeJSON(w, http.StatusCreated, IssueComment{
|
||||
ID: cid + int64(idx)*100,
|
||||
Body: body.Body,
|
||||
URL: "http://gitea/x",
|
||||
})
|
||||
return
|
||||
}
|
||||
// GET /api/v1/repos/{owner}/{repo}/issues?state=...&type=...
|
||||
if r.Method == http.MethodGet && strings.HasSuffix(p, "/issues") {
|
||||
rest := strings.TrimSuffix(strings.TrimPrefix(p, "/api/v1/repos/"), "/issues")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 2 {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
issues, ok := f.issuesByRepo[repoKey]
|
||||
if !ok {
|
||||
http.Error(w, "no repo", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
|
||||
if page == 0 {
|
||||
page = 1
|
||||
}
|
||||
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
|
||||
if limit == 0 {
|
||||
limit = 50
|
||||
}
|
||||
start := (page - 1) * limit
|
||||
end := start + limit
|
||||
if start > len(issues) {
|
||||
start = len(issues)
|
||||
}
|
||||
if end > len(issues) {
|
||||
end = len(issues)
|
||||
}
|
||||
writeJSON(w, http.StatusOK, issues[start:end])
|
||||
return
|
||||
}
|
||||
// POST /api/v1/repos/{owner}/{repo}/issues
|
||||
if r.Method == http.MethodPost && strings.HasSuffix(p, "/issues") {
|
||||
rest := strings.TrimSuffix(strings.TrimPrefix(p, "/api/v1/repos/"), "/issues")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 2 {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
var body issueCreate
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
n := int64(len(f.issuesByRepo[repoKey]) + 1)
|
||||
issue := Issue{
|
||||
ID: n + 1000,
|
||||
Number: n,
|
||||
Title: body.Title,
|
||||
Body: body.Body,
|
||||
State: "open",
|
||||
URL: "http://gitea/x/" + strconv.Itoa(int(n)),
|
||||
}
|
||||
f.issuesByRepo[repoKey] = append(f.issuesByRepo[repoKey], issue)
|
||||
writeJSON(w, http.StatusCreated, issue)
|
||||
return
|
||||
}
|
||||
http.Error(w, "unhandled "+r.Method+" "+p, http.StatusNotFound)
|
||||
})
|
||||
}
|
||||
|
||||
func newMergeIssuesClient(t *testing.T, f *mergeIssuesFake) *Client {
|
||||
t.Helper()
|
||||
srv := httptest.NewServer(f.handler())
|
||||
t.Cleanup(srv.Close)
|
||||
c := New(srv.URL, "test-token")
|
||||
c.HTTP = srv.Client()
|
||||
return c
|
||||
}
|
||||
|
||||
func TestMergePullRequest_DefaultStyle(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
|
||||
if err := c.MergePullRequest(context.Background(), "acme", "blueprints", 42, MergePROpts{}); err != nil {
|
||||
t.Fatalf("MergePullRequest: %v", err)
|
||||
}
|
||||
if got := f.merged["acme/blueprints/42"]; got != "merge" {
|
||||
t.Errorf("default style: got %q, want merge", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergePullRequest_ExplicitStyle(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
|
||||
if err := c.MergePullRequest(context.Background(), "acme", "blueprints", 7, MergePROpts{Style: "squash"}); err != nil {
|
||||
t.Fatalf("MergePullRequest: %v", err)
|
||||
}
|
||||
if got := f.merged["acme/blueprints/7"]; got != "squash" {
|
||||
t.Errorf("got %q, want squash", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergePullRequest_InvalidStyleRejected(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
err := c.MergePullRequest(context.Background(), "acme", "r", 1, MergePROpts{Style: "wat"})
|
||||
if err == nil || !strings.Contains(err.Error(), "invalid style") {
|
||||
t.Errorf("err = %v, want invalid style", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergePullRequest_RejectsEmptyArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
if err := c.MergePullRequest(context.Background(), "", "r", 1, MergePROpts{}); err == nil {
|
||||
t.Error("want error for empty org")
|
||||
}
|
||||
if err := c.MergePullRequest(context.Background(), "o", "r", 0, MergePROpts{}); err == nil {
|
||||
t.Error("want error for non-positive number")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateIssue_HappyPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
|
||||
issue, err := c.CreateIssue(context.Background(), "acme", "blueprints", "hello", "world")
|
||||
if err != nil {
|
||||
t.Fatalf("CreateIssue: %v", err)
|
||||
}
|
||||
if issue.Title != "hello" || issue.Body != "world" || issue.Number != 1 {
|
||||
t.Errorf("unexpected issue: %+v", issue)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateIssue_RejectsEmptyTitle(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
if _, err := c.CreateIssue(context.Background(), "o", "r", "", "body"); err == nil {
|
||||
t.Error("want error for empty title")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListIssues_AfterCreate(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
|
||||
for _, ti := range []string{"a", "b", "c"} {
|
||||
if _, err := c.CreateIssue(context.Background(), "acme", "blueprints", ti, ""); err != nil {
|
||||
t.Fatalf("CreateIssue %q: %v", ti, err)
|
||||
}
|
||||
}
|
||||
out, err := c.ListIssues(context.Background(), "acme", "blueprints", ListIssuesOpts{})
|
||||
if err != nil {
|
||||
t.Fatalf("ListIssues: %v", err)
|
||||
}
|
||||
if len(out) != 3 {
|
||||
t.Errorf("want 3 issues, got %d", len(out))
|
||||
}
|
||||
for i, want := range []string{"a", "b", "c"} {
|
||||
if out[i].Title != want {
|
||||
t.Errorf("issues[%d].Title = %q, want %q", i, out[i].Title, want)
|
||||
}
|
||||
if out[i].IsPullRequest() {
|
||||
t.Errorf("issues[%d] reported IsPullRequest", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestListIssues_RepoNotFound(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
_, err := c.ListIssues(context.Background(), "ghost", "missing", ListIssuesOpts{})
|
||||
if !errors.Is(err, ErrRepoNotFound) {
|
||||
t.Errorf("err = %v, want ErrRepoNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetIssue_AfterCreate(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
issue, _ := c.CreateIssue(context.Background(), "acme", "blueprints", "first", "body")
|
||||
|
||||
got, err := c.GetIssue(context.Background(), "acme", "blueprints", issue.Number)
|
||||
if err != nil {
|
||||
t.Fatalf("GetIssue: %v", err)
|
||||
}
|
||||
if got.Title != "first" {
|
||||
t.Errorf("got Title=%q, want first", got.Title)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetIssue_NotFound(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
// Seed an empty issue list so the repo exists in the fake's eyes,
|
||||
// but the requested number is absent → 404.
|
||||
f.issuesByRepo["acme/blueprints"] = []Issue{}
|
||||
c := newMergeIssuesClient(t, f)
|
||||
_, err := c.GetIssue(context.Background(), "acme", "blueprints", 999)
|
||||
if !IsNotFound(err) {
|
||||
t.Errorf("err = %v, want IsNotFound==true", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCommentOnIssue_HappyPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newMergeIssuesFake()
|
||||
c := newMergeIssuesClient(t, f)
|
||||
cm, err := c.CommentOnIssue(context.Background(), "acme", "blueprints", 7, "looks good")
|
||||
if err != nil {
|
||||
t.Fatalf("CommentOnIssue: %v", err)
|
||||
}
|
||||
if cm.Body != "looks good" {
|
||||
t.Errorf("Body = %q, want looks good", cm.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCommentOnIssue_RejectsEmptyBody(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
if _, err := c.CommentOnIssue(context.Background(), "o", "r", 1, ""); err == nil {
|
||||
t.Error("want error for empty body")
|
||||
}
|
||||
}
|
||||
@ -1,213 +0,0 @@
|
||||
// Read-side PR operations on top of the canonical pkg/gitea Client.
|
||||
//
|
||||
// client.go already carries the write-side (CreatePullRequest + the
|
||||
// `findOpenPR` race fallback) but had no public read surface for the
|
||||
// MCP server's `gitea.pr.list` + `gitea.pr.get` tools (Wave 8). The two
|
||||
// helpers here add exactly that: a paginated list with state + filter
|
||||
// passthrough, and a single-PR fetch by number. Both reuse the existing
|
||||
// `Client.do` envelope so HTTP error mapping (ErrRepoNotFound) is
|
||||
// shared with the rest of the surface.
|
||||
//
|
||||
// New endpoints (Gitea Admin REST API):
|
||||
//
|
||||
// GET /api/v1/repos/{owner}/{repo}/pulls?state=...&page=...&limit=50
|
||||
// GET /api/v1/repos/{owner}/{repo}/pulls/{number}
|
||||
//
|
||||
// Why a separate file: client.go is already 800+ LOC. Wave 8 review
|
||||
// scope is the two new methods; isolating them keeps the diff scoped
|
||||
// and the canonical surface auditable from one place.
|
||||
package gitea
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
)
|
||||
|
||||
// PullRequestState constrains the `state` query param on ListPullRequests.
|
||||
// Gitea accepts "open" | "closed" | "all"; empty defaults server-side to
|
||||
// "open" but we make the wire shape explicit here so callers don't drift.
|
||||
type PullRequestState string
|
||||
|
||||
const (
|
||||
// PRStateOpen lists only open PRs (Gitea default).
|
||||
PRStateOpen PullRequestState = "open"
|
||||
// PRStateClosed lists only closed/merged PRs.
|
||||
PRStateClosed PullRequestState = "closed"
|
||||
// PRStateAll lists every PR regardless of state.
|
||||
PRStateAll PullRequestState = "all"
|
||||
)
|
||||
|
||||
// ListPRsOpts threads optional filters through ListPullRequests without
|
||||
// expanding the positional signature.
|
||||
type ListPRsOpts struct {
|
||||
// State filters by open/closed/all. Empty → server default ("open").
|
||||
State PullRequestState
|
||||
// Head filters by source branch (matches Gitea's `head=org:branch`).
|
||||
// Empty → no head filter.
|
||||
Head string
|
||||
// Base filters by target branch. Empty → no base filter.
|
||||
Base string
|
||||
}
|
||||
|
||||
// ListPullRequests returns every PR on the repo matching opts, walking
|
||||
// Gitea's pagination (page=1..N, limit=50). Result order matches what
|
||||
// Gitea returns (typically newest-first by creation).
|
||||
//
|
||||
// Returns ErrRepoNotFound on a first-page 404. Subsequent pagination
|
||||
// failures bubble up as *HTTPError.
|
||||
//
|
||||
// Added Wave 8 for openova-sandbox-mcp `gitea.pr.list`.
|
||||
func (c *Client) ListPullRequests(ctx context.Context, org, repo string, opts ListPRsOpts) ([]PullRequest, error) {
|
||||
if org == "" || repo == "" {
|
||||
return nil, errors.New("gitea: ListPullRequests requires non-empty org, repo")
|
||||
}
|
||||
const pageSize = 50
|
||||
out := make([]PullRequest, 0, pageSize)
|
||||
for page := 1; ; page++ {
|
||||
q := url.Values{}
|
||||
q.Set("limit", fmt.Sprintf("%d", pageSize))
|
||||
q.Set("page", fmt.Sprintf("%d", page))
|
||||
if opts.State != "" {
|
||||
q.Set("state", string(opts.State))
|
||||
}
|
||||
if opts.Head != "" {
|
||||
// Gitea's `head` filter expects `<org>:<branch>` for same-repo
|
||||
// PRs. Accept both forms — pass through verbatim when the
|
||||
// caller already included the colon.
|
||||
head := opts.Head
|
||||
if !containsRune(head, ':') {
|
||||
head = org + ":" + head
|
||||
}
|
||||
q.Set("head", head)
|
||||
}
|
||||
if opts.Base != "" {
|
||||
q.Set("base", opts.Base)
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/pulls?%s",
|
||||
url.PathEscape(org), url.PathEscape(repo), q.Encode())
|
||||
var batch []PullRequest
|
||||
status, _, err := c.do(ctx, http.MethodGet, endpoint, nil, &batch)
|
||||
if err != nil {
|
||||
if page == 1 && status == http.StatusNotFound {
|
||||
return nil, ErrRepoNotFound
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, batch...)
|
||||
if len(batch) < pageSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// MergePROpts threads the MERGE payload fields the MCP tool exposes
|
||||
// without growing the positional signature.
|
||||
type MergePROpts struct {
|
||||
// Style is one of "merge" | "rebase" | "rebase-merge" | "squash".
|
||||
// Empty defaults to "merge" (the conservative, non-rewriting option
|
||||
// — keeps the head SHA + parent chain stable so CI doesn't re-run
|
||||
// on a synthetic merge commit).
|
||||
Style string
|
||||
// Title overrides the default merge-commit title (squash + merge
|
||||
// styles only). Empty → Gitea picks the PR title.
|
||||
Title string
|
||||
// Message overrides the default merge-commit body. Empty → Gitea
|
||||
// picks the PR body.
|
||||
Message string
|
||||
}
|
||||
|
||||
// mergePullRequestPayload — POST body for /pulls/{number}/merge.
|
||||
// Gitea's `MergePullRequestOption` uses CamelCase JSON field names
|
||||
// (`Do`, `MergeTitleField`, `MergeMessageField`) which is exactly the
|
||||
// shape this struct emits.
|
||||
type mergePullRequestPayload struct {
|
||||
Do string `json:"Do"`
|
||||
MergeTitleField string `json:"MergeTitleField,omitempty"`
|
||||
MergeMessageField string `json:"MergeMessageField,omitempty"`
|
||||
}
|
||||
|
||||
// MergePullRequest merges PR #number on (org, repo).
|
||||
//
|
||||
// Gitea returns 200 on a successful merge, 404 if the PR doesn't exist,
|
||||
// 405 if the PR isn't mergeable (work-in-progress, draft, conflicting),
|
||||
// 409 if the head changed between Get and Merge. We surface the typed
|
||||
// sentinel for 404 (both `repo gone` and `PR gone` end up here on
|
||||
// Gitea's wire) and a *HTTPError for 405/409 so callers can inspect
|
||||
// Status for retry/abort decisions.
|
||||
//
|
||||
// Added Wave 11 for openova-sandbox-mcp `gitea.pr.merge`.
|
||||
func (c *Client) MergePullRequest(ctx context.Context, org, repo string, number int64, opts MergePROpts) error {
|
||||
if org == "" || repo == "" {
|
||||
return errors.New("gitea: MergePullRequest requires non-empty org, repo")
|
||||
}
|
||||
if number <= 0 {
|
||||
return errors.New("gitea: MergePullRequest requires positive PR number")
|
||||
}
|
||||
style := opts.Style
|
||||
if style == "" {
|
||||
style = "merge"
|
||||
}
|
||||
switch style {
|
||||
case "merge", "rebase", "rebase-merge", "squash":
|
||||
// ok
|
||||
default:
|
||||
return fmt.Errorf("gitea: MergePullRequest: invalid style %q (want merge|rebase|rebase-merge|squash)", style)
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/pulls/%d/merge",
|
||||
url.PathEscape(org), url.PathEscape(repo), number)
|
||||
payload := mergePullRequestPayload{
|
||||
Do: style,
|
||||
MergeTitleField: opts.Title,
|
||||
MergeMessageField: opts.Message,
|
||||
}
|
||||
status, _, err := c.do(ctx, http.MethodPost, endpoint, payload, nil)
|
||||
if err != nil {
|
||||
if status == http.StatusNotFound {
|
||||
return ErrRepoNotFound
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetPullRequest fetches a single PR by number. Returns ErrRepoNotFound
|
||||
// on 404 with "repository" in the body (matching GetFile's heuristic),
|
||||
// otherwise a plain *HTTPError with Status==404 when the PR number itself
|
||||
// doesn't resolve. Callers can `IsNotFound(err)` to fold both cases.
|
||||
//
|
||||
// Added Wave 8 for openova-sandbox-mcp `gitea.pr.get`.
|
||||
func (c *Client) GetPullRequest(ctx context.Context, org, repo string, number int64) (PullRequest, error) {
|
||||
if org == "" || repo == "" {
|
||||
return PullRequest{}, errors.New("gitea: GetPullRequest requires non-empty org, repo")
|
||||
}
|
||||
if number <= 0 {
|
||||
return PullRequest{}, errors.New("gitea: GetPullRequest requires positive PR number")
|
||||
}
|
||||
endpoint := fmt.Sprintf("/repos/%s/%s/pulls/%d",
|
||||
url.PathEscape(org), url.PathEscape(repo), number)
|
||||
var out PullRequest
|
||||
status, _, err := c.do(ctx, http.MethodGet, endpoint, nil, &out)
|
||||
if err != nil {
|
||||
if status == http.StatusNotFound {
|
||||
return PullRequest{}, ErrRepoNotFound
|
||||
}
|
||||
return PullRequest{}, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// containsRune is a tiny strings.ContainsRune replacement kept inline so
|
||||
// pulls.go doesn't import "strings" for a single use; the rest of the
|
||||
// file uses url + fmt + http + errors only.
|
||||
func containsRune(s string, r rune) bool {
|
||||
for _, c := range s {
|
||||
if c == r {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -1,249 +0,0 @@
|
||||
package gitea
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// pullsFake is a tiny httptest stand-in scoped to the two new endpoints
|
||||
// added in pulls.go: paginated list + get-by-number. We don't reuse the
|
||||
// big fakeGitea handler from client_test.go because that one's GET /pulls
|
||||
// branch is filter-by-head only (it was written before list-with-state
|
||||
// existed) and overriding it would risk regressing CreatePullRequest's
|
||||
// 409 path. A scoped fake keeps the new tests independent.
|
||||
type pullsFake struct {
|
||||
// repos that exist (key = "owner/repo").
|
||||
repos map[string]bool
|
||||
// prs is keyed by "owner/repo/number".
|
||||
prs map[string]PullRequest
|
||||
}
|
||||
|
||||
func newPullsFake() *pullsFake {
|
||||
return &pullsFake{
|
||||
repos: map[string]bool{},
|
||||
prs: map[string]PullRequest{},
|
||||
}
|
||||
}
|
||||
|
||||
func (f *pullsFake) handler() http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Header.Get("Authorization") == "" {
|
||||
http.Error(w, "no auth", http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
p := r.URL.Path
|
||||
|
||||
// GET /api/v1/repos/{owner}/{repo}/pulls/{number}
|
||||
if r.Method == http.MethodGet &&
|
||||
strings.HasPrefix(p, "/api/v1/repos/") &&
|
||||
strings.Contains(p, "/pulls/") {
|
||||
rest := strings.TrimPrefix(p, "/api/v1/repos/")
|
||||
// rest = "owner/repo/pulls/123"
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 4 || parts[2] != "pulls" {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
if !f.repos[repoKey] {
|
||||
http.Error(w, "no repo", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
pr, ok := f.prs[repoKey+"/"+parts[3]]
|
||||
if !ok {
|
||||
http.Error(w, "no pr", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, pr)
|
||||
return
|
||||
}
|
||||
|
||||
// GET /api/v1/repos/{owner}/{repo}/pulls?state=...
|
||||
if r.Method == http.MethodGet &&
|
||||
strings.HasPrefix(p, "/api/v1/repos/") &&
|
||||
strings.HasSuffix(p, "/pulls") {
|
||||
rest := strings.TrimSuffix(strings.TrimPrefix(p, "/api/v1/repos/"), "/pulls")
|
||||
parts := strings.Split(rest, "/")
|
||||
if len(parts) != 2 {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
repoKey := parts[0] + "/" + parts[1]
|
||||
if !f.repos[repoKey] {
|
||||
http.Error(w, "no repo", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
stateWanted := r.URL.Query().Get("state")
|
||||
page, _ := strconv.Atoi(r.URL.Query().Get("page"))
|
||||
if page == 0 {
|
||||
page = 1
|
||||
}
|
||||
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
|
||||
if limit == 0 {
|
||||
limit = 50
|
||||
}
|
||||
out := []PullRequest{}
|
||||
for k, pr := range f.prs {
|
||||
if !strings.HasPrefix(k, repoKey+"/") {
|
||||
continue
|
||||
}
|
||||
if stateWanted != "" && stateWanted != "all" && pr.State != stateWanted {
|
||||
continue
|
||||
}
|
||||
out = append(out, pr)
|
||||
}
|
||||
// Stable order by Number ascending so test assertions can index.
|
||||
for i := 1; i < len(out); i++ {
|
||||
for j := i; j > 0 && out[j-1].Number > out[j].Number; j-- {
|
||||
out[j-1], out[j] = out[j], out[j-1]
|
||||
}
|
||||
}
|
||||
// Apply pagination window.
|
||||
start := (page - 1) * limit
|
||||
end := start + limit
|
||||
if start > len(out) {
|
||||
start = len(out)
|
||||
}
|
||||
if end > len(out) {
|
||||
end = len(out)
|
||||
}
|
||||
writeJSON(w, http.StatusOK, out[start:end])
|
||||
return
|
||||
}
|
||||
|
||||
http.Error(w, "unhandled "+r.Method+" "+p, http.StatusNotFound)
|
||||
})
|
||||
}
|
||||
|
||||
func newPullsClient(t *testing.T, f *pullsFake) *Client {
|
||||
t.Helper()
|
||||
srv := httptest.NewServer(f.handler())
|
||||
t.Cleanup(srv.Close)
|
||||
c := New(srv.URL, "test-token")
|
||||
c.HTTP = srv.Client()
|
||||
return c
|
||||
}
|
||||
|
||||
func TestListPullRequests_StateFilter(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newPullsFake()
|
||||
f.repos["acme/blueprints"] = true
|
||||
for i := int64(1); i <= 3; i++ {
|
||||
pr := PullRequest{ID: i, Number: i, State: "open", Title: fmt.Sprintf("open #%d", i)}
|
||||
pr.Head.Ref = "feature/" + fmt.Sprint(i)
|
||||
pr.Base.Ref = "main"
|
||||
f.prs[fmt.Sprintf("acme/blueprints/%d", i)] = pr
|
||||
}
|
||||
for i := int64(10); i <= 11; i++ {
|
||||
pr := PullRequest{ID: i, Number: i, State: "closed", Title: fmt.Sprintf("closed #%d", i)}
|
||||
pr.Head.Ref = "old/" + fmt.Sprint(i)
|
||||
pr.Base.Ref = "main"
|
||||
f.prs[fmt.Sprintf("acme/blueprints/%d", i)] = pr
|
||||
}
|
||||
c := newPullsClient(t, f)
|
||||
|
||||
open, err := c.ListPullRequests(context.Background(), "acme", "blueprints", ListPRsOpts{State: PRStateOpen})
|
||||
if err != nil {
|
||||
t.Fatalf("ListPullRequests open: %v", err)
|
||||
}
|
||||
if len(open) != 3 {
|
||||
t.Fatalf("want 3 open PRs, got %d (%v)", len(open), open)
|
||||
}
|
||||
for _, pr := range open {
|
||||
if pr.State != "open" {
|
||||
t.Errorf("unexpected state %q on open list", pr.State)
|
||||
}
|
||||
}
|
||||
|
||||
closed, err := c.ListPullRequests(context.Background(), "acme", "blueprints", ListPRsOpts{State: PRStateClosed})
|
||||
if err != nil {
|
||||
t.Fatalf("ListPullRequests closed: %v", err)
|
||||
}
|
||||
if len(closed) != 2 {
|
||||
t.Fatalf("want 2 closed PRs, got %d", len(closed))
|
||||
}
|
||||
|
||||
all, err := c.ListPullRequests(context.Background(), "acme", "blueprints", ListPRsOpts{State: PRStateAll})
|
||||
if err != nil {
|
||||
t.Fatalf("ListPullRequests all: %v", err)
|
||||
}
|
||||
if len(all) != 5 {
|
||||
t.Fatalf("want 5 PRs (all), got %d", len(all))
|
||||
}
|
||||
}
|
||||
|
||||
func TestListPullRequests_RepoNotFound(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newPullsFake()
|
||||
c := newPullsClient(t, f)
|
||||
|
||||
_, err := c.ListPullRequests(context.Background(), "ghost", "missing", ListPRsOpts{})
|
||||
if !errors.Is(err, ErrRepoNotFound) {
|
||||
t.Errorf("err = %v, want ErrRepoNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListPullRequests_RejectsEmptyArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
if _, err := c.ListPullRequests(context.Background(), "", "r", ListPRsOpts{}); err == nil {
|
||||
t.Error("want error for empty org")
|
||||
}
|
||||
if _, err := c.ListPullRequests(context.Background(), "o", "", ListPRsOpts{}); err == nil {
|
||||
t.Error("want error for empty repo")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetPullRequest_HappyPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newPullsFake()
|
||||
f.repos["acme/blueprints"] = true
|
||||
pr := PullRequest{ID: 42, Number: 42, State: "open", Title: "hello"}
|
||||
pr.Head.Ref = "feature/x"
|
||||
pr.Base.Ref = "main"
|
||||
f.prs["acme/blueprints/42"] = pr
|
||||
c := newPullsClient(t, f)
|
||||
|
||||
got, err := c.GetPullRequest(context.Background(), "acme", "blueprints", 42)
|
||||
if err != nil {
|
||||
t.Fatalf("GetPullRequest: %v", err)
|
||||
}
|
||||
if got.Number != 42 || got.Title != "hello" || got.Head.Ref != "feature/x" {
|
||||
t.Errorf("unexpected PR: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetPullRequest_NotFound(t *testing.T) {
|
||||
t.Parallel()
|
||||
f := newPullsFake()
|
||||
f.repos["acme/blueprints"] = true
|
||||
c := newPullsClient(t, f)
|
||||
|
||||
_, err := c.GetPullRequest(context.Background(), "acme", "blueprints", 999)
|
||||
if !IsNotFound(err) {
|
||||
t.Errorf("err = %v, want IsNotFound==true", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetPullRequest_RejectsBadArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
c := New("http://x", "tok")
|
||||
if _, err := c.GetPullRequest(context.Background(), "", "r", 1); err == nil {
|
||||
t.Error("want error for empty org")
|
||||
}
|
||||
if _, err := c.GetPullRequest(context.Background(), "o", "r", 0); err == nil {
|
||||
t.Error("want error for non-positive number")
|
||||
}
|
||||
}
|
||||
|
||||
// Compile-time check: PullRequest must JSON-decode into the same fields
|
||||
// pulls.go reads. Caught a regression in the original Wave-8 draft where
|
||||
// the alias name diverged from the canonical struct.
|
||||
var _ = json.Unmarshal
|
||||
@ -1,300 +0,0 @@
|
||||
// Package natsbus is a minimal NATS JetStream subscriber for the
|
||||
// in-cluster Group-C controllers (organization-controller,
|
||||
// sandbox-controller). It exists to close the consume-leg of DoD D35
|
||||
// (`catalyst.tenant.created` / `catalyst.order.placed` /
|
||||
// `catalyst.tenant.sandbox_requested` round-trip end-to-end) when the
|
||||
// publish-leg PR #1626 wired into core/services/shared/events landed.
|
||||
//
|
||||
// The package is deliberately self-contained inside core/controllers/
|
||||
// (separate Go module from core/services/shared/events) — copying the
|
||||
// ~80 LOC of JetStream connect + durable-consumer attach is cheaper
|
||||
// than dragging the entire core/services/shared/events package
|
||||
// (BrokerPublisher / MultiSubscriber / Kafka transport) into a
|
||||
// controller binary that only needs to subscribe.
|
||||
//
|
||||
// What the controllers do with each envelope is up to the caller —
|
||||
// the package surface is intentionally narrow:
|
||||
//
|
||||
// - Connect(url) → *Subscriber, Close() teardown.
|
||||
// - Subscriber.Subscribe(ctx, subject, durable, handler) starts a
|
||||
// durable JetStream consumer and dispatches every envelope to
|
||||
// handler. Handler returning nil → Ack; non-nil → Nak (5s
|
||||
// backoff so transient downstream blips don't hot-loop).
|
||||
// - Event mirrors core/services/shared/events.Event so the JSON
|
||||
// wire format is identical (id / type / source / timestamp /
|
||||
// tenant_id / data / metadata).
|
||||
//
|
||||
// The expected operational pattern is:
|
||||
//
|
||||
// r.Reconcile is the canonical path for steady-state convergence.
|
||||
// NATS subscribers in main.go observe domain events as they fire
|
||||
// and enqueue the corresponding CR for reconcile (so the controller
|
||||
// responds within ~50ms instead of waiting up to 30s for the next
|
||||
// requeue). The 30s RequeueAfter inside r.Reconcile remains the
|
||||
// fallback — NATS message loss never strands a CR; the next
|
||||
// informer-driven Reconcile picks it up.
|
||||
//
|
||||
// Connection options mirror core/cmd/projector/internal/nats:
|
||||
// MaxReconnects=-1 (retry forever — JetStream rollouts shouldn't
|
||||
// crash-loop the controller), 2s ReconnectWait, 20s PingInterval.
|
||||
//
|
||||
// Per Inviolable Principle #4 every knob is env-driven; this package
|
||||
// reads nothing from os.Getenv directly — main.go owns the env
|
||||
// surface and passes URL + StreamName + DurableName into Connect /
|
||||
// Subscribe.
|
||||
package natsbus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// StreamCatalystSME is the canonical JetStream Stream backing every
|
||||
// SME convergence event (catalyst.tenant.*, catalyst.billing.*,
|
||||
// catalyst.domain.*, catalyst.provision.*). Mirrors
|
||||
// core/services/shared/events.StreamCatalystSME — kept in sync by the
|
||||
// constant lifting up to the chart / per-Sovereign overlay.
|
||||
const StreamCatalystSME = "CATALYST_SME"
|
||||
|
||||
// Canonical subjects the Group-C controllers consume. Each constant
|
||||
// matches the publish-side subject derived by
|
||||
// core/services/shared/events.CanonicalSubject — kept in sync by the
|
||||
// D35 test plan.
|
||||
const (
|
||||
// SubjectTenantCreated fires when tenant-service finalises a new
|
||||
// tenant (publish-side PR #1626). organization-controller subscribes
|
||||
// so the corresponding Organization CR converges within ~50ms of
|
||||
// the marketplace checkout instead of waiting on the 30s fallback.
|
||||
SubjectTenantCreated = "catalyst.tenant.created"
|
||||
|
||||
// SubjectOrderPlaced fires when billing-service records a paid
|
||||
// order (publish-side PR #1626). organization-controller subscribes
|
||||
// to observe the round-trip and trigger reconcile of the per-tenant
|
||||
// Organization CR (so day-1 app installs in the basket land on the
|
||||
// per-Org Gitea repo without the operator polling catalyst-api).
|
||||
SubjectOrderPlaced = "catalyst.billing.order.placed"
|
||||
|
||||
// SubjectTenantSandboxRequested fires when the marketplace cart
|
||||
// contained the sandbox product (publish-side tenant-service in
|
||||
// PR #1633). sandbox-controller subscribes so the per-Sandbox
|
||||
// reconcile loop runs immediately on cart completion.
|
||||
SubjectTenantSandboxRequested = "catalyst.tenant.sandbox_requested"
|
||||
)
|
||||
|
||||
// Event is the JSON envelope on every catalyst.* subject. The shape
|
||||
// mirrors core/services/shared/events.Event exactly — fields are
|
||||
// duplicated here (rather than imported) to keep the controllers
|
||||
// module free of a dependency on core/services/shared, which pulls
|
||||
// in franz-go + Kafka transports the controllers never touch.
|
||||
type Event struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
Data json.RawMessage `json:"data"`
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
// Subscriber holds an open NATS+JetStream connection. Construct via
|
||||
// Connect; close via Close. Subscribe may be called from multiple
|
||||
// goroutines — each call attaches an independent durable consumer.
|
||||
type Subscriber struct {
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream
|
||||
}
|
||||
|
||||
// Connect opens a NATS connection at url and binds a JetStream client
|
||||
// on top. Empty url falls back to nats.DefaultURL so unit tests can
|
||||
// exercise the package against a local nats-server without env wiring.
|
||||
//
|
||||
// Returns an error if the broker is unreachable; the caller (main.go)
|
||||
// is expected to either bail out (NATS is canonical on Sovereigns) or
|
||||
// log and continue (Catalyst-Zero / contabo, where REDPANDA_BROKERS
|
||||
// is the authoritative bus).
|
||||
func Connect(url string) (*Subscriber, error) {
|
||||
if url == "" {
|
||||
url = nats.DefaultURL
|
||||
}
|
||||
nc, err := nats.Connect(url,
|
||||
nats.Name("catalyst-controllers"),
|
||||
nats.MaxReconnects(-1),
|
||||
nats.ReconnectWait(2*time.Second),
|
||||
nats.PingInterval(20*time.Second),
|
||||
nats.MaxPingsOutstanding(3),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("natsbus: connect %s: %w", url, err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("natsbus: jetstream init: %w", err)
|
||||
}
|
||||
return &Subscriber{nc: nc, js: js}, nil
|
||||
}
|
||||
|
||||
// Handler is the per-message callback. Returning a nil error Acks the
|
||||
// message (JetStream advances the consumer cursor); returning a
|
||||
// non-nil error Naks with a 5s backoff so transient handler failures
|
||||
// redeliver instead of stranding the envelope.
|
||||
//
|
||||
// Implementations SHOULD be idempotent — JetStream guarantees
|
||||
// at-least-once delivery, so the same Event.ID may arrive twice on a
|
||||
// broker rebalance.
|
||||
type Handler func(ctx context.Context, ev *Event) error
|
||||
|
||||
// SubscribeOptions tunes Subscribe behaviour. Zero values yield sane
|
||||
// production defaults (Stream=CATALYST_SME, AckWait=30s, no MaxDeliver
|
||||
// cap so a permanently-failing handler does NOT silently drop events
|
||||
// — operator-visible nak loops are the right failure mode).
|
||||
type SubscribeOptions struct {
|
||||
// Stream is the JetStream Stream the FilterSubject lives on.
|
||||
// Defaults to StreamCatalystSME.
|
||||
Stream string
|
||||
// AckWait bounds how long JetStream waits for Ack before redeliver.
|
||||
// Defaults to 30 seconds.
|
||||
AckWait time.Duration
|
||||
// NakBackoff is the backoff inserted before redelivery when the
|
||||
// Handler returns a non-nil error. Defaults to 5 seconds.
|
||||
NakBackoff time.Duration
|
||||
}
|
||||
|
||||
// Subscribe attaches a durable consumer to subject on options.Stream
|
||||
// (default StreamCatalystSME) under the supplied durable name, and
|
||||
// dispatches every envelope to handler.
|
||||
//
|
||||
// Subscribe is non-blocking: it returns once the consumer has been
|
||||
// created AND the underlying Consume loop has been started. The
|
||||
// loop runs until ctx is cancelled. To stop, cancel ctx — the
|
||||
// underlying JetStream ConsumeContext is then stopped automatically.
|
||||
//
|
||||
// Durable names are stable across pod restarts so JetStream resumes
|
||||
// from the committed sequence after a controller-manager rollout.
|
||||
// MaxDeliver=-1 (retry forever) matches the
|
||||
// core/services/shared/events.MultiSubscriber convention: operator-
|
||||
// visible nak loops are the right failure mode for unrecoverable
|
||||
// handler errors, not silent drops.
|
||||
func (s *Subscriber) Subscribe(
|
||||
ctx context.Context,
|
||||
subject, durable string,
|
||||
handler Handler,
|
||||
opts SubscribeOptions,
|
||||
) error {
|
||||
if s == nil || s.js == nil {
|
||||
return errors.New("natsbus: subscriber not initialised")
|
||||
}
|
||||
if subject == "" {
|
||||
return errors.New("natsbus: Subscribe requires subject")
|
||||
}
|
||||
if durable == "" {
|
||||
return errors.New("natsbus: Subscribe requires durable name")
|
||||
}
|
||||
if handler == nil {
|
||||
return errors.New("natsbus: Subscribe requires handler")
|
||||
}
|
||||
stream := opts.Stream
|
||||
if stream == "" {
|
||||
stream = StreamCatalystSME
|
||||
}
|
||||
ackWait := opts.AckWait
|
||||
if ackWait <= 0 {
|
||||
ackWait = 30 * time.Second
|
||||
}
|
||||
nakBackoff := opts.NakBackoff
|
||||
if nakBackoff <= 0 {
|
||||
nakBackoff = 5 * time.Second
|
||||
}
|
||||
|
||||
cons, err := s.js.CreateOrUpdateConsumer(ctx, stream, jetstream.ConsumerConfig{
|
||||
Durable: durable,
|
||||
Description: fmt.Sprintf("controllers natsbus %s on %s", durable, subject),
|
||||
AckPolicy: jetstream.AckExplicitPolicy,
|
||||
AckWait: ackWait,
|
||||
FilterSubject: subject,
|
||||
DeliverPolicy: jetstream.DeliverAllPolicy,
|
||||
MaxDeliver: -1,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("natsbus: create consumer %s on %s/%s: %w", durable, stream, subject, err)
|
||||
}
|
||||
|
||||
cc, err := cons.Consume(func(msg jetstream.Msg) {
|
||||
dispatchOne(ctx, msg, handler, subject, durable, nakBackoff)
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("natsbus: start consume %s: %w", durable, err)
|
||||
}
|
||||
|
||||
slog.Info("natsbus: subscribed",
|
||||
"subject", subject, "durable", durable, "stream", stream)
|
||||
|
||||
// Stop the JetStream consume context when ctx is cancelled. We do
|
||||
// this in a goroutine so Subscribe returns immediately — the caller
|
||||
// (main.go) wires the same ctx to manager.Start so SIGTERM unwinds
|
||||
// every Subscribe at once.
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
cc.Stop()
|
||||
slog.Info("natsbus: stopped", "subject", subject, "durable", durable)
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
// dispatchOne parses one JetStream message into an Event, invokes the
|
||||
// handler with a per-message timeout, and Acks / Naks per the result.
|
||||
// Malformed JSON is Ack'd-skipped (with an error log) so a poison-pill
|
||||
// envelope cannot hot-loop the consumer.
|
||||
func dispatchOne(
|
||||
parent context.Context,
|
||||
msg jetstream.Msg,
|
||||
handler Handler,
|
||||
subject, durable string,
|
||||
nakBackoff time.Duration,
|
||||
) {
|
||||
var ev Event
|
||||
if err := json.Unmarshal(msg.Data(), &ev); err != nil {
|
||||
slog.Error("natsbus: malformed envelope — ack to skip",
|
||||
"subject", subject, "durable", durable,
|
||||
"err", err, "body_size", len(msg.Data()))
|
||||
_ = msg.Ack()
|
||||
return
|
||||
}
|
||||
|
||||
// Per-message timeout — 25s leaves 5s of slack inside the 30s
|
||||
// default AckWait so the broker does not redeliver while the handler
|
||||
// is still running.
|
||||
hctx, cancel := context.WithTimeout(parent, 25*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := handler(hctx, &ev); err != nil {
|
||||
slog.Warn("natsbus: handler error — nak for retry",
|
||||
"subject", subject, "durable", durable,
|
||||
"event_id", ev.ID, "event_type", ev.Type, "err", err)
|
||||
if nakErr := msg.NakWithDelay(nakBackoff); nakErr != nil {
|
||||
slog.Error("natsbus: nak failed",
|
||||
"subject", subject, "durable", durable, "err", nakErr)
|
||||
}
|
||||
return
|
||||
}
|
||||
if ackErr := msg.Ack(); ackErr != nil {
|
||||
slog.Error("natsbus: ack failed",
|
||||
"subject", subject, "durable", durable,
|
||||
"event_id", ev.ID, "err", ackErr)
|
||||
}
|
||||
}
|
||||
|
||||
// Close drains the underlying NATS connection. Idempotent.
|
||||
func (s *Subscriber) Close() {
|
||||
if s == nil || s.nc == nil {
|
||||
return
|
||||
}
|
||||
_ = s.nc.Drain()
|
||||
}
|
||||
@ -1,223 +0,0 @@
|
||||
// Unit tests for natsbus. Verifies the per-message dispatch contract
|
||||
// (Ack on handler success, Nak on handler error, Ack-to-skip on
|
||||
// malformed JSON) without spinning up an embedded NATS server. Live-
|
||||
// broker integration is covered by the D35 fresh-prov verifier.
|
||||
package natsbus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// fakeMsg implements just enough of jetstream.Msg for dispatchOne to
|
||||
// drive Ack / Nak / NakWithDelay outcomes. Method set mirrors the
|
||||
// interface so a typecheck against jetstream.Msg keeps the fake honest.
|
||||
type fakeMsg struct {
|
||||
data []byte
|
||||
|
||||
mu sync.Mutex
|
||||
ackCount int
|
||||
nakCount int
|
||||
termCount int
|
||||
nakDelay time.Duration
|
||||
inProgress int
|
||||
headers map[string][]string
|
||||
subject string
|
||||
reply string
|
||||
}
|
||||
|
||||
var _ jetstream.Msg = (*fakeMsg)(nil)
|
||||
|
||||
func (f *fakeMsg) Data() []byte { return f.data }
|
||||
func (f *fakeMsg) Headers() nats.Header {
|
||||
out := nats.Header{}
|
||||
for k, vs := range f.headers {
|
||||
out[k] = vs
|
||||
}
|
||||
return out
|
||||
}
|
||||
func (f *fakeMsg) Subject() string { return f.subject }
|
||||
func (f *fakeMsg) Reply() string { return f.reply }
|
||||
func (f *fakeMsg) Ack() error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.ackCount++
|
||||
return nil
|
||||
}
|
||||
func (f *fakeMsg) DoubleAck(context.Context) error {
|
||||
return f.Ack()
|
||||
}
|
||||
func (f *fakeMsg) Nak() error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.nakCount++
|
||||
return nil
|
||||
}
|
||||
func (f *fakeMsg) NakWithDelay(d time.Duration) error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.nakCount++
|
||||
f.nakDelay = d
|
||||
return nil
|
||||
}
|
||||
func (f *fakeMsg) InProgress() error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.inProgress++
|
||||
return nil
|
||||
}
|
||||
func (f *fakeMsg) Term() error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.termCount++
|
||||
return nil
|
||||
}
|
||||
func (f *fakeMsg) TermWithReason(string) error { return f.Term() }
|
||||
func (f *fakeMsg) Metadata() (*jetstream.MsgMetadata, error) {
|
||||
return &jetstream.MsgMetadata{}, nil
|
||||
}
|
||||
|
||||
// TestDispatchOne_HandlerSuccess pins: a handler returning nil Acks
|
||||
// the message exactly once and never Naks. Bodyguard for the D35
|
||||
// happy path — every successful round-trip moves the consumer cursor.
|
||||
func TestDispatchOne_HandlerSuccess(t *testing.T) {
|
||||
payload := Event{
|
||||
ID: "evt-1",
|
||||
Type: "tenant.created",
|
||||
Source: "tenant-service",
|
||||
Timestamp: time.Now().UTC(),
|
||||
TenantID: "tnt-abc",
|
||||
Data: json.RawMessage(`{"id":"tnt-abc","slug":"acme"}`),
|
||||
}
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal payload: %v", err)
|
||||
}
|
||||
msg := &fakeMsg{data: body, subject: SubjectTenantCreated}
|
||||
|
||||
var seen *Event
|
||||
handler := Handler(func(_ context.Context, ev *Event) error {
|
||||
seen = ev
|
||||
return nil
|
||||
})
|
||||
|
||||
dispatchOne(context.Background(), msg, handler, SubjectTenantCreated, "test-d", 5*time.Second)
|
||||
|
||||
if msg.ackCount != 1 {
|
||||
t.Errorf("want Ack count 1, got %d", msg.ackCount)
|
||||
}
|
||||
if msg.nakCount != 0 {
|
||||
t.Errorf("want Nak count 0, got %d", msg.nakCount)
|
||||
}
|
||||
if seen == nil || seen.ID != "evt-1" || seen.Type != "tenant.created" {
|
||||
t.Errorf("handler did not receive the parsed envelope: %+v", seen)
|
||||
}
|
||||
if seen.TenantID != "tnt-abc" {
|
||||
t.Errorf("envelope tenant_id mismatch: got %q want %q", seen.TenantID, "tnt-abc")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDispatchOne_HandlerError pins: a handler returning a non-nil
|
||||
// error Naks (with the configured backoff) and does NOT Ack. Bodyguard
|
||||
// for transient downstream failures — JetStream must redeliver.
|
||||
func TestDispatchOne_HandlerError(t *testing.T) {
|
||||
body, _ := json.Marshal(Event{ID: "evt-err", Type: "order.placed"})
|
||||
msg := &fakeMsg{data: body, subject: SubjectOrderPlaced}
|
||||
|
||||
handler := Handler(func(context.Context, *Event) error {
|
||||
return errors.New("downstream API unreachable")
|
||||
})
|
||||
|
||||
const backoff = 7 * time.Second
|
||||
dispatchOne(context.Background(), msg, handler, SubjectOrderPlaced, "test-d", backoff)
|
||||
|
||||
if msg.ackCount != 0 {
|
||||
t.Errorf("want Ack count 0 on handler error, got %d", msg.ackCount)
|
||||
}
|
||||
if msg.nakCount != 1 {
|
||||
t.Errorf("want Nak count 1 on handler error, got %d", msg.nakCount)
|
||||
}
|
||||
if msg.nakDelay != backoff {
|
||||
t.Errorf("want Nak delay %v, got %v", backoff, msg.nakDelay)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDispatchOne_MalformedJSON pins: a payload that fails json.Unmarshal
|
||||
// is Ack'd-skipped (the consumer cursor advances) so a poison pill cannot
|
||||
// hot-loop the subscriber. Caught by the operator log line, not the
|
||||
// transport.
|
||||
func TestDispatchOne_MalformedJSON(t *testing.T) {
|
||||
msg := &fakeMsg{data: []byte("not-json{"), subject: SubjectTenantSandboxRequested}
|
||||
|
||||
called := false
|
||||
handler := Handler(func(context.Context, *Event) error {
|
||||
called = true
|
||||
return nil
|
||||
})
|
||||
|
||||
dispatchOne(context.Background(), msg, handler, SubjectTenantSandboxRequested, "test-d", 5*time.Second)
|
||||
|
||||
if called {
|
||||
t.Error("handler should NOT be invoked on malformed JSON")
|
||||
}
|
||||
if msg.ackCount != 1 {
|
||||
t.Errorf("want Ack count 1 to skip poison pill, got %d", msg.ackCount)
|
||||
}
|
||||
if msg.nakCount != 0 {
|
||||
t.Errorf("want Nak count 0 on poison pill (Term/Ack only), got %d", msg.nakCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDispatchOne_SandboxRequestedPayload pins the exact wire format
|
||||
// the publish-side (PR #1633 tenant.handlers.CreateOrg) emits — the
|
||||
// downstream sandbox-controller handler reads tenant_id, org_slug,
|
||||
// owner_id, owner_email, agent_catalogue out of the Data blob. If
|
||||
// the publish-side renames a field this test goes red so the consumer
|
||||
// stays in lockstep.
|
||||
func TestDispatchOne_SandboxRequestedPayload(t *testing.T) {
|
||||
payload := Event{
|
||||
ID: "evt-sb",
|
||||
Type: "tenant.sandbox_requested",
|
||||
Source: "tenant-service",
|
||||
Timestamp: time.Now().UTC(),
|
||||
TenantID: "tnt-sb",
|
||||
Data: json.RawMessage(`{
|
||||
"tenant_id":"tnt-sb",
|
||||
"org_slug":"acme",
|
||||
"owner_id":"u-1",
|
||||
"owner_email":"ceo@acme.com",
|
||||
"agent_catalogue":["qwen","claude"]
|
||||
}`),
|
||||
}
|
||||
body, _ := json.Marshal(payload)
|
||||
msg := &fakeMsg{data: body, subject: SubjectTenantSandboxRequested}
|
||||
|
||||
var got struct {
|
||||
TenantID string `json:"tenant_id"`
|
||||
OrgSlug string `json:"org_slug"`
|
||||
OwnerEmail string `json:"owner_email"`
|
||||
AgentCatalogue []string `json:"agent_catalogue"`
|
||||
}
|
||||
handler := Handler(func(_ context.Context, ev *Event) error {
|
||||
return json.Unmarshal(ev.Data, &got)
|
||||
})
|
||||
|
||||
dispatchOne(context.Background(), msg, handler, SubjectTenantSandboxRequested, "test-d", 5*time.Second)
|
||||
|
||||
if msg.ackCount != 1 {
|
||||
t.Fatalf("expected Ack=1 on success, got Ack=%d Nak=%d", msg.ackCount, msg.nakCount)
|
||||
}
|
||||
if got.OrgSlug != "acme" || got.OwnerEmail != "ceo@acme.com" {
|
||||
t.Errorf("payload fields not surfaced: %+v", got)
|
||||
}
|
||||
if len(got.AgentCatalogue) != 2 || got.AgentCatalogue[0] != "qwen" {
|
||||
t.Errorf("agent_catalogue not surfaced: %+v", got.AgentCatalogue)
|
||||
}
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
# sandbox-controller — Wave 1 of the Sandbox product.
|
||||
#
|
||||
# A Catalyst-built Go binary that reconciles Sandbox.sandbox.openova.io/v1
|
||||
# CRs into per-Sandbox namespace + RBAC + PVCs + placeholder Secret
|
||||
# manifests written to the per-Org `catalyst-tenant` Gitea repo. Flux on
|
||||
# the host cluster picks up the manifests and reconciles them into the
|
||||
# Org vcluster (sister of organization-controller — same patterns).
|
||||
#
|
||||
# Build context: invoked with the repository ROOT as the build context.
|
||||
# Mirrors core/controllers/organization/Containerfile (slice CC1 layout:
|
||||
# shared go.mod at core/controllers/, shared pkg at core/controllers/pkg).
|
||||
#
|
||||
# Two stages:
|
||||
# build — golang:1.23-alpine
|
||||
# final — alpine:3.20 minimal runtime (CA certs + the binary)
|
||||
|
||||
FROM docker.io/library/golang:1.23-alpine AS build
|
||||
WORKDIR /workspace
|
||||
|
||||
# Stage 1: cache module downloads — go.mod/go.sum at the shared root.
|
||||
COPY core/controllers/go.mod core/controllers/go.sum core/controllers/
|
||||
WORKDIR /workspace/core/controllers
|
||||
RUN go mod download
|
||||
|
||||
# Stage 2: copy source + build. Same layout the organization-controller
|
||||
# Containerfile uses (Fix #42 follow-up — shared internal + pkg dirs
|
||||
# MUST be copied before the per-controller dir, else `go build` fails
|
||||
# resolving the github.com/openova-io/openova/core/controllers/pkg/gitea
|
||||
# import.
|
||||
WORKDIR /workspace
|
||||
COPY core/controllers/internal /workspace/core/controllers/internal
|
||||
COPY core/controllers/pkg /workspace/core/controllers/pkg
|
||||
COPY core/controllers/sandbox /workspace/core/controllers/sandbox
|
||||
|
||||
WORKDIR /workspace/core/controllers/sandbox
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags="-s -w" \
|
||||
-o /sandbox-controller ./cmd/sandbox-controller
|
||||
|
||||
# Stage 3: minimal runtime.
|
||||
FROM docker.io/library/alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates tzdata
|
||||
|
||||
COPY --from=build /sandbox-controller /sandbox-controller
|
||||
|
||||
# Alpine 3.20 already ships UID 65534 as `nobody`. The numeric form
|
||||
# satisfies runAsNonRoot=true + runAsUser=65534 in the chart's
|
||||
# Deployment.
|
||||
USER 65534:65534
|
||||
|
||||
EXPOSE 8080 8081
|
||||
|
||||
ENTRYPOINT ["/sandbox-controller"]
|
||||
@ -1,294 +0,0 @@
|
||||
// sandbox-controller — Wave 1 + Wave 8 + Wave 9 of the Sandbox product
|
||||
// (products/sandbox/docs/architecture.md §7).
|
||||
//
|
||||
// Production entry point. Reads configuration from environment vars,
|
||||
// constructs the controller-runtime manager, and starts the Sandbox
|
||||
// reconciler with leader election.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/healthz"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
"sigs.k8s.io/controller-runtime/pkg/manager"
|
||||
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/controller"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/idlescaler"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/newapi"
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
)
|
||||
|
||||
var scheme = runtime.NewScheme()
|
||||
|
||||
func init() {
|
||||
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
|
||||
utilruntime.Must(sandboxapi.AddToScheme(scheme))
|
||||
}
|
||||
|
||||
func main() {
|
||||
var (
|
||||
metricsAddr string
|
||||
probeAddr string
|
||||
enableLeaderElection bool
|
||||
)
|
||||
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
|
||||
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
|
||||
flag.BoolVar(&enableLeaderElection, "leader-elect", true,
|
||||
"Enable leader election for controller manager. Defaults to true so HA replicas don't double-write.")
|
||||
|
||||
opts := zap.Options{Development: false}
|
||||
opts.BindFlags(flag.CommandLine)
|
||||
flag.Parse()
|
||||
|
||||
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
|
||||
log := ctrl.Log.WithName("sandbox-controller")
|
||||
|
||||
giteaURL := mustEnv("CATALYST_GITEA_URL", log)
|
||||
giteaToken := mustEnv("CATALYST_GITEA_TOKEN", log)
|
||||
hostCluster := mustEnv("CATALYST_HOST_CLUSTER", log)
|
||||
sovereignFQDN := mustEnv("CATALYST_SOVEREIGN_FQDN", log)
|
||||
|
||||
branch := envOr("CATALYST_GITEA_BRANCH", "main")
|
||||
tenantRepo := envOr("CATALYST_TENANT_REPO_NAME", "catalyst-tenant")
|
||||
|
||||
// Wave 8 runtime env — per-Sandbox pty-server / MCP / NEWAPI for
|
||||
// the rendered Pod manifests.
|
||||
ptyServerImage := mustEnv("SANDBOX_PTY_SERVER_IMAGE", log)
|
||||
mcpImage := mustEnv("SANDBOX_MCP_IMAGE", log)
|
||||
sandboxNewapiURL := mustEnv("SANDBOX_NEWAPI_URL", log)
|
||||
llmGatewayTokenSecret := envOr("SANDBOX_LLM_GATEWAY_TOKEN_SECRET", "sandbox-tokens")
|
||||
byosSecretPrefix := envOr("SANDBOX_BYOS_SECRET_PREFIX", "sandbox-byos-claude-code")
|
||||
idleTimeoutMinutes := envOrInt("SANDBOX_IDLE_TIMEOUT_MINUTES", 30)
|
||||
|
||||
// Wave 9 — NewAPI bridge wiring. Two env vars carry the bridge URL +
|
||||
// admin bearer used by the controller to call POST
|
||||
// /admin/tokens/sandbox (catalyst-api bridge handler, PR #1638).
|
||||
// Both are REQUIRED in production — a sandbox-controller without
|
||||
// the bridge wired silently ships Sandboxes without an LLM
|
||||
// connection. Permit unset for compatibility with smoke tests
|
||||
// that exercise only the gitops path (env both unset ⇒ controller
|
||||
// runs without the token-mint path; log line announces it).
|
||||
newapiBaseURL := strings.TrimSpace(os.Getenv("NEWAPI_BASE_URL"))
|
||||
newapiAdmin := strings.TrimSpace(os.Getenv("NEWAPI_ADMIN_SECRET"))
|
||||
defaultChannels := splitAndTrim(envOr("NEWAPI_DEFAULT_CHANNELS", ""), ",")
|
||||
|
||||
// D31 active-hot-standby — Sovereign-level toggle + region pair the
|
||||
// controller threads into every per-Sandbox MCP Pod. The MCP
|
||||
// server's sandbox.db.provision handler reads these at call time
|
||||
// and, when valid, materialises a primary + replica Cluster.
|
||||
// postgresql.cnpg.io pair instead of a single Cluster (DoD D31).
|
||||
// Default-empty keeps every existing Sandbox on single-Cluster
|
||||
// CNPG (zero regression). Bootstrap-kit slot 61 wires these from
|
||||
// the per-Sovereign overlay's envsubst placeholders into the
|
||||
// bp-sandbox HelmRelease values.
|
||||
enableHotStandby := envOr("SOVEREIGN_ENABLE_HOT_STANDBY", "")
|
||||
primaryRegion := envOr("SOVEREIGN_PRIMARY_REGION", "")
|
||||
replicaRegion := envOr("SOVEREIGN_REPLICA_REGION", "")
|
||||
|
||||
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
|
||||
Scheme: scheme,
|
||||
Metrics: metricsserver.Options{BindAddress: metricsAddr},
|
||||
HealthProbeBindAddress: probeAddr,
|
||||
LeaderElection: enableLeaderElection,
|
||||
LeaderElectionID: "sandbox-controller.sandbox.openova.io",
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(err, "manager init")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
|
||||
log.Error(err, "healthz")
|
||||
os.Exit(1)
|
||||
}
|
||||
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
|
||||
log.Error(err, "readyz")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
var newapiClient newapi.Client
|
||||
if newapiBaseURL != "" && newapiAdmin != "" {
|
||||
c, err := newapi.New(newapiBaseURL, newapiAdmin, nil)
|
||||
if err != nil {
|
||||
log.Error(err, "newapi client init")
|
||||
os.Exit(1)
|
||||
}
|
||||
newapiClient = c
|
||||
} else {
|
||||
log.Info("newapi bridge not wired — sandbox-controller running in gitops-only mode",
|
||||
"newapi_base_url_set", newapiBaseURL != "",
|
||||
"newapi_admin_secret_set", newapiAdmin != "",
|
||||
)
|
||||
}
|
||||
|
||||
r := &controller.Reconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Log: log.WithName("reconciler"),
|
||||
GiteaClient: gitea.New(giteaURL, giteaToken),
|
||||
HostCluster: hostCluster,
|
||||
SovereignFQDN: sovereignFQDN,
|
||||
Branch: branch,
|
||||
TenantRepoName: tenantRepo,
|
||||
PtyServerImage: ptyServerImage,
|
||||
MCPImage: mcpImage,
|
||||
NewapiURL: sandboxNewapiURL,
|
||||
LLMGatewayTokenSecret: llmGatewayTokenSecret,
|
||||
BYOSSecretPrefix: byosSecretPrefix,
|
||||
IdleTimeoutMinutes: idleTimeoutMinutes,
|
||||
NewAPIClient: newapiClient,
|
||||
DefaultChannels: defaultChannels,
|
||||
EnableHotStandby: enableHotStandby,
|
||||
PrimaryRegion: primaryRegion,
|
||||
ReplicaRegion: replicaRegion,
|
||||
}
|
||||
if err := r.SetupWithManager(mgr); err != nil {
|
||||
log.Error(err, "setup reconciler")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Wave 10 (PR #1641 follow-up) — IdleScaler reads the
|
||||
// `openova.io/sandbox-idle-timeout-minutes` annotation the
|
||||
// renderer writes on every pty-server StatefulSet, polls each
|
||||
// pty-server Service for live activity, and scales replicas to 0
|
||||
// once the idle window has elapsed. Leader-elected so HA
|
||||
// controller replicas don't race.
|
||||
scaler := idlescaler.New(mgr.GetClient(),
|
||||
log.WithName("idle-scaler"),
|
||||
idlescaler.Options{
|
||||
DefaultIdleTimeoutMinutes: idleTimeoutMinutes,
|
||||
})
|
||||
if err := mgr.Add(scaler); err != nil {
|
||||
log.Error(err, "add idle-scaler to manager")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// D35 consume-leg — subscribe to `catalyst.tenant.sandbox_requested`
|
||||
// so the publish from tenant-service nudges the matching Sandbox CR
|
||||
// into a fresh Reconcile within ~50ms. Same wiring shape as the
|
||||
// organization-controller's NATS bridge. Best-effort: NATS_URL
|
||||
// unset → log + continue (informer requeue fallback intact).
|
||||
natsURL := strings.TrimSpace(os.Getenv("NATS_URL"))
|
||||
sandboxNs := envOr("SANDBOX_NAMESPACE", "catalyst-system")
|
||||
if natsURL != "" {
|
||||
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
|
||||
sub, err := natsbus.Connect(natsURL)
|
||||
if err != nil {
|
||||
log.Error(err, "natsbus: connect failed — D35 consume-leg disabled",
|
||||
"nats_url", natsURL)
|
||||
return nil
|
||||
}
|
||||
bridge := &controller.NATSBridge{
|
||||
Client: mgr.GetClient(),
|
||||
Log: log.WithName("natsbridge"),
|
||||
Namespace: sandboxNs,
|
||||
}
|
||||
if err := sub.Subscribe(ctx,
|
||||
natsbus.SubjectTenantSandboxRequested,
|
||||
"sandbox-controller-sandbox-requested",
|
||||
bridge.HandleSandboxRequested,
|
||||
natsbus.SubscribeOptions{},
|
||||
); err != nil {
|
||||
log.Error(err, "natsbus: subscribe tenant.sandbox_requested failed")
|
||||
}
|
||||
<-ctx.Done()
|
||||
sub.Close()
|
||||
return nil
|
||||
})); err != nil {
|
||||
log.Error(err, "natsbus: add runnable failed")
|
||||
os.Exit(1)
|
||||
}
|
||||
log.Info("natsbus: D35 consume-leg wired",
|
||||
"nats_url", natsURL,
|
||||
"subjects", []string{natsbus.SubjectTenantSandboxRequested},
|
||||
"sandbox_namespace", sandboxNs,
|
||||
)
|
||||
} else {
|
||||
log.Info("natsbus: NATS_URL unset — D35 consume-leg disabled (informer-requeue fallback only)")
|
||||
}
|
||||
|
||||
log.Info("starting manager",
|
||||
"host_cluster", hostCluster,
|
||||
"sovereign_fqdn", sovereignFQDN,
|
||||
"gitea_url", giteaURL,
|
||||
"tenant_repo", tenantRepo,
|
||||
"pty_server_image", ptyServerImage,
|
||||
"mcp_image", mcpImage,
|
||||
"newapi_url", sandboxNewapiURL,
|
||||
"llm_gateway_token_secret", llmGatewayTokenSecret,
|
||||
"byos_secret_prefix", byosSecretPrefix,
|
||||
"idle_timeout_minutes", idleTimeoutMinutes,
|
||||
"newapi_wired", newapiClient != nil,
|
||||
"default_channels", defaultChannels,
|
||||
)
|
||||
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
|
||||
log.Error(err, "manager start")
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func mustEnv(key string, log interface {
|
||||
Error(err error, msg string, kvs ...any)
|
||||
},
|
||||
) string {
|
||||
v := strings.TrimSpace(os.Getenv(key))
|
||||
if v == "" {
|
||||
log.Error(fmt.Errorf("missing env"), "required env var unset", "key", key)
|
||||
os.Exit(2)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func envOr(key, fallback string) string {
|
||||
v := strings.TrimSpace(os.Getenv(key))
|
||||
if v == "" {
|
||||
return fallback
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// envOrInt parses an integer env var; non-integer / empty returns the
|
||||
// fallback. Used for SANDBOX_IDLE_TIMEOUT_MINUTES — operator drift
|
||||
// (mistyped value) shouldn't crash the controller.
|
||||
func envOrInt(key string, fallback int) int {
|
||||
v := strings.TrimSpace(os.Getenv(key))
|
||||
if v == "" {
|
||||
return fallback
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil || n <= 0 {
|
||||
return fallback
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// splitAndTrim splits s on sep and returns the non-empty trimmed
|
||||
// pieces. "qwen,vllm , " → ["qwen","vllm"]. Empty s returns nil so
|
||||
// the caller's len()==0 check is unambiguous.
|
||||
func splitAndTrim(s, sep string) []string {
|
||||
if strings.TrimSpace(s) == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(s, sep)
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@ -1,223 +0,0 @@
|
||||
// nats_bridge wires the canonical Catalyst NATS subject
|
||||
// `catalyst.tenant.sandbox_requested` (D35 consume leg, sandbox-controller side)
|
||||
// into the sandbox-controller's reconcile loop.
|
||||
//
|
||||
// Why this lives in sandbox-controller, not just in tenant-service:
|
||||
// the tenant-service SandboxOrchestrator (PR #1633) already consumes
|
||||
// `catalyst.tenant.sandbox_requested` and creates the Sandbox CR. The
|
||||
// missing leg was an in-cluster controller that, after the CR
|
||||
// materialised, OBSERVES the same envelope on its broker side and
|
||||
// triggers a fresh Reconcile within ~50ms instead of waiting for the
|
||||
// 30s informer requeue. That tightens the cart-completion → CR-Ready
|
||||
// loop end-to-end and closes D35: NATS round-trips end-to-end with
|
||||
// the controllers as the consume-side leg.
|
||||
//
|
||||
// The bridge looks up the matching Sandbox CR by the same name
|
||||
// derivation tenant-service uses (sanitised owner email/UID inside the
|
||||
// sandbox namespace) and stamps two annotations:
|
||||
//
|
||||
// - openova.io/last-event-observed-at: RFC3339 timestamp from the
|
||||
// broker envelope. Stable across duplicate JetStream delivery so
|
||||
// the annotation patch is byte-equal on replay.
|
||||
// - openova.io/last-event-subject: the canonical subject string.
|
||||
//
|
||||
// Patching either annotation triggers an informer event →
|
||||
// controller-runtime enqueues the CR's NamespacedName → Reconcile
|
||||
// runs within ~50ms.
|
||||
//
|
||||
// The 30s RequeueAfter in r.Reconcile remains untouched — this bridge
|
||||
// is an accelerator, not the only path. NATS message loss never
|
||||
// strands a CR.
|
||||
//
|
||||
// Per HARD CONSTRAINT: no credential write-paths. The bridge reads
|
||||
// only the Event envelope + the matching CR; it never touches Secrets
|
||||
// or NewAPI bearer tokens.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
)
|
||||
|
||||
// Annotation keys stamped on the matching Sandbox CR when a canonical
|
||||
// NATS envelope is observed. Identical to the organization-controller
|
||||
// keys for operator-visible symmetry across the two Group-C
|
||||
// controllers — `kubectl get sandboxes,organizations -o jsonpath`
|
||||
// surfaces the same field across both kinds.
|
||||
const (
|
||||
AnnotationLastNATSObservedAt = "openova.io/last-event-observed-at"
|
||||
AnnotationLastNATSSubject = "openova.io/last-event-subject"
|
||||
)
|
||||
|
||||
// DefaultSandboxNamespace is the namespace tenant-service writes
|
||||
// Sandbox CRs into when SANDBOX_NAMESPACE is unset. Mirrors the
|
||||
// publish-side default in core/services/tenant/handlers/sandbox_consumer.go.
|
||||
const DefaultSandboxNamespace = "catalyst-system"
|
||||
|
||||
// NATSBridge is the consume-leg adapter for the sandbox-controller.
|
||||
type NATSBridge struct {
|
||||
Client client.Client
|
||||
Log logr.Logger
|
||||
|
||||
// Namespace is the sandbox-namespace tenant-service writes Sandbox
|
||||
// CRs into. Defaults to DefaultSandboxNamespace when empty.
|
||||
Namespace string
|
||||
}
|
||||
|
||||
// HandleSandboxRequested reacts to a `catalyst.tenant.sandbox_requested`
|
||||
// envelope. The publish-side (tenant.handlers.CreateOrg in PR #1633)
|
||||
// stamps owner_email + owner_id + org_slug + agents on the Data
|
||||
// payload. We derive the deterministic Sandbox CR name using the same
|
||||
// rules tenant-service applies (sanitised email leaf, "sandbox-"
|
||||
// prefix, RFC1123-bounded) and patch the observation annotations.
|
||||
func (b *NATSBridge) HandleSandboxRequested(ctx context.Context, ev *natsbus.Event) error {
|
||||
if ev == nil {
|
||||
return nil
|
||||
}
|
||||
var payload struct {
|
||||
TenantID string `json:"tenant_id"`
|
||||
OrgSlug string `json:"org_slug"`
|
||||
OwnerID string `json:"owner_id"`
|
||||
OwnerEmail string `json:"owner_email"`
|
||||
}
|
||||
if err := json.Unmarshal(ev.Data, &payload); err != nil {
|
||||
b.Log.Error(err, "sandbox_requested: malformed Data payload — ack to skip",
|
||||
"event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
|
||||
name := sandboxCRNameFromEvent(payload.OwnerEmail, payload.OwnerID)
|
||||
if name == "" {
|
||||
b.Log.Error(fmt.Errorf("empty derived name"),
|
||||
"sandbox_requested: payload has neither owner_email nor owner_id — ack to skip",
|
||||
"event_id", ev.ID, "tenant_id", payload.TenantID)
|
||||
return nil
|
||||
}
|
||||
|
||||
ns := b.Namespace
|
||||
if ns == "" {
|
||||
ns = DefaultSandboxNamespace
|
||||
}
|
||||
|
||||
var sb sandboxapi.Sandbox
|
||||
if err := b.Client.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, &sb); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
// Cold-start ordering: the broker delivered our copy of
|
||||
// the envelope before the tenant-service orchestrator
|
||||
// finished writing the CR. Soft-miss; tenant-service's
|
||||
// Sandbox CR Create will fire an informer event of its
|
||||
// own when it lands, so we don't need to retry.
|
||||
b.Log.Info("nats observation: no matching Sandbox CR — ack and skip",
|
||||
"subject", natsbus.SubjectTenantSandboxRequested,
|
||||
"namespace", ns, "name", name, "event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("get sandbox %s/%s: %w", ns, name, err)
|
||||
}
|
||||
|
||||
observedAt := ev.Timestamp.UTC().Format(time.RFC3339Nano)
|
||||
if observedAt == "" || ev.Timestamp.IsZero() {
|
||||
observedAt = time.Now().UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
|
||||
// Byte-stable patch on duplicate JetStream delivery: skip when
|
||||
// the annotations already match.
|
||||
cur := sb.GetAnnotations()
|
||||
if cur != nil &&
|
||||
cur[AnnotationLastNATSObservedAt] == observedAt &&
|
||||
cur[AnnotationLastNATSSubject] == natsbus.SubjectTenantSandboxRequested {
|
||||
b.Log.V(1).Info("nats observation: duplicate envelope — skip patch",
|
||||
"subject", natsbus.SubjectTenantSandboxRequested,
|
||||
"namespace", ns, "name", name, "event_id", ev.ID)
|
||||
return nil
|
||||
}
|
||||
|
||||
desired := &sandboxapi.Sandbox{}
|
||||
sb.DeepCopyInto(desired)
|
||||
anns := desired.GetAnnotations()
|
||||
if anns == nil {
|
||||
anns = map[string]string{}
|
||||
}
|
||||
anns[AnnotationLastNATSObservedAt] = observedAt
|
||||
anns[AnnotationLastNATSSubject] = natsbus.SubjectTenantSandboxRequested
|
||||
desired.SetAnnotations(anns)
|
||||
|
||||
if err := b.Client.Patch(ctx, desired, client.MergeFrom(&sb)); err != nil {
|
||||
return fmt.Errorf("patch sandbox %s/%s: %w", ns, name, err)
|
||||
}
|
||||
b.Log.Info("nats observation stamped — reconcile enqueued",
|
||||
"subject", natsbus.SubjectTenantSandboxRequested,
|
||||
"namespace", ns, "name", name,
|
||||
"event_id", ev.ID, "observed_at", observedAt)
|
||||
return nil
|
||||
}
|
||||
|
||||
// sandboxCRNameFromEvent mirrors core/services/tenant/handlers/sandbox_consumer.go
|
||||
// `sandboxCRName(email, ownerID)`. The two functions MUST stay in
|
||||
// sync — tenant-service writes the CR under this name, and
|
||||
// sandbox-controller's NATSBridge looks it up by the same name.
|
||||
//
|
||||
// Rules (verbatim from the publish-side):
|
||||
//
|
||||
// 1. Prefer the email; fall back to ownerID when email is empty.
|
||||
// 2. Sanitise to a DNS-1123 leaf via sanitizeSandboxLeaf.
|
||||
// 3. Empty post-sanitise → literal "user" so the consumer never
|
||||
// returns an empty-name lookup.
|
||||
// 4. Final name = "sandbox-" + leaf, truncated to 63 chars and
|
||||
// trailing-hyphen-stripped.
|
||||
func sandboxCRNameFromEvent(email, ownerID string) string {
|
||||
candidate := strings.TrimSpace(email)
|
||||
if candidate == "" {
|
||||
candidate = strings.TrimSpace(ownerID)
|
||||
}
|
||||
leaf := sanitizeSandboxLeaf(candidate)
|
||||
if leaf == "" {
|
||||
leaf = "user"
|
||||
}
|
||||
name := "sandbox-" + leaf
|
||||
if len(name) > 63 {
|
||||
name = name[:63]
|
||||
}
|
||||
name = strings.TrimRight(name, "-")
|
||||
return name
|
||||
}
|
||||
|
||||
// sanitizeSandboxLeaf mirrors core/services/tenant/handlers/sandbox_consumer.go
|
||||
// `sanitizeSandboxLeaf`. Lowercases, replaces @ + . + + + _ with -,
|
||||
// strips everything outside [a-z0-9-], collapses double-hyphens, and
|
||||
// trims leading/trailing hyphens.
|
||||
func sanitizeSandboxLeaf(in string) string {
|
||||
out := strings.ToLower(in)
|
||||
out = strings.ReplaceAll(out, "@", "-at-")
|
||||
out = strings.ReplaceAll(out, ".", "-")
|
||||
out = strings.ReplaceAll(out, "+", "-plus-")
|
||||
out = strings.ReplaceAll(out, "_", "-")
|
||||
var b strings.Builder
|
||||
b.Grow(len(out))
|
||||
for _, r := range out {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= '0' && r <= '9', r == '-':
|
||||
b.WriteRune(r)
|
||||
default:
|
||||
b.WriteRune('-')
|
||||
}
|
||||
}
|
||||
out = b.String()
|
||||
for strings.Contains(out, "--") {
|
||||
out = strings.ReplaceAll(out, "--", "-")
|
||||
}
|
||||
out = strings.Trim(out, "-")
|
||||
return out
|
||||
}
|
||||
@ -1,193 +0,0 @@
|
||||
// Unit tests for the sandbox-controller NATS consume-leg bridge (D35).
|
||||
//
|
||||
// Mirrors organization/internal/controller/nats_bridge_test.go for
|
||||
// `catalyst.tenant.sandbox_requested`. The bridge surface is the same
|
||||
// signature the live JetStream subscriber drives, so these tests
|
||||
// exercise the same code path the runtime uses.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr/testr"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/fake"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
)
|
||||
|
||||
func newSandboxBridgeFixture(t *testing.T, ns string, objs ...runtime.Object) *NATSBridge {
|
||||
t.Helper()
|
||||
scheme := runtime.NewScheme()
|
||||
if err := clientgoscheme.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("clientgo addtoscheme: %v", err)
|
||||
}
|
||||
if err := sandboxapi.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("sandboxapi addtoscheme: %v", err)
|
||||
}
|
||||
cb := fake.NewClientBuilder().WithScheme(scheme)
|
||||
if len(objs) > 0 {
|
||||
cb = cb.WithRuntimeObjects(objs...)
|
||||
}
|
||||
return &NATSBridge{
|
||||
Client: cb.Build(),
|
||||
Log: testr.New(t),
|
||||
Namespace: ns,
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_SandboxRequested_HappyPath pins the D35 sandbox round-trip:
|
||||
// an envelope with owner_email matching a real Sandbox CR results in
|
||||
// both observation annotations being patched.
|
||||
func TestNATSBridge_SandboxRequested_HappyPath(t *testing.T) {
|
||||
const ns = "catalyst-system"
|
||||
// tenant-service derives sandbox name as "sandbox-" + sanitised
|
||||
// email → ceo@acme.com → "sandbox-ceo-at-acme-com".
|
||||
sb := &sandboxapi.Sandbox{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: ns,
|
||||
Name: "sandbox-ceo-at-acme-com",
|
||||
},
|
||||
}
|
||||
bridge := newSandboxBridgeFixture(t, ns, sb)
|
||||
|
||||
ts := time.Date(2026, 5, 18, 15, 0, 0, 123456789, time.UTC)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"tenant_id": "tnt-9",
|
||||
"org_slug": "acme",
|
||||
"owner_id": "u-1",
|
||||
"owner_email": "ceo@acme.com",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-sb-1",
|
||||
Type: "tenant.sandbox_requested",
|
||||
Source: "tenant-service",
|
||||
Timestamp: ts,
|
||||
TenantID: "tnt-9",
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleSandboxRequested(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleSandboxRequested: %v", err)
|
||||
}
|
||||
|
||||
var got sandboxapi.Sandbox
|
||||
if err := bridge.Client.Get(context.Background(),
|
||||
types.NamespacedName{Namespace: ns, Name: "sandbox-ceo-at-acme-com"}, &got); err != nil {
|
||||
t.Fatalf("get sandbox: %v", err)
|
||||
}
|
||||
anns := got.GetAnnotations()
|
||||
if anns[AnnotationLastNATSSubject] != natsbus.SubjectTenantSandboxRequested {
|
||||
t.Errorf("subject annotation: got %q want %q",
|
||||
anns[AnnotationLastNATSSubject], natsbus.SubjectTenantSandboxRequested)
|
||||
}
|
||||
wantObservedAt := ts.Format(time.RFC3339Nano)
|
||||
if anns[AnnotationLastNATSObservedAt] != wantObservedAt {
|
||||
t.Errorf("observed-at annotation: got %q want %q",
|
||||
anns[AnnotationLastNATSObservedAt], wantObservedAt)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_SandboxRequested_OwnerIDFallback pins: when owner_email
|
||||
// is absent, the bridge falls back to owner_id for the CR name
|
||||
// derivation. Mirrors tenant-service's sandboxCRName fallback rule
|
||||
// (PR #1633) — both sides must stay in lockstep.
|
||||
func TestNATSBridge_SandboxRequested_OwnerIDFallback(t *testing.T) {
|
||||
const ns = "catalyst-system"
|
||||
sb := &sandboxapi.Sandbox{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: ns,
|
||||
Name: "sandbox-u-1",
|
||||
},
|
||||
}
|
||||
bridge := newSandboxBridgeFixture(t, ns, sb)
|
||||
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"tenant_id": "tnt-no-email",
|
||||
"owner_id": "u-1",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-sb-no-email",
|
||||
Type: "tenant.sandbox_requested",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleSandboxRequested(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleSandboxRequested: %v", err)
|
||||
}
|
||||
|
||||
var got sandboxapi.Sandbox
|
||||
if err := bridge.Client.Get(context.Background(),
|
||||
types.NamespacedName{Namespace: ns, Name: "sandbox-u-1"}, &got); err != nil {
|
||||
t.Fatalf("get sandbox: %v", err)
|
||||
}
|
||||
if _, ok := got.GetAnnotations()[AnnotationLastNATSSubject]; !ok {
|
||||
t.Error("owner_id fallback failed — CR not annotated")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_SandboxRequested_NoMatchingCR pins: cold-start ordering
|
||||
// (broker delivered before tenant-service finished creating the CR)
|
||||
// is a soft miss — return nil so the dispatcher Acks, do not Nak.
|
||||
func TestNATSBridge_SandboxRequested_NoMatchingCR(t *testing.T) {
|
||||
bridge := newSandboxBridgeFixture(t, "catalyst-system")
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"owner_email": "ghost@nowhere.io",
|
||||
})
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-miss-sb",
|
||||
Type: "tenant.sandbox_requested",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: body,
|
||||
}
|
||||
if err := bridge.HandleSandboxRequested(context.Background(), ev); err != nil {
|
||||
t.Fatalf("HandleSandboxRequested on missing CR returned error (should soft-miss): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNATSBridge_SandboxRequested_MalformedData pins poison-pill
|
||||
// behaviour: malformed JSON returns nil so the dispatcher Acks-to-skip.
|
||||
func TestNATSBridge_SandboxRequested_MalformedData(t *testing.T) {
|
||||
bridge := newSandboxBridgeFixture(t, "catalyst-system")
|
||||
ev := &natsbus.Event{
|
||||
ID: "evt-bad-sb",
|
||||
Type: "tenant.sandbox_requested",
|
||||
Timestamp: time.Now().UTC(),
|
||||
Data: []byte("not-json{"),
|
||||
}
|
||||
if err := bridge.HandleSandboxRequested(context.Background(), ev); err != nil {
|
||||
t.Errorf("malformed Data should not Nak, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSandboxCRName_MatchesTenantServiceConvention pins the name
|
||||
// derivation rules verbatim against the publish-side convention. If
|
||||
// tenant-service changes its naming rule, this test goes red so the
|
||||
// bridge stays in lockstep.
|
||||
func TestSandboxCRName_MatchesTenantServiceConvention(t *testing.T) {
|
||||
cases := []struct {
|
||||
email, ownerID, want string
|
||||
}{
|
||||
{"ceo@acme.com", "u-1", "sandbox-ceo-at-acme-com"},
|
||||
{"", "u-99", "sandbox-u-99"},
|
||||
{"Mixed.Case+User@Globex.io", "", "sandbox-mixed-case-plus-user-at-globex-io"},
|
||||
{"", "", "sandbox-user"},
|
||||
{"a@b", "", "sandbox-a-at-b"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.email+"|"+c.ownerID, func(t *testing.T) {
|
||||
got := sandboxCRNameFromEvent(c.email, c.ownerID)
|
||||
if got != c.want {
|
||||
t.Errorf("sandboxCRNameFromEvent(%q,%q) = %q, want %q",
|
||||
c.email, c.ownerID, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -1,437 +0,0 @@
|
||||
// Package controller hosts the Sandbox reconciler — the Wave 1 + Wave 8
|
||||
// slice of the Sandbox product (#1615 brief + products/sandbox/docs/
|
||||
// architecture.md §7).
|
||||
//
|
||||
// Per architecture.md §7 the sandbox-controller is the sister of
|
||||
// organization-controller. It reconciles a Sandbox CR into manifests
|
||||
// the per-Org Flux Kustomization (host cluster) materializes inside
|
||||
// the Org vcluster. Wave 8 adds the pty-server StatefulSet + MCP
|
||||
// Deployment + Service + HTTPRoute (in addition to the Wave-1
|
||||
// namespace + RBAC + PVCs + placeholder Secret).
|
||||
//
|
||||
// Idempotency: every "ensure" step is find-or-create + byte-equal
|
||||
// short-circuit. Re-reconciling on a steady-state CR writes nothing
|
||||
// downstream.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/gitops"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/newapi"
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
)
|
||||
|
||||
// Annotation keys the reconciler stamps onto the Sandbox CR to carry
|
||||
// the per-Sandbox NewAPI token lifecycle. The token VALUE itself
|
||||
// never lands on the CR — only its expiry + last-rotation instant.
|
||||
// The rendered Secret in the per-Org Gitea repo carries the bytes.
|
||||
const (
|
||||
annotationTokenExpiresAt = "openova.io/sandbox-token-expires-at"
|
||||
annotationTokenRotatedAt = "openova.io/sandbox-token-rotated-at"
|
||||
)
|
||||
|
||||
// DefaultTokenRotationLeadTime is how far in advance the reconciler
|
||||
// re-mints the per-Sandbox NewAPI token before its expiry. The
|
||||
// bridge handler currently issues 7-day tokens (SandboxTokenTTL in
|
||||
// platform/newapi/internal/handler/sandbox_token.go) — picking a 1-
|
||||
// day lead means a steady-state reconcile re-mints once per day,
|
||||
// keeping the rendered Secret byte-stable between reconciles in the
|
||||
// 6-day fresh-token window.
|
||||
//
|
||||
// The Wave 9 brief calls for "15 days before expiry" — that target
|
||||
// applies once the bridge TTL is bumped to 30+ days. Until then 24h
|
||||
// is the operationally-sane default; per-Sovereign overlays can
|
||||
// override via Reconciler.TokenRotationLeadTime (e.g. set to 15d
|
||||
// when the bridge's TTL is bumped).
|
||||
const DefaultTokenRotationLeadTime = 24 * time.Hour
|
||||
|
||||
// Reconciler reconciles Sandbox CRs.
|
||||
type Reconciler struct {
|
||||
client.Client
|
||||
Log logr.Logger
|
||||
|
||||
GiteaClient *gitea.Client
|
||||
HostCluster string
|
||||
SovereignFQDN string
|
||||
Branch string
|
||||
TenantRepoName string
|
||||
|
||||
// Wave 8 per-Sandbox runtime knobs (plumbed from chart env).
|
||||
PtyServerImage string
|
||||
MCPImage string
|
||||
NewapiURL string
|
||||
LLMGatewayTokenSecret string
|
||||
BYOSSecretPrefix string
|
||||
IdleTimeoutMinutes int
|
||||
|
||||
// D31 active-hot-standby — Sovereign-level toggle + region pair the
|
||||
// controller threads from its chart env (SOVEREIGN_ENABLE_HOT_STANDBY,
|
||||
// SOVEREIGN_PRIMARY_REGION, SOVEREIGN_REPLICA_REGION) into every
|
||||
// per-Sandbox MCP Pod via gitops.Inputs. The MCP server's
|
||||
// sandbox.db.provision handler reads them at call time and renders a
|
||||
// primary + replica Cluster.postgresql.cnpg.io pair when valid.
|
||||
// Default-empty keeps every existing Sandbox on single-Cluster CNPG
|
||||
// (zero regression). Bootstrap-kit slot 61 wires the per-Sovereign
|
||||
// overlay's envsubst placeholders into the bp-sandbox HelmRelease
|
||||
// values; the chart surfaces them as the controller's env.
|
||||
EnableHotStandby string
|
||||
PrimaryRegion string
|
||||
ReplicaRegion string
|
||||
|
||||
// Wave 9 — NewAPI bridge client used by Reconcile to mint
|
||||
// per-Sandbox LLM-gateway tokens (POST /admin/tokens/sandbox,
|
||||
// PR #1638). When nil the reconciler renders the Wave 1+8
|
||||
// manifests but skips the token-mint path — the controller is
|
||||
// operable on a Sovereign whose bridge handler is not yet rolled
|
||||
// out (e.g. fresh prov mid-handover) without silently shipping a
|
||||
// Sandbox without an LLM connection. main.go logs a warning in
|
||||
// that case.
|
||||
NewAPIClient newapi.Client
|
||||
|
||||
// DefaultChannels is the operator-configured list of NewAPI
|
||||
// channel names every freshly-minted Sandbox token is allowed to
|
||||
// call. Currently a single channel per Sovereign ("qwen" today,
|
||||
// see products/sandbox/docs/newapi-proxy-contract.md §2); future
|
||||
// per-tier work will allow per-Sandbox overrides via spec.
|
||||
DefaultChannels []string
|
||||
|
||||
// TokenRotationLeadTime overrides DefaultTokenRotationLeadTime. The
|
||||
// controller re-mints when the previously-issued token's expiry is
|
||||
// within this window of now. Zero ⇒ DefaultTokenRotationLeadTime.
|
||||
TokenRotationLeadTime time.Duration
|
||||
|
||||
// Now is the wall-clock source. Defaults to time.Now when nil;
|
||||
// injected by tests for deterministic rotation behaviour.
|
||||
Now func() time.Time
|
||||
}
|
||||
|
||||
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
return ctrl.NewControllerManagedBy(mgr).
|
||||
For(&sandboxapi.Sandbox{}).
|
||||
Complete(r)
|
||||
}
|
||||
|
||||
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
log := r.Log.WithValues("sandbox", req.NamespacedName.String())
|
||||
log.Info("reconcile")
|
||||
|
||||
var sb sandboxapi.Sandbox
|
||||
if err := r.Get(ctx, req.NamespacedName, &sb); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
return ctrl.Result{}, fmt.Errorf("get sandbox: %w", err)
|
||||
}
|
||||
|
||||
if strings.TrimSpace(sb.Spec.Owner.OrgRef.Slug) == "" {
|
||||
return r.fail(ctx, &sb, "OwnerOrgRefMissing",
|
||||
"spec.owner.orgRef.slug must be non-empty (the parent Organization slug)")
|
||||
}
|
||||
if strings.TrimSpace(sb.Spec.Owner.Email) == "" {
|
||||
return r.fail(ctx, &sb, "OwnerEmailMissing",
|
||||
"spec.owner.email must be non-empty")
|
||||
}
|
||||
|
||||
ownerUID := sanitizeEmail(sb.Spec.Owner.Email)
|
||||
if ownerUID == "" {
|
||||
return r.fail(ctx, &sb, "OwnerEmailInvalid",
|
||||
fmt.Sprintf("spec.owner.email %q did not yield a DNS-safe owner UID", sb.Spec.Owner.Email))
|
||||
}
|
||||
|
||||
// ── Per-Sandbox NewAPI bearer ──────────────────────────────────────
|
||||
// When wired (r.NewAPIClient non-nil), the controller drives the
|
||||
// full token lifecycle:
|
||||
//
|
||||
// - No prior token (annotation absent) → mint fresh.
|
||||
// - Token within tokenRotationLeadTime of expiry → re-mint, bump
|
||||
// the `kubectl.kubernetes.io/restartedAt` annotation on the
|
||||
// rendered Secret so Wave 8's pty-server StatefulSet picks up
|
||||
// a rolling restart.
|
||||
// - Steady state (token healthy) → leave the previously-rendered
|
||||
// Secret manifest in Gitea untouched (PutFile's byte-equal
|
||||
// guard short-circuits).
|
||||
//
|
||||
// When the bridge call fails the reconciler records a Failed
|
||||
// condition (TokenMintFailed) and requeues 30s — namespace/RBAC/PVC
|
||||
// manifests are NOT rendered until the bridge is reachable, so a
|
||||
// Sandbox without an LLM gateway never lands in steady state.
|
||||
now := time.Now
|
||||
if r.Now != nil {
|
||||
now = r.Now
|
||||
}
|
||||
leadTime := r.TokenRotationLeadTime
|
||||
if leadTime <= 0 {
|
||||
leadTime = DefaultTokenRotationLeadTime
|
||||
}
|
||||
|
||||
var (
|
||||
tokenValue string
|
||||
tokenExpiresAt string
|
||||
tokenRotatedAt string
|
||||
)
|
||||
if r.NewAPIClient != nil {
|
||||
nowT := now()
|
||||
mustMint, prevExpiry := r.shouldMintToken(&sb, nowT, leadTime)
|
||||
if mustMint {
|
||||
channels := r.channelsForSandbox(&sb)
|
||||
if len(channels) == 0 {
|
||||
return r.fail(ctx, &sb, "NoAllowedChannels",
|
||||
"sandbox-controller has no DefaultChannels configured AND spec exposes none — refusing to mint a token with empty allowed_channels")
|
||||
}
|
||||
sandboxID := string(sb.UID)
|
||||
if strings.TrimSpace(sandboxID) == "" {
|
||||
// Fresh CR without a UID stamped (only happens in
|
||||
// pathological hand-rolled fixtures). Fall back to the
|
||||
// stable namespace/name pair.
|
||||
sandboxID = fmt.Sprintf("%s/%s", sb.Namespace, sb.Name)
|
||||
}
|
||||
// Tier-bound MCP capabilities (PR #1671) — derived from
|
||||
// spec.capabilities (operator override) or spec.planId via
|
||||
// sandboxapi.ResolveCapabilities. Empty list is permitted by
|
||||
// the bridge handler and produces an introspection-only
|
||||
// token; the controller never short-circuits on a missing
|
||||
// capability list because the operator can grant on-demand
|
||||
// by patching spec.capabilities.
|
||||
caps := sandboxapi.ResolveCapabilities(&sb.Spec)
|
||||
mint, mintErr := r.NewAPIClient.MintSandboxToken(ctx, newapi.MintRequest{
|
||||
OrgID: sb.Spec.Owner.OrgRef.Slug,
|
||||
UserID: sb.Spec.Owner.Email,
|
||||
SandboxID: sandboxID,
|
||||
AllowedChannels: channels,
|
||||
Capabilities: caps,
|
||||
})
|
||||
if mintErr != nil {
|
||||
r.Log.Error(mintErr, "newapi mint failed",
|
||||
"sandbox", sb.Namespace+"/"+sb.Name,
|
||||
"prev_expiry", prevExpiry.Format(time.RFC3339))
|
||||
return r.fail(ctx, &sb, "TokenMintFailed", mintErr.Error())
|
||||
}
|
||||
tokenValue = mint.Token
|
||||
tokenExpiresAt = mint.ExpiresAt.UTC().Format(time.RFC3339)
|
||||
tokenRotatedAt = nowT.UTC().Format(time.RFC3339)
|
||||
|
||||
// Persist the rotation marker on the CR BEFORE the Gitea
|
||||
// write so a crash between this point and the PutFile pass
|
||||
// surfaces on the next reconcile as "prev_expiry already
|
||||
// past, re-mint" rather than "token rendered but CR has no
|
||||
// expiry annotation, mint again". Both paths converge but
|
||||
// stamping first keeps the operator-visible state honest.
|
||||
if err := r.stampTokenAnnotations(ctx, &sb, tokenExpiresAt, tokenRotatedAt); err != nil {
|
||||
return ctrl.Result{}, fmt.Errorf("stamp annotations: %w", err)
|
||||
}
|
||||
} else {
|
||||
tokenExpiresAt = prevExpiry.UTC().Format(time.RFC3339)
|
||||
// tokenRotatedAt left empty — renderer drops the
|
||||
// kubectl.kubernetes.io/restartedAt annotation only when
|
||||
// non-empty, so steady-state reconciles never bump it.
|
||||
}
|
||||
}
|
||||
|
||||
in := gitops.Inputs{
|
||||
Name: sb.Name,
|
||||
OwnerUID: ownerUID,
|
||||
OwnerEmail: sb.Spec.Owner.Email,
|
||||
OrgSlug: sb.Spec.Owner.OrgRef.Slug,
|
||||
SovereignFQDN: r.SovereignFQDN,
|
||||
Quota: sb.Spec.Quota,
|
||||
Repos: sb.Spec.Repos,
|
||||
PreviewDomain: sb.Spec.PreviewDomain,
|
||||
AgentCatalogue: sb.Spec.AgentCatalogue,
|
||||
PtyServerImage: r.PtyServerImage,
|
||||
MCPImage: r.MCPImage,
|
||||
NewapiURL: r.NewapiURL,
|
||||
LLMGatewayTokenSecret: r.LLMGatewayTokenSecret,
|
||||
BYOSSecretPrefix: r.BYOSSecretPrefix,
|
||||
IdleTimeoutMinutes: r.IdleTimeoutMinutes,
|
||||
IdleScalingDisabled: sb.Spec.IdleScaling != nil && !sb.Spec.IdleScaling.Enabled,
|
||||
NewAPIToken: tokenValue,
|
||||
NewAPITokenSecretName: fmt.Sprintf("sandbox-%s-newapi-token", ownerUID),
|
||||
NewAPITokenExpiresAt: tokenExpiresAt,
|
||||
NewAPITokenRotatedAt: tokenRotatedAt,
|
||||
EnableHotStandby: r.EnableHotStandby,
|
||||
PrimaryRegion: r.PrimaryRegion,
|
||||
ReplicaRegion: r.ReplicaRegion,
|
||||
}
|
||||
manifests, err := gitops.Render(in)
|
||||
if err != nil {
|
||||
return r.fail(ctx, &sb, "ManifestRenderFailed", err.Error())
|
||||
}
|
||||
|
||||
branch := r.Branch
|
||||
if branch == "" {
|
||||
branch = "main"
|
||||
}
|
||||
repo := r.TenantRepoName
|
||||
if repo == "" {
|
||||
repo = "catalyst-tenant"
|
||||
}
|
||||
|
||||
prefix := fmt.Sprintf("sandbox/%s", ownerUID)
|
||||
for path, data := range manifests {
|
||||
fullPath := fmt.Sprintf("%s/%s", prefix, path)
|
||||
if _, _, err := r.GiteaClient.PutFile(ctx,
|
||||
sb.Spec.Owner.OrgRef.Slug, repo, branch, fullPath, data,
|
||||
fmt.Sprintf("sandbox-controller: reconcile %s for sandbox %s/%s",
|
||||
fullPath, sb.Namespace, sb.Name)); err != nil {
|
||||
return r.fail(ctx, &sb, "GitopsWriteFailed",
|
||||
fmt.Sprintf("write %s: %s", fullPath, err))
|
||||
}
|
||||
}
|
||||
|
||||
desired := sandboxapi.SandboxStatus{
|
||||
Phase: "Provisioning",
|
||||
GitopsPath: prefix,
|
||||
Conditions: []sandboxapi.SandboxCondition{
|
||||
{
|
||||
Type: "Ready",
|
||||
Status: "True",
|
||||
Reason: "GitopsReconciled",
|
||||
Message: fmt.Sprintf("Wave 1+8 manifests reconciled to gitea %s/%s@%s:%s", sb.Spec.Owner.OrgRef.Slug, repo, branch, prefix),
|
||||
LastTransitionTime: metav1.NewTime(time.Now()),
|
||||
},
|
||||
},
|
||||
ObservedGeneration: sb.Generation,
|
||||
}
|
||||
if err := r.patchStatus(ctx, &sb, desired); err != nil {
|
||||
return ctrl.Result{}, fmt.Errorf("patch status: %w", err)
|
||||
}
|
||||
|
||||
log.Info("reconcile ok",
|
||||
"org", sb.Spec.Owner.OrgRef.Slug,
|
||||
"owner_uid", ownerUID,
|
||||
"gitops_path", prefix,
|
||||
"files", len(manifests),
|
||||
)
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *Reconciler) fail(ctx context.Context, sb *sandboxapi.Sandbox, reason, message string) (ctrl.Result, error) {
|
||||
r.Log.Error(errors.New(reason), message,
|
||||
"sandbox", sb.Namespace+"/"+sb.Name,
|
||||
"owner", sb.Spec.Owner.Email)
|
||||
st := sandboxapi.SandboxStatus{
|
||||
Phase: "Failed",
|
||||
Conditions: []sandboxapi.SandboxCondition{
|
||||
{
|
||||
Type: "Ready",
|
||||
Status: "False",
|
||||
Reason: reason,
|
||||
Message: message,
|
||||
LastTransitionTime: metav1.NewTime(time.Now()),
|
||||
},
|
||||
},
|
||||
ObservedGeneration: sb.Generation,
|
||||
}
|
||||
_ = r.patchStatus(ctx, sb, st)
|
||||
switch reason {
|
||||
case "OwnerOrgRefMissing", "OwnerEmailMissing", "OwnerEmailInvalid":
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
|
||||
}
|
||||
|
||||
func (r *Reconciler) patchStatus(ctx context.Context, sb *sandboxapi.Sandbox, desired sandboxapi.SandboxStatus) error {
|
||||
updated := sb.DeepCopyObject().(*sandboxapi.Sandbox)
|
||||
updated.Status = desired
|
||||
return r.Status().Update(ctx, updated)
|
||||
}
|
||||
|
||||
// shouldMintToken inspects the CR's annotations and decides whether
|
||||
// the reconciler should call the NewAPI bridge handler this pass.
|
||||
// Returns (true, zeroExpiry) on first issuance or unparseable
|
||||
// annotation; (true, prevExpiry) when the previously-issued token is
|
||||
// within leadTime of expiry; (false, prevExpiry) when the token is
|
||||
// healthy.
|
||||
func (r *Reconciler) shouldMintToken(sb *sandboxapi.Sandbox, nowT time.Time, leadTime time.Duration) (bool, time.Time) {
|
||||
raw := strings.TrimSpace(sb.GetAnnotations()[annotationTokenExpiresAt])
|
||||
if raw == "" {
|
||||
return true, time.Time{}
|
||||
}
|
||||
prev, err := time.Parse(time.RFC3339, raw)
|
||||
if err != nil {
|
||||
// Corrupt annotation — re-mint and overwrite. Operator-debug
|
||||
// path is the log line in the mint branch above.
|
||||
return true, time.Time{}
|
||||
}
|
||||
// Re-mint when expiry is within leadTime of now (covers the
|
||||
// already-expired case too: nowT.Add(leadTime).After(prev) is
|
||||
// trivially true when prev < nowT).
|
||||
if !prev.After(nowT.Add(leadTime)) {
|
||||
return true, prev
|
||||
}
|
||||
return false, prev
|
||||
}
|
||||
|
||||
// channelsForSandbox derives the AllowedChannels list for a freshly
|
||||
// minted token. Wave 9: the operator-supplied DefaultChannels are
|
||||
// the source of truth. Future waves (per architecture.md §3) will
|
||||
// add a spec.allowedChannels overlay for per-Sandbox restriction.
|
||||
func (r *Reconciler) channelsForSandbox(_ *sandboxapi.Sandbox) []string {
|
||||
if len(r.DefaultChannels) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]string, 0, len(r.DefaultChannels))
|
||||
for _, c := range r.DefaultChannels {
|
||||
c = strings.TrimSpace(c)
|
||||
if c == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, c)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// stampTokenAnnotations patches the Sandbox CR with the new expiry +
|
||||
// rotation timestamps. Uses a deep-copy + Update against the cached
|
||||
// client so the patch is one round-trip; the controller-runtime
|
||||
// cache reflects the change on the next reconcile.
|
||||
//
|
||||
// IMPORTANT: an Update() bumps the metadata.resourceVersion. The
|
||||
// subsequent status update (patchStatus) operates on the same local
|
||||
// `sb` value; we sync the bumped ResourceVersion back onto sb so the
|
||||
// status-subresource patch does not 409 on stale-version.
|
||||
func (r *Reconciler) stampTokenAnnotations(ctx context.Context, sb *sandboxapi.Sandbox, expiresAt, rotatedAt string) error {
|
||||
updated := sb.DeepCopyObject().(*sandboxapi.Sandbox)
|
||||
if updated.Annotations == nil {
|
||||
updated.Annotations = map[string]string{}
|
||||
}
|
||||
updated.Annotations[annotationTokenExpiresAt] = expiresAt
|
||||
updated.Annotations[annotationTokenRotatedAt] = rotatedAt
|
||||
if err := r.Update(ctx, updated); err != nil {
|
||||
return err
|
||||
}
|
||||
// Reflect changes back onto the local copy so the rest of this
|
||||
// reconcile reads consistent annotations + the post-Update
|
||||
// resourceVersion (required by the cached client's optimistic-
|
||||
// concurrency check on the next .Status().Update call).
|
||||
sb.Annotations = updated.Annotations
|
||||
sb.ResourceVersion = updated.ResourceVersion
|
||||
return nil
|
||||
}
|
||||
|
||||
// sanitizeEmail converts an email into a DNS-label-safe leaf.
|
||||
func sanitizeEmail(email string) string {
|
||||
out := strings.ToLower(strings.TrimSpace(email))
|
||||
out = strings.ReplaceAll(out, "@", "-at-")
|
||||
out = strings.ReplaceAll(out, ".", "-")
|
||||
out = strings.ReplaceAll(out, "+", "-plus-")
|
||||
out = strings.ReplaceAll(out, "_", "-")
|
||||
if len(out) > 200 {
|
||||
out = out[:200]
|
||||
}
|
||||
out = strings.Trim(out, "-")
|
||||
return out
|
||||
}
|
||||
@ -1,883 +0,0 @@
|
||||
// sandbox_controller_test.go — Wave 1 + Wave 8 happy-path + drift +
|
||||
// idempotency coverage for the sandbox reconciler.
|
||||
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/fake"
|
||||
|
||||
"github.com/openova-io/openova/core/controllers/pkg/gitea"
|
||||
"github.com/openova-io/openova/core/controllers/sandbox/internal/newapi"
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
// stubNewAPI is an in-process newapi.Client used by the reconciler
|
||||
// tests. Captures every MintRequest + replies with the configured
|
||||
// MintResponse / error.
|
||||
type stubNewAPI struct {
|
||||
mu sync.Mutex
|
||||
calls []newapi.MintRequest
|
||||
resp newapi.MintResponse
|
||||
err error
|
||||
mintError func(newapi.MintRequest) (*newapi.MintResponse, error)
|
||||
}
|
||||
|
||||
func (s *stubNewAPI) MintSandboxToken(_ context.Context, req newapi.MintRequest) (*newapi.MintResponse, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.calls = append(s.calls, req)
|
||||
if s.mintError != nil {
|
||||
return s.mintError(req)
|
||||
}
|
||||
if s.err != nil {
|
||||
return nil, s.err
|
||||
}
|
||||
r := s.resp
|
||||
return &r, nil
|
||||
}
|
||||
|
||||
func (s *stubNewAPI) callCount() int {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
return len(s.calls)
|
||||
}
|
||||
|
||||
type giteaServer struct {
|
||||
t *testing.T
|
||||
|
||||
mu sync.Mutex
|
||||
|
||||
files map[string]fileEntry
|
||||
|
||||
createFiles int
|
||||
updateFiles int
|
||||
|
||||
server *httptest.Server
|
||||
}
|
||||
|
||||
type fileEntry struct {
|
||||
sha string
|
||||
content []byte
|
||||
}
|
||||
|
||||
func newGiteaServer(t *testing.T) *giteaServer {
|
||||
gs := &giteaServer{
|
||||
t: t,
|
||||
files: map[string]fileEntry{},
|
||||
}
|
||||
gs.server = httptest.NewServer(http.HandlerFunc(gs.handle))
|
||||
t.Cleanup(gs.server.Close)
|
||||
return gs
|
||||
}
|
||||
|
||||
func (g *giteaServer) URL() string { return g.server.URL }
|
||||
|
||||
func (g *giteaServer) handle(w http.ResponseWriter, r *http.Request) {
|
||||
g.mu.Lock()
|
||||
defer g.mu.Unlock()
|
||||
|
||||
if r.Header.Get("Authorization") == "" {
|
||||
http.Error(w, "no auth", http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
p := r.URL.Path
|
||||
|
||||
if strings.HasPrefix(p, "/api/v1/repos/") && strings.Contains(p, "/contents/") {
|
||||
const prefix = "/api/v1/repos/"
|
||||
rest := p[len(prefix):]
|
||||
idx := strings.Index(rest, "/contents/")
|
||||
if idx < 0 {
|
||||
http.Error(w, "bad path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
ownerRepo := rest[:idx]
|
||||
filePath := rest[idx+len("/contents/"):]
|
||||
key := ownerRepo + "/" + filePath
|
||||
|
||||
switch r.Method {
|
||||
case http.MethodGet:
|
||||
f, ok := g.files[key]
|
||||
if !ok {
|
||||
http.Error(w, "not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, gitea.File{
|
||||
Path: filePath,
|
||||
SHA: f.sha,
|
||||
Type: "file",
|
||||
ContentBase64: base64.StdEncoding.EncodeToString(f.content),
|
||||
})
|
||||
return
|
||||
case http.MethodPost, http.MethodPut:
|
||||
var body struct {
|
||||
Message string `json:"message"`
|
||||
Content string `json:"content"`
|
||||
Branch string `json:"branch"`
|
||||
SHA string `json:"sha"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
data, err := base64.StdEncoding.DecodeString(body.Content)
|
||||
if err != nil {
|
||||
http.Error(w, "bad b64", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if r.Method == http.MethodPost {
|
||||
if _, exists := g.files[key]; exists {
|
||||
http.Error(w, "exists", http.StatusUnprocessableEntity)
|
||||
return
|
||||
}
|
||||
g.createFiles++
|
||||
} else {
|
||||
g.updateFiles++
|
||||
}
|
||||
g.files[key] = fileEntry{
|
||||
sha: fmt.Sprintf("sha-%d", g.createFiles+g.updateFiles),
|
||||
content: data,
|
||||
}
|
||||
writeJSON(w, http.StatusCreated, map[string]any{
|
||||
"content": gitea.File{
|
||||
Path: filePath,
|
||||
SHA: g.files[key].sha,
|
||||
Type: "file",
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
g.t.Logf("giteaServer: unhandled %s %s", r.Method, r.URL.Path)
|
||||
http.Error(w, "not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, code int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(code)
|
||||
_ = json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
|
||||
func makeReconciler(t *testing.T, objs ...client.Object) (*Reconciler, *giteaServer) {
|
||||
t.Helper()
|
||||
|
||||
scheme := runtime.NewScheme()
|
||||
if err := clientgoscheme.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("add clientgo scheme: %v", err)
|
||||
}
|
||||
if err := sandboxapi.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("add sandboxapi scheme: %v", err)
|
||||
}
|
||||
|
||||
cl := fake.NewClientBuilder().
|
||||
WithScheme(scheme).
|
||||
WithStatusSubresource(&sandboxapi.Sandbox{}).
|
||||
WithObjects(objs...).
|
||||
Build()
|
||||
|
||||
gs := newGiteaServer(t)
|
||||
|
||||
r := &Reconciler{
|
||||
Client: cl,
|
||||
Log: logr.Discard(),
|
||||
GiteaClient: gitea.New(gs.URL(), "test-token"),
|
||||
HostCluster: "ct-eu-mgt-prod",
|
||||
SovereignFQDN: "omantel.omani.works",
|
||||
Branch: "main",
|
||||
TenantRepoName: "catalyst-tenant",
|
||||
PtyServerImage: "ghcr.io/openova-io/openova/sandbox-pty-server:test-sha",
|
||||
MCPImage: "ghcr.io/openova-io/openova/sandbox-mcp:test-sha",
|
||||
NewapiURL: "https://newapi.omantel.omani.works/v1",
|
||||
LLMGatewayTokenSecret: "sandbox-tokens",
|
||||
BYOSSecretPrefix: "sandbox-byos-claude-code",
|
||||
IdleTimeoutMinutes: 30,
|
||||
}
|
||||
return r, gs
|
||||
}
|
||||
|
||||
func sampleSandbox() *sandboxapi.Sandbox {
|
||||
return &sandboxapi.Sandbox{
|
||||
TypeMeta: metav1.TypeMeta{
|
||||
APIVersion: sandboxapi.GroupVersion.String(),
|
||||
Kind: "Sandbox",
|
||||
},
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "emrah",
|
||||
Namespace: "acme",
|
||||
Generation: 1,
|
||||
UID: "00000000-0000-0000-0000-000000000001",
|
||||
},
|
||||
Spec: sandboxapi.SandboxSpec{
|
||||
Owner: sandboxapi.SandboxOwner{
|
||||
Email: "ceo@acme.com",
|
||||
OrgRef: sandboxapi.SandboxOrgRef{Slug: "acme"},
|
||||
},
|
||||
Quota: sandboxapi.SandboxQuota{
|
||||
CPU: "4",
|
||||
Memory: "8Gi",
|
||||
Storage: "50Gi",
|
||||
ConcurrentSessions: 3,
|
||||
},
|
||||
Repos: []sandboxapi.SandboxRepo{
|
||||
{GiteaRepo: "acme/eventforge"},
|
||||
{GiteaRepo: "acme/internal-tools"},
|
||||
},
|
||||
AgentCatalogue: []string{"claude-code", "cursor-agent"},
|
||||
PreviewDomain: "sb-emrah.rzk7.openova.io",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcile_HappyPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
res, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("reconcile error: %v", err)
|
||||
}
|
||||
if res.RequeueAfter != 0 {
|
||||
t.Errorf("happy path should not requeue: got %v", res)
|
||||
}
|
||||
|
||||
// Wave 1 + Wave 8: 6 fixed + 1 kust + 2 repo PVCs + 4 wave-8 = 13.
|
||||
expectedFiles := 6 + 1 + 2 + 4
|
||||
if gs.createFiles != expectedFiles {
|
||||
t.Errorf("expected %d file creates, got %d", expectedFiles, gs.createFiles)
|
||||
}
|
||||
if gs.updateFiles != 0 {
|
||||
t.Errorf("expected 0 file updates on first reconcile, got %d", gs.updateFiles)
|
||||
}
|
||||
|
||||
wantPrefix := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/"
|
||||
for key := range gs.files {
|
||||
if !strings.HasPrefix(key, wantPrefix) {
|
||||
t.Errorf("file %q not under expected prefix %q", key, wantPrefix)
|
||||
}
|
||||
}
|
||||
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get post-reconcile: %v", err)
|
||||
}
|
||||
if got.Status.ObservedGeneration != 1 {
|
||||
t.Errorf("observedGeneration: got %d want 1", got.Status.ObservedGeneration)
|
||||
}
|
||||
if got.Status.Phase != "Provisioning" {
|
||||
t.Errorf("phase: got %q want %q", got.Status.Phase, "Provisioning")
|
||||
}
|
||||
if got.Status.GitopsPath != "sandbox/ceo-at-acme-com" {
|
||||
t.Errorf("gitopsPath: got %q", got.Status.GitopsPath)
|
||||
}
|
||||
if len(got.Status.Conditions) != 1 ||
|
||||
got.Status.Conditions[0].Type != "Ready" ||
|
||||
got.Status.Conditions[0].Status != "True" ||
|
||||
got.Status.Conditions[0].Reason != "GitopsReconciled" {
|
||||
t.Errorf("expected Ready=True/GitopsReconciled, got %+v", got.Status.Conditions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcile_Idempotent(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("first reconcile: %v", err)
|
||||
}
|
||||
firstCreates := gs.createFiles
|
||||
firstUpdates := gs.updateFiles
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("second reconcile: %v", err)
|
||||
}
|
||||
|
||||
if delta := gs.createFiles - firstCreates; delta != 0 {
|
||||
t.Errorf("idempotency: expected zero new creates, got %d", delta)
|
||||
}
|
||||
if delta := gs.updateFiles - firstUpdates; delta != 0 {
|
||||
t.Errorf("idempotency: expected zero file updates, got %d", delta)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcile_OwnerOrgRefMissing(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
sb.Spec.Owner.OrgRef.Slug = ""
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
res, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("reconcile (drift): %v", err)
|
||||
}
|
||||
if res.RequeueAfter != 0 {
|
||||
t.Errorf("drift should not requeue: got %v", res)
|
||||
}
|
||||
if gs.createFiles != 0 || gs.updateFiles != 0 {
|
||||
t.Errorf("drift: no Gitea writes expected, got creates=%d updates=%d",
|
||||
gs.createFiles, gs.updateFiles)
|
||||
}
|
||||
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Status.Phase != "Failed" {
|
||||
t.Errorf("phase: got %q want Failed", got.Status.Phase)
|
||||
}
|
||||
if len(got.Status.Conditions) != 1 ||
|
||||
got.Status.Conditions[0].Status != "False" ||
|
||||
got.Status.Conditions[0].Reason != "OwnerOrgRefMissing" {
|
||||
t.Errorf("expected OwnerOrgRefMissing False condition, got %+v", got.Status.Conditions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcile_OwnerEmailMissing(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
sb.Spec.Owner.Email = ""
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile (drift): %v", err)
|
||||
}
|
||||
if gs.createFiles != 0 {
|
||||
t.Errorf("drift: no Gitea writes expected, got %d creates", gs.createFiles)
|
||||
}
|
||||
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if len(got.Status.Conditions) != 1 ||
|
||||
got.Status.Conditions[0].Reason != "OwnerEmailMissing" {
|
||||
t.Errorf("expected OwnerEmailMissing False condition, got %+v", got.Status.Conditions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcile_Missing_NoError(t *testing.T) {
|
||||
t.Parallel()
|
||||
r, _ := makeReconciler(t)
|
||||
res, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: "ghost", Namespace: "acme"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("reconcile of missing CR should be a no-op, got: %v", err)
|
||||
}
|
||||
if res.RequeueAfter != 0 {
|
||||
t.Errorf("missing CR should not requeue, got %v", res)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_Wave8RuntimeShape asserts the Wave 8 runtime manifests
|
||||
// (pty-server StatefulSet, MCP Deployment, Service, HTTPRoute) carry
|
||||
// the right identity + env wiring + BYOS branching + hostname derivation.
|
||||
func TestReconcile_Wave8RuntimeShape(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
prefix := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/"
|
||||
get := func(name string) string {
|
||||
gs.mu.Lock()
|
||||
defer gs.mu.Unlock()
|
||||
entry, ok := gs.files[prefix+name]
|
||||
if !ok {
|
||||
t.Fatalf("expected rendered file %q in gitea stub", prefix+name)
|
||||
}
|
||||
return string(entry.content)
|
||||
}
|
||||
|
||||
ss := get("statefulset-pty-server.yaml")
|
||||
for _, want := range []string{
|
||||
"kind: StatefulSet",
|
||||
"name: pty-server",
|
||||
"namespace: sandbox-ceo-at-acme-com",
|
||||
"replicas: 3",
|
||||
`image: "ghcr.io/openova-io/openova/sandbox-pty-server:test-sha"`,
|
||||
"PTY_SERVER_ADDR",
|
||||
"SANDBOX_OWNER_UID",
|
||||
`value: "ceo-at-acme-com"`,
|
||||
"ORG_ID",
|
||||
`value: "acme"`,
|
||||
"NEWAPI_URL",
|
||||
`value: "https://newapi.omantel.omani.works/v1"`,
|
||||
"OPENAI_BASE_URL",
|
||||
"LLM_GATEWAY_TOKEN",
|
||||
"OPENAI_API_KEY",
|
||||
"ANTHROPIC_API_KEY",
|
||||
`name: "sandbox-byos-claude-code-ceo-at-acme-com"`,
|
||||
"key: access_token",
|
||||
"openova.io/sandbox-idle-timeout-minutes",
|
||||
"name: repo-acme-eventforge",
|
||||
"mountPath: /workspace/acme-eventforge",
|
||||
"name: repo-acme-internal-tools",
|
||||
} {
|
||||
if !strings.Contains(ss, want) {
|
||||
t.Errorf("statefulset-pty-server.yaml missing %q", want)
|
||||
}
|
||||
}
|
||||
|
||||
dep := get("deployment-mcp.yaml")
|
||||
for _, want := range []string{
|
||||
"kind: Deployment",
|
||||
"name: openova-sandbox-mcp",
|
||||
`image: "ghcr.io/openova-io/openova/sandbox-mcp:test-sha"`,
|
||||
"PTY_SERVER_URL",
|
||||
"pty-server.sandbox-ceo-at-acme-com.svc.cluster.local:7681",
|
||||
} {
|
||||
if !strings.Contains(dep, want) {
|
||||
t.Errorf("deployment-mcp.yaml missing %q", want)
|
||||
}
|
||||
}
|
||||
|
||||
svc := get("service-pty-server.yaml")
|
||||
for _, want := range []string{
|
||||
"kind: Service",
|
||||
"name: pty-server",
|
||||
"port: 7681",
|
||||
"targetPort: 7681",
|
||||
} {
|
||||
if !strings.Contains(svc, want) {
|
||||
t.Errorf("service-pty-server.yaml missing %q", want)
|
||||
}
|
||||
}
|
||||
|
||||
rt := get("httproute-pty-server.yaml")
|
||||
for _, want := range []string{
|
||||
"kind: HTTPRoute",
|
||||
`- "sandbox.omantel.omani.works"`,
|
||||
"value: /sessions/ceo-at-acme-com/",
|
||||
// Sandbox HTTPRoute now attaches to the canonical Cilium Gateway
|
||||
// (cilium-gateway/kube-system) so the wildcard *.<sov-fqdn>
|
||||
// listener serves traffic to sandbox.<sov-fqdn>. The previous
|
||||
// "catalyst-public/catalyst-system/https" parentRefs pointed at a
|
||||
// Gateway that doesn't exist on a Sovereign.
|
||||
"name: cilium-gateway",
|
||||
"namespace: kube-system",
|
||||
"name: pty-server",
|
||||
"port: 7681",
|
||||
} {
|
||||
if !strings.Contains(rt, want) {
|
||||
t.Errorf("httproute-pty-server.yaml missing %q", want)
|
||||
}
|
||||
}
|
||||
|
||||
kust := get("kustomization.yaml")
|
||||
for _, want := range []string{
|
||||
"statefulset-pty-server.yaml",
|
||||
"service-pty-server.yaml",
|
||||
"deployment-mcp.yaml",
|
||||
"httproute-pty-server.yaml",
|
||||
} {
|
||||
if !strings.Contains(kust, want) {
|
||||
t.Errorf("kustomization.yaml missing %q", want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_Wave8NoBYOSWhenAgentMissing asserts that a Sandbox
|
||||
// without claude-code in spec.agentCatalogue does NOT wire the
|
||||
// ANTHROPIC_API_KEY env into the rendered StatefulSet.
|
||||
func TestReconcile_Wave8NoBYOSWhenAgentMissing(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
sb.Spec.AgentCatalogue = []string{"cursor-agent"}
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
gs.mu.Lock()
|
||||
entry, ok := gs.files["acme/catalyst-tenant/sandbox/ceo-at-acme-com/statefulset-pty-server.yaml"]
|
||||
gs.mu.Unlock()
|
||||
if !ok {
|
||||
t.Fatalf("expected statefulset-pty-server.yaml")
|
||||
}
|
||||
body := string(entry.content)
|
||||
if strings.Contains(body, "ANTHROPIC_API_KEY") {
|
||||
t.Errorf("expected NO ANTHROPIC_API_KEY env when claude-code not in agentCatalogue")
|
||||
}
|
||||
if strings.Contains(body, "sandbox-byos-claude-code-ceo-at-acme-com") {
|
||||
t.Errorf("expected NO BYOS Secret reference when claude-code not in agentCatalogue")
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_MintsAndRendersSecret exercises the Wave 9 mint
|
||||
// path: NewAPIClient wired + no prior token annotation → the
|
||||
// controller calls the bridge once, stamps both lifecycle annotations
|
||||
// on the CR, and renders secret-newapi-token.yaml under the Gitea
|
||||
// prefix with the expected token bytes.
|
||||
func TestReconcile_NewAPI_MintsAndRendersSecret(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
fixedNow := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
|
||||
exp := fixedNow.Add(7 * 24 * time.Hour)
|
||||
stub := &stubNewAPI{resp: newapi.MintResponse{Token: "jwt-fresh", ExpiresAt: exp}}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
r.Now = func() time.Time { return fixedNow }
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
if stub.callCount() != 1 {
|
||||
t.Errorf("mint calls: got %d want 1", stub.callCount())
|
||||
}
|
||||
gotReq := stub.calls[0]
|
||||
if gotReq.OrgID != "acme" {
|
||||
t.Errorf("mint req OrgID: got %q", gotReq.OrgID)
|
||||
}
|
||||
if gotReq.UserID != "ceo@acme.com" {
|
||||
t.Errorf("mint req UserID: got %q", gotReq.UserID)
|
||||
}
|
||||
if gotReq.SandboxID != string(sb.UID) {
|
||||
t.Errorf("mint req SandboxID: got %q want %q", gotReq.SandboxID, sb.UID)
|
||||
}
|
||||
if len(gotReq.AllowedChannels) != 1 || gotReq.AllowedChannels[0] != "qwen" {
|
||||
t.Errorf("mint req channels: got %v", gotReq.AllowedChannels)
|
||||
}
|
||||
|
||||
// The rendered Secret manifest must exist + carry the token bytes
|
||||
// + expiry annotation + rotation marker (first issuance is also a
|
||||
// rotation event, so kubectl.kubernetes.io/restartedAt is present).
|
||||
secretKey := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/secret-newapi-token.yaml"
|
||||
entry, ok := gs.files[secretKey]
|
||||
if !ok {
|
||||
t.Fatalf("expected secret-newapi-token.yaml under %q; files=%v",
|
||||
secretKey, gsKeys(gs))
|
||||
}
|
||||
if !strings.Contains(string(entry.content), "LLM_GATEWAY_TOKEN: \"jwt-fresh\"") {
|
||||
t.Errorf("rendered Secret missing token bytes: %s", string(entry.content))
|
||||
}
|
||||
if !strings.Contains(string(entry.content), "openova.io/sandbox-token-expires-at: \""+exp.UTC().Format(time.RFC3339)+"\"") {
|
||||
t.Errorf("rendered Secret missing expires-at annotation: %s", string(entry.content))
|
||||
}
|
||||
if !strings.Contains(string(entry.content), "kubectl.kubernetes.io/restartedAt:") {
|
||||
t.Errorf("rendered Secret missing restartedAt annotation: %s", string(entry.content))
|
||||
}
|
||||
|
||||
// The Sandbox CR must carry both lifecycle annotations.
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get post-reconcile: %v", err)
|
||||
}
|
||||
if got.Annotations[annotationTokenExpiresAt] != exp.UTC().Format(time.RFC3339) {
|
||||
t.Errorf("CR expires-at annotation: got %q", got.Annotations[annotationTokenExpiresAt])
|
||||
}
|
||||
if got.Annotations[annotationTokenRotatedAt] != fixedNow.UTC().Format(time.RFC3339) {
|
||||
t.Errorf("CR rotated-at annotation: got %q", got.Annotations[annotationTokenRotatedAt])
|
||||
}
|
||||
|
||||
// kustomization.yaml must reference the new secret.
|
||||
kustKey := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/kustomization.yaml"
|
||||
kustEntry, ok := gs.files[kustKey]
|
||||
if !ok {
|
||||
t.Fatalf("expected kustomization.yaml at %q", kustKey)
|
||||
}
|
||||
if !strings.Contains(string(kustEntry.content), "secret-newapi-token.yaml") {
|
||||
t.Errorf("kustomization.yaml missing secret-newapi-token entry: %s", string(kustEntry.content))
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_RotationOnExpiry verifies that a token whose
|
||||
// expiry sits within the rotation lead-time triggers a fresh mint +
|
||||
// fresh restart marker.
|
||||
func TestReconcile_NewAPI_RotationOnExpiry(t *testing.T) {
|
||||
t.Parallel()
|
||||
fixedNow := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
|
||||
expSoon := fixedNow.Add(30 * time.Minute) // inside default 24h lead time
|
||||
sb := sampleSandbox()
|
||||
sb.Annotations = map[string]string{
|
||||
annotationTokenExpiresAt: expSoon.UTC().Format(time.RFC3339),
|
||||
annotationTokenRotatedAt: fixedNow.Add(-6 * 24 * time.Hour).UTC().Format(time.RFC3339),
|
||||
}
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
newExp := fixedNow.Add(7 * 24 * time.Hour)
|
||||
stub := &stubNewAPI{resp: newapi.MintResponse{Token: "jwt-rotated", ExpiresAt: newExp}}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
r.Now = func() time.Time { return fixedNow }
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
if stub.callCount() != 1 {
|
||||
t.Errorf("expected exactly one mint call, got %d", stub.callCount())
|
||||
}
|
||||
secretKey := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/secret-newapi-token.yaml"
|
||||
entry := gs.files[secretKey]
|
||||
if !strings.Contains(string(entry.content), "LLM_GATEWAY_TOKEN: \"jwt-rotated\"") {
|
||||
t.Errorf("rotation did not write new token: %s", string(entry.content))
|
||||
}
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Annotations[annotationTokenExpiresAt] != newExp.UTC().Format(time.RFC3339) {
|
||||
t.Errorf("rotation did not bump expires-at: got %q",
|
||||
got.Annotations[annotationTokenExpiresAt])
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_NoMintWhenHealthy verifies the steady-state
|
||||
// path: a CR with a token whose expiry is well outside the rotation
|
||||
// lead-time triggers zero mint calls AND the rendered Secret carries
|
||||
// the previous bytes.
|
||||
func TestReconcile_NewAPI_NoMintWhenHealthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
fixedNow := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
|
||||
farExp := fixedNow.Add(5 * 24 * time.Hour) // outside default 24h lead
|
||||
sb := sampleSandbox()
|
||||
sb.Annotations = map[string]string{
|
||||
annotationTokenExpiresAt: farExp.UTC().Format(time.RFC3339),
|
||||
annotationTokenRotatedAt: fixedNow.Add(-2 * 24 * time.Hour).UTC().Format(time.RFC3339),
|
||||
}
|
||||
r, gs := makeReconciler(t, sb)
|
||||
|
||||
stub := &stubNewAPI{} // any call would explode (empty MintResponse)
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
r.Now = func() time.Time { return fixedNow }
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
if stub.callCount() != 0 {
|
||||
t.Errorf("steady-state should not call mint, got %d", stub.callCount())
|
||||
}
|
||||
// The Secret manifest is NOT rendered because tokenValue is empty
|
||||
// when the controller decides not to mint. The previous Secret
|
||||
// content remains in Gitea untouched (we trust PutFile's byte-
|
||||
// equal guard) — for this in-memory test there was no prior file,
|
||||
// so the in-memory store simply doesn't have a secret-newapi-token
|
||||
// entry. The kustomization.yaml must therefore NOT reference it.
|
||||
kustKey := "acme/catalyst-tenant/sandbox/ceo-at-acme-com/kustomization.yaml"
|
||||
kust := gs.files[kustKey]
|
||||
if strings.Contains(string(kust.content), "secret-newapi-token.yaml") {
|
||||
t.Errorf("kustomization should not reference secret-newapi-token when not minted")
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_MintFailureSurfacesCondition exercises the
|
||||
// failure path: the bridge returns a non-2xx → controller records a
|
||||
// Failed/TokenMintFailed condition + requeues + NO manifests written.
|
||||
func TestReconcile_NewAPI_MintFailureSurfacesCondition(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
stub := &stubNewAPI{err: errors.New("newapi: POST .../admin/tokens/sandbox: status 503: outage")}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
r.Now = func() time.Time { return time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC) }
|
||||
|
||||
res, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("reconcile error: %v", err)
|
||||
}
|
||||
if res.RequeueAfter == 0 {
|
||||
t.Errorf("expected non-zero requeue on bridge failure")
|
||||
}
|
||||
if gs.createFiles != 0 {
|
||||
t.Errorf("no Gitea writes expected on token-mint failure, got %d creates", gs.createFiles)
|
||||
}
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if got.Status.Phase != "Failed" {
|
||||
t.Errorf("phase: got %q want Failed", got.Status.Phase)
|
||||
}
|
||||
if len(got.Status.Conditions) != 1 ||
|
||||
got.Status.Conditions[0].Reason != "TokenMintFailed" ||
|
||||
got.Status.Conditions[0].Status != "False" {
|
||||
t.Errorf("expected TokenMintFailed False condition, got %+v", got.Status.Conditions)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_NoChannelsConfigured surfaces the misconfig
|
||||
// path: operator didn't wire DefaultChannels → fail-loud rather than
|
||||
// minting a token with an empty allowed_channels list (the bridge
|
||||
// would 400 anyway, but the controller fails earlier with a more
|
||||
// helpful Reason).
|
||||
func TestReconcile_NewAPI_NoChannelsConfigured(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
r, gs := makeReconciler(t, sb)
|
||||
stub := &stubNewAPI{}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = nil // misconfig
|
||||
r.Now = func() time.Time { return time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC) }
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
if stub.callCount() != 0 {
|
||||
t.Errorf("misconfig should not call bridge, got %d calls", stub.callCount())
|
||||
}
|
||||
if gs.createFiles != 0 {
|
||||
t.Errorf("misconfig: no gitea writes expected, got %d", gs.createFiles)
|
||||
}
|
||||
var got sandboxapi.Sandbox
|
||||
if err := r.Get(context.Background(),
|
||||
client.ObjectKey{Name: sb.Name, Namespace: sb.Namespace}, &got); err != nil {
|
||||
t.Fatalf("get: %v", err)
|
||||
}
|
||||
if len(got.Status.Conditions) != 1 || got.Status.Conditions[0].Reason != "NoAllowedChannels" {
|
||||
t.Errorf("expected NoAllowedChannels condition, got %+v", got.Status.Conditions)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_CapabilitiesFromPlan exercises the tier-bound
|
||||
// capability path (PR #1671): when the CR carries spec.planId without
|
||||
// an explicit spec.capabilities overlay, the controller resolves the
|
||||
// plan's capability allowlist and threads it into the MintRequest.
|
||||
func TestReconcile_NewAPI_CapabilitiesFromPlan(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
sb.Spec.PlanID = sandboxapi.PlanSandboxPro
|
||||
|
||||
r, _ := makeReconciler(t, sb)
|
||||
stub := &stubNewAPI{resp: newapi.MintResponse{
|
||||
Token: "jwt-pro", ExpiresAt: time.Now().Add(7 * 24 * time.Hour),
|
||||
}}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
if stub.callCount() != 1 {
|
||||
t.Fatalf("mint calls: got %d want 1", stub.callCount())
|
||||
}
|
||||
gotCaps := stub.calls[0].Capabilities
|
||||
wantSubset := []string{
|
||||
"gitea.repo.list", // Free baseline.
|
||||
"sandbox.db.*", // Pro extra.
|
||||
"sandbox.storage.*", // Pro extra.
|
||||
"flux.status", // Pro extra.
|
||||
}
|
||||
got := make(map[string]bool, len(gotCaps))
|
||||
for _, c := range gotCaps {
|
||||
got[c] = true
|
||||
}
|
||||
for _, w := range wantSubset {
|
||||
if !got[w] {
|
||||
t.Errorf("Pro plan capability %q missing from MintRequest: %v", w, gotCaps)
|
||||
}
|
||||
}
|
||||
// Pro plan MUST NOT grant Ent-only capabilities.
|
||||
for _, forbidden := range []string{
|
||||
"sandbox.deploy.production", "sandbox.stripe.*", "flux.reconcile",
|
||||
} {
|
||||
if got[forbidden] {
|
||||
t.Errorf("Pro plan unexpectedly granted Ent capability %q: %v", forbidden, gotCaps)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcile_NewAPI_CapabilitiesSpecOverride asserts that an explicit
|
||||
// spec.capabilities overlay wins over the plan default — the operator
|
||||
// can tighten or widen the per-Sandbox grant by patching the CR.
|
||||
func TestReconcile_NewAPI_CapabilitiesSpecOverride(t *testing.T) {
|
||||
t.Parallel()
|
||||
sb := sampleSandbox()
|
||||
sb.Spec.PlanID = sandboxapi.PlanSandboxEnt
|
||||
// Override: drop every Ent grant down to read-only intersect.
|
||||
sb.Spec.Capabilities = []string{"gitea.repo.list", "k8s.read.get"}
|
||||
|
||||
r, _ := makeReconciler(t, sb)
|
||||
stub := &stubNewAPI{resp: newapi.MintResponse{
|
||||
Token: "jwt-override", ExpiresAt: time.Now().Add(7 * 24 * time.Hour),
|
||||
}}
|
||||
r.NewAPIClient = stub
|
||||
r.DefaultChannels = []string{"qwen"}
|
||||
|
||||
if _, err := r.Reconcile(context.Background(), ctrl.Request{
|
||||
NamespacedName: types.NamespacedName{Name: sb.Name, Namespace: sb.Namespace},
|
||||
}); err != nil {
|
||||
t.Fatalf("reconcile: %v", err)
|
||||
}
|
||||
|
||||
gotCaps := stub.calls[0].Capabilities
|
||||
if len(gotCaps) != 2 {
|
||||
t.Fatalf("override caps len: got %d (%v) want 2", len(gotCaps), gotCaps)
|
||||
}
|
||||
if gotCaps[0] != "gitea.repo.list" || gotCaps[1] != "k8s.read.get" {
|
||||
t.Errorf("override caps: got %v want [gitea.repo.list k8s.read.get]", gotCaps)
|
||||
}
|
||||
}
|
||||
|
||||
func gsKeys(gs *giteaServer) []string {
|
||||
gs.mu.Lock()
|
||||
defer gs.mu.Unlock()
|
||||
out := make([]string, 0, len(gs.files))
|
||||
for k := range gs.files {
|
||||
out = append(out, k)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@ -1,790 +0,0 @@
|
||||
// Package gitops renders the per-Sandbox manifests the sandbox-controller
|
||||
// writes into the per-Org `catalyst-tenant` Gitea repo under
|
||||
// `sandbox/<owner-uid>/`.
|
||||
//
|
||||
// Per products/sandbox/docs/architecture.md §7 the sandbox-controller
|
||||
// is the sister of organization-controller — it reconciles a
|
||||
// per-Sandbox namespace + RBAC + PVCs + placeholder Secret INSIDE the
|
||||
// Org vcluster (not the host cluster). The controller writes manifests
|
||||
// to the per-Org Gitea repo following the SAME idiom
|
||||
// organization-controller uses for vcluster manifests
|
||||
// (core/controllers/organization/internal/gitops/manifests.go) — Flux
|
||||
// on the host picks them up and reconciles into the Org vcluster.
|
||||
//
|
||||
// Wave 1 materialized only namespace + RBAC + PVCs + placeholder
|
||||
// Secret. Wave 8 (this slice — PR follow-up to #1622) extends the
|
||||
// renderer to ALSO spawn the per-Sandbox runtime:
|
||||
//
|
||||
// - Namespace `sandbox-<owner-uid>`
|
||||
// - ResourceQuota (mirrors spec.quota)
|
||||
// - ServiceAccount `sandbox` + Role + RoleBinding
|
||||
// - One PVC per spec.repos[] entry
|
||||
// - Placeholder Secret `sandbox-tokens`
|
||||
// - NEW: StatefulSet `pty-server` (replicas = spec.quota.concurrentSessions)
|
||||
// - NEW: Deployment `openova-sandbox-mcp`
|
||||
// - NEW: Service `pty-server` ClusterIP :7681
|
||||
// - NEW: HTTPRoute exposing `sandbox.<sov-fqdn>/sessions/<owner-uid>/*`
|
||||
//
|
||||
// Per Inviolable Principle #4 (no hardcoded values) every knob comes
|
||||
// from Inputs — nothing in the template literals encodes a cluster /
|
||||
// region / version / image / hostname.
|
||||
package gitops
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"text/template"
|
||||
|
||||
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
|
||||
)
|
||||
|
||||
// Inputs is the subset of Sandbox spec + controller-level metadata the
|
||||
// renderer needs.
|
||||
type Inputs struct {
|
||||
Name string
|
||||
OwnerUID string
|
||||
OwnerEmail string
|
||||
OrgSlug string
|
||||
SovereignFQDN string
|
||||
Quota sandboxapi.SandboxQuota
|
||||
Repos []sandboxapi.SandboxRepo
|
||||
PreviewDomain string
|
||||
AgentCatalogue []string
|
||||
PtyServerImage string
|
||||
MCPImage string
|
||||
NewapiURL string
|
||||
LLMGatewayTokenSecret string
|
||||
BYOSSecretPrefix string
|
||||
IdleTimeoutMinutes int
|
||||
|
||||
// IdleScalingDisabled (TBD-D8b #1725) — when true the renderer
|
||||
// stamps `openova.io/sandbox-idle-scaling-disabled=true` on the
|
||||
// pty-server StatefulSet so the cluster-wide idle scaler skips it
|
||||
// on every pass. Default false preserves the existing scale-to-zero
|
||||
// policy. Sourced from Sandbox.spec.idleScaling.enabled (false →
|
||||
// disabled true; nil OR true → disabled false).
|
||||
IdleScalingDisabled bool
|
||||
|
||||
// Wave 9 — per-Sandbox NewAPI bearer rendered into a dedicated
|
||||
// Secret manifest. When NewAPIToken is non-empty the renderer
|
||||
// emits secret-newapi-token.yaml carrying stringData
|
||||
// LLM_GATEWAY_TOKEN + openova.io/sandbox-token-expires-at
|
||||
// annotation; when NewAPITokenRotatedAt is also non-empty the
|
||||
// rendered Secret additionally carries
|
||||
// kubectl.kubernetes.io/restartedAt so Wave 8's pty-server
|
||||
// StatefulSet picks up rolling restarts on token rotation.
|
||||
NewAPIToken string
|
||||
NewAPITokenSecretName string
|
||||
NewAPITokenExpiresAt string
|
||||
NewAPITokenRotatedAt string
|
||||
|
||||
// D31 active-hot-standby — Sovereign-level toggle + region pair the
|
||||
// sandbox-controller propagates into every per-Sandbox MCP Pod via
|
||||
// SOVEREIGN_ENABLE_HOT_STANDBY / SOVEREIGN_PRIMARY_REGION /
|
||||
// SOVEREIGN_REPLICA_REGION env. The MCP server's sandbox.db.provision
|
||||
// handler reads them at call time and, when valid, materialises a
|
||||
// primary + replica Cluster.postgresql.cnpg.io pair instead of a
|
||||
// single Cluster (mirrors the bp-cnpg-pair pattern). Default empty
|
||||
// (zero regression): every Sandbox stays on single-Cluster CNPG.
|
||||
// Sourced from the sandbox-controller's own env (chart values
|
||||
// `cnpg.activeHotStandby.*` plumbed by bootstrap-kit slot 61 from
|
||||
// the per-Sovereign overlay's envsubst placeholders).
|
||||
EnableHotStandby string
|
||||
PrimaryRegion string
|
||||
ReplicaRegion string
|
||||
}
|
||||
|
||||
const namespaceTemplate = `apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/organization: {{ .OrgSlug }}
|
||||
openova.io/sovereign: {{ .SovereignFQDN }}
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
annotations:
|
||||
openova.io/sandbox-owner-email: {{ .OwnerEmail | quote }}
|
||||
{{- if .PreviewDomain }}
|
||||
openova.io/sandbox-preview-domain: {{ .PreviewDomain | quote }}
|
||||
{{- end }}
|
||||
`
|
||||
|
||||
const resourceQuotaTemplate = `apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: sandbox-quota
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
spec:
|
||||
hard:
|
||||
requests.cpu: {{ .Quota.CPU | quote }}
|
||||
limits.cpu: {{ .Quota.CPU | quote }}
|
||||
requests.memory: {{ .Quota.Memory | quote }}
|
||||
limits.memory: {{ .Quota.Memory | quote }}
|
||||
requests.storage: {{ .Quota.Storage | quote }}
|
||||
count/pods: {{ .Quota.ConcurrentSessions | quote }}
|
||||
`
|
||||
|
||||
const serviceAccountTemplate = `apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: sandbox
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
`
|
||||
|
||||
const roleTemplate = `apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: sandbox
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/log", "pods/exec", "services", "configmaps", "secrets", "persistentvolumeclaims", "events"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["deployments", "statefulsets", "replicasets"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs", "cronjobs"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
`
|
||||
|
||||
const roleBindingTemplate = `apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: sandbox
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: sandbox
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: sandbox
|
||||
namespace: {{ .NamespaceName }}
|
||||
`
|
||||
|
||||
const pvcTemplate = `apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ .PVCName }}
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-repo: {{ .RepoSlug | quote }}
|
||||
openova.io/managed-by: catalyst
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .RepoStorage | quote }}
|
||||
`
|
||||
|
||||
const secretTemplate = `apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: sandbox-tokens
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
type: Opaque
|
||||
stringData:
|
||||
placeholder: ""
|
||||
`
|
||||
|
||||
// newapiTokenSecretTemplate renders the per-Sandbox NewAPI bearer
|
||||
// Secret (Wave 9). Materialized into the Org vcluster's
|
||||
// sandbox-<owner-uid> namespace by Flux; Wave 8's pty-server
|
||||
// StatefulSet mounts the LLM_GATEWAY_TOKEN key as an env var on
|
||||
// every Sandbox-agent Pod.
|
||||
//
|
||||
// The Secret carries TWO operator-visible annotations:
|
||||
// - openova.io/sandbox-token-expires-at — absolute expiry of the
|
||||
// embedded JWT (operator + rotation observer).
|
||||
// - kubectl.kubernetes.io/restartedAt — rotation marker; Wave 8's
|
||||
// pty-server StatefulSet propagates this onto its Pod template via
|
||||
// a stringData → annotation reference so a fresh Secret triggers
|
||||
// a rolling restart.
|
||||
const newapiTokenSecretTemplate = `apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ .SecretName }}
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
annotations:
|
||||
openova.io/sandbox-token-expires-at: {{ .ExpiresAt | quote }}
|
||||
{{- if .RotatedAt }}
|
||||
kubectl.kubernetes.io/restartedAt: {{ .RotatedAt | quote }}
|
||||
{{- end }}
|
||||
type: Opaque
|
||||
stringData:
|
||||
LLM_GATEWAY_TOKEN: {{ .Token | quote }}
|
||||
`
|
||||
|
||||
const ptyServerStatefulSetTemplate = `apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: pty-server
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
app.kubernetes.io/name: pty-server
|
||||
app.kubernetes.io/component: pty-server
|
||||
annotations:
|
||||
openova.io/sandbox-idle-timeout-minutes: {{ .IdleTimeoutMinutes | quote }}
|
||||
{{- if .IdleScalingDisabled }}
|
||||
openova.io/sandbox-idle-scaling-disabled: "true"
|
||||
{{- end }}
|
||||
spec:
|
||||
serviceName: pty-server
|
||||
replicas: {{ .Replicas }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: pty-server
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: pty-server
|
||||
app.kubernetes.io/component: pty-server
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
spec:
|
||||
serviceAccountName: sandbox
|
||||
automountServiceAccountToken: true
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
runAsGroup: 65532
|
||||
fsGroup: 65532
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: pty-server
|
||||
image: {{ .PtyServerImage | quote }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 7681
|
||||
env:
|
||||
- name: PTY_SERVER_ADDR
|
||||
value: ":7681"
|
||||
- name: SANDBOX_OWNER_UID
|
||||
value: {{ .OwnerUID | quote }}
|
||||
- name: SANDBOX_OWNER_EMAIL
|
||||
value: {{ .OwnerEmail | quote }}
|
||||
- name: ORG_ID
|
||||
value: {{ .OrgSlug | quote }}
|
||||
- name: SOVEREIGN_FQDN
|
||||
value: {{ .SovereignFQDN | quote }}
|
||||
- name: NEWAPI_URL
|
||||
value: {{ .NewapiURL | quote }}
|
||||
- name: OPENAI_BASE_URL
|
||||
value: {{ .NewapiURL | quote }}
|
||||
- name: LLM_GATEWAY_URL
|
||||
value: {{ .NewapiURL | quote }}
|
||||
- name: LLM_GATEWAY_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .LLMGatewayTokenSecret | quote }}
|
||||
key: llm-gateway-token
|
||||
optional: true
|
||||
- name: OPENAI_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .LLMGatewayTokenSecret | quote }}
|
||||
key: llm-gateway-token
|
||||
optional: true
|
||||
{{- if .ClaudeCodeBYOSActive }}
|
||||
- name: ANTHROPIC_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .BYOSSecretName | quote }}
|
||||
key: access_token
|
||||
optional: true
|
||||
- name: ANTHROPIC_BASE_URL
|
||||
value: ""
|
||||
{{- end }}
|
||||
volumeMounts:
|
||||
{{- range .RuntimeRepos }}
|
||||
- name: repo-{{ .Slug }}
|
||||
mountPath: /workspace/{{ .Slug }}
|
||||
{{- end }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 3
|
||||
periodSeconds: 5
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 15
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "256Mi"
|
||||
limits:
|
||||
cpu: {{ .Quota.CPU | quote }}
|
||||
memory: {{ .Quota.Memory | quote }}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: false
|
||||
volumes:
|
||||
{{- range .RuntimeRepos }}
|
||||
- name: repo-{{ .Slug }}
|
||||
persistentVolumeClaim:
|
||||
claimName: repo-{{ .Slug }}
|
||||
{{- end }}
|
||||
terminationGracePeriodSeconds: 30
|
||||
`
|
||||
|
||||
const mcpDeploymentTemplate = `apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: openova-sandbox-mcp
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
app.kubernetes.io/name: openova-sandbox-mcp
|
||||
app.kubernetes.io/component: mcp-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: openova-sandbox-mcp
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: openova-sandbox-mcp
|
||||
app.kubernetes.io/component: mcp-server
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/sandbox-owner: {{ .OwnerUID }}
|
||||
openova.io/managed-by: catalyst
|
||||
spec:
|
||||
serviceAccountName: sandbox
|
||||
automountServiceAccountToken: true
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
runAsGroup: 65532
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: mcp
|
||||
image: {{ .MCPImage | quote }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: SANDBOX_OWNER_UID
|
||||
value: {{ .OwnerUID | quote }}
|
||||
- name: SANDBOX_OWNER_EMAIL
|
||||
value: {{ .OwnerEmail | quote }}
|
||||
- name: ORG_ID
|
||||
value: {{ .OrgSlug | quote }}
|
||||
- name: SOVEREIGN_FQDN
|
||||
value: {{ .SovereignFQDN | quote }}
|
||||
- name: PTY_SERVER_URL
|
||||
value: "http://pty-server.{{ .NamespaceName }}.svc.cluster.local:7681"
|
||||
- name: LLM_GATEWAY_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .LLMGatewayTokenSecret | quote }}
|
||||
key: llm-gateway-token
|
||||
optional: true
|
||||
# ── D31 active-hot-standby — Sovereign-level toggle + region
|
||||
# pair. When SOVEREIGN_ENABLE_HOT_STANDBY parses truthy AND
|
||||
# both region values are non-empty AND distinct, sandbox.db.
|
||||
# provision materialises a primary + replica Cluster.
|
||||
# postgresql.cnpg.io pair instead of a single Cluster (DoD
|
||||
# D31). Default-off keeps every existing Sandbox on single-
|
||||
# Cluster CNPG (zero regression). The values flow:
|
||||
# bootstrap-kit slot 19a envsubst (per-Sovereign overlay)
|
||||
# -> bp-sandbox HelmRelease values
|
||||
# -> sandbox-controller env (host cluster)
|
||||
# -> here, into every per-Sandbox MCP Pod
|
||||
- name: SOVEREIGN_ENABLE_HOT_STANDBY
|
||||
value: {{ .EnableHotStandby | quote }}
|
||||
- name: SOVEREIGN_PRIMARY_REGION
|
||||
value: {{ .PrimaryRegion | quote }}
|
||||
- name: SOVEREIGN_REPLICA_REGION
|
||||
value: {{ .ReplicaRegion | quote }}
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "128Mi"
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: true
|
||||
terminationGracePeriodSeconds: 10
|
||||
`
|
||||
|
||||
const ptyServerServiceTemplate = `apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: pty-server
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
app.kubernetes.io/name: pty-server
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: pty-server
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
ports:
|
||||
- name: http
|
||||
port: 7681
|
||||
targetPort: 7681
|
||||
protocol: TCP
|
||||
`
|
||||
|
||||
const httpRouteTemplate = `apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: HTTPRoute
|
||||
metadata:
|
||||
name: pty-server
|
||||
namespace: {{ .NamespaceName }}
|
||||
labels:
|
||||
openova.io/sandbox: {{ .Name }}
|
||||
openova.io/managed-by: catalyst
|
||||
spec:
|
||||
# Attach to the canonical Cilium Gateway on the host cluster. PR #1641
|
||||
# originally targeted "catalyst-public/catalyst-system/https" — that
|
||||
# Gateway does not exist on a Sovereign. The real public Gateway is
|
||||
# cilium-gateway/kube-system (clusters/_template/sovereign-tls/
|
||||
# cilium-gateway.yaml), matching the placement organization-controller's
|
||||
# tenant_route.go and products/catalyst/chart/templates/httproute.yaml
|
||||
# already use. sectionName is intentionally omitted so the HTTPRoute
|
||||
# attaches to every listener whose hostname matches "sandbox.<sov-fqdn>"
|
||||
# — currently the wildcard *.${SOVEREIGN_FQDN} HTTPS listener
|
||||
# (https-<sov-fqdn-dashed>) per infra/hetzner/main.tf
|
||||
# locals.parent_domains_listeners_yaml fallback path.
|
||||
parentRefs:
|
||||
- name: cilium-gateway
|
||||
namespace: kube-system
|
||||
hostnames:
|
||||
- "sandbox.{{ .SovereignFQDN }}"
|
||||
rules:
|
||||
- matches:
|
||||
- path:
|
||||
type: PathPrefix
|
||||
value: /sessions/{{ .OwnerUID }}/
|
||||
filters:
|
||||
- type: URLRewrite
|
||||
urlRewrite:
|
||||
path:
|
||||
type: ReplacePrefixMatch
|
||||
replacePrefixMatch: /sessions/
|
||||
backendRefs:
|
||||
- name: pty-server
|
||||
port: 7681
|
||||
`
|
||||
|
||||
const kustomizationTemplate = `apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- resourcequota.yaml
|
||||
- serviceaccount.yaml
|
||||
- role.yaml
|
||||
- rolebinding.yaml
|
||||
- secret.yaml
|
||||
{{- if .HasNewAPIToken }}
|
||||
- secret-newapi-token.yaml
|
||||
{{- end }}
|
||||
{{- range .RepoPaths }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
- statefulset-pty-server.yaml
|
||||
- service-pty-server.yaml
|
||||
- deployment-mcp.yaml
|
||||
- httproute-pty-server.yaml
|
||||
`
|
||||
|
||||
const pvcRepoStorageDefault = "5Gi"
|
||||
|
||||
const (
|
||||
defaultLLMGatewayTokenSecret = "sandbox-tokens"
|
||||
defaultBYOSSecretPrefix = "sandbox-byos-claude-code"
|
||||
defaultIdleTimeoutMinutes = 30
|
||||
defaultConcurrentSessions = 1
|
||||
)
|
||||
|
||||
// Render returns (path, bytes) tuples the reconciler writes into the
|
||||
// per-Org Gitea repo under `sandbox/<owner-uid>/`.
|
||||
func Render(in Inputs) (map[string][]byte, error) {
|
||||
if strings.TrimSpace(in.Name) == "" {
|
||||
return nil, fmt.Errorf("Inputs.Name is required")
|
||||
}
|
||||
if strings.TrimSpace(in.OwnerUID) == "" {
|
||||
return nil, fmt.Errorf("Inputs.OwnerUID is required")
|
||||
}
|
||||
if strings.TrimSpace(in.OrgSlug) == "" {
|
||||
return nil, fmt.Errorf("Inputs.OrgSlug is required")
|
||||
}
|
||||
if strings.TrimSpace(in.PtyServerImage) == "" {
|
||||
return nil, fmt.Errorf("Inputs.PtyServerImage is required (Wave 8 pty-server StatefulSet has no default image)")
|
||||
}
|
||||
if strings.TrimSpace(in.MCPImage) == "" {
|
||||
return nil, fmt.Errorf("Inputs.MCPImage is required (Wave 8 openova-sandbox-mcp Deployment has no default image)")
|
||||
}
|
||||
if strings.TrimSpace(in.NewapiURL) == "" {
|
||||
return nil, fmt.Errorf("Inputs.NewapiURL is required (newapi-proxy-contract.md §1 — pty-server env LLM_GATEWAY_URL)")
|
||||
}
|
||||
if strings.TrimSpace(in.SovereignFQDN) == "" {
|
||||
return nil, fmt.Errorf("Inputs.SovereignFQDN is required (HTTPRoute hostname binding)")
|
||||
}
|
||||
|
||||
if strings.TrimSpace(in.LLMGatewayTokenSecret) == "" {
|
||||
in.LLMGatewayTokenSecret = defaultLLMGatewayTokenSecret
|
||||
}
|
||||
if strings.TrimSpace(in.BYOSSecretPrefix) == "" {
|
||||
in.BYOSSecretPrefix = defaultBYOSSecretPrefix
|
||||
}
|
||||
if in.IdleTimeoutMinutes <= 0 {
|
||||
in.IdleTimeoutMinutes = defaultIdleTimeoutMinutes
|
||||
}
|
||||
|
||||
ns := fmt.Sprintf("sandbox-%s", in.OwnerUID)
|
||||
|
||||
repos := make([]sandboxapi.SandboxRepo, len(in.Repos))
|
||||
copy(repos, in.Repos)
|
||||
sort.SliceStable(repos, func(i, j int) bool {
|
||||
return repos[i].GiteaRepo < repos[j].GiteaRepo
|
||||
})
|
||||
|
||||
type baseCtx struct {
|
||||
Inputs
|
||||
NamespaceName string
|
||||
}
|
||||
base := baseCtx{Inputs: in, NamespaceName: ns}
|
||||
|
||||
out := make(map[string][]byte, 12+len(repos))
|
||||
|
||||
for path, raw := range map[string]string{
|
||||
"namespace.yaml": namespaceTemplate,
|
||||
"resourcequota.yaml": resourceQuotaTemplate,
|
||||
"serviceaccount.yaml": serviceAccountTemplate,
|
||||
"role.yaml": roleTemplate,
|
||||
"rolebinding.yaml": roleBindingTemplate,
|
||||
"secret.yaml": secretTemplate,
|
||||
} {
|
||||
buf, err := renderTemplate(path, raw, base)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out[path] = buf
|
||||
}
|
||||
|
||||
type pvcCtx struct {
|
||||
Inputs
|
||||
NamespaceName string
|
||||
PVCName string
|
||||
RepoSlug string
|
||||
RepoStorage string
|
||||
}
|
||||
repoPaths := make([]string, 0, len(repos))
|
||||
for _, repo := range repos {
|
||||
slug := sanitizeRepoSlug(repo.GiteaRepo)
|
||||
pvcName := fmt.Sprintf("repo-%s", slug)
|
||||
path := fmt.Sprintf("pvc-%s.yaml", slug)
|
||||
ctx := pvcCtx{
|
||||
Inputs: in,
|
||||
NamespaceName: ns,
|
||||
PVCName: pvcName,
|
||||
RepoSlug: repo.GiteaRepo,
|
||||
RepoStorage: pvcRepoStorageDefault,
|
||||
}
|
||||
buf, err := renderTemplate(path, pvcTemplate, ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out[path] = buf
|
||||
repoPaths = append(repoPaths, path)
|
||||
}
|
||||
|
||||
// NewAPI per-Sandbox bearer Secret — opt-in (only when the caller
|
||||
// supplied a non-empty token; reconciler skips this manifest when
|
||||
// the bridge is unreachable so namespace + RBAC + PVCs still land
|
||||
// without the token-mint side-effect).
|
||||
if strings.TrimSpace(in.NewAPIToken) != "" {
|
||||
secretName := strings.TrimSpace(in.NewAPITokenSecretName)
|
||||
if secretName == "" {
|
||||
secretName = fmt.Sprintf("sandbox-%s-newapi-token", in.OwnerUID)
|
||||
}
|
||||
type tokenCtx struct {
|
||||
Inputs
|
||||
NamespaceName string
|
||||
SecretName string
|
||||
Token string
|
||||
ExpiresAt string
|
||||
RotatedAt string
|
||||
}
|
||||
buf, err := renderTemplate("secret-newapi-token.yaml", newapiTokenSecretTemplate, tokenCtx{
|
||||
Inputs: in,
|
||||
NamespaceName: ns,
|
||||
SecretName: secretName,
|
||||
Token: in.NewAPIToken,
|
||||
ExpiresAt: in.NewAPITokenExpiresAt,
|
||||
RotatedAt: in.NewAPITokenRotatedAt,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out["secret-newapi-token.yaml"] = buf
|
||||
}
|
||||
|
||||
// Kustomization stitching — sorted repoPaths keeps output stable.
|
||||
sort.Strings(repoPaths)
|
||||
type kustCtx struct {
|
||||
Inputs
|
||||
NamespaceName string
|
||||
RepoPaths []string
|
||||
HasNewAPIToken bool
|
||||
}
|
||||
kustBuf, err := renderTemplate("kustomization.yaml", kustomizationTemplate, kustCtx{
|
||||
Inputs: in,
|
||||
NamespaceName: ns,
|
||||
RepoPaths: repoPaths,
|
||||
HasNewAPIToken: strings.TrimSpace(in.NewAPIToken) != "",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out["kustomization.yaml"] = kustBuf
|
||||
|
||||
// Wave 8 runtime — pty-server StatefulSet, MCP Deployment,
|
||||
// pty-server Service, HTTPRoute.
|
||||
type runtimeRepo struct {
|
||||
Slug string
|
||||
}
|
||||
runtimeRepos := make([]runtimeRepo, 0, len(repos))
|
||||
for _, r := range repos {
|
||||
runtimeRepos = append(runtimeRepos, runtimeRepo{Slug: sanitizeRepoSlug(r.GiteaRepo)})
|
||||
}
|
||||
replicas := in.Quota.ConcurrentSessions
|
||||
if replicas <= 0 {
|
||||
replicas = defaultConcurrentSessions
|
||||
}
|
||||
byosActive := agentInCatalogue(in.AgentCatalogue, "claude-code")
|
||||
byosSecretName := fmt.Sprintf("%s-%s", in.BYOSSecretPrefix, in.OwnerUID)
|
||||
|
||||
type runtimeCtx struct {
|
||||
Inputs
|
||||
NamespaceName string
|
||||
Replicas int
|
||||
RuntimeRepos []runtimeRepo
|
||||
ClaudeCodeBYOSActive bool
|
||||
BYOSSecretName string
|
||||
}
|
||||
rctx := runtimeCtx{
|
||||
Inputs: in,
|
||||
NamespaceName: ns,
|
||||
Replicas: replicas,
|
||||
RuntimeRepos: runtimeRepos,
|
||||
ClaudeCodeBYOSActive: byosActive,
|
||||
BYOSSecretName: byosSecretName,
|
||||
}
|
||||
for path, raw := range map[string]string{
|
||||
"statefulset-pty-server.yaml": ptyServerStatefulSetTemplate,
|
||||
"service-pty-server.yaml": ptyServerServiceTemplate,
|
||||
"deployment-mcp.yaml": mcpDeploymentTemplate,
|
||||
"httproute-pty-server.yaml": httpRouteTemplate,
|
||||
} {
|
||||
buf, err := renderTemplate(path, raw, rctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out[path] = buf
|
||||
}
|
||||
_ = base
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func agentInCatalogue(catalogue []string, agent string) bool {
|
||||
want := strings.ToLower(strings.TrimSpace(agent))
|
||||
for _, a := range catalogue {
|
||||
if strings.ToLower(strings.TrimSpace(a)) == want {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func renderTemplate(name, raw string, data any) ([]byte, error) {
|
||||
t, err := template.New(name).Funcs(funcs()).Parse(raw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("template parse %s: %w", name, err)
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
if err := t.Execute(&buf, data); err != nil {
|
||||
return nil, fmt.Errorf("template execute %s: %w", name, err)
|
||||
}
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
func funcs() template.FuncMap {
|
||||
return template.FuncMap{
|
||||
"quote": func(v any) string { return fmt.Sprintf("%q", fmt.Sprintf("%v", v)) },
|
||||
}
|
||||
}
|
||||
|
||||
func sanitizeRepoSlug(s string) string {
|
||||
s = strings.ToLower(strings.TrimSpace(s))
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= '0' && r <= '9':
|
||||
b.WriteRune(r)
|
||||
case r == '/' || r == '_' || r == '.' || r == ' ':
|
||||
b.WriteRune('-')
|
||||
case r == '-':
|
||||
b.WriteRune('-')
|
||||
}
|
||||
}
|
||||
out := b.String()
|
||||
for strings.Contains(out, "--") {
|
||||
out = strings.ReplaceAll(out, "--", "-")
|
||||
}
|
||||
out = strings.Trim(out, "-")
|
||||
if len(out) > 200 {
|
||||
out = strings.Trim(out[:200], "-")
|
||||
}
|
||||
return out
|
||||
}
|
||||
@ -1,444 +0,0 @@
|
||||
// Package idlescaler hosts the IdleScaler — the Wave 10 (PR #1641 follow-up)
|
||||
// goroutine that scales pty-server StatefulSets to 0 replicas after the
|
||||
// configured idle window has elapsed (architecture.md §1 idle policy).
|
||||
//
|
||||
// PR #1641 shipped the `openova.io/sandbox-idle-timeout-minutes`
|
||||
// annotation on every pty-server StatefulSet but no controller was
|
||||
// reading it. This package closes that loop:
|
||||
//
|
||||
// 1. Every Interval (default 60s) the IdleScaler lists every
|
||||
// StatefulSet labeled `app.kubernetes.io/component=pty-server` AND
|
||||
// `openova.io/managed-by=catalyst` across all `sandbox-*` namespaces
|
||||
// visible to the controller's client.
|
||||
//
|
||||
// 2. For each StatefulSet, it reads the idle-timeout annotation. If
|
||||
// absent or unparseable, it falls back to the controller-level
|
||||
// default (typically env SANDBOX_IDLE_TIMEOUT_MINUTES, 30 min).
|
||||
//
|
||||
// 3. It polls the StatefulSet's pty-server Service at
|
||||
// `http://pty-server.<ns>.svc.cluster.local:7681/idle` (the Service
|
||||
// name + port are written by the renderer in
|
||||
// core/controllers/sandbox/internal/gitops/manifests.go) — the
|
||||
// handler is contributed by products/sandbox/pty-server/internal/
|
||||
// server/routes.go and returns the in-memory lastActivityAt +
|
||||
// activeSessions counters.
|
||||
//
|
||||
// 4. It stamps `openova.io/sandbox-last-activity-at` (RFC3339) onto
|
||||
// the StatefulSet so a future operator inspecting `kubectl get
|
||||
// statefulset -o yaml` can see what the scaler observed.
|
||||
//
|
||||
// 5. If `now - lastActivityAt > idleTimeout` AND activeSessions == 0
|
||||
// AND spec.replicas > 0, the IdleScaler patches spec.replicas = 0.
|
||||
// The Sandbox reconciler will bump replicas back to
|
||||
// spec.quota.concurrentSessions the next time anything touches the
|
||||
// parent Sandbox CR (a tab connect, a session create, a CR edit).
|
||||
//
|
||||
// Mutation scope: the IdleScaler ONLY ever scales pty-server
|
||||
// StatefulSets — its OWN managed resource (architecture.md §7 — those
|
||||
// StatefulSets are written by the sandbox-controller renderer). It
|
||||
// never patches anything outside the `sandbox-*` namespace + the
|
||||
// `app.kubernetes.io/component=pty-server` label.
|
||||
package idlescaler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
const (
|
||||
// LabelComponent is the StatefulSet label the renderer writes on
|
||||
// every pty-server StatefulSet (manifests.go ptyServerStatefulSetTemplate).
|
||||
LabelComponent = "app.kubernetes.io/component"
|
||||
// LabelManagedBy is the secondary safety filter — we never touch
|
||||
// a StatefulSet that didn't come from us.
|
||||
LabelManagedBy = "openova.io/managed-by"
|
||||
// ComponentValue is the LabelComponent value we filter on.
|
||||
ComponentValue = "pty-server"
|
||||
// ManagedByValue is the LabelManagedBy value we filter on.
|
||||
ManagedByValue = "catalyst"
|
||||
|
||||
// AnnIdleTimeoutMinutes — set by the renderer (manifests.go
|
||||
// `openova.io/sandbox-idle-timeout-minutes`). Per-StatefulSet
|
||||
// override of the controller-level default.
|
||||
AnnIdleTimeoutMinutes = "openova.io/sandbox-idle-timeout-minutes"
|
||||
// AnnLastActivityAt — written by the IdleScaler. RFC3339 (UTC).
|
||||
// External observers (operators, dashboards) can read this without
|
||||
// hitting the pty-server endpoint directly.
|
||||
AnnLastActivityAt = "openova.io/sandbox-last-activity-at"
|
||||
// AnnIdleScalingDisabled — set by the renderer when Sandbox CR
|
||||
// carries `spec.idleScaling.enabled=false`. The IdleScaler skips
|
||||
// the StatefulSet on every pass (TBD-D8b #1725 — long-running
|
||||
// agent workloads that idle for hours but must stay Running).
|
||||
// Truthy values: 1, t, true (case-insensitive).
|
||||
AnnIdleScalingDisabled = "openova.io/sandbox-idle-scaling-disabled"
|
||||
|
||||
// NamespacePrefix limits the scaler to namespaces the renderer
|
||||
// creates (`sandbox-<owner-uid>`). Any StatefulSet that somehow
|
||||
// carries our labels outside this prefix is ignored.
|
||||
NamespacePrefix = "sandbox-"
|
||||
|
||||
// PtyServicePort mirrors the renderer's ptyServerServiceTemplate
|
||||
// (port 7681). If that constant changes, this must change too.
|
||||
PtyServicePort = 7681
|
||||
// IdlePath is the endpoint exposed by pty-server (routes.go).
|
||||
IdlePath = "/idle"
|
||||
)
|
||||
|
||||
// idleDTO mirrors products/sandbox/pty-server/internal/server/routes.go
|
||||
// idleDTO. Kept local (no cross-module type import) — both sides change
|
||||
// in the same PR per the architecture-doc cross-reference idiom.
|
||||
type idleDTO struct {
|
||||
LastActivityAt time.Time `json:"lastActivityAt"`
|
||||
ActiveSessions int `json:"activeSessions"`
|
||||
}
|
||||
|
||||
// Options configures the IdleScaler.
|
||||
type Options struct {
|
||||
// Interval is the poll cadence. Defaults to 60s.
|
||||
Interval time.Duration
|
||||
// DefaultIdleTimeoutMinutes is the fallback when a StatefulSet has
|
||||
// no idle-timeout annotation (or it's unparseable). The controller
|
||||
// already plumbs SANDBOX_IDLE_TIMEOUT_MINUTES through env — pass
|
||||
// the same value here so behaviour is consistent.
|
||||
DefaultIdleTimeoutMinutes int
|
||||
// HTTPTimeout bounds a single /idle probe. Defaults to 5s.
|
||||
HTTPTimeout time.Duration
|
||||
// HTTPClient is injectable for tests. Defaults to http.DefaultClient
|
||||
// with HTTPTimeout applied.
|
||||
HTTPClient *http.Client
|
||||
// ProbeURL is injectable for tests. nil = use cluster-DNS form
|
||||
// `http://pty-server.<ns>.svc.cluster.local:7681/idle`.
|
||||
ProbeURL func(namespace string) string
|
||||
// Now is injectable for tests. Defaults to time.Now().UTC.
|
||||
Now func() time.Time
|
||||
}
|
||||
|
||||
// Scaler is the IdleScaler runtime. Construct via New, register with
|
||||
// the controller-runtime manager via mgr.Add(s) (Scaler implements
|
||||
// manager.Runnable + manager.LeaderElectionRunnable so only the
|
||||
// elected leader scales — peers stay idle).
|
||||
type Scaler struct {
|
||||
client client.Client
|
||||
log logr.Logger
|
||||
|
||||
interval time.Duration
|
||||
defaultTimeout time.Duration
|
||||
httpClient *http.Client
|
||||
probeURL func(string) string
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
// New returns a Scaler ready to register with a controller-runtime manager.
|
||||
func New(c client.Client, log logr.Logger, opts Options) *Scaler {
|
||||
if opts.Interval <= 0 {
|
||||
opts.Interval = 60 * time.Second
|
||||
}
|
||||
if opts.DefaultIdleTimeoutMinutes <= 0 {
|
||||
opts.DefaultIdleTimeoutMinutes = 30
|
||||
}
|
||||
if opts.HTTPTimeout <= 0 {
|
||||
opts.HTTPTimeout = 5 * time.Second
|
||||
}
|
||||
httpc := opts.HTTPClient
|
||||
if httpc == nil {
|
||||
httpc = &http.Client{Timeout: opts.HTTPTimeout}
|
||||
}
|
||||
probe := opts.ProbeURL
|
||||
if probe == nil {
|
||||
probe = func(ns string) string {
|
||||
return fmt.Sprintf("http://pty-server.%s.svc.cluster.local:%d%s",
|
||||
ns, PtyServicePort, IdlePath)
|
||||
}
|
||||
}
|
||||
now := opts.Now
|
||||
if now == nil {
|
||||
now = func() time.Time { return time.Now().UTC() }
|
||||
}
|
||||
return &Scaler{
|
||||
client: c,
|
||||
log: log,
|
||||
interval: opts.Interval,
|
||||
defaultTimeout: time.Duration(opts.DefaultIdleTimeoutMinutes) * time.Minute,
|
||||
httpClient: httpc,
|
||||
probeURL: probe,
|
||||
now: now,
|
||||
}
|
||||
}
|
||||
|
||||
// Start runs the scaler loop until ctx is cancelled. Satisfies
|
||||
// controller-runtime's manager.Runnable interface — register via
|
||||
// `mgr.Add(scaler)`.
|
||||
func (s *Scaler) Start(ctx context.Context) error {
|
||||
s.log.Info("idle-scaler starting",
|
||||
"interval", s.interval,
|
||||
"default_timeout", s.defaultTimeout)
|
||||
|
||||
// Tick once on startup so we don't wait `interval` before the
|
||||
// first reconciliation pass.
|
||||
if err := s.runOnce(ctx); err != nil {
|
||||
s.log.Error(err, "idle-scaler initial pass failed (non-fatal — will retry)")
|
||||
}
|
||||
|
||||
t := time.NewTicker(s.interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
s.log.Info("idle-scaler shutting down")
|
||||
return nil
|
||||
case <-t.C:
|
||||
if err := s.runOnce(ctx); err != nil {
|
||||
s.log.Error(err, "idle-scaler tick failed (non-fatal — will retry)")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NeedLeaderElection makes the scaler a singleton across HA replicas.
|
||||
// Peers stay idle so we never race a scale-to-zero against a scale-back-up.
|
||||
func (s *Scaler) NeedLeaderElection() bool { return true }
|
||||
|
||||
// runOnce is one IdleScaler pass.
|
||||
func (s *Scaler) runOnce(ctx context.Context) error {
|
||||
sel, err := labels.Parse(fmt.Sprintf("%s=%s,%s=%s",
|
||||
LabelComponent, ComponentValue,
|
||||
LabelManagedBy, ManagedByValue))
|
||||
if err != nil {
|
||||
return fmt.Errorf("build label selector: %w", err)
|
||||
}
|
||||
|
||||
var list appsv1.StatefulSetList
|
||||
if err := s.client.List(ctx, &list, &client.ListOptions{LabelSelector: sel}); err != nil {
|
||||
return fmt.Errorf("list pty-server statefulsets: %w", err)
|
||||
}
|
||||
|
||||
now := s.now()
|
||||
scanned := 0
|
||||
idled := 0
|
||||
for i := range list.Items {
|
||||
ss := &list.Items[i]
|
||||
if !strings.HasPrefix(ss.Namespace, NamespacePrefix) {
|
||||
// Defence in depth — we never touch SS outside
|
||||
// `sandbox-*` even if a stray label leaked elsewhere.
|
||||
continue
|
||||
}
|
||||
scanned++
|
||||
if didScale, err := s.processOne(ctx, ss, now); err != nil {
|
||||
s.log.Error(err, "idle-scaler: process statefulset failed",
|
||||
"namespace", ss.Namespace, "name", ss.Name)
|
||||
continue
|
||||
} else if didScale {
|
||||
idled++
|
||||
}
|
||||
}
|
||||
s.log.V(1).Info("idle-scaler pass done", "scanned", scanned, "idled", idled)
|
||||
return nil
|
||||
}
|
||||
|
||||
// processOne returns (didScale, err). didScale is true if this pass
|
||||
// scaled spec.replicas to 0.
|
||||
func (s *Scaler) processOne(ctx context.Context, ss *appsv1.StatefulSet, now time.Time) (bool, error) {
|
||||
log := s.log.WithValues("namespace", ss.Namespace, "name", ss.Name)
|
||||
|
||||
// TBD-D8b #1725 — per-Sandbox opt-out. The renderer stamps the
|
||||
// disabled annotation when Sandbox.spec.idleScaling.enabled=false.
|
||||
// Skip entirely: no probe, no annotation patch, no scale decision.
|
||||
if isIdleScalingDisabled(ss) {
|
||||
log.V(1).Info("idle-scaler: skipping (idle-scaling disabled per CR)")
|
||||
return false, nil
|
||||
}
|
||||
|
||||
timeout := s.timeoutFor(ss)
|
||||
|
||||
// Probe the in-cluster service for the in-memory activity counter.
|
||||
dto, probeErr := s.probe(ctx, ss.Namespace)
|
||||
|
||||
// Decide the canonical lastActivity to stamp.
|
||||
//
|
||||
// Probe-success path: trust the live counter.
|
||||
// Probe-failure path: keep the existing annotation (don't reset)
|
||||
// — a service may be unreachable briefly during
|
||||
// Pod restart; the next tick will catch up.
|
||||
var lastActivity time.Time
|
||||
var activeSessions int
|
||||
if probeErr == nil {
|
||||
lastActivity = dto.LastActivityAt.UTC()
|
||||
activeSessions = dto.ActiveSessions
|
||||
} else {
|
||||
// Existing annotation as fallback.
|
||||
if existing, ok := ss.Annotations[AnnLastActivityAt]; ok {
|
||||
if t, perr := time.Parse(time.RFC3339, existing); perr == nil {
|
||||
lastActivity = t.UTC()
|
||||
}
|
||||
}
|
||||
// If we have neither a probe nor a prior annotation, we
|
||||
// can't make an idle decision yet. Skip — next tick.
|
||||
if lastActivity.IsZero() {
|
||||
log.V(1).Info("idle-scaler: probe failed and no prior annotation, skipping",
|
||||
"err", probeErr.Error())
|
||||
return false, nil
|
||||
}
|
||||
log.V(1).Info("idle-scaler: probe failed, using prior annotation",
|
||||
"last_activity", lastActivity.Format(time.RFC3339),
|
||||
"err", probeErr.Error())
|
||||
}
|
||||
|
||||
// Stamp the annotation (probe-success only — otherwise we'd
|
||||
// overwrite a stale value with the same stale value, which is a
|
||||
// no-op patch but still chatty).
|
||||
if probeErr == nil {
|
||||
if err := s.stampAnnotation(ctx, ss, lastActivity); err != nil {
|
||||
log.Error(err, "idle-scaler: stamp last-activity annotation failed")
|
||||
// non-fatal — fall through to the scale decision so a
|
||||
// degraded annotation-patch path can't keep a Pod alive
|
||||
// forever.
|
||||
}
|
||||
}
|
||||
|
||||
if activeSessions > 0 {
|
||||
// Sessions are open; never scale down even if lastActivity
|
||||
// drifts (lastActivity covers the trailing edge after the
|
||||
// last WS frame — Touch() also fires on attach/detach).
|
||||
return false, nil
|
||||
}
|
||||
|
||||
idleFor := now.Sub(lastActivity)
|
||||
if idleFor < timeout {
|
||||
log.V(1).Info("idle-scaler: not yet idle",
|
||||
"idle_for", idleFor.String(),
|
||||
"timeout", timeout.String())
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Already scaled to zero? Skip the patch — idempotent.
|
||||
if ss.Spec.Replicas != nil && *ss.Spec.Replicas == 0 {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
log.Info("idle-scaler: scaling pty-server to 0",
|
||||
"idle_for", idleFor.String(),
|
||||
"timeout", timeout.String(),
|
||||
"last_activity", lastActivity.Format(time.RFC3339))
|
||||
if err := s.scaleToZero(ctx, ss); err != nil {
|
||||
return false, fmt.Errorf("scale to zero: %w", err)
|
||||
}
|
||||
// Wave 15 (PR #1674 follow-up) — emit the canonical idle-timeout
|
||||
// counter so the Grafana "Idle-Timeout Scale-Down Events / hour"
|
||||
// panel ticks. Labelled by namespace to match the dashboard's
|
||||
// `sum by (namespace) (rate(...))` aggregation.
|
||||
idleTimeoutEvents.WithLabelValues(ss.Namespace).Inc()
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// isIdleScalingDisabled reports whether the StatefulSet carries the
|
||||
// `openova.io/sandbox-idle-scaling-disabled` annotation set to a truthy
|
||||
// value (TBD-D8b #1725). The annotation is renderer-stamped from
|
||||
// Sandbox.spec.idleScaling.enabled=false; absence (or any other value)
|
||||
// keeps the StatefulSet subject to the scaler. The check is
|
||||
// quote-tolerant for the same reason timeoutFor is.
|
||||
func isIdleScalingDisabled(ss *appsv1.StatefulSet) bool {
|
||||
v, ok := ss.Annotations[AnnIdleScalingDisabled]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
v = strings.Trim(strings.TrimSpace(v), "\"'")
|
||||
switch strings.ToLower(v) {
|
||||
case "1", "t", "true", "yes", "y":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Scaler) timeoutFor(ss *appsv1.StatefulSet) time.Duration {
|
||||
if v, ok := ss.Annotations[AnnIdleTimeoutMinutes]; ok {
|
||||
// The renderer writes the annotation as a quoted integer
|
||||
// (manifests.go uses {{ .IdleTimeoutMinutes | quote }}); the
|
||||
// API server stores annotations verbatim so we get the raw
|
||||
// string back. strconv.Atoi handles the unquoted form; we
|
||||
// trim quotes defensively in case operators hand-edited.
|
||||
v = strings.Trim(strings.TrimSpace(v), "\"'")
|
||||
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||||
return time.Duration(n) * time.Minute
|
||||
}
|
||||
}
|
||||
return s.defaultTimeout
|
||||
}
|
||||
|
||||
func (s *Scaler) probe(ctx context.Context, namespace string) (idleDTO, error) {
|
||||
url := s.probeURL(namespace)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return idleDTO{}, fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
resp, err := s.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return idleDTO{}, fmt.Errorf("%s: %w", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return idleDTO{}, fmt.Errorf("%s: status %d", url, resp.StatusCode)
|
||||
}
|
||||
var dto idleDTO
|
||||
if err := json.NewDecoder(resp.Body).Decode(&dto); err != nil {
|
||||
return idleDTO{}, fmt.Errorf("decode %s body: %w", url, err)
|
||||
}
|
||||
return dto, nil
|
||||
}
|
||||
|
||||
func (s *Scaler) stampAnnotation(ctx context.Context, ss *appsv1.StatefulSet, lastActivity time.Time) error {
|
||||
stamp := lastActivity.UTC().Format(time.RFC3339)
|
||||
if existing, ok := ss.Annotations[AnnLastActivityAt]; ok && existing == stamp {
|
||||
// Already up-to-date (typical when probe + last poll agree
|
||||
// within a second). Skip the patch.
|
||||
return nil
|
||||
}
|
||||
|
||||
// JSON-Merge-Patch body — only the annotation we care about.
|
||||
// strategic-merge-patch over annotations is equivalent here since
|
||||
// metav1.ObjectMeta.Annotations is a map (additive merge).
|
||||
patch := []byte(fmt.Sprintf(
|
||||
`{"metadata":{"annotations":{%q:%q}}}`,
|
||||
AnnLastActivityAt, stamp))
|
||||
|
||||
if err := s.client.Patch(ctx, ss, client.RawPatch(client.Merge.Type(), patch)); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
// The StatefulSet was deleted between List and Patch —
|
||||
// nothing to do.
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
// Reflect the new value in our in-memory copy too so subsequent
|
||||
// runOnce passes within this process don't re-stamp.
|
||||
if ss.Annotations == nil {
|
||||
ss.Annotations = map[string]string{}
|
||||
}
|
||||
ss.Annotations[AnnLastActivityAt] = stamp
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Scaler) scaleToZero(ctx context.Context, ss *appsv1.StatefulSet) error {
|
||||
patch := []byte(`{"spec":{"replicas":0}}`)
|
||||
if err := s.client.Patch(ctx, ss, client.RawPatch(client.Merge.Type(), patch)); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
// Keep our local copy consistent for the rest of this pass.
|
||||
var zero int32 = 0
|
||||
ss.Spec.Replicas = &zero
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -1,488 +0,0 @@
|
||||
// idlescaler_test.go — coverage for the Wave 10 IdleScaler.
|
||||
//
|
||||
// Strategy: drive the scaler with a fake controller-runtime client +
|
||||
// a localhost httptest server that mimics pty-server's /idle endpoint.
|
||||
// We assert four trajectories:
|
||||
//
|
||||
// (1) Active pty-server with no idle time → no scale, no harm.
|
||||
// (2) Idle pty-server past timeout → spec.replicas patched to 0.
|
||||
// (3) activeSessions > 0 keeps the pod alive even past timeout.
|
||||
// (4) /idle probe failure with NO prior annotation → skip (next tick).
|
||||
// (5) Per-StatefulSet annotation overrides the controller default.
|
||||
// (6) StatefulSets outside `sandbox-*` namespace are ignored (defence
|
||||
// in depth).
|
||||
// (7) StatefulSets already at replicas=0 are not re-patched.
|
||||
|
||||
package idlescaler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client/fake"
|
||||
)
|
||||
|
||||
// helper to assemble a pty-server StatefulSet with the labels +
|
||||
// annotations the renderer writes.
|
||||
func ptyStatefulSet(namespace, name string, replicas int32, annotations map[string]string) *appsv1.StatefulSet {
|
||||
ann := map[string]string{}
|
||||
for k, v := range annotations {
|
||||
ann[k] = v
|
||||
}
|
||||
return &appsv1.StatefulSet{
|
||||
TypeMeta: metav1.TypeMeta{
|
||||
APIVersion: "apps/v1",
|
||||
Kind: "StatefulSet",
|
||||
},
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Namespace: namespace,
|
||||
Labels: map[string]string{
|
||||
LabelComponent: ComponentValue,
|
||||
LabelManagedBy: ManagedByValue,
|
||||
},
|
||||
Annotations: ann,
|
||||
},
|
||||
Spec: appsv1.StatefulSetSpec{
|
||||
Replicas: &replicas,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func newFakeClient(t *testing.T, objs ...client.Object) client.Client {
|
||||
t.Helper()
|
||||
scheme := runtime.NewScheme()
|
||||
if err := clientgoscheme.AddToScheme(scheme); err != nil {
|
||||
t.Fatalf("add clientgo scheme: %v", err)
|
||||
}
|
||||
return fake.NewClientBuilder().
|
||||
WithScheme(scheme).
|
||||
WithObjects(objs...).
|
||||
Build()
|
||||
}
|
||||
|
||||
// helper — make a pty-server probe target. fn decides response per ns.
|
||||
func newProbeServer(t *testing.T, fn func(ns string) (idleDTO, bool)) (*httptest.Server, func(ns string) string) {
|
||||
t.Helper()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/idle/", func(w http.ResponseWriter, r *http.Request) {
|
||||
ns := r.URL.Path[len("/idle/"):]
|
||||
dto, ok := fn(ns)
|
||||
if !ok {
|
||||
http.Error(w, "no", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(dto)
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
t.Cleanup(srv.Close)
|
||||
build := func(ns string) string { return srv.URL + "/idle/" + ns }
|
||||
return srv, build
|
||||
}
|
||||
|
||||
// (1) Active pty-server, very recent activity → no scale.
|
||||
func TestProcessOne_NotIdle_NoScale(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 2,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "30"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{
|
||||
LastActivityAt: time.Now().UTC().Add(-1 * time.Minute),
|
||||
ActiveSessions: 0,
|
||||
}, true
|
||||
})
|
||||
|
||||
now := time.Now().UTC()
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 30,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 2 {
|
||||
t.Errorf("replicas: got %v want 2 (not idle yet, must not scale)", got.Spec.Replicas)
|
||||
}
|
||||
if _, ok := got.Annotations[AnnLastActivityAt]; !ok {
|
||||
t.Errorf("expected %s annotation to be stamped on success probe", AnnLastActivityAt)
|
||||
}
|
||||
}
|
||||
|
||||
// (2) Idle past timeout, no active sessions → scale to zero.
|
||||
func TestProcessOne_Idle_ScalesToZero(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 3,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "30"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
stale := now.Add(-45 * time.Minute) // past the 30-min annotation timeout
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{LastActivityAt: stale, ActiveSessions: 0}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 30,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 0 {
|
||||
t.Errorf("replicas: got %v want 0 (idle past timeout)", got.Spec.Replicas)
|
||||
}
|
||||
if got.Annotations[AnnLastActivityAt] == "" {
|
||||
t.Errorf("expected %s annotation to be stamped before scale", AnnLastActivityAt)
|
||||
}
|
||||
}
|
||||
|
||||
// (3) activeSessions > 0 keeps the pod alive even past timeout.
|
||||
func TestProcessOne_ActiveSessions_NeverScales(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 3,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "5"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
stale := now.Add(-2 * time.Hour) // way past timeout
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{LastActivityAt: stale, ActiveSessions: 2}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 5,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 3 {
|
||||
t.Errorf("replicas: got %v want 3 (activeSessions > 0 must keep pod alive)",
|
||||
got.Spec.Replicas)
|
||||
}
|
||||
}
|
||||
|
||||
// (4) Probe failure with no prior annotation → skip (no scale, no
|
||||
// annotation written).
|
||||
func TestProcessOne_ProbeFailNoPriorAnnotation_Skips(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 2,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "5"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
// probe returns 404
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{}, false
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 5,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 2 {
|
||||
t.Errorf("replicas: got %v want 2 (probe failed, no decision)",
|
||||
got.Spec.Replicas)
|
||||
}
|
||||
if _, ok := got.Annotations[AnnLastActivityAt]; ok {
|
||||
t.Errorf("annotation: got %v, expected NO annotation when probe fails",
|
||||
got.Annotations[AnnLastActivityAt])
|
||||
}
|
||||
}
|
||||
|
||||
// (5) Per-StatefulSet annotation override the controller default.
|
||||
func TestProcessOne_AnnotationOverridesDefault(t *testing.T) {
|
||||
t.Parallel()
|
||||
// SS says timeout is 5 minutes; controller default is 60 (would
|
||||
// not have scaled at 10min idle).
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 1,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "5"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{
|
||||
LastActivityAt: now.Add(-10 * time.Minute),
|
||||
ActiveSessions: 0,
|
||||
}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 60, // would NOT scale at 10min
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 0 {
|
||||
t.Errorf("replicas: got %v want 0 (annotation says 5min, 10min idle)",
|
||||
got.Spec.Replicas)
|
||||
}
|
||||
}
|
||||
|
||||
// (6) StatefulSets outside `sandbox-*` are ignored.
|
||||
func TestRunOnce_IgnoresNonSandboxNamespace(t *testing.T) {
|
||||
t.Parallel()
|
||||
rogue := ptyStatefulSet("kube-system", "pty-server", 1,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "5"})
|
||||
|
||||
c := newFakeClient(t, rogue)
|
||||
now := time.Now().UTC()
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{
|
||||
LastActivityAt: now.Add(-2 * time.Hour),
|
||||
ActiveSessions: 0,
|
||||
}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 5,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "kube-system", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 1 {
|
||||
t.Errorf("replicas: got %v want 1 (must NOT touch kube-system)",
|
||||
got.Spec.Replicas)
|
||||
}
|
||||
}
|
||||
|
||||
// (7) StatefulSets already at replicas=0 are not re-patched (idempotent).
|
||||
func TestProcessOne_AlreadyZero_NoOp(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 0,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "5"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
_, probe := newProbeServer(t, func(ns string) (idleDTO, bool) {
|
||||
return idleDTO{
|
||||
LastActivityAt: now.Add(-2 * time.Hour),
|
||||
ActiveSessions: 0,
|
||||
}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 5,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
// A double pass — both passes should leave replicas==0 and not error.
|
||||
for i := 0; i < 2; i++ {
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce pass %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 0 {
|
||||
t.Errorf("replicas: got %v want 0 (already zero, no-op)", got.Spec.Replicas)
|
||||
}
|
||||
}
|
||||
|
||||
// (8) Default URL builder produces the cluster-DNS form.
|
||||
func TestDefaultProbeURL(t *testing.T) {
|
||||
t.Parallel()
|
||||
s := New(nil, logr.Discard(), Options{})
|
||||
got := s.probeURL("sandbox-ceo-at-acme-com")
|
||||
want := fmt.Sprintf("http://pty-server.sandbox-ceo-at-acme-com.svc.cluster.local:%d%s",
|
||||
PtyServicePort, IdlePath)
|
||||
if got != want {
|
||||
t.Errorf("default probeURL:\n got %q\n want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// (9) NeedLeaderElection — singleton across HA replicas.
|
||||
func TestNeedLeaderElection_True(t *testing.T) {
|
||||
t.Parallel()
|
||||
s := New(nil, logr.Discard(), Options{})
|
||||
if !s.NeedLeaderElection() {
|
||||
t.Errorf("NeedLeaderElection: got false, want true (must be singleton)")
|
||||
}
|
||||
}
|
||||
|
||||
// (10) Wave 15 — idle-timeout counter ticks once per scale-to-zero, with
|
||||
// the namespace label set. Asserts the `sandbox_controller_idle_timeout_events_total`
|
||||
// counter the Grafana panel "Idle-Timeout Scale-Down Events / hour" reads.
|
||||
func TestProcessOne_IdleTimeoutCounter_Increments(t *testing.T) {
|
||||
// Not t.Parallel — counter is package-global and we read its value.
|
||||
ns := "sandbox-metric-test"
|
||||
ss := ptyStatefulSet(ns, "pty-server", 2,
|
||||
map[string]string{AnnIdleTimeoutMinutes: "10"})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
stale := now.Add(-30 * time.Minute) // past 10-min timeout
|
||||
_, probe := newProbeServer(t, func(_ string) (idleDTO, bool) {
|
||||
return idleDTO{LastActivityAt: stale, ActiveSessions: 0}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 10,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
before := testutilCounterValue(t, ns)
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
after := testutilCounterValue(t, ns)
|
||||
if got, want := after-before, 1.0; got != want {
|
||||
t.Errorf("idle_timeout_events_total{namespace=%q} delta: got %v want %v", ns, got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// testutilCounterValue reads the current counter value for the namespace
|
||||
// label using the prometheus testutil package — returns 0 if the label
|
||||
// tuple has not been touched yet.
|
||||
func testutilCounterValue(t *testing.T, namespace string) float64 {
|
||||
t.Helper()
|
||||
return prometheustestutil.ToFloat64(idleTimeoutEvents.WithLabelValues(namespace))
|
||||
}
|
||||
|
||||
// (10) TBD-D8b #1725 — `openova.io/sandbox-idle-scaling-disabled=true`
|
||||
// annotation prevents scale-to-zero even when the StatefulSet has been
|
||||
// idle for far past its timeout window.
|
||||
func TestProcessOne_IdleScalingDisabled_NeverScales(t *testing.T) {
|
||||
t.Parallel()
|
||||
ss := ptyStatefulSet("sandbox-emrah", "pty-server", 2, map[string]string{
|
||||
AnnIdleTimeoutMinutes: "5",
|
||||
AnnIdleScalingDisabled: "true",
|
||||
})
|
||||
|
||||
c := newFakeClient(t, ss)
|
||||
now := time.Now().UTC()
|
||||
// Probe should never be called when scaling is disabled — return an
|
||||
// implausibly-stale lastActivity to prove that even if it WERE called
|
||||
// the scaler still wouldn't act.
|
||||
_, probe := newProbeServer(t, func(_ string) (idleDTO, bool) {
|
||||
return idleDTO{
|
||||
LastActivityAt: now.Add(-24 * time.Hour),
|
||||
ActiveSessions: 0,
|
||||
}, true
|
||||
})
|
||||
|
||||
s := New(c, logr.Discard(), Options{
|
||||
DefaultIdleTimeoutMinutes: 5,
|
||||
ProbeURL: probe,
|
||||
Now: func() time.Time { return now },
|
||||
})
|
||||
|
||||
if err := s.runOnce(context.Background()); err != nil {
|
||||
t.Fatalf("runOnce: %v", err)
|
||||
}
|
||||
|
||||
var got appsv1.StatefulSet
|
||||
if err := c.Get(context.Background(),
|
||||
client.ObjectKey{Namespace: "sandbox-emrah", Name: "pty-server"}, &got); err != nil {
|
||||
t.Fatalf("get post-pass: %v", err)
|
||||
}
|
||||
if got.Spec.Replicas == nil || *got.Spec.Replicas != 2 {
|
||||
t.Errorf("replicas: got %v want 2 (idle-scaling disabled)", got.Spec.Replicas)
|
||||
}
|
||||
// AnnLastActivityAt must NOT have been stamped — disabled path
|
||||
// skips the entire process pipeline.
|
||||
if _, stamped := got.Annotations[AnnLastActivityAt]; stamped {
|
||||
t.Errorf("AnnLastActivityAt unexpectedly stamped on disabled Sandbox")
|
||||
}
|
||||
}
|
||||
|
||||
// (11) `false` / "0" / unset → scaler still active. Confirms the
|
||||
// truthy-only matcher (architecture.md §1 idle policy default-on).
|
||||
func TestIsIdleScalingDisabled_TruthyOnly(t *testing.T) {
|
||||
t.Parallel()
|
||||
cases := []struct {
|
||||
val string
|
||||
want bool
|
||||
}{
|
||||
{"true", true},
|
||||
{"True", true},
|
||||
{"TRUE", true},
|
||||
{"1", true},
|
||||
{"yes", true},
|
||||
{"\"true\"", true},
|
||||
{"false", false},
|
||||
{"0", false},
|
||||
{"", false},
|
||||
{"maybe", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
ss := ptyStatefulSet("sandbox-x", "pty-server", 1,
|
||||
map[string]string{AnnIdleScalingDisabled: tc.val})
|
||||
got := isIdleScalingDisabled(ss)
|
||||
if got != tc.want {
|
||||
t.Errorf("isIdleScalingDisabled(%q) = %v, want %v", tc.val, got, tc.want)
|
||||
}
|
||||
}
|
||||
// Annotation absent → never disabled.
|
||||
bare := ptyStatefulSet("sandbox-x", "pty-server", 1, nil)
|
||||
if isIdleScalingDisabled(bare) {
|
||||
t.Errorf("isIdleScalingDisabled(no annotation) = true, want false")
|
||||
}
|
||||
}
|
||||
@ -1,39 +0,0 @@
|
||||
// metrics.go — Prometheus metrics for the IdleScaler (Wave 15, PR #1674).
|
||||
//
|
||||
// Wave 14 (PR #1674) shipped a Grafana dashboard panel
|
||||
// "Idle-Timeout Scale-Down Events / hour" that targets metric
|
||||
// `sandbox_controller_idle_timeout_events_total`. The panel renders
|
||||
// "No data" until the sandbox-controller image carrying this emitter
|
||||
// rolls out across the fleet (Inviolable Principle #11 — no fabricated
|
||||
// metrics).
|
||||
//
|
||||
// This file closes that loop on the controller side:
|
||||
//
|
||||
// - Registers Counter `sandbox_controller_idle_timeout_events_total`
|
||||
// with label {namespace} via controller-runtime's metrics registry
|
||||
// (sigs.k8s.io/controller-runtime/pkg/metrics). The controller's
|
||||
// manager already wires up /metrics on :8080 — registering with
|
||||
// ctrlmetrics.Registry surfaces this counter on the same scrape.
|
||||
// - The IdleScaler calls IncIdleTimeoutEvent(namespace) inside
|
||||
// scaleToZero() so the counter ticks once per pty-server
|
||||
// StatefulSet scaled to 0 replicas, with the namespace label
|
||||
// matching the dashboard's `sum by (namespace) (rate(...))`
|
||||
// aggregation.
|
||||
package idlescaler
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
|
||||
)
|
||||
|
||||
var idleTimeoutEvents = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "sandbox_controller_idle_timeout_events_total",
|
||||
Help: "Number of pty-server StatefulSets scaled to 0 replicas by the IdleScaler, partitioned by namespace.",
|
||||
}, []string{"namespace"})
|
||||
|
||||
func init() {
|
||||
// Register with controller-runtime's shared registry so the
|
||||
// manager's existing :8080 /metrics endpoint exposes it. Re-
|
||||
// registration on test process reuse is guarded by ctrlmetrics.
|
||||
ctrlmetrics.Registry.MustRegister(idleTimeoutEvents)
|
||||
}
|
||||
@ -1,202 +0,0 @@
|
||||
// Package newapi is the thin HTTP client the sandbox-controller uses
|
||||
// to mint per-Sandbox LLM-gateway tokens via the catalyst-api bridge
|
||||
// handler shipped in PR #1638 — POST /admin/tokens/sandbox.
|
||||
//
|
||||
// Wire shape mirrors platform/newapi/internal/handler/sandbox_token.go
|
||||
// EXACTLY (request fields org_id/user_id/sandbox_id/allowed_channels,
|
||||
// response fields token/expires_at). If the handler's contract evolves
|
||||
// both sides must change in the same PR — there is no schema
|
||||
// generator between them on purpose: the bridge endpoint is the
|
||||
// authoritative spec, this client is its only known caller, and a
|
||||
// thin manual binding is easier to audit than yet-another generated
|
||||
// surface.
|
||||
//
|
||||
// Per Inviolable Principle #4 (no hardcoded values) every operational
|
||||
// knob (base URL, admin secret, HTTP timeout) is injected by the
|
||||
// caller — no defaults baked into the package.
|
||||
package newapi
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Client is the surface the sandbox-controller's reconciler depends
|
||||
// on. Defined as an interface so the controller's unit tests can
|
||||
// substitute an in-process stub without standing up a httptest.Server
|
||||
// per case.
|
||||
type Client interface {
|
||||
// MintSandboxToken POSTs the request to /admin/tokens/sandbox and
|
||||
// returns the issued bearer + its absolute expiry. Caller-supplied
|
||||
// context governs cancellation + the outbound HTTP deadline.
|
||||
MintSandboxToken(ctx context.Context, req MintRequest) (*MintResponse, error)
|
||||
}
|
||||
|
||||
// MintRequest is the wire body for POST /admin/tokens/sandbox.
|
||||
//
|
||||
// Field names match handler.sandboxTokenRequest one-for-one — change
|
||||
// them in lockstep with platform/newapi/internal/handler/
|
||||
// sandbox_token.go.
|
||||
type MintRequest struct {
|
||||
// OrgID is the parent Organization slug (e.g. "acme"). The handler
|
||||
// stamps this as the `org` claim on the minted JWT.
|
||||
OrgID string `json:"org_id"`
|
||||
|
||||
// UserID is the Sandbox owner's stable identity — Keycloak sub or
|
||||
// owner email. Forwarded as `X-User-Id` on every NewAPI /v1/* call
|
||||
// for per-user billing-ledger attribution.
|
||||
UserID string `json:"user_id"`
|
||||
|
||||
// SandboxID is the opaque per-Sandbox identifier the
|
||||
// sandbox-controller assigns. We pass the Sandbox CR's
|
||||
// metadata.uid — stable across spec mutations and 1:1 with a
|
||||
// rendered Pod's identity.
|
||||
SandboxID string `json:"sandbox_id"`
|
||||
|
||||
// AllowedChannels is the list of NewAPI channels the issued token
|
||||
// is restricted to. Empty rejected with 400 by the handler.
|
||||
AllowedChannels []string `json:"allowed_channels"`
|
||||
|
||||
// Capabilities is the MCP capability allowlist the issued token's
|
||||
// `capabilities` claim carries. Sourced from the Sandbox CR's
|
||||
// spec.capabilities (falling back to the plan→capabilities map via
|
||||
// sandboxapi.ResolveCapabilities). Encoded by the bridge handler
|
||||
// as the JWT `capabilities` claim which `Claims.HasCapability`
|
||||
// reads on every MCP tool call. Wildcards (`sandbox.db.*`) are
|
||||
// supported by the matcher so this list can carry coarse grants.
|
||||
// Empty list is allowed (downgrades the token to the introspection
|
||||
// surface only, matching a pre-PR-#1671 token).
|
||||
Capabilities []string `json:"capabilities,omitempty"`
|
||||
}
|
||||
|
||||
// MintResponse is the wire body for the 200 OK reply.
|
||||
type MintResponse struct {
|
||||
// Token is the HS256-signed JWT the Sandbox Pod presents to NewAPI
|
||||
// on every /v1/* call as the bearer credential.
|
||||
Token string `json:"token"`
|
||||
|
||||
// ExpiresAt is the absolute expiry instant of Token (RFC3339).
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
}
|
||||
|
||||
// HTTPClient is the net/http.Client subset we need — narrowed for
|
||||
// dependency-injection in tests.
|
||||
type HTTPClient interface {
|
||||
Do(req *http.Request) (*http.Response, error)
|
||||
}
|
||||
|
||||
// liveClient is the production implementation. Constructed with a
|
||||
// pre-configured *http.Client + the shared admin bearer + base URL.
|
||||
type liveClient struct {
|
||||
baseURL string
|
||||
adminSecret string
|
||||
http HTTPClient
|
||||
}
|
||||
|
||||
// New returns a live client. baseURL is the catalyst-api root the
|
||||
// bridge handler is mounted on (e.g.
|
||||
// "http://newapi.newapi.svc.cluster.local:3000"). adminSecret is the
|
||||
// value of NEWAPI_ADMIN_SECRET — chart-emitted by the
|
||||
// newapi-token-signing-key Secret. httpClient may be nil; in that
|
||||
// case a default 30s-timeout client is used.
|
||||
//
|
||||
// Returns an error when baseURL or adminSecret is empty so the
|
||||
// controller fails-loud at process start rather than shipping a
|
||||
// no-op token-mint path.
|
||||
func New(baseURL, adminSecret string, httpClient HTTPClient) (Client, error) {
|
||||
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
||||
if baseURL == "" {
|
||||
return nil, errors.New("newapi.New: base URL is empty")
|
||||
}
|
||||
if strings.TrimSpace(adminSecret) == "" {
|
||||
return nil, errors.New("newapi.New: admin secret is empty")
|
||||
}
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 30 * time.Second}
|
||||
}
|
||||
return &liveClient{
|
||||
baseURL: baseURL,
|
||||
adminSecret: adminSecret,
|
||||
http: httpClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// MintSandboxToken implements Client.
|
||||
//
|
||||
// Failure modes surfaced as wrapped errors:
|
||||
// - request marshalling — caller bug (programmer error)
|
||||
// - transport error — retry-worthy
|
||||
// - non-2xx with body — retry-worthy unless 4xx (configuration drift)
|
||||
// - response decode error — bridge contract violation (escalate)
|
||||
func (c *liveClient) MintSandboxToken(ctx context.Context, req MintRequest) (*MintResponse, error) {
|
||||
if strings.TrimSpace(req.OrgID) == "" {
|
||||
return nil, errors.New("newapi: MintRequest.OrgID is required")
|
||||
}
|
||||
if strings.TrimSpace(req.UserID) == "" {
|
||||
return nil, errors.New("newapi: MintRequest.UserID is required")
|
||||
}
|
||||
if strings.TrimSpace(req.SandboxID) == "" {
|
||||
return nil, errors.New("newapi: MintRequest.SandboxID is required")
|
||||
}
|
||||
if len(req.AllowedChannels) == 0 {
|
||||
return nil, errors.New("newapi: MintRequest.AllowedChannels is empty")
|
||||
}
|
||||
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newapi: marshal request: %w", err)
|
||||
}
|
||||
|
||||
url := c.baseURL + "/admin/tokens/sandbox"
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newapi: build request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
httpReq.Header.Set("Accept", "application/json")
|
||||
httpReq.Header.Set("Authorization", "Bearer "+c.adminSecret)
|
||||
// Wave 15 (PR #1674 follow-up) — stamp the tool header so the
|
||||
// bridge handler's `newapi_admin_token_mint_requests_total{tool,status}`
|
||||
// counter attributes mints to this controller. Header value must
|
||||
// match the dashboard's tool="sandbox-controller" panel filter.
|
||||
httpReq.Header.Set("X-Catalyst-Tool", "sandbox-controller")
|
||||
|
||||
resp, err := c.http.Do(httpReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newapi: POST %s: %w", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
// Surface bridge error verbatim — operator log diagnoses the
|
||||
// difference between 401 (admin secret rotated out of sync),
|
||||
// 400 (controller sent malformed request) and 5xx (bridge
|
||||
// outage). Body capped to 512 bytes for sanity.
|
||||
snip := respBody
|
||||
if len(snip) > 512 {
|
||||
snip = snip[:512]
|
||||
}
|
||||
return nil, fmt.Errorf("newapi: POST %s: status %d: %s",
|
||||
url, resp.StatusCode, string(snip))
|
||||
}
|
||||
|
||||
var out MintResponse
|
||||
if err := json.Unmarshal(respBody, &out); err != nil {
|
||||
return nil, fmt.Errorf("newapi: decode response: %w", err)
|
||||
}
|
||||
if out.Token == "" {
|
||||
return nil, errors.New("newapi: response missing token")
|
||||
}
|
||||
if out.ExpiresAt.IsZero() {
|
||||
return nil, errors.New("newapi: response missing expires_at")
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
@ -1,117 +0,0 @@
|
||||
package newapi
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestMintSandboxToken_HappyPath(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var captured MintRequest
|
||||
var capturedAuth string
|
||||
exp := time.Date(2030, 1, 2, 3, 4, 5, 0, time.UTC)
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if r.URL.Path != "/admin/tokens/sandbox" {
|
||||
http.Error(w, "path", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
capturedAuth = r.Header.Get("Authorization")
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
_ = json.Unmarshal(body, &captured)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_ = json.NewEncoder(w).Encode(MintResponse{Token: "tok-abc", ExpiresAt: exp})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
c, err := New(srv.URL, "admin-bytes", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("New: %v", err)
|
||||
}
|
||||
got, err := c.MintSandboxToken(context.Background(), MintRequest{
|
||||
OrgID: "acme",
|
||||
UserID: "ceo@acme.com",
|
||||
SandboxID: "uid-1",
|
||||
AllowedChannels: []string{"qwen"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("MintSandboxToken: %v", err)
|
||||
}
|
||||
if got.Token != "tok-abc" {
|
||||
t.Errorf("token: got %q", got.Token)
|
||||
}
|
||||
if !got.ExpiresAt.Equal(exp) {
|
||||
t.Errorf("expires_at: got %v want %v", got.ExpiresAt, exp)
|
||||
}
|
||||
if capturedAuth != "Bearer admin-bytes" {
|
||||
t.Errorf("auth header: got %q", capturedAuth)
|
||||
}
|
||||
if captured.OrgID != "acme" || captured.UserID != "ceo@acme.com" ||
|
||||
captured.SandboxID != "uid-1" || len(captured.AllowedChannels) != 1 ||
|
||||
captured.AllowedChannels[0] != "qwen" {
|
||||
t.Errorf("request body: got %+v", captured)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMintSandboxToken_Non2xx(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
_, _ = io.WriteString(w, `{"error":"invalid admin credentials"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
c, err := New(srv.URL, "wrong", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("New: %v", err)
|
||||
}
|
||||
_, err = c.MintSandboxToken(context.Background(), MintRequest{
|
||||
OrgID: "a", UserID: "u", SandboxID: "s", AllowedChannels: []string{"q"},
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatalf("expected error on 401")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "401") {
|
||||
t.Errorf("error should surface status code: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNew_InputValidation(t *testing.T) {
|
||||
t.Parallel()
|
||||
if _, err := New("", "x", nil); err == nil {
|
||||
t.Errorf("expected error on empty baseURL")
|
||||
}
|
||||
if _, err := New("http://x", " ", nil); err == nil {
|
||||
t.Errorf("expected error on empty adminSecret")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMintSandboxToken_RequestValidation(t *testing.T) {
|
||||
t.Parallel()
|
||||
c, err := New("http://x", "s", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("New: %v", err)
|
||||
}
|
||||
cases := []MintRequest{
|
||||
{OrgID: "", UserID: "u", SandboxID: "s", AllowedChannels: []string{"q"}},
|
||||
{OrgID: "o", UserID: "", SandboxID: "s", AllowedChannels: []string{"q"}},
|
||||
{OrgID: "o", UserID: "u", SandboxID: "", AllowedChannels: []string{"q"}},
|
||||
{OrgID: "o", UserID: "u", SandboxID: "s", AllowedChannels: nil},
|
||||
}
|
||||
for i, tc := range cases {
|
||||
if _, err := c.MintSandboxToken(context.Background(), tc); err == nil {
|
||||
t.Errorf("case %d: expected error, got nil", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user