diff --git a/k8s/AGENTS.md b/k8s/AGENTS.md index b4821f1..390bf73 100644 --- a/k8s/AGENTS.md +++ b/k8s/AGENTS.md @@ -89,16 +89,16 @@ ssh -t moritz@haumdaucher.de "sudo df -h" ``` ## Ingress Configuration -Ingress resources **must** follow these strict conventions to work with the cluster's ingress controller (`nginx`) and certificate manager (`cert-manager`). +Ingress resources **must** follow these strict conventions to work with the cluster's ingress controller (`traefik`) and certificate manager (`cert-manager`). ### Annotations All Ingress resources must include: ```yaml annotations: - kubernetes.io/ingress.class: "nginx" + kubernetes.io/ingress.class: "traefik" cert-manager.io/cluster-issuer: "letsencrypt-prod" kubernetes.io/tls-acme: "true" - # Standard nginx tweaks + # Standard nginx tweaks (if using dual class) or traefik configurations nginx.ingress.kubernetes.io/proxy-body-size: "0" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" @@ -107,13 +107,13 @@ annotations: ### Hostnames & TLS * **Domain**: Use a subdomain of `haumdaucher.de` or `moritzgraf.de`. * **TLS Secret Name**: Must use **hyphens** instead of dots. - * Pattern: `--` - * Example: `n8n.moritzgraf.de` -> `n8n-moritzgraf-de` +* **Pattern**: `--` +* **Example**: `n8n.moritzgraf.de` -> `n8n-moritzgraf-de` ### Example ```yaml spec: - ingressClassName: nginx + ingressClassName: traefik tls: - hosts: - n8n.moritzgraf.de diff --git a/k8s/README.md b/k8s/README.md index 18d1d28..0ba2a42 100644 --- a/k8s/README.md +++ b/k8s/README.md @@ -34,14 +34,33 @@ kubcetl edit -n mailu secret sh.helm.release.v1.mailu.v8 # Deployment (non persistent stuff) -## [ingress-nginx](https://github.com/kubernetes/ingress-nginx/tree/master/charts/ingress-nginx) +## [Traefik Ingress Controller](https://github.com/traefik/traefik-helm-chart) (Replaces retired ingress-nginx) -Apply with helm: +Apply Dual IngressClass configuration (supporting both legacy `nginx` and new `traefik` names on the same controller): ```bash -helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +kubectl apply -f traefik/ingress-classes.yaml +``` + +Apply with Helm: + +```bash +helm repo add traefik https://traefik.github.io/charts helm repo update -helm upgrade --install --create-namespace ingress-nginx ingress-nginx/ingress-nginx -n ingress-nginx -f ingress-nginx/ingress-nginx.yaml +helm upgrade --install traefik traefik/traefik -n kube-system -f traefik/traefik-values.yaml +``` + +### [Legacy ingress-nginx (RETIRED)](https://github.com/kubernetes/ingress-nginx/tree/master/charts/ingress-nginx) + +The community ingress-nginx is **retired** (EOL March 2026). During hot-swap, it was scaled down to 0 replicas. Keep it scaled down for a few days before deleting: + +```bash +# To scale down (executed during migration) +kubectl scale daemonset ingress-nginx-controller -n ingress-nginx --replicas=0 + +# Permanent cleanup (execute after 3-7 days safety buffer) +# helm uninstall ingress-nginx -n ingress-nginx +# kubectl delete ns ingress-nginx ``` ## [cert-manager](https://cert-manager.io/docs/tutorials/acme/ingress/) diff --git a/k8s/development/registry_ingress.yaml b/k8s/development/registry_ingress.yaml index d44a200..d36d0ff 100644 --- a/k8s/development/registry_ingress.yaml +++ b/k8s/development/registry_ingress.yaml @@ -8,7 +8,7 @@ metadata: kubernetes.io/tls-acme: "true" # ---------------------------------------------- cert-manager.io/cluster-issuer: letsencrypt-prod - kubernetes.io/ingress.class: nginx + kubernetes.io/ingress.class: traefik meta.helm.sh/release-name: docker-registry meta.helm.sh/release-namespace: development nginx.ingress.kubernetes.io/force-ssl-redirect: "true" @@ -21,7 +21,7 @@ metadata: release: docker-registry spec: # --- ADDED: Critical for modern K8s --- - ingressClassName: nginx + ingressClassName: traefik # -------------------------------------- rules: - host: registry.haumdaucher.de diff --git a/k8s/kuard/ingress.yaml b/k8s/kuard/ingress.yaml index a64d479..661d185 100644 --- a/k8s/kuard/ingress.yaml +++ b/k8s/kuard/ingress.yaml @@ -1,12 +1,11 @@ -apiVersion: extensions/v1beta1 +apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: kuard namespace: kuard annotations: - kubernetes.io/ingress.class: "nginx" + kubernetes.io/ingress.class: "traefik" cert-manager.io/cluster-issuer: "letsencrypt-prod" - spec: tls: - hosts: @@ -17,6 +16,10 @@ spec: http: paths: - path: / + pathType: Prefix backend: - serviceName: kuard - servicePort: 80 + service: + name: kuard + port: + number: 80 + diff --git a/k8s/nextcloud/nextcloud-phpmyadmin.yml b/k8s/nextcloud/nextcloud-phpmyadmin.yml index a132959..3548dc1 100644 --- a/k8s/nextcloud/nextcloud-phpmyadmin.yml +++ b/k8s/nextcloud/nextcloud-phpmyadmin.yml @@ -5,7 +5,7 @@ ingress: enabled: true hostname: "nextcloud.phpmyadmin.haumdaucher.de" tls: "true" - ingressClassName: "nginx" + ingressClassName: "traefik" # hosts: # - path: "/" # tls: true @@ -14,5 +14,5 @@ ingress: annotations: cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/proxy-body-size: "0" - kubernetes.io/ingress.class: nginx + kubernetes.io/ingress.class: traefik nginx.ingress.kubernetes.io/force-ssl-redirect: "true" \ No newline at end of file diff --git a/k8s/openclaw/AGENTS.md b/k8s/openclaw/AGENTS.md index 2e41f55..8fc3e70 100644 --- a/k8s/openclaw/AGENTS.md +++ b/k8s/openclaw/AGENTS.md @@ -1,69 +1,69 @@ -# OpenClaw Agent Guide +# OpenClaw Agent Guide (Operator-Managed) -This document provides a comprehensive technical reference for AI agents to manage the **OpenClaw** deployment in this repository. +This document provides a technical reference for AI agents managing the **OpenClaw** deployment in this repository. + +--- ## 🏗️ Architecture & Configuration Lifecycle -### 1. Status -* **Telegram**: Configured with `dmPolicy: "allowlist"` for users `306373425` and `255114390`. -* **Skills**: Integrated `gog` (Workspace), `nano-banana-pro` (Image Gen), and various utility skills. -* **Authentication**: Multi-provider setup with Gemini CLI OAuth (Primary) and Gemini API Key (Backup). -* **Ollama**: Removed from the deployment. +The deployment has been migrated to the **OpenClaw Operator** framework. -### 2. Bootstrap Process -OpenClaw uses an `initContainer` to bootstrap the configuration: -1. The `openclaw-bootstrap-config` volume is mounted at `/mnt/config`. -2. The `initContainer` copies `/mnt/config/openclaw.json` to the persistent data volume at `/mnt/data/openclaw.json`. -3. The `initContainer` provisions authentication tokens (e.g., `google-gemini-cli.json`) from environment variables/secrets. -4. The main `openclaw` container identifies the persistent volume at `/home/node/.openclaw`. +### 1. Persistent Storage & State Protection +* **Storage Claim**: Uses the existing PVC `openclaw-data` (`openebs-hostpath`). +* **Zero-Drift Merge**: Configured with `mergeMode: merge`. Any declarative base config applied via K8s is safely deep-merged with runtime configuration mutations (e.g. linked channels, active sessions) written by the agent inside the PVC at `/home/openclaw/.openclaw/openclaw.json`. +* **Stateful Memory**: The agent's wisdom (`MEMORY.md`, `SOUL.md`, SQLite databases) resides in the persistent claim and survives pod restarts naturally. -### 3. Gemini OAuth Setup & Sync -This deployment uses a **local-to-remote** sync for Gemini OAuth: -1. **Local Login**: The user runs `openclaw models auth login --provider google-gemini-cli` on their local machine. -2. **Credential Capture**: This generates `~/.gemini/oauth_creds.json` locally. -3. **Secret Update**: The JSON content from that file is copied into the `gemini-oauth-token` field of `openclaw.secret.yaml`. -4. **Provisioning**: The `initContainer` in the K8s manifest reads the `GEMINI_OAUTH_TOKEN` env var (populated from the secret) and writes it to `/home/node/.openclaw/auth/google-gemini-cli.json`. +### 2. Sidecar Mapping -### 4. Applying Changes -To update the configuration or rotate tokens: -1. Modify the relevant fields in [openclaw.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw.secret.yaml). -2. Apply the manifest: `kubectl apply -f k8s/openclaw/openclaw.secret.yaml` -3. **Rotate Deployment**: You MUST restart the pod to trigger the `initContainer` bootstrap and inject new env vars: - `kubectl rollout restart deployment openclaw -n openclaw` +The multi-container pod structure is managed natively by the operator controller: + +| Legay Pod Container | Operator Controller Replacement | +|---|---| +| `chromium-sidecar` | Managed natively under `spec.chromium` with autowired CDP endpoint | +| `sidecar-proxy` | Managed natively by operator gateway proxy sidecar | +| `git-sync` | Retained under `spec.sidecars` to fetch skills from private git repository | +| `skill-stabilizer` | Retained under `spec.sidecars` to copy skills to the flat PVC structure | +| `install-uv-python` (init) | Managed natively by the operator under `spec.runtimeDeps.python: true` | --- -## 🔧 Configuration Reference (`openclaw.json`) +## 🔧 Applying Configuration Changes -### `models.providers` -- **`google`**: Built-in provider. Uses `GEMINI_API_KEY`. See [GEMINI_AUTH_GUIDE.md](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/GEMINI_AUTH_GUIDE.md). -- **`google-gemini-cli`**: OAuth-based provider (Primary). Uses provisioned tokens. -### `agents.defaults` -- `model.primary`: `google-gemini-cli/gemini-3-flash-preview` -- `model.fallbacks`: `["google/gemini-flash-latest"]` - -> [!IMPORTANT] -> Gemini 3 requires `previewFeatures: true` in `~/.gemini/settings.json`, which is automatically provisioned by the `initContainer`. A **rollout restart** is required after any manifest change. - -### `plugins` -- `google-gemini-cli-auth`: MUST be enabled for the primary provider to function. +To perform modifications or rotate api keys: +1. **Secrets**: Edit string fields in [openclaw.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw.secret.yaml). +2. **Declarative Base Config**: Edit fields under `spec.config.raw` in [openclaw-instance.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw-instance.secret.yaml). +3. **Apply & Rollout**: Apply the files. The operator tracks config hashes and performs an automated rolling update of the StatefulSet automatically (no manual restart needed!): + ```bash + kubectl apply -f k8s/openclaw/openclaw.secret.yaml + kubectl apply -f k8s/openclaw/openclaw-instance.secret.yaml + ``` --- -## 🚨 Startup & Troubleshooting +## 🚨 Troubleshooting & Live Verification -### Investigating Issues +If the agent is offline or failing: + +### 1. Verify Deployment Health +Ensure that the `OpenClawInstance` is successfully reconciled: ```bash -# Check config -kubectl exec -it -n openclaw deployment/openclaw -c openclaw -- cat /home/node/.openclaw/openclaw.json - -# Check auth tokens -kubectl exec -it -n openclaw deployment/openclaw -c openclaw -- ls -la /home/node/.openclaw/auth/ +kubectl get openclawinstance openclaw -n openclaw +# Expected: PHASE=Running, READY=True ``` -### Applying Configuration Changes +### 2. Verify Pod & Container States +Check that all 6 containers inside the StatefulSet are fully functional: ```bash -kubectl apply -f k8s/openclaw/openclaw.secret.yaml -kubectl rollout restart deployment openclaw -n openclaw -kubectl rollout status deployment openclaw -n openclaw +kubectl get pods -n openclaw -l app.kubernetes.io/instance=openclaw +``` + +### 3. Check Live Logs +Inspect the core gateway engine: +```bash +kubectl logs -n openclaw openclaw-0 -c openclaw --tail=100 +``` + +Inspect the Chromium browser engine: +```bash +kubectl logs -n openclaw openclaw-0 -c chromium --tail=100 ``` diff --git a/k8s/openclaw/README.md b/k8s/openclaw/README.md new file mode 100644 index 0000000..d2c36fd --- /dev/null +++ b/k8s/openclaw/README.md @@ -0,0 +1,109 @@ +# OpenClaw Operations Guide (Operator-Managed) + +This directory manages the deployment of the **OpenClaw** stateful AI agent on the single-node `haumdaucher` cluster. + +The deployment is managed by the official **OpenClaw Operator** (`openclaw-operator`) using an `OpenClawInstance` Custom Resource, replacing the legacy 571-line manual deployment with a modern, zero-drift GitOps-friendly framework. + +--- + +## 🏗️ Architecture Layout + +* **Operator System**: Watches the `openclaw` namespace and reconciles the `OpenClawInstance` resource. +* **Deep-Merge Config**: Configured with `mergeMode: merge` to prevent configuration clobbering. Declarative settings from Git are safely merged with dynamic configurations (such as linked Telegram accounts or runtime credentials) mutated by the agent on the PVC. +* **Built-in Sidecars**: + * **Headless Chromium**: Auto-injected and wired to `http://127.0.0.1:9222` for browser automation and tools. + * **Gateway Proxy**: Handles token-based authentication and secure WebSocket forwarding natively. +* **Custom Sidecars**: + * **git-sync**: Pulls custom skills dynamically from your private repository `git.moritzgraf.de/moritz/mop-skills`. + * **skill-stabilizer**: Continuously copies checked-out skills into the flat `/home/openclaw/.openclaw/skills` workspace. + +--- + +## 🚀 Installation & Setup + +### 1. Install the Operator (One-time) + +The operator is installed globally via Helm: + +```bash +helm install openclaw-operator \ + oci://ghcr.io/openclaw-rocks/charts/openclaw-operator \ + --namespace openclaw-operator-system \ + --create-namespace +``` + +### 2. Apply Custom Secrets and CRD Instance + +The deployment resources are applied directly via `kubectl`: + +```bash +# Apply raw secrets (git-crypt encrypted in the repository) +kubectl apply -f k8s/openclaw/openclaw.secret.yaml + +# Apply the custom resource definition instance +kubectl apply -f k8s/openclaw/openclaw-instance.secret.yaml +``` + +--- + +## 🔑 Gateway Access (Token Auth) + +Basic Auth (`htpasswd`) has been retired. The operator-injected gateway proxy implements token-based authentication. + +To access the Control UI in your browser, append your gateway token as a URL fragment: + +``` +https://openclaw.haumdaucher.de/#token= +``` + +> The `` corresponds to the `gateway-token` value configured inside the `openclaw-secrets` Secret. + +--- + +## 💾 Backup & Disaster Recovery + +Because your deployment relies on `openebs-hostpath` persistent storage, **all state resides on a single physical node disk**. Local pre-migration backups should be kept in case of host or cluster recreation. + +### Perform a Local Backup +Run the following script to pull your memory files, SQLite databases, paired devices, and active configs locally: + +```bash +# Define target backup dir +BACKUP_DIR="k8s/openclaw/backup" +mkdir -p "$BACKUP_DIR" + +# Get active pod name +POD_NAME=$(kubectl get pods -n openclaw -l app.kubernetes.io/instance=openclaw -o jsonpath='{.items[0].metadata.name}') + +# Copy critical runtime files (excluding 2.9GB model files) +for item in openclaw.json workspace memory credentials identity devices telegram settings scripts tasks subagents canvas; do + kubectl cp -n openclaw -c openclaw "$POD_NAME:/home/openclaw/.openclaw/$item" "$BACKUP_DIR/$item" +done +``` + +### Restore from a Local Backup +If your PVC is recreated or wiped, you can push the backup files back to the new pod: + +```bash +# Push directories back to the running pod +kubectl cp "$BACKUP_DIR/workspace" openclaw-0:/home/openclaw/.openclaw/workspace -n openclaw -c openclaw +# Repeat for other folders as needed +``` + +--- + +## 🔧 Operational Verification Commands + +```bash +# Check the controller reconciliation phase (Expected: PHASE=Running) +kubectl get openclawinstances -n openclaw + +# View the status of all 6 containers inside the StatefulSet pod +kubectl get pods -n openclaw -o wide + +# Review live logs of the core OpenClaw engine +kubectl logs -n openclaw openclaw-0 -c openclaw --tail=100 -f + +# Review live logs of the Chromium sidecar +kubectl logs -n openclaw openclaw-0 -c chromium --tail=100 -f +``` diff --git a/k8s/openclaw/openclaw-instance.secret.yaml b/k8s/openclaw/openclaw-instance.secret.yaml new file mode 100644 index 0000000..54dff82 Binary files /dev/null and b/k8s/openclaw/openclaw-instance.secret.yaml differ diff --git a/k8s/openclaw/openclaw.secret.yaml b/k8s/openclaw/openclaw.secret.yaml index 2138866..96e4bb9 100644 Binary files a/k8s/openclaw/openclaw.secret.yaml and b/k8s/openclaw/openclaw.secret.yaml differ diff --git a/k8s/traefik/ingress-classes.yaml b/k8s/traefik/ingress-classes.yaml new file mode 100644 index 0000000..74b7c60 --- /dev/null +++ b/k8s/traefik/ingress-classes.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.k8s.io/v1 +kind: IngressClass +metadata: + name: nginx +spec: + controller: traefik.io/ingress-controller +--- +apiVersion: networking.k8s.io/v1 +kind: IngressClass +metadata: + name: traefik +spec: + controller: traefik.io/ingress-controller diff --git a/k8s/traefik/traefik-values.yaml b/k8s/traefik/traefik-values.yaml new file mode 100644 index 0000000..900f115 --- /dev/null +++ b/k8s/traefik/traefik-values.yaml @@ -0,0 +1,66 @@ +deployment: + kind: DaemonSet + dnsPolicy: ClusterFirstWithHostNet + +hostNetwork: true + +# Bind directly to host ports 80 and 443 +ports: + web: + port: 80 + hostPort: 80 + expose: + default: true + exposedPort: 80 + websecure: + port: 443 + hostPort: 443 + expose: + default: true + exposedPort: 443 + # Avoid port collision with node-exporter on host network (9100) + metrics: + port: 9101 + hostPort: 9101 + exposedPort: 9101 + +# Configure Traefik to watch for standard Kubernetes Ingress resources +providers: + kubernetesIngress: + enabled: true + publishedService: + enabled: false + +# We will define IngressClass resources manually to achieve dual-class mapping +ingressClass: + enabled: false + +# Resource limits to ensure stable execution on a single node +resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +# Run as root (UID/GID 0) to bind to host network ports 80/443 +podSecurityContext: + runAsGroup: 0 + runAsNonRoot: false + runAsUser: 0 + +securityContext: + allowPrivilegeEscalation: true + capabilities: + drop: [] + add: + - NET_BIND_SERVICE + readOnlyRootFilesystem: false + +# Required for hostNetwork DaemonSets to allow rolling updates +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 0