Latest state of openclaw and traefik
This commit is contained in:
parent
3042168c4c
commit
68345648d9
|
|
@ -89,16 +89,16 @@ ssh -t moritz@haumdaucher.de "sudo df -h"
|
|||
```
|
||||
|
||||
## Ingress Configuration
|
||||
Ingress resources **must** follow these strict conventions to work with the cluster's ingress controller (`nginx`) and certificate manager (`cert-manager`).
|
||||
Ingress resources **must** follow these strict conventions to work with the cluster's ingress controller (`traefik`) and certificate manager (`cert-manager`).
|
||||
|
||||
### Annotations
|
||||
All Ingress resources must include:
|
||||
```yaml
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: "nginx"
|
||||
kubernetes.io/ingress.class: "traefik"
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
kubernetes.io/tls-acme: "true"
|
||||
# Standard nginx tweaks
|
||||
# Standard nginx tweaks (if using dual class) or traefik configurations
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
|
|
@ -107,13 +107,13 @@ annotations:
|
|||
### Hostnames & TLS
|
||||
* **Domain**: Use a subdomain of `haumdaucher.de` or `moritzgraf.de`.
|
||||
* **TLS Secret Name**: Must use **hyphens** instead of dots.
|
||||
* Pattern: `<subdomain>-<domain>-<tld>`
|
||||
* Example: `n8n.moritzgraf.de` -> `n8n-moritzgraf-de`
|
||||
* **Pattern**: `<subdomain>-<domain>-<tld>`
|
||||
* **Example**: `n8n.moritzgraf.de` -> `n8n-moritzgraf-de`
|
||||
|
||||
### Example
|
||||
```yaml
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
ingressClassName: traefik
|
||||
tls:
|
||||
- hosts:
|
||||
- n8n.moritzgraf.de
|
||||
|
|
|
|||
|
|
@ -34,14 +34,33 @@ kubcetl edit -n mailu secret sh.helm.release.v1.mailu.v8
|
|||
|
||||
# Deployment (non persistent stuff)
|
||||
|
||||
## [ingress-nginx](https://github.com/kubernetes/ingress-nginx/tree/master/charts/ingress-nginx)
|
||||
## [Traefik Ingress Controller](https://github.com/traefik/traefik-helm-chart) (Replaces retired ingress-nginx)
|
||||
|
||||
Apply with helm:
|
||||
Apply Dual IngressClass configuration (supporting both legacy `nginx` and new `traefik` names on the same controller):
|
||||
|
||||
```bash
|
||||
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
|
||||
kubectl apply -f traefik/ingress-classes.yaml
|
||||
```
|
||||
|
||||
Apply with Helm:
|
||||
|
||||
```bash
|
||||
helm repo add traefik https://traefik.github.io/charts
|
||||
helm repo update
|
||||
helm upgrade --install --create-namespace ingress-nginx ingress-nginx/ingress-nginx -n ingress-nginx -f ingress-nginx/ingress-nginx.yaml
|
||||
helm upgrade --install traefik traefik/traefik -n kube-system -f traefik/traefik-values.yaml
|
||||
```
|
||||
|
||||
### [Legacy ingress-nginx (RETIRED)](https://github.com/kubernetes/ingress-nginx/tree/master/charts/ingress-nginx)
|
||||
|
||||
The community ingress-nginx is **retired** (EOL March 2026). During hot-swap, it was scaled down to 0 replicas. Keep it scaled down for a few days before deleting:
|
||||
|
||||
```bash
|
||||
# To scale down (executed during migration)
|
||||
kubectl scale daemonset ingress-nginx-controller -n ingress-nginx --replicas=0
|
||||
|
||||
# Permanent cleanup (execute after 3-7 days safety buffer)
|
||||
# helm uninstall ingress-nginx -n ingress-nginx
|
||||
# kubectl delete ns ingress-nginx
|
||||
```
|
||||
|
||||
## [cert-manager](https://cert-manager.io/docs/tutorials/acme/ingress/)
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ metadata:
|
|||
kubernetes.io/tls-acme: "true"
|
||||
# ----------------------------------------------
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
kubernetes.io/ingress.class: nginx
|
||||
kubernetes.io/ingress.class: traefik
|
||||
meta.helm.sh/release-name: docker-registry
|
||||
meta.helm.sh/release-namespace: development
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
|
|
@ -21,7 +21,7 @@ metadata:
|
|||
release: docker-registry
|
||||
spec:
|
||||
# --- ADDED: Critical for modern K8s ---
|
||||
ingressClassName: nginx
|
||||
ingressClassName: traefik
|
||||
# --------------------------------------
|
||||
rules:
|
||||
- host: registry.haumdaucher.de
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
apiVersion: extensions/v1beta1
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: kuard
|
||||
namespace: kuard
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: "nginx"
|
||||
kubernetes.io/ingress.class: "traefik"
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
|
||||
spec:
|
||||
tls:
|
||||
- hosts:
|
||||
|
|
@ -17,6 +16,10 @@ spec:
|
|||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
serviceName: kuard
|
||||
servicePort: 80
|
||||
service:
|
||||
name: kuard
|
||||
port:
|
||||
number: 80
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ ingress:
|
|||
enabled: true
|
||||
hostname: "nextcloud.phpmyadmin.haumdaucher.de"
|
||||
tls: "true"
|
||||
ingressClassName: "nginx"
|
||||
ingressClassName: "traefik"
|
||||
# hosts:
|
||||
# - path: "/"
|
||||
# tls: true
|
||||
|
|
@ -14,5 +14,5 @@ ingress:
|
|||
annotations:
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "0"
|
||||
kubernetes.io/ingress.class: nginx
|
||||
kubernetes.io/ingress.class: traefik
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
|
|
@ -1,69 +1,69 @@
|
|||
# OpenClaw Agent Guide
|
||||
# OpenClaw Agent Guide (Operator-Managed)
|
||||
|
||||
This document provides a comprehensive technical reference for AI agents to manage the **OpenClaw** deployment in this repository.
|
||||
This document provides a technical reference for AI agents managing the **OpenClaw** deployment in this repository.
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Architecture & Configuration Lifecycle
|
||||
|
||||
### 1. Status
|
||||
* **Telegram**: Configured with `dmPolicy: "allowlist"` for users `306373425` and `255114390`.
|
||||
* **Skills**: Integrated `gog` (Workspace), `nano-banana-pro` (Image Gen), and various utility skills.
|
||||
* **Authentication**: Multi-provider setup with Gemini CLI OAuth (Primary) and Gemini API Key (Backup).
|
||||
* **Ollama**: Removed from the deployment.
|
||||
The deployment has been migrated to the **OpenClaw Operator** framework.
|
||||
|
||||
### 2. Bootstrap Process
|
||||
OpenClaw uses an `initContainer` to bootstrap the configuration:
|
||||
1. The `openclaw-bootstrap-config` volume is mounted at `/mnt/config`.
|
||||
2. The `initContainer` copies `/mnt/config/openclaw.json` to the persistent data volume at `/mnt/data/openclaw.json`.
|
||||
3. The `initContainer` provisions authentication tokens (e.g., `google-gemini-cli.json`) from environment variables/secrets.
|
||||
4. The main `openclaw` container identifies the persistent volume at `/home/node/.openclaw`.
|
||||
### 1. Persistent Storage & State Protection
|
||||
* **Storage Claim**: Uses the existing PVC `openclaw-data` (`openebs-hostpath`).
|
||||
* **Zero-Drift Merge**: Configured with `mergeMode: merge`. Any declarative base config applied via K8s is safely deep-merged with runtime configuration mutations (e.g. linked channels, active sessions) written by the agent inside the PVC at `/home/openclaw/.openclaw/openclaw.json`.
|
||||
* **Stateful Memory**: The agent's wisdom (`MEMORY.md`, `SOUL.md`, SQLite databases) resides in the persistent claim and survives pod restarts naturally.
|
||||
|
||||
### 3. Gemini OAuth Setup & Sync
|
||||
This deployment uses a **local-to-remote** sync for Gemini OAuth:
|
||||
1. **Local Login**: The user runs `openclaw models auth login --provider google-gemini-cli` on their local machine.
|
||||
2. **Credential Capture**: This generates `~/.gemini/oauth_creds.json` locally.
|
||||
3. **Secret Update**: The JSON content from that file is copied into the `gemini-oauth-token` field of `openclaw.secret.yaml`.
|
||||
4. **Provisioning**: The `initContainer` in the K8s manifest reads the `GEMINI_OAUTH_TOKEN` env var (populated from the secret) and writes it to `/home/node/.openclaw/auth/google-gemini-cli.json`.
|
||||
### 2. Sidecar Mapping
|
||||
|
||||
### 4. Applying Changes
|
||||
To update the configuration or rotate tokens:
|
||||
1. Modify the relevant fields in [openclaw.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw.secret.yaml).
|
||||
2. Apply the manifest: `kubectl apply -f k8s/openclaw/openclaw.secret.yaml`
|
||||
3. **Rotate Deployment**: You MUST restart the pod to trigger the `initContainer` bootstrap and inject new env vars:
|
||||
`kubectl rollout restart deployment openclaw -n openclaw`
|
||||
The multi-container pod structure is managed natively by the operator controller:
|
||||
|
||||
| Legay Pod Container | Operator Controller Replacement |
|
||||
|---|---|
|
||||
| `chromium-sidecar` | Managed natively under `spec.chromium` with autowired CDP endpoint |
|
||||
| `sidecar-proxy` | Managed natively by operator gateway proxy sidecar |
|
||||
| `git-sync` | Retained under `spec.sidecars` to fetch skills from private git repository |
|
||||
| `skill-stabilizer` | Retained under `spec.sidecars` to copy skills to the flat PVC structure |
|
||||
| `install-uv-python` (init) | Managed natively by the operator under `spec.runtimeDeps.python: true` |
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Configuration Reference (`openclaw.json`)
|
||||
## 🔧 Applying Configuration Changes
|
||||
|
||||
### `models.providers`
|
||||
- **`google`**: Built-in provider. Uses `GEMINI_API_KEY`. See [GEMINI_AUTH_GUIDE.md](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/GEMINI_AUTH_GUIDE.md).
|
||||
- **`google-gemini-cli`**: OAuth-based provider (Primary). Uses provisioned tokens.
|
||||
### `agents.defaults`
|
||||
- `model.primary`: `google-gemini-cli/gemini-3-flash-preview`
|
||||
- `model.fallbacks`: `["google/gemini-flash-latest"]`
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Gemini 3 requires `previewFeatures: true` in `~/.gemini/settings.json`, which is automatically provisioned by the `initContainer`. A **rollout restart** is required after any manifest change.
|
||||
|
||||
### `plugins`
|
||||
- `google-gemini-cli-auth`: MUST be enabled for the primary provider to function.
|
||||
|
||||
---
|
||||
|
||||
## 🚨 Startup & Troubleshooting
|
||||
|
||||
### Investigating Issues
|
||||
```bash
|
||||
# Check config
|
||||
kubectl exec -it -n openclaw deployment/openclaw -c openclaw -- cat /home/node/.openclaw/openclaw.json
|
||||
|
||||
# Check auth tokens
|
||||
kubectl exec -it -n openclaw deployment/openclaw -c openclaw -- ls -la /home/node/.openclaw/auth/
|
||||
```
|
||||
|
||||
### Applying Configuration Changes
|
||||
To perform modifications or rotate api keys:
|
||||
1. **Secrets**: Edit string fields in [openclaw.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw.secret.yaml).
|
||||
2. **Declarative Base Config**: Edit fields under `spec.config.raw` in [openclaw-instance.secret.yaml](file:///Users/moritz/src/infrapuzzle/k8s/openclaw/openclaw-instance.secret.yaml).
|
||||
3. **Apply & Rollout**: Apply the files. The operator tracks config hashes and performs an automated rolling update of the StatefulSet automatically (no manual restart needed!):
|
||||
```bash
|
||||
kubectl apply -f k8s/openclaw/openclaw.secret.yaml
|
||||
kubectl rollout restart deployment openclaw -n openclaw
|
||||
kubectl rollout status deployment openclaw -n openclaw
|
||||
kubectl apply -f k8s/openclaw/openclaw-instance.secret.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚨 Troubleshooting & Live Verification
|
||||
|
||||
If the agent is offline or failing:
|
||||
|
||||
### 1. Verify Deployment Health
|
||||
Ensure that the `OpenClawInstance` is successfully reconciled:
|
||||
```bash
|
||||
kubectl get openclawinstance openclaw -n openclaw
|
||||
# Expected: PHASE=Running, READY=True
|
||||
```
|
||||
|
||||
### 2. Verify Pod & Container States
|
||||
Check that all 6 containers inside the StatefulSet are fully functional:
|
||||
```bash
|
||||
kubectl get pods -n openclaw -l app.kubernetes.io/instance=openclaw
|
||||
```
|
||||
|
||||
### 3. Check Live Logs
|
||||
Inspect the core gateway engine:
|
||||
```bash
|
||||
kubectl logs -n openclaw openclaw-0 -c openclaw --tail=100
|
||||
```
|
||||
|
||||
Inspect the Chromium browser engine:
|
||||
```bash
|
||||
kubectl logs -n openclaw openclaw-0 -c chromium --tail=100
|
||||
```
|
||||
|
|
|
|||
|
|
@ -0,0 +1,109 @@
|
|||
# OpenClaw Operations Guide (Operator-Managed)
|
||||
|
||||
This directory manages the deployment of the **OpenClaw** stateful AI agent on the single-node `haumdaucher` cluster.
|
||||
|
||||
The deployment is managed by the official **OpenClaw Operator** (`openclaw-operator`) using an `OpenClawInstance` Custom Resource, replacing the legacy 571-line manual deployment with a modern, zero-drift GitOps-friendly framework.
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Architecture Layout
|
||||
|
||||
* **Operator System**: Watches the `openclaw` namespace and reconciles the `OpenClawInstance` resource.
|
||||
* **Deep-Merge Config**: Configured with `mergeMode: merge` to prevent configuration clobbering. Declarative settings from Git are safely merged with dynamic configurations (such as linked Telegram accounts or runtime credentials) mutated by the agent on the PVC.
|
||||
* **Built-in Sidecars**:
|
||||
* **Headless Chromium**: Auto-injected and wired to `http://127.0.0.1:9222` for browser automation and tools.
|
||||
* **Gateway Proxy**: Handles token-based authentication and secure WebSocket forwarding natively.
|
||||
* **Custom Sidecars**:
|
||||
* **git-sync**: Pulls custom skills dynamically from your private repository `git.moritzgraf.de/moritz/mop-skills`.
|
||||
* **skill-stabilizer**: Continuously copies checked-out skills into the flat `/home/openclaw/.openclaw/skills` workspace.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Installation & Setup
|
||||
|
||||
### 1. Install the Operator (One-time)
|
||||
|
||||
The operator is installed globally via Helm:
|
||||
|
||||
```bash
|
||||
helm install openclaw-operator \
|
||||
oci://ghcr.io/openclaw-rocks/charts/openclaw-operator \
|
||||
--namespace openclaw-operator-system \
|
||||
--create-namespace
|
||||
```
|
||||
|
||||
### 2. Apply Custom Secrets and CRD Instance
|
||||
|
||||
The deployment resources are applied directly via `kubectl`:
|
||||
|
||||
```bash
|
||||
# Apply raw secrets (git-crypt encrypted in the repository)
|
||||
kubectl apply -f k8s/openclaw/openclaw.secret.yaml
|
||||
|
||||
# Apply the custom resource definition instance
|
||||
kubectl apply -f k8s/openclaw/openclaw-instance.secret.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔑 Gateway Access (Token Auth)
|
||||
|
||||
Basic Auth (`htpasswd`) has been retired. The operator-injected gateway proxy implements token-based authentication.
|
||||
|
||||
To access the Control UI in your browser, append your gateway token as a URL fragment:
|
||||
|
||||
```
|
||||
https://openclaw.haumdaucher.de/#token=<your-gateway-token>
|
||||
```
|
||||
|
||||
> The `<your-gateway-token>` corresponds to the `gateway-token` value configured inside the `openclaw-secrets` Secret.
|
||||
|
||||
---
|
||||
|
||||
## 💾 Backup & Disaster Recovery
|
||||
|
||||
Because your deployment relies on `openebs-hostpath` persistent storage, **all state resides on a single physical node disk**. Local pre-migration backups should be kept in case of host or cluster recreation.
|
||||
|
||||
### Perform a Local Backup
|
||||
Run the following script to pull your memory files, SQLite databases, paired devices, and active configs locally:
|
||||
|
||||
```bash
|
||||
# Define target backup dir
|
||||
BACKUP_DIR="k8s/openclaw/backup"
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
# Get active pod name
|
||||
POD_NAME=$(kubectl get pods -n openclaw -l app.kubernetes.io/instance=openclaw -o jsonpath='{.items[0].metadata.name}')
|
||||
|
||||
# Copy critical runtime files (excluding 2.9GB model files)
|
||||
for item in openclaw.json workspace memory credentials identity devices telegram settings scripts tasks subagents canvas; do
|
||||
kubectl cp -n openclaw -c openclaw "$POD_NAME:/home/openclaw/.openclaw/$item" "$BACKUP_DIR/$item"
|
||||
done
|
||||
```
|
||||
|
||||
### Restore from a Local Backup
|
||||
If your PVC is recreated or wiped, you can push the backup files back to the new pod:
|
||||
|
||||
```bash
|
||||
# Push directories back to the running pod
|
||||
kubectl cp "$BACKUP_DIR/workspace" openclaw-0:/home/openclaw/.openclaw/workspace -n openclaw -c openclaw
|
||||
# Repeat for other folders as needed
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Operational Verification Commands
|
||||
|
||||
```bash
|
||||
# Check the controller reconciliation phase (Expected: PHASE=Running)
|
||||
kubectl get openclawinstances -n openclaw
|
||||
|
||||
# View the status of all 6 containers inside the StatefulSet pod
|
||||
kubectl get pods -n openclaw -o wide
|
||||
|
||||
# Review live logs of the core OpenClaw engine
|
||||
kubectl logs -n openclaw openclaw-0 -c openclaw --tail=100 -f
|
||||
|
||||
# Review live logs of the Chromium sidecar
|
||||
kubectl logs -n openclaw openclaw-0 -c chromium --tail=100 -f
|
||||
```
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: networking.k8s.io/v1
|
||||
kind: IngressClass
|
||||
metadata:
|
||||
name: nginx
|
||||
spec:
|
||||
controller: traefik.io/ingress-controller
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: IngressClass
|
||||
metadata:
|
||||
name: traefik
|
||||
spec:
|
||||
controller: traefik.io/ingress-controller
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
deployment:
|
||||
kind: DaemonSet
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
|
||||
hostNetwork: true
|
||||
|
||||
# Bind directly to host ports 80 and 443
|
||||
ports:
|
||||
web:
|
||||
port: 80
|
||||
hostPort: 80
|
||||
expose:
|
||||
default: true
|
||||
exposedPort: 80
|
||||
websecure:
|
||||
port: 443
|
||||
hostPort: 443
|
||||
expose:
|
||||
default: true
|
||||
exposedPort: 443
|
||||
# Avoid port collision with node-exporter on host network (9100)
|
||||
metrics:
|
||||
port: 9101
|
||||
hostPort: 9101
|
||||
exposedPort: 9101
|
||||
|
||||
# Configure Traefik to watch for standard Kubernetes Ingress resources
|
||||
providers:
|
||||
kubernetesIngress:
|
||||
enabled: true
|
||||
publishedService:
|
||||
enabled: false
|
||||
|
||||
# We will define IngressClass resources manually to achieve dual-class mapping
|
||||
ingressClass:
|
||||
enabled: false
|
||||
|
||||
# Resource limits to ensure stable execution on a single node
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
# Run as root (UID/GID 0) to bind to host network ports 80/443
|
||||
podSecurityContext:
|
||||
runAsGroup: 0
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: true
|
||||
capabilities:
|
||||
drop: []
|
||||
add:
|
||||
- NET_BIND_SERVICE
|
||||
readOnlyRootFilesystem: false
|
||||
|
||||
# Required for hostNetwork DaemonSets to allow rolling updates
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
maxSurge: 0
|
||||
Loading…
Reference in New Issue