infrapuzzle/k8s/llm/ollama.yaml

110 lines
3.0 KiB
YAML

apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: llm
labels:
app: ollama
spec:
replicas: 1
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
spec:
initContainers:
- name: pull-model
image: curlimages/curl
command: ["/bin/sh", "-c"]
args:
- |
echo "Waiting for Ollama service..."
# Simple wait loop (naive check, better to use readiness probe/postStart but init runs before app)
# Actually, init container runs BEFORE the main container, so it can't interact with the main container's localhost.
# We need to perform the model pull *after* Ollama starts.
# Changing strategy: Use a postStart hook or sidecar.
# Or simpler: Just let it start, and rely on user/execution time pull, or use an entrypoint script wrapper in main container.
# Best approach for k8s simplicity: Use a command wrapper.
echo "Init container cannot pull because main container is not up. Skipping pre-pull in init."
echo "Model pull will require manual trigger or standard entrypoint behavior."
# To automate: We can run a sidecar that waits for port 11434 and then pulls.
containers:
- name: ollama
image: ollama/ollama:latest
env:
- name: OLLAMA_KEEP_ALIVE
value: "-1"
- name: OLLAMA_HOST
value: "0.0.0.0"
resources:
requests:
memory: "8Gi"
cpu: "2"
limits:
memory: "12Gi"
cpu: "4"
ports:
- containerPort: 11434
name: http
volumeMounts:
- name: ollama-storage
mountPath: /root/.ollama
livenessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 60
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 30
periodSeconds: 5
command: ["/bin/sh", "-c"]
args:
- |
# Start Ollama in background
/bin/ollama serve &
PID=$!
echo "Waiting for Ollama..."
sleep 10
echo "Pulling model..."
ollama pull llama3.1:8b-instruct-q8_0
echo "Model pulled. Keeping container alive."
wait $PID
volumes:
- name: ollama-storage
persistentVolumeClaim:
claimName: ollama-storage
---
apiVersion: v1
kind: Service
metadata:
name: ollama
namespace: llm
spec:
type: ClusterIP
selector:
app: ollama
ports:
- port: 11434
targetPort: 11434
protocol: TCP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-storage
namespace: llm
spec:
accessModes:
- ReadWriteOnce
storageClassName: openebs-hostpath
resources:
requests:
storage: 50Gi