infrapuzzle/k8s/llm/ollama.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: ollama
  namespace: llm
  labels:
    app: ollama
spec:
  replicas: 1
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
    spec:
      initContainers:
      - name: pull-model
        image: curlimages/curl
        command: ["/bin/sh", "-c"]
        args:
          - |
            echo "Waiting for Ollama service..."
            # Simple wait loop (naive check, better to use readiness probe/postStart but init runs before app)
            # Actually, init container runs BEFORE the main container, so it can't interact with the main container's localhost.
            # We need to perform the model pull *after* Ollama starts.
            # Changing strategy: Use a postStart hook or sidecar.
            # Or simpler: Just let it start, and rely on user/execution time pull, or use an entrypoint script wrapper in main container.
            # Best approach for k8s simplicity: Use a command wrapper.
            echo "Init container cannot pull because main container is not up. Skipping pre-pull in init."
            echo "Model pull will require manual trigger or standard entrypoint behavior."
            # To automate: We can run a sidecar that waits for port 11434 and then pulls.
      containers:
      - name: ollama
        image: ollama/ollama:latest
        env:
        - name: OLLAMA_KEEP_ALIVE
          value: "-1"
        - name: OLLAMA_HOST
          value: "0.0.0.0"
        resources:
          requests:
            memory: "8Gi"
            cpu: "2"
          limits:
            memory: "12Gi"
            cpu: "4"
        ports:
        - containerPort: 11434
          name: http
        volumeMounts:
        - name: ollama-storage
          mountPath: /root/.ollama
        livenessProbe:
          httpGet:
            path: /api/health
            port: http
          initialDelaySeconds: 60
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /api/health
            port: http
          initialDelaySeconds: 30
          periodSeconds: 5
        command: ["/bin/sh", "-c"]
        args:
          - |
            # Start Ollama in background
            /bin/ollama serve &
            PID=$!
            echo "Waiting for Ollama..."
            sleep 10
            echo "Pulling model..."
            ollama pull llama3.1:8b-instruct-q8_0
            echo "Model pulled. Keeping container alive."
            wait $PID
      volumes:
      - name: ollama-storage
        persistentVolumeClaim:
          claimName: ollama-storage
---
apiVersion: v1
kind: Service
metadata:
  name: ollama
  namespace: llm
spec:
  type: ClusterIP
  selector:
    app: ollama
  ports:
  - port: 11434
    targetPort: 11434
    protocol: TCP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: ollama-storage
  namespace: llm
spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: openebs-hostpath
  resources:
    requests:
      storage: 50Gi