infrapuzzle/k8s/llm/llama_cpp_hosting.yaml

apiVersion: v1
kind: Namespace
metadata:
  name: llm

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-cpp-server
  namespace: llm
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-cpp-server
  strategy:
    type: Recreate
  template:
    metadata:
      labels:
        app: llama-cpp-server
    spec:
      initContainers:
      - name: download-model
        image: curlimages/curl
        command:
        - /bin/sh
        - -c
        - |
          MODEL_URL="https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q8_0.gguf?download=true"
          MODEL_FILE="/models/Meta-Llama-3-8B-Instruct.Q8_0.gguf"
          # Purge everything except the desired model file
          find /models -type f ! -name "$(basename $MODEL_FILE)" -delete
          # Check if the model file does not exist and then download it
          if [ ! -f $MODEL_FILE ]; then
            curl -L -o $MODEL_FILE $MODEL_URL
          fi
        volumeMounts:
        - name: model-storage
          mountPath: /models
      containers:
      - name: llama-cpp-server
        image: ghcr.io/ggerganov/llama.cpp:server
        command:
          - /server
          - -m
          - "/models/Meta-Llama-3-8B-Instruct.Q8_0.gguf"
          - --port
          - "8000"
          - --host
          - "0.0.0.0"
          - -n
          - "512"
        resources:
          requests:
            memory: "18Gi"
            cpu: 0.1
        volumeMounts:
        - name: model-storage
          mountPath: /models
      volumes:
      - name: model-storage
        persistentVolumeClaim:
          claimName: llama-model-pvc

---

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: llama-model-pvc
  namespace: llm
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 20Gi
---
apiVersion: v1
kind: Service
metadata:
  name: llama-server-service
  namespace: llm
spec:
  type: ClusterIP
  selector:
    app: llama-cpp-server
  ports:
    - protocol: TCP
      port: 8000
      targetPort: 8000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: llama-server-service
  namespace: llm
  annotations:
    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
    nginx.ingress.kubernetes.io/auth-realm: Authentication Required - llama webui
    nginx.ingress.kubernetes.io/auth-secret: llama-auth
    nginx.ingress.kubernetes.io/auth-type: basic
    cert-manager.io/cluster-issuer: "letsencrypt-prod"
    kubernetes.io/ingress.class: nginx
spec:
  ingressClassName: nginx
  tls:
   - hosts:
     - "llama.moritzgraf.de"
     secretName: llama-moritzgraf-de
  rules:
  - host: llama.moritzgraf.de
    http:
      paths:
      - backend:
          service:
            name: llama-server-service
            port:
              number: 8000
        path: /
        pathType: Prefix
---
apiVersion: v1
data:
  # fabian:stinkt
  # $htpasswd -c auth fabian
  # -> Creates file auth with creds, does not work in git repo. unkn why.
  auth: ZmFiaWFuOiRhcHIxJHRTV3YzU3hOJHJPZEJ5WXhYdG4vbVJtSzhtaENWZy4K
kind: Secret
metadata:
  name: llama-auth
  namespace: llm
type: Opaque