Latest revision as of 02:12, 21 August 2025

install nvidia drivers

install nvidia drivers:

https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers

install

fedora

install podman:

sudo dnf -y install podman podman-compose

install ramala:

sudo dnf -y install python3-ramalama

install via pypi:

pip install ramalama

debian/ubuntu

install podman:

apt install podman podman-compose -y

install ramala:

curl -fsSL https://ramalama.ai/install.sh | bash

archlinux

install podman:

pacman -Sy podman podman-compose --noconfirm

install yay using chaotic repo:

https://wiki.vidalinux.org/index.php?title=Howto_NVK#enable_chaotic_repo

install ramalama using yay:

yay -S ramala

mac

use the official script to install ramalama on mac:

curl -fsSL https://ramalama.ai/install.sh | bash

windows

for windows install wsl:

https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows

usage

pull model openai gpt-oss:

ramalama pull gpt-oss

pull model deekseek-r1:

ramalama pull deepseek

pull model granite:

ramala pull granite

run model ibm granite:

ramalama run granite

serve model:

ramalama serve gpt-oss

serve model with vulkan backend:

ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek

serve model with intel-gpu backend:

ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek

serve model with nvidia-gpu backend:

ramalama serve --image=quay.io/ramalama/cuda:latest deepseek

serve model with amd-gpu backend:

ramalama serve --image=quay.io/ramalama/rocm:latest deepseek

serve model as daemon:

ramalama serve --port 8080 --name llamaserver -d deepseek

enter to the web chat ui:

http://localhost:8080

show container runtime command output without executing it:

ramalama --dryrun run deepseek

stop model service:

ramalama stop llamaserver

convert specified model to an oci formatted ai model:

ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest

file and dir location

shortname file location:

/usr/share/ramalama/shortnames.conf

ramalama.conf file location:

/usr/share/ramalama/ramalama.conf

models directory location running as normal user:

~/.local/share/ramalama/store/

models directory location running as root user:

/var/lib/ramalama/store

podman-compose

podman-compose-yml for nvidia:

version: '3.8'

services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/cuda:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "nvidia.com/gpu=all"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    runtime: /usr/bin/nvidia-container-runtime
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true

podman-compose.yml for amd gpu:

version: '3.8'

services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/rocm:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - HIP_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "/dev/kfd:/dev/kfd"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true

get the mount points of running container:

CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'

edit podman-compose.yml and fix volumes with the output from the previous command.

deploy container using podman-compose.yml:

podman-compose -f podman-compose.yml up -d

running as systemd daemon

for running container as systemd daemon create this directory:

mkdir ~/.config/containers/systemd/

create yaml from containers deployed:

CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml

remove all containers running:

podman rm -af

create systemd file:

cat > ~/.config/containers/systemd/llamaserver.kube << EOF
[Unit]
Description = Run Kubernetes YAML with podman kube play

[Kube]
Yaml=llamaserver.yaml
EOF

reload systemd:

systemctl --user daemon-reload

start service using systemd:

systemctl --user start llamaserver.service

Howto ramalama: Difference between revisions

Latest revision as of 02:12, 21 August 2025

Contents

install nvidia drivers

install

usage

file and dir location

podman-compose

running as systemd daemon

references

Navigation menu

@@ Line 1: / Line 1: @@
+= install nvidia drivers =
+install nvidia drivers:
+ https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers
 = install =
 * fedora
@@ Line 19: / Line 22: @@
 install ramalama using yay:
   yay -S ramala
+* mac
+use the official script to install ramalama on mac:
+ curl -fsSL https://ramalama.ai/install.sh | bash
+* windows
+for windows install wsl:
+ https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows
 = usage =
 pull model openai gpt-oss:
-  ramalama pull gpt-oss:latest
+  ramalama pull gpt-oss
+pull model deekseek-r1:
+ ramalama pull deepseek
+pull model granite:
+ ramala pull granite
 run model ibm granite:
   ramalama run granite
@@ Line 28: / Line 41: @@
   ramalama serve gpt-oss
 serve model with vulkan backend:
-  ramalama serve --image=quay.io/ramalama/ramalama:latest gemma3:4b
+  ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek
 serve model with intel-gpu backend:
-  ramalama serve --image=quay.io/ramalama/intel-gpu:latest gemma3:4b
+  ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek
-pull model deekseek-r1:
+serve model with nvidia-gpu backend:
-  ramalama pull deepseek
+ ramalama serve --image=quay.io/ramalama/cuda:latest deepseek
-serve model as daemon with llama-stack and other options:
+serve model with amd-gpu backend:
-  ramalama serve --port 8080 --api llama-stack --name deepseek-service -d deepseek
+  ramalama serve --image=quay.io/ramalama/rocm:latest deepseek
-chat webui for ramalama:
+serve model as daemon:
-  podman run -it --rm --name ramalamastack-ui -p 8501:8501 -e LLAMA_STACK_ENDPOINT=http://host.containers.internal:8080 quay.io/redhat-et/streamlit_client:latest
+  ramalama serve --port 8080 --name llamaserver -d deepseek
+enter to the web chat ui:
+  http://localhost:8080
 show container runtime command output without executing it:
   ramalama --dryrun run deepseek
 stop model service:
-  ramalama stop deepseek-service
+  ramalama stop llamaserver
 convert specified model to an oci formatted ai model:
   ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest
+= file and dir location =
+shortname file location:
+ /usr/share/ramalama/shortnames.conf
+ramalama.conf file location:
+ /usr/share/ramalama/ramalama.conf
+models directory location running as normal user:
+ ~/.local/share/ramalama/store/
+models directory location running as root user:
+ /var/lib/ramalama/store
+= podman-compose =
+podman-compose-yml for nvidia:
+ version: '3.8'
+ services:
+   llama:
+     container_name: llama_server
+     image: quay.io/ramalama/cuda:latest
+     command: [
+       "llama-server",
+       "--port", "8080",
+       "--model", "/mnt/models/deepseek-r1",
+       "--no-warmup",
+       "--jinja",
+       "--chat-template-file", "/mnt/models/chat_template",
+       "--log-colors",
+       "--alias", "deepseek-r1",
+       "--ctx-size", "2048",
+       "--temp", "0.8",
+       "--cache-reuse", "256",
+       "--flash-attn",
+       "-ngl", "999",
+       "--threads", "12",
+       "--host", "0.0.0.0"
+     ]
+     ports:
+       - "8080:8080"
+     environment:
+       - CUDA_VISIBLE_DEVICES=0
+       - HOME=/tmp
+     labels:
+       - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
+       - "ai.ramalama.engine=podman"
+       - "ai.ramalama.runtime=llama.cpp"
+       - "ai.ramalama.port=8080"
+       - "ai.ramalama.command=serve"
+       - "ai.ramalama=true"
+     devices:
+       - "/dev/dri:/dev/dri"
+       - "nvidia.com/gpu=all"
+     security_opt:
+       - "label=disable"
+       - "no-new-privileges"
+     cap_drop:
+       - ALL
+     init: true
+     pull: newer
+     runtime: /usr/bin/nvidia-container-runtime
+     rm: true
+     volumes:
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
+       - "/tmp:/tmp"
+     detach: true
+podman-compose.yml for amd gpu:
+ version: '3.8'
+ services:
+   llama:
+     container_name: llama_server
+     image: quay.io/ramalama/rocm:latest
+     command: [
+       "llama-server",
+       "--port", "8080",
+       "--model", "/mnt/models/deepseek-r1",
+       "--no-warmup",
+       "--jinja",
+       "--chat-template-file", "/mnt/models/chat_template",
+       "--log-colors",
+       "--alias", "deepseek-r1",
+       "--ctx-size", "2048",
+       "--temp", "0.8",
+       "--cache-reuse", "256",
+       "--flash-attn",
+       "-ngl", "999",
+       "--threads", "12",
+       "--host", "0.0.0.0"
+     ]
+     ports:
+       - "8080:8080"
+     environment:
+       - HIP_VISIBLE_DEVICES=0
+       - HOME=/tmp
+     labels:
+       - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
+       - "ai.ramalama.engine=podman"
+       - "ai.ramalama.runtime=llama.cpp"
+       - "ai.ramalama.port=8080"
+       - "ai.ramalama.command=serve"
+       - "ai.ramalama=true"
+     devices:
+       - "/dev/dri:/dev/dri"
+       - "/dev/kfd:/dev/kfd"
+     security_opt:
+       - "label=disable"
+       - "no-new-privileges"
+     cap_drop:
+       - ALL
+     init: true
+     pull: newer
+     rm: true
+     volumes:
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
+       - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
+       - "/tmp:/tmp"
+     detach: true
+get the mount points of running container:
+ CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
+ podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'
+edit podman-compose.yml and fix volumes with the output from the previous command.
+deploy container using podman-compose.yml:
+ podman-compose -f podman-compose.yml up -d
+= running as systemd daemon =
+for running container as systemd daemon create this directory:
+ mkdir ~/.config/containers/systemd/
 create yaml from containers deployed:
-  podman kube generate containerid -f myapp.yaml
+ CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
+  podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml
 remove all containers running:
   podman rm -af
-deploy containers using created yaml:
+create systemd file:
-  podman play myapp.yaml
+ cat > ~/.config/containers/systemd/llamaserver.kube << EOF
+ [Unit]
+  Description = Run Kubernetes YAML with podman kube play
+ [Kube]
+ Yaml=llamaserver.yaml
+ EOF
+reload systemd:
+ systemctl --user daemon-reload
+start service using systemd:
+ systemctl --user start llamaserver.service
 = references =
+* https://ramalama.ai
 * https://crfm.stanford.edu/2023/03/13/alpaca.html
 * https://www.youtube.com/watch?v=53NZFC-ReWs
+* https://youtu.be/WzG0-64VJG0?si=tT6QEEA-IfUTPOGB
 * https://github.com/containers/ramalama/tree/main/docs
 * https://github.com/containers/ramalama

Howto ramalama: Difference between revisions

Latest revision as of 02:12, 21 August 2025

install nvidia drivers

install

usage

file and dir location

podman-compose

running as systemd daemon

references

Navigation menu

Search