Howto ramalama: Difference between revisions

From Vidalinux Wiki
Jump to navigation Jump to search
No edit summary
 
(20 intermediate revisions by the same user not shown)
Line 1: Line 1:
= install nvidia drivers =
install nvidia drivers:
https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers
= install =
= install =
* fedora
* fedora
Line 19: Line 22:
install ramalama using yay:
install ramalama using yay:
  yay -S ramala
  yay -S ramala
* mac
use the official script to install ramalama on mac:
curl -fsSL https://ramalama.ai/install.sh | bash
* windows
for windows install wsl:
https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows


= usage =
= usage =
pull model openai gpt-oss:
pull model openai gpt-oss:
  ramalama pull gpt-oss:latest
  ramalama pull gpt-oss
pull model deekseek-r1:
ramalama pull deepseek
pull model granite:
ramala pull granite
run model ibm granite:
run model ibm granite:
  ramalama run granite
  ramalama run granite
pull model deekseek-r1:
ramalama pull deepseek
serve model:
serve model:
  ramalama serve gpt-oss
  ramalama serve gpt-oss
Line 39: Line 50:
serve model as daemon:
serve model as daemon:
  ramalama serve --port 8080 --name llamaserver -d deepseek
  ramalama serve --port 8080 --name llamaserver -d deepseek
chat webui for ramalama:
enter to the web chat ui:
  podman run -it --rm --name ramalamastack-ui -p 8501:8501 -e LLAMA_STACK_ENDPOINT=http://host.containers.internal:8080 quay.io/redhat-et/streamlit_client:latest
  http://localhost:8080
show container runtime command output without executing it:
show container runtime command output without executing it:
  ramalama --dryrun run deepseek
  ramalama --dryrun run deepseek
stop model service:
stop model service:
  ramalama stop deepseek-service
  ramalama stop llamaserver
convert specified model to an oci formatted ai model:
convert specified model to an oci formatted ai model:
  ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest
  ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest


= running as daemon =
= file and dir location =
shortname file location:
/usr/share/ramalama/shortnames.conf
ramalama.conf file location:
/usr/share/ramalama/ramalama.conf
models directory location running as normal user:
~/.local/share/ramalama/store/
models directory location running as root user:
/var/lib/ramalama/store
= podman-compose =
podman-compose-yml for nvidia:
version: '3.8'
services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/cuda:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "nvidia.com/gpu=all"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    runtime: /usr/bin/nvidia-container-runtime
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true
podman-compose.yml for amd gpu:
version: '3.8'
services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/rocm:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - HIP_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "/dev/kfd:/dev/kfd"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true
get the mount points of running container:
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'
edit podman-compose.yml and fix volumes with the output from the previous command.
 
deploy container using podman-compose.yml:
podman-compose -f podman-compose.yml up -d
 
= running as systemd daemon =
for running container as systemd daemon create this directory:
for running container as systemd daemon create this directory:
  mkdir ~/.config/containers/systemd/
  mkdir ~/.config/containers/systemd/
Line 70: Line 208:


= references =
= references =
* https://ramalama.ai
* https://crfm.stanford.edu/2023/03/13/alpaca.html
* https://crfm.stanford.edu/2023/03/13/alpaca.html
* https://www.youtube.com/watch?v=53NZFC-ReWs
* https://www.youtube.com/watch?v=53NZFC-ReWs
* https://youtu.be/WzG0-64VJG0?si=tT6QEEA-IfUTPOGB
* https://github.com/containers/ramalama/tree/main/docs
* https://github.com/containers/ramalama/tree/main/docs
* https://github.com/containers/ramalama
* https://github.com/containers/ramalama

Latest revision as of 02:12, 21 August 2025

install nvidia drivers

install nvidia drivers:

https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers

install

  • fedora

install podman:

sudo dnf -y install podman podman-compose

install ramala:

sudo dnf -y install python3-ramalama

install via pypi:

pip install ramalama
  • debian/ubuntu

install podman:

apt install podman podman-compose -y

install ramala:

curl -fsSL https://ramalama.ai/install.sh | bash
  • archlinux

install podman:

pacman -Sy podman podman-compose --noconfirm

install yay using chaotic repo:

https://wiki.vidalinux.org/index.php?title=Howto_NVK#enable_chaotic_repo

install ramalama using yay:

yay -S ramala
  • mac

use the official script to install ramalama on mac:

curl -fsSL https://ramalama.ai/install.sh | bash
  • windows

for windows install wsl:

https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows

usage

pull model openai gpt-oss:

ramalama pull gpt-oss

pull model deekseek-r1:

ramalama pull deepseek

pull model granite:

ramala pull granite

run model ibm granite:

ramalama run granite

serve model:

ramalama serve gpt-oss

serve model with vulkan backend:

ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek

serve model with intel-gpu backend:

ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek

serve model with nvidia-gpu backend:

ramalama serve --image=quay.io/ramalama/cuda:latest deepseek

serve model with amd-gpu backend:

ramalama serve --image=quay.io/ramalama/rocm:latest deepseek

serve model as daemon:

ramalama serve --port 8080 --name llamaserver -d deepseek

enter to the web chat ui:

http://localhost:8080

show container runtime command output without executing it:

ramalama --dryrun run deepseek

stop model service:

ramalama stop llamaserver

convert specified model to an oci formatted ai model:

ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest

file and dir location

shortname file location:

/usr/share/ramalama/shortnames.conf

ramalama.conf file location:

/usr/share/ramalama/ramalama.conf

models directory location running as normal user:

~/.local/share/ramalama/store/

models directory location running as root user:

/var/lib/ramalama/store

podman-compose

podman-compose-yml for nvidia:

version: '3.8'

services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/cuda:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "nvidia.com/gpu=all"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    runtime: /usr/bin/nvidia-container-runtime
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true

podman-compose.yml for amd gpu:

version: '3.8'

services:
  llama:
    container_name: llama_server
    image: quay.io/ramalama/rocm:latest
    command: [
      "llama-server",
      "--port", "8080",
      "--model", "/mnt/models/deepseek-r1",
      "--no-warmup",
      "--jinja",
      "--chat-template-file", "/mnt/models/chat_template",
      "--log-colors",
      "--alias", "deepseek-r1",
      "--ctx-size", "2048",
      "--temp", "0.8",
      "--cache-reuse", "256",
      "--flash-attn",
      "-ngl", "999",
      "--threads", "12",
      "--host", "0.0.0.0"
    ]
    ports:
      - "8080:8080"
    environment:
      - HIP_VISIBLE_DEVICES=0
      - HOME=/tmp
    labels:
      - "ai.ramalama.model=ollama://library/deepseek-r1:latest"
      - "ai.ramalama.engine=podman"
      - "ai.ramalama.runtime=llama.cpp"
      - "ai.ramalama.port=8080"
      - "ai.ramalama.command=serve"
      - "ai.ramalama=true"
    devices:
      - "/dev/dri:/dev/dri"
      - "/dev/kfd:/dev/kfd"
    security_opt:
      - "label=disable"
      - "no-new-privileges"
    cap_drop:
      - ALL
    init: true
    pull: newer
    rm: true
    volumes:
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro"
      - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro"
      - "/tmp:/tmp"
    detach: true

get the mount points of running container:

CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'

edit podman-compose.yml and fix volumes with the output from the previous command.

deploy container using podman-compose.yml:

podman-compose -f podman-compose.yml up -d

running as systemd daemon

for running container as systemd daemon create this directory:

mkdir ~/.config/containers/systemd/

create yaml from containers deployed:

CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}')
podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml

remove all containers running:

podman rm -af

create systemd file:

cat > ~/.config/containers/systemd/llamaserver.kube << EOF
[Unit]
Description = Run Kubernetes YAML with podman kube play

[Kube]
Yaml=llamaserver.yaml
EOF

reload systemd:

systemctl --user daemon-reload

start service using systemd:

systemctl --user start llamaserver.service

references