Howto ramalama: Difference between revisions
Mandulete1 (talk | contribs) |
Mandulete1 (talk | contribs) No edit summary |
||
(34 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
= install nvidia drivers = | |||
install nvidia drivers: | |||
https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers | |||
= install = | = install = | ||
* fedora | * fedora | ||
Line 19: | Line 22: | ||
install ramalama using yay: | install ramalama using yay: | ||
yay -S ramala | yay -S ramala | ||
* mac | |||
use the official script to install ramalama on mac: | |||
curl -fsSL https://ramalama.ai/install.sh | bash | |||
* windows | |||
for windows install wsl: | |||
https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows | |||
= usage = | = usage = | ||
pull model openai gpt-oss: | pull model openai gpt-oss: | ||
ramalama pull gpt-oss: | ramalama pull gpt-oss | ||
pull model deekseek-r1: | |||
ramalama pull deepseek | |||
pull model granite: | |||
ramala pull granite | |||
run model ibm granite: | run model ibm granite: | ||
ramalama run granite | ramalama run granite | ||
Line 28: | Line 41: | ||
ramalama serve gpt-oss | ramalama serve gpt-oss | ||
serve model with vulkan backend: | serve model with vulkan backend: | ||
ramalama serve --image=quay.io/ramalama/ramalama:latest | ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek | ||
serve model with intel-gpu backend: | serve model with intel-gpu backend: | ||
ramalama serve --image=quay.io/ramalama/intel-gpu:latest | ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek | ||
serve model with nvidia-gpu backend: | |||
ramalama | ramalama serve --image=quay.io/ramalama/cuda:latest deepseek | ||
serve model as daemon | serve model with amd-gpu backend: | ||
ramalama serve --port 8080 | ramalama serve --image=quay.io/ramalama/rocm:latest deepseek | ||
chat | serve model as daemon: | ||
ramalama serve --port 8080 --name llamaserver -d deepseek | |||
enter to the web chat ui: | |||
http://localhost:8080 | |||
show container runtime command output without executing it: | show container runtime command output without executing it: | ||
ramalama --dryrun run deepseek | ramalama --dryrun run deepseek | ||
stop model service: | stop model service: | ||
ramalama stop | ramalama stop llamaserver | ||
convert specified model to an oci formatted ai model: | convert specified model to an oci formatted ai model: | ||
ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest | ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest | ||
= file and dir location = | |||
shortname file location: | |||
/usr/share/ramalama/shortnames.conf | |||
ramalama.conf file location: | |||
/usr/share/ramalama/ramalama.conf | |||
models directory location running as normal user: | |||
~/.local/share/ramalama/store/ | |||
models directory location running as root user: | |||
/var/lib/ramalama/store | |||
= podman-compose = | |||
podman-compose-yml for nvidia: | |||
version: '3.8' | |||
services: | |||
llama: | |||
container_name: llama_server | |||
image: quay.io/ramalama/cuda:latest | |||
command: [ | |||
"llama-server", | |||
"--port", "8080", | |||
"--model", "/mnt/models/deepseek-r1", | |||
"--no-warmup", | |||
"--jinja", | |||
"--chat-template-file", "/mnt/models/chat_template", | |||
"--log-colors", | |||
"--alias", "deepseek-r1", | |||
"--ctx-size", "2048", | |||
"--temp", "0.8", | |||
"--cache-reuse", "256", | |||
"--flash-attn", | |||
"-ngl", "999", | |||
"--threads", "12", | |||
"--host", "0.0.0.0" | |||
] | |||
ports: | |||
- "8080:8080" | |||
environment: | |||
- CUDA_VISIBLE_DEVICES=0 | |||
- HOME=/tmp | |||
labels: | |||
- "ai.ramalama.model=ollama://library/deepseek-r1:latest" | |||
- "ai.ramalama.engine=podman" | |||
- "ai.ramalama.runtime=llama.cpp" | |||
- "ai.ramalama.port=8080" | |||
- "ai.ramalama.command=serve" | |||
- "ai.ramalama=true" | |||
devices: | |||
- "/dev/dri:/dev/dri" | |||
- "nvidia.com/gpu=all" | |||
security_opt: | |||
- "label=disable" | |||
- "no-new-privileges" | |||
cap_drop: | |||
- ALL | |||
init: true | |||
pull: newer | |||
runtime: /usr/bin/nvidia-container-runtime | |||
rm: true | |||
volumes: | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" | |||
- "/tmp:/tmp" | |||
detach: true | |||
podman-compose.yml for amd gpu: | |||
version: '3.8' | |||
services: | |||
llama: | |||
container_name: llama_server | |||
image: quay.io/ramalama/rocm:latest | |||
command: [ | |||
"llama-server", | |||
"--port", "8080", | |||
"--model", "/mnt/models/deepseek-r1", | |||
"--no-warmup", | |||
"--jinja", | |||
"--chat-template-file", "/mnt/models/chat_template", | |||
"--log-colors", | |||
"--alias", "deepseek-r1", | |||
"--ctx-size", "2048", | |||
"--temp", "0.8", | |||
"--cache-reuse", "256", | |||
"--flash-attn", | |||
"-ngl", "999", | |||
"--threads", "12", | |||
"--host", "0.0.0.0" | |||
] | |||
ports: | |||
- "8080:8080" | |||
environment: | |||
- HIP_VISIBLE_DEVICES=0 | |||
- HOME=/tmp | |||
labels: | |||
- "ai.ramalama.model=ollama://library/deepseek-r1:latest" | |||
- "ai.ramalama.engine=podman" | |||
- "ai.ramalama.runtime=llama.cpp" | |||
- "ai.ramalama.port=8080" | |||
- "ai.ramalama.command=serve" | |||
- "ai.ramalama=true" | |||
devices: | |||
- "/dev/dri:/dev/dri" | |||
- "/dev/kfd:/dev/kfd" | |||
security_opt: | |||
- "label=disable" | |||
- "no-new-privileges" | |||
cap_drop: | |||
- ALL | |||
init: true | |||
pull: newer | |||
rm: true | |||
volumes: | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" | |||
- "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" | |||
- "/tmp:/tmp" | |||
detach: true | |||
get the mount points of running container: | |||
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') | |||
podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||' | |||
edit podman-compose.yml and fix volumes with the output from the previous command. | |||
deploy container using podman-compose.yml: | |||
podman-compose -f podman-compose.yml up -d | |||
= running as systemd daemon = | |||
for running container as systemd daemon create this directory: | |||
mkdir ~/.config/containers/systemd/ | |||
create yaml from containers deployed: | create yaml from containers deployed: | ||
podman kube generate | CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') | ||
podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml | |||
remove all containers running: | remove all containers running: | ||
podman rm -af | podman rm -af | ||
create systemd file: | |||
podman play | cat > ~/.config/containers/systemd/llamaserver.kube << EOF | ||
[Unit] | |||
Description = Run Kubernetes YAML with podman kube play | |||
[Kube] | |||
Yaml=llamaserver.yaml | |||
EOF | |||
reload systemd: | |||
systemctl --user daemon-reload | |||
start service using systemd: | |||
systemctl --user start llamaserver.service | |||
= references = | = references = | ||
* https://ramalama.ai | |||
* https://crfm.stanford.edu/2023/03/13/alpaca.html | * https://crfm.stanford.edu/2023/03/13/alpaca.html | ||
* https://www.youtube.com/watch?v=53NZFC-ReWs | * https://www.youtube.com/watch?v=53NZFC-ReWs | ||
* https://youtu.be/WzG0-64VJG0?si=tT6QEEA-IfUTPOGB | |||
* https://github.com/containers/ramalama/tree/main/docs | * https://github.com/containers/ramalama/tree/main/docs | ||
* https://github.com/containers/ramalama | * https://github.com/containers/ramalama |
Latest revision as of 02:12, 21 August 2025
install nvidia drivers
install nvidia drivers:
https://wiki.vidalinux.org/index.php?title=Howto_install_nvidia_drivers
install
- fedora
install podman:
sudo dnf -y install podman podman-compose
install ramala:
sudo dnf -y install python3-ramalama
install via pypi:
pip install ramalama
- debian/ubuntu
install podman:
apt install podman podman-compose -y
install ramala:
curl -fsSL https://ramalama.ai/install.sh | bash
- archlinux
install podman:
pacman -Sy podman podman-compose --noconfirm
install yay using chaotic repo:
https://wiki.vidalinux.org/index.php?title=Howto_NVK#enable_chaotic_repo
install ramalama using yay:
yay -S ramala
- mac
use the official script to install ramalama on mac:
curl -fsSL https://ramalama.ai/install.sh | bash
- windows
for windows install wsl:
https://wiki.vidalinux.org/index.php?title=Howto_wsl_windows
usage
pull model openai gpt-oss:
ramalama pull gpt-oss
pull model deekseek-r1:
ramalama pull deepseek
pull model granite:
ramala pull granite
run model ibm granite:
ramalama run granite
serve model:
ramalama serve gpt-oss
serve model with vulkan backend:
ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek
serve model with intel-gpu backend:
ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek
serve model with nvidia-gpu backend:
ramalama serve --image=quay.io/ramalama/cuda:latest deepseek
serve model with amd-gpu backend:
ramalama serve --image=quay.io/ramalama/rocm:latest deepseek
serve model as daemon:
ramalama serve --port 8080 --name llamaserver -d deepseek
enter to the web chat ui:
http://localhost:8080
show container runtime command output without executing it:
ramalama --dryrun run deepseek
stop model service:
ramalama stop llamaserver
convert specified model to an oci formatted ai model:
ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest
file and dir location
shortname file location:
/usr/share/ramalama/shortnames.conf
ramalama.conf file location:
/usr/share/ramalama/ramalama.conf
models directory location running as normal user:
~/.local/share/ramalama/store/
models directory location running as root user:
/var/lib/ramalama/store
podman-compose
podman-compose-yml for nvidia:
version: '3.8' services: llama: container_name: llama_server image: quay.io/ramalama/cuda:latest command: [ "llama-server", "--port", "8080", "--model", "/mnt/models/deepseek-r1", "--no-warmup", "--jinja", "--chat-template-file", "/mnt/models/chat_template", "--log-colors", "--alias", "deepseek-r1", "--ctx-size", "2048", "--temp", "0.8", "--cache-reuse", "256", "--flash-attn", "-ngl", "999", "--threads", "12", "--host", "0.0.0.0" ] ports: - "8080:8080" environment: - CUDA_VISIBLE_DEVICES=0 - HOME=/tmp labels: - "ai.ramalama.model=ollama://library/deepseek-r1:latest" - "ai.ramalama.engine=podman" - "ai.ramalama.runtime=llama.cpp" - "ai.ramalama.port=8080" - "ai.ramalama.command=serve" - "ai.ramalama=true" devices: - "/dev/dri:/dev/dri" - "nvidia.com/gpu=all" security_opt: - "label=disable" - "no-new-privileges" cap_drop: - ALL init: true pull: newer runtime: /usr/bin/nvidia-container-runtime rm: true volumes: - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" - "/tmp:/tmp" detach: true
podman-compose.yml for amd gpu:
version: '3.8' services: llama: container_name: llama_server image: quay.io/ramalama/rocm:latest command: [ "llama-server", "--port", "8080", "--model", "/mnt/models/deepseek-r1", "--no-warmup", "--jinja", "--chat-template-file", "/mnt/models/chat_template", "--log-colors", "--alias", "deepseek-r1", "--ctx-size", "2048", "--temp", "0.8", "--cache-reuse", "256", "--flash-attn", "-ngl", "999", "--threads", "12", "--host", "0.0.0.0" ] ports: - "8080:8080" environment: - HIP_VISIBLE_DEVICES=0 - HOME=/tmp labels: - "ai.ramalama.model=ollama://library/deepseek-r1:latest" - "ai.ramalama.engine=podman" - "ai.ramalama.runtime=llama.cpp" - "ai.ramalama.port=8080" - "ai.ramalama.command=serve" - "ai.ramalama=true" devices: - "/dev/dri:/dev/dri" - "/dev/kfd:/dev/kfd" security_opt: - "label=disable" - "no-new-privileges" cap_drop: - ALL init: true pull: newer rm: true volumes: - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" - "/tmp:/tmp" detach: true
get the mount points of running container:
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'
edit podman-compose.yml and fix volumes with the output from the previous command.
deploy container using podman-compose.yml:
podman-compose -f podman-compose.yml up -d
running as systemd daemon
for running container as systemd daemon create this directory:
mkdir ~/.config/containers/systemd/
create yaml from containers deployed:
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml
remove all containers running:
podman rm -af
create systemd file:
cat > ~/.config/containers/systemd/llamaserver.kube << EOF [Unit] Description = Run Kubernetes YAML with podman kube play [Kube] Yaml=llamaserver.yaml EOF
reload systemd:
systemctl --user daemon-reload
start service using systemd:
systemctl --user start llamaserver.service