Howto ramalama
install
- fedora
install podman:
sudo dnf -y install podman podman-compose
install ramala:
sudo dnf -y install python3-ramalama
install via pypi:
pip install ramalama
- debian/ubuntu
install podman:
apt install podman podman-compose -y
install ramala:
curl -fsSL https://ramalama.ai/install.sh | bash
- archlinux
install podman:
pacman -Sy podman podman-compose --noconfirm
install yay using chaotic repo:
https://wiki.vidalinux.org/index.php?title=Howto_NVK#enable_chaotic_repo
install ramalama using yay:
yay -S ramala
usage
pull model openai gpt-oss:
ramalama pull gpt-oss
pull model deekseek-r1:
ramalama pull deepseek
pull model granite:
ramala pull granite
run model ibm granite:
ramalama run granite
serve model:
ramalama serve gpt-oss
serve model with vulkan backend:
ramalama serve --image=quay.io/ramalama/ramalama:latest deepseek
serve model with intel-gpu backend:
ramalama serve --image=quay.io/ramalama/intel-gpu:latest deepseek
serve model with nvidia-gpu backend:
ramalama serve --image=quay.io/ramalama/cuda:latest deepseek
serve model with amd-gpu backend:
ramalama serve --image=quay.io/ramalama/rocm:latest deepseek
serve model as daemon:
ramalama serve --port 8080 --name llamaserver -d deepseek
enter to the web chat ui:
http://localhost:8080
show container runtime command output without executing it:
ramalama --dryrun run deepseek
stop model service:
ramalama stop deepseek-service
convert specified model to an oci formatted ai model:
ramalama convert ollama://tinyllama:latest oci://quay.io/rhatdan/tiny:latest
file and dir location
shortname file location:
/usr/share/ramalama/shortnames.conf
ramalama.conf file location:
/usr/share/ramalama/ramalama.conf
models directory location running as normal user:
~/.local/share/ramalama/store/
models directory location running as root user:
/var/lib/ramalama/store
podman-compose
podman-compose-yml for nvidia:
version: '3.8' services: llama: container_name: llama_server image: quay.io/ramalama/cuda:latest command: [ "llama-server", "--port", "8080", "--model", "/mnt/models/deepseek-r1", "--no-warmup", "--jinja", "--chat-template-file", "/mnt/models/chat_template", "--log-colors", "--alias", "deepseek-r1", "--ctx-size", "2048", "--temp", "0.8", "--cache-reuse", "256", "--flash-attn", "-ngl", "999", "--threads", "12", "--host", "0.0.0.0" ] ports: - "8080:8080" environment: - CUDA_VISIBLE_DEVICES=0 - HOME=/tmp labels: - "ai.ramalama.model=ollama://library/deepseek-r1:latest" - "ai.ramalama.engine=podman" - "ai.ramalama.runtime=llama.cpp" - "ai.ramalama.port=8080" - "ai.ramalama.command=serve" - "ai.ramalama=true" devices: - "/dev/dri:/dev/dri" - "nvidia.com/gpu=all" security_opt: - "label=disable" - "no-new-privileges" cap_drop: - ALL init: true pull: newer runtime: /usr/bin/nvidia-container-runtime rm: true volumes: - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" - "/tmp:/tmp" detach: true
podman-compose.yml for amd gpu:
version: '3.8' services: llama: container_name: llama_server image: quay.io/ramalama/rocm:latest command: [ "llama-server", "--port", "8080", "--model", "/mnt/models/deepseek-r1", "--no-warmup", "--jinja", "--chat-template-file", "/mnt/models/chat_template", "--log-colors", "--alias", "deepseek-r1", "--ctx-size", "2048", "--temp", "0.8", "--cache-reuse", "256", "--flash-attn", "-ngl", "999", "--threads", "12", "--host", "0.0.0.0" ] ports: - "8080:8080" environment: - HIP_VISIBLE_DEVICES=0 - HOME=/tmp labels: - "ai.ramalama.model=ollama://library/deepseek-r1:latest" - "ai.ramalama.engine=podman" - "ai.ramalama.runtime=llama.cpp" - "ai.ramalama.port=8080" - "ai.ramalama.command=serve" - "ai.ramalama=true" devices: - "/dev/dri:/dev/dri" - "/dev/kfd:/dev/kfd" security_opt: - "label=disable" - "no-new-privileges" cap_drop: - ALL init: true pull: newer rm: true volumes: - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-e6a7edc1a4d7d9b2de136a221a57336b76316cfe53a252aeba814496c5ae439d:/mnt/models/deepseek-r1:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-f64cd5418e4b038ef90cf5fab6eb7ce6ae8f18909416822751d3b9fca827c2ab:/mnt/models/config.json:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3:/mnt/models/chat_template:ro" - "/home/username/.local/share/ramalama/store/ollama/library/deepseek-r1/blobs/sha256-17f49571369a083a6f73503f294d46b7144ebbf57d90a41925cc8de77b4b2ac6:/mnt/models/chat_template_converted:ro" - "/tmp:/tmp" detach: true
get the mount points of running container:
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') podman inspect ${CONTAINERID}|grep mount=type=bind,src=|sed 's|--mount=type=bind,src=||g'|sed 's|,destination=|:|g'|sed 's|,|:|g'|sed 's|:$||'
edit podman-compose.yml and fix volumes with the output from the previous command.
deploy container using podman-compose.yml:
podman-compose -f podman-compose.yml up -d
running as systemd daemon
for running container as systemd daemon create this directory:
mkdir ~/.config/containers/systemd/
create yaml from containers deployed:
CONTAINERID=$(podman ps|grep -v CONTAINER|awk '{print $1}') podman kube generate ${CONTAINERID} -f ~/.config/containers/systemd/llamaserver.yaml
remove all containers running:
podman rm -af
create systemd file:
cat > ~/.config/containers/systemd/llamaserver.kube << EOF [Unit] Description = Run Kubernetes YAML with podman kube play [Kube] Yaml=llamaserver.yaml EOF
reload systemd:
systemctl --user daemon-reload
start service using systemd:
systemctl --user start llamaserver.service