Switch GPU backend from Vulkan to SYCL for ~2x inference performance on Intel GPUs
Build ggml-sycl from upstream llama.cpp (commit a5bb8ba4, matching ollama's vendored ggml) using Intel oneAPI 2025.1.1 in a multi-stage Docker build. Patch two ollama-specific API divergences via patch-sycl.py: added batch_size parameter to graph_compute, removed GGML_TENSOR_FLAG_COMPUTE skip-check that caused all compute nodes to be bypassed. Tested: gemma3:1b — 27/27 layers on GPU, 10.2 tok/s gen, 65.3 tok/s prompt eval. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
+4
-3
@@ -8,8 +8,7 @@ services:
|
||||
container_name: ollama-intel-gpu
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
# - /dev/dri:/dev/dri
|
||||
- /dev/dri/renderD128:/dev/dri/renderD128
|
||||
- /dev/dri:/dev/dri
|
||||
volumes:
|
||||
- /tmp/.X11-unix:/tmp/.X11-unix
|
||||
- ollama-intel-gpu:/root/.ollama
|
||||
@@ -17,7 +16,9 @@ services:
|
||||
environment:
|
||||
- DISPLAY=${DISPLAY}
|
||||
- OLLAMA_HOST=0.0.0.0
|
||||
- OLLAMA_VULKAN=1
|
||||
- OLLAMA_DEBUG=1
|
||||
- ONEAPI_DEVICE_SELECTOR=level_zero:0
|
||||
- ZES_ENABLE_SYSMAN=1
|
||||
- OLLAMA_DEFAULT_KEEPALIVE=6h
|
||||
- OLLAMA_KEEP_ALIVE=24h
|
||||
- OLLAMA_MAX_LOADED_MODELS=1
|
||||
|
||||
Reference in New Issue
Block a user