Switch GPU backend from Vulkan to SYCL for ~2x inference performance on Intel GPUs

Build ggml-sycl from upstream llama.cpp (commit a5bb8ba4, matching ollama's
vendored ggml) using Intel oneAPI 2025.1.1 in a multi-stage Docker build.
Patch two ollama-specific API divergences via patch-sycl.py: added batch_size
parameter to graph_compute, removed GGML_TENSOR_FLAG_COMPUTE skip-check that
caused all compute nodes to be bypassed.

Tested: gemma3:1b — 27/27 layers on GPU, 10.2 tok/s gen, 65.3 tok/s prompt eval.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-12 17:28:23 +00:00
parent 63c3b81292
commit c56646e7e7
4 changed files with 260 additions and 42 deletions
+72
View File
@@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""
Patch upstream ggml-sycl to match ollama's modified ggml backend API.
ollama v0.15.6 vendors ggml from llama.cpp commit a5bb8ba4 but makes two
divergences from upstream:
1. graph_compute() has an extra 'int batch_size' parameter (ollama addition)
2. GGML_TENSOR_FLAG_COMPUTE enum value is removed from ollama's ggml.h,
so the skip-check in the compute loop must be removed entirely
"""
import re
import sys
path = sys.argv[1]
with open(path, "r") as f:
src = f.read()
original = src
# 1. Fix graph_compute signature: add 'int batch_size' parameter
# The function is defined as:
# static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
src = re.sub(
r'(static\s+(?:enum\s+)?ggml_status\s+ggml_backend_sycl_graph_compute\s*\([^)]*cgraph)\s*\)',
r'\1, int batch_size)',
src,
)
# 2. Add GGML_UNUSED(batch_size) inside the function body (after the opening brace)
src = re.sub(
r'(ggml_backend_sycl_graph_compute\([^)]*int\s+batch_size\)\s*\{)',
r'\1\n GGML_UNUSED(batch_size);',
src,
)
# 3. Remove GGML_TENSOR_FLAG_COMPUTE skip-check entirely.
# In ollama's vendored ggml, this flag doesn't exist (removed from the enum).
# Since ollama never sets bit 16, ALL nodes would be skipped, producing garbage.
# The actual code looks like:
# if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
# continue;
# }
src = re.sub(
r'\s*if\s*\(\(node->flags\s*&\s*GGML_TENSOR_FLAG_COMPUTE\)\s*==\s*0\)\s*\{\s*continue;\s*\}',
'',
src,
)
if src == original:
print(f"WARNING: No changes made to {path}", file=sys.stderr)
sys.exit(1)
with open(path, "w") as f:
f.write(src)
# Verify patches applied
checks = [
("batch_size parameter", "int batch_size" in src),
("GGML_UNUSED(batch_size)", "GGML_UNUSED(batch_size)" in src),
("GGML_TENSOR_FLAG_COMPUTE removed", "GGML_TENSOR_FLAG_COMPUTE" not in src),
]
for name, ok in checks:
status = "OK" if ok else "FAILED"
print(f" [{status}] {name}")
if all(ok for _, ok in checks):
print(f"Patched {path} successfully")
else:
print(f"ERROR: Some patches failed on {path}", file=sys.stderr)
sys.exit(1)