Switch GPU backend from Vulkan to SYCL for ~2x inference performance on Intel GPUs
Build ggml-sycl from upstream llama.cpp (commit a5bb8ba4, matching ollama's vendored ggml) using Intel oneAPI 2025.1.1 in a multi-stage Docker build. Patch two ollama-specific API divergences via patch-sycl.py: added batch_size parameter to graph_compute, removed GGML_TENSOR_FLAG_COMPUTE skip-check that caused all compute nodes to be bypassed. Tested: gemma3:1b — 27/27 layers on GPU, 10.2 tok/s gen, 65.3 tok/s prompt eval. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Patch upstream ggml-sycl to match ollama's modified ggml backend API.
|
||||
|
||||
ollama v0.15.6 vendors ggml from llama.cpp commit a5bb8ba4 but makes two
|
||||
divergences from upstream:
|
||||
|
||||
1. graph_compute() has an extra 'int batch_size' parameter (ollama addition)
|
||||
2. GGML_TENSOR_FLAG_COMPUTE enum value is removed from ollama's ggml.h,
|
||||
so the skip-check in the compute loop must be removed entirely
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
path = sys.argv[1]
|
||||
with open(path, "r") as f:
|
||||
src = f.read()
|
||||
|
||||
original = src
|
||||
|
||||
# 1. Fix graph_compute signature: add 'int batch_size' parameter
|
||||
# The function is defined as:
|
||||
# static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
src = re.sub(
|
||||
r'(static\s+(?:enum\s+)?ggml_status\s+ggml_backend_sycl_graph_compute\s*\([^)]*cgraph)\s*\)',
|
||||
r'\1, int batch_size)',
|
||||
src,
|
||||
)
|
||||
|
||||
# 2. Add GGML_UNUSED(batch_size) inside the function body (after the opening brace)
|
||||
src = re.sub(
|
||||
r'(ggml_backend_sycl_graph_compute\([^)]*int\s+batch_size\)\s*\{)',
|
||||
r'\1\n GGML_UNUSED(batch_size);',
|
||||
src,
|
||||
)
|
||||
|
||||
# 3. Remove GGML_TENSOR_FLAG_COMPUTE skip-check entirely.
|
||||
# In ollama's vendored ggml, this flag doesn't exist (removed from the enum).
|
||||
# Since ollama never sets bit 16, ALL nodes would be skipped, producing garbage.
|
||||
# The actual code looks like:
|
||||
# if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
# continue;
|
||||
# }
|
||||
src = re.sub(
|
||||
r'\s*if\s*\(\(node->flags\s*&\s*GGML_TENSOR_FLAG_COMPUTE\)\s*==\s*0\)\s*\{\s*continue;\s*\}',
|
||||
'',
|
||||
src,
|
||||
)
|
||||
|
||||
if src == original:
|
||||
print(f"WARNING: No changes made to {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(path, "w") as f:
|
||||
f.write(src)
|
||||
|
||||
# Verify patches applied
|
||||
checks = [
|
||||
("batch_size parameter", "int batch_size" in src),
|
||||
("GGML_UNUSED(batch_size)", "GGML_UNUSED(batch_size)" in src),
|
||||
("GGML_TENSOR_FLAG_COMPUTE removed", "GGML_TENSOR_FLAG_COMPUTE" not in src),
|
||||
]
|
||||
for name, ok in checks:
|
||||
status = "OK" if ok else "FAILED"
|
||||
print(f" [{status}] {name}")
|
||||
|
||||
if all(ok for _, ok in checks):
|
||||
print(f"Patched {path} successfully")
|
||||
else:
|
||||
print(f"ERROR: Some patches failed on {path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user