Build ggml-sycl from upstream llama.cpp (commit a5bb8ba4, matching ollama's vendored ggml) using Intel oneAPI 2025.1.1 in a multi-stage Docker build. Patch two ollama-specific API divergences via patch-sycl.py: added batch_size parameter to graph_compute, removed GGML_TENSOR_FLAG_COMPUTE skip-check that caused all compute nodes to be bypassed. Tested: gemma3:1b — 27/27 layers on GPU, 10.2 tok/s gen, 65.3 tok/s prompt eval. Co-authored-by: Cursor <cursoragent@cursor.com>
73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Patch upstream ggml-sycl to match ollama's modified ggml backend API.
|
|
|
|
ollama v0.15.6 vendors ggml from llama.cpp commit a5bb8ba4 but makes two
|
|
divergences from upstream:
|
|
|
|
1. graph_compute() has an extra 'int batch_size' parameter (ollama addition)
|
|
2. GGML_TENSOR_FLAG_COMPUTE enum value is removed from ollama's ggml.h,
|
|
so the skip-check in the compute loop must be removed entirely
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
with open(path, "r") as f:
|
|
src = f.read()
|
|
|
|
original = src
|
|
|
|
# 1. Fix graph_compute signature: add 'int batch_size' parameter
|
|
# The function is defined as:
|
|
# static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
src = re.sub(
|
|
r'(static\s+(?:enum\s+)?ggml_status\s+ggml_backend_sycl_graph_compute\s*\([^)]*cgraph)\s*\)',
|
|
r'\1, int batch_size)',
|
|
src,
|
|
)
|
|
|
|
# 2. Add GGML_UNUSED(batch_size) inside the function body (after the opening brace)
|
|
src = re.sub(
|
|
r'(ggml_backend_sycl_graph_compute\([^)]*int\s+batch_size\)\s*\{)',
|
|
r'\1\n GGML_UNUSED(batch_size);',
|
|
src,
|
|
)
|
|
|
|
# 3. Remove GGML_TENSOR_FLAG_COMPUTE skip-check entirely.
|
|
# In ollama's vendored ggml, this flag doesn't exist (removed from the enum).
|
|
# Since ollama never sets bit 16, ALL nodes would be skipped, producing garbage.
|
|
# The actual code looks like:
|
|
# if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
|
# continue;
|
|
# }
|
|
src = re.sub(
|
|
r'\s*if\s*\(\(node->flags\s*&\s*GGML_TENSOR_FLAG_COMPUTE\)\s*==\s*0\)\s*\{\s*continue;\s*\}',
|
|
'',
|
|
src,
|
|
)
|
|
|
|
if src == original:
|
|
print(f"WARNING: No changes made to {path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
with open(path, "w") as f:
|
|
f.write(src)
|
|
|
|
# Verify patches applied
|
|
checks = [
|
|
("batch_size parameter", "int batch_size" in src),
|
|
("GGML_UNUSED(batch_size)", "GGML_UNUSED(batch_size)" in src),
|
|
("GGML_TENSOR_FLAG_COMPUTE removed", "GGML_TENSOR_FLAG_COMPUTE" not in src),
|
|
]
|
|
for name, ok in checks:
|
|
status = "OK" if ok else "FAILED"
|
|
print(f" [{status}] {name}")
|
|
|
|
if all(ok for _, ok in checks):
|
|
print(f"Patched {path} successfully")
|
|
else:
|
|
print(f"ERROR: Some patches failed on {path}", file=sys.stderr)
|
|
sys.exit(1)
|