Switch GPU backend from Vulkan to SYCL for ~2x inference performance on Intel GPUs

Build ggml-sycl from upstream llama.cpp (commit a5bb8ba4, matching ollama's vendored ggml) using Intel oneAPI 2025.1.1 in a multi-stage Docker build. Patch two ollama-specific API divergences via patch-sycl.py: added batch_size parameter to graph_compute, removed GGML_TENSOR_FLAG_COMPUTE skip-check that caused all compute nodes to be bypassed. Tested: gemma3:1b — 27/27 layers on GPU, 10.2 tok/s gen, 65.3 tok/s prompt eval. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-12 17:28:23 +00:00
parent 63c3b81292
commit c56646e7e7
4 changed files with 260 additions and 42 deletions
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""
+Patch upstream ggml-sycl to match ollama's modified ggml backend API.
+
+ollama v0.15.6 vendors ggml from llama.cpp commit a5bb8ba4 but makes two
+divergences from upstream:
+
+1. graph_compute() has an extra 'int batch_size' parameter (ollama addition)
+2. GGML_TENSOR_FLAG_COMPUTE enum value is removed from ollama's ggml.h,
+   so the skip-check in the compute loop must be removed entirely
+"""
+
+import re
+import sys
+
+path = sys.argv[1]
+with open(path, "r") as f:
+    src = f.read()
+
+original = src
+
+# 1. Fix graph_compute signature: add 'int batch_size' parameter
+# The function is defined as:
+#   static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+src = re.sub(
+    r'(static\s+(?:enum\s+)?ggml_status\s+ggml_backend_sycl_graph_compute\s*\([^)]*cgraph)\s*\)',
+    r'\1, int batch_size)',
+    src,
+)
+
+# 2. Add GGML_UNUSED(batch_size) inside the function body (after the opening brace)
+src = re.sub(
+    r'(ggml_backend_sycl_graph_compute\([^)]*int\s+batch_size\)\s*\{)',
+    r'\1\n    GGML_UNUSED(batch_size);',
+    src,
+)
+
+# 3. Remove GGML_TENSOR_FLAG_COMPUTE skip-check entirely.
+# In ollama's vendored ggml, this flag doesn't exist (removed from the enum).
+# Since ollama never sets bit 16, ALL nodes would be skipped, producing garbage.
+# The actual code looks like:
+#   if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+#       continue;
+#   }
+src = re.sub(
+    r'\s*if\s*\(\(node->flags\s*&\s*GGML_TENSOR_FLAG_COMPUTE\)\s*==\s*0\)\s*\{\s*continue;\s*\}',
+    '',
+    src,
+)
+
+if src == original:
+    print(f"WARNING: No changes made to {path}", file=sys.stderr)
+    sys.exit(1)
+
+with open(path, "w") as f:
+    f.write(src)
+
+# Verify patches applied
+checks = [
+    ("batch_size parameter", "int batch_size" in src),
+    ("GGML_UNUSED(batch_size)", "GGML_UNUSED(batch_size)" in src),
+    ("GGML_TENSOR_FLAG_COMPUTE removed", "GGML_TENSOR_FLAG_COMPUTE" not in src),
+]
+for name, ok in checks:
+    status = "OK" if ok else "FAILED"
+    print(f"  [{status}] {name}")
+
+if all(ok for _, ok in checks):
+    print(f"Patched {path} successfully")
+else:
+    print(f"ERROR: Some patches failed on {path}", file=sys.stderr)
+    sys.exit(1)