From 5bf170cedb4472f1257c3dabccfec9962990ba39 Mon Sep 17 00:00:00 2001
From: Chili Palmer <hugoms@me.com>
Date: Tue, 17 Mar 2026 09:20:35 +0100
Subject: [PATCH] removed kv quantization due to incompatibility with gemma3

---
 mlx_server/engine.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlx_server/engine.py b/mlx_server/engine.py
index 7e1cf9a..8187eeb 100644
--- a/mlx_server/engine.py
+++ b/mlx_server/engine.py
@@ -434,11 +434,9 @@ class InferenceEngine:
     # Generation
     # ------------------------------------------------------------------
 
-    # Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon
-    _GENERATE_KWARGS = {
-        "kv_bits": 8,           # Quantize KV cache to 8-bit (halves memory bandwidth)
-        "kv_group_size": 64,    # Group size for KV quantization
-    }
+    # Common kwargs for mlx_vlm generate calls
+    # Note: KV cache quantization is not supported with Gemma 3's RotatingKVCache
+    _GENERATE_KWARGS: dict = {}
 
     def generate(
         self,