removed kv quantization due to incompatibility with gemma3

2026-03-17 09:20:35 +01:00
parent df81afe8d7
commit 5bf170cedb
1 changed files with 3 additions and 5 deletions
--- a/mlx_server/engine.py
+++ b/mlx_server/engine.py
@@ -434,11 +434,9 @@ class InferenceEngine:
    # Generation
    # ------------------------------------------------------------------

-    # Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon
-    _GENERATE_KWARGS = {
-        "kv_bits": 8,           # Quantize KV cache to 8-bit (halves memory bandwidth)
-        "kv_group_size": 64,    # Group size for KV quantization
-    }
+    # Common kwargs for mlx_vlm generate calls
+    # Note: KV cache quantization is not supported with Gemma 3's RotatingKVCache
+    _GENERATE_KWARGS: dict = {}

    def generate(
        self,