From 5bf170cedb4472f1257c3dabccfec9962990ba39 Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Tue, 17 Mar 2026 09:20:35 +0100 Subject: [PATCH] removed kv quantization due to incompatibility with gemma3 --- mlx_server/engine.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mlx_server/engine.py b/mlx_server/engine.py index 7e1cf9a..8187eeb 100644 --- a/mlx_server/engine.py +++ b/mlx_server/engine.py @@ -434,11 +434,9 @@ class InferenceEngine: # Generation # ------------------------------------------------------------------ - # Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon - _GENERATE_KWARGS = { - "kv_bits": 8, # Quantize KV cache to 8-bit (halves memory bandwidth) - "kv_group_size": 64, # Group size for KV quantization - } + # Common kwargs for mlx_vlm generate calls + # Note: KV cache quantization is not supported with Gemma 3's RotatingKVCache + _GENERATE_KWARGS: dict = {} def generate( self,