removed kv quantization due to incompatibility with gemma3

This commit is contained in:
2026-03-17 09:20:35 +01:00
parent df81afe8d7
commit 5bf170cedb

View File

@@ -434,11 +434,9 @@ class InferenceEngine:
# Generation # Generation
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon # Common kwargs for mlx_vlm generate calls
_GENERATE_KWARGS = { # Note: KV cache quantization is not supported with Gemma 3's RotatingKVCache
"kv_bits": 8, # Quantize KV cache to 8-bit (halves memory bandwidth) _GENERATE_KWARGS: dict = {}
"kv_group_size": 64, # Group size for KV quantization
}
def generate( def generate(
self, self,