removed kv quantization due to incompatibility with gemma3
This commit is contained in:
@@ -434,11 +434,9 @@ class InferenceEngine:
|
|||||||
# Generation
|
# Generation
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
# Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon
|
# Common kwargs for mlx_vlm generate calls
|
||||||
_GENERATE_KWARGS = {
|
# Note: KV cache quantization is not supported with Gemma 3's RotatingKVCache
|
||||||
"kv_bits": 8, # Quantize KV cache to 8-bit (halves memory bandwidth)
|
_GENERATE_KWARGS: dict = {}
|
||||||
"kv_group_size": 64, # Group size for KV quantization
|
|
||||||
}
|
|
||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user