From c80fe97f41efa02a8d742f32d18ac425b6de6414 Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Tue, 17 Mar 2026 13:24:43 +0100 Subject: [PATCH] feat: added gemma 3n E4B as another model for fast response --- CLAUDE.md | 5 +++-- README.md | 1 + mlx_server/engine.py | 14 ++++++++++++++ run.sh | 7 +++++-- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 245d90a..ea625ee 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,8 +29,9 @@ python -m mlx_server.main --model mlx-community/Qwen3-VL-4B-Instruct-4bit --port | Alias | HuggingFace ID | Notes | |-------|---------------|-------| -| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks | -| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `` tags | +| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks (128k context) | +| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | Vision/audio/video + tool use via `tool_code` blocks (32k context, ~1.5x faster) | +| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `` tags (256k context) | ## Key Design Decisions diff --git a/README.md b/README.md index de13332..4586f81 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ OpenAI-compatible API server for running local LLMs on Apple Silicon via [MLX](h | Alias | Model | Context | Capabilities | |-------|-------|---------|-------------| | `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) | +| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | 32k | Vision/audio/video, tool use (`tool_code` blocks), ~1.5x faster | | `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`` tags) | ## Quick Start diff --git a/mlx_server/engine.py b/mlx_server/engine.py index 2048298..c164c51 100644 --- a/mlx_server/engine.py +++ b/mlx_server/engine.py @@ -23,9 +23,16 @@ DEFAULT_MODEL = "mlx-community/gemma-3-4b-it-4bit" # Known model aliases for quick selection MODEL_ALIASES: dict[str, str] = { "gemma": "mlx-community/gemma-3-4b-it-4bit", + "gemma3n": "mlx-community/gemma-3n-E4B-it-4bit", "qwen": "mlx-community/Qwen3-VL-4B-Instruct-4bit", } +# Fallback context lengths for models whose config doesn't expose +# max_position_embeddings (e.g. gemma3n uses a MatFormer architecture). +_CONTEXT_LENGTH_OVERRIDES: dict[str, int] = { + "gemma3n": 32768, +} + def _resolve_local_model_path(repo_id: str) -> Path | None: """If a HuggingFace model is already cached locally, return its snapshot path. @@ -300,6 +307,9 @@ class InferenceEngine: """Max context length from the model config.""" if self.config is None: return 0 + # Some architectures don't expose max_position_embeddings in config + if self._model_type in _CONTEXT_LENGTH_OVERRIDES: + return _CONTEXT_LENGTH_OVERRIDES[self._model_type] # VLMs nest the LLM config under text_config text_cfg = getattr(self.config, "text_config", self.config) return getattr(text_cfg, "max_position_embeddings", 0) @@ -1095,6 +1105,10 @@ class ModelManager: return None try: config = json.loads(config_file.read_text()) + model_type = config.get("model_type", "") + # Check override table for models that don't expose it in config + if model_type in _CONTEXT_LENGTH_OVERRIDES: + return _CONTEXT_LENGTH_OVERRIDES[model_type] # VLMs nest under text_config text_cfg = config.get("text_config", config) return text_cfg.get("max_position_embeddings") diff --git a/run.sh b/run.sh index ab197c1..9567dfe 100755 --- a/run.sh +++ b/run.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR" source .venv/bin/activate # --- Model selection --- -# Usage: ./run.sh [gemma|qwen] +# Usage: ./run.sh [gemma|gemma3n|qwen] # Or set MODEL env var directly for a custom model. MODEL_CHOICE="${1:-gemma}" @@ -18,12 +18,15 @@ if [[ -z "${MODEL:-}" ]]; then gemma) MODEL="mlx-community/gemma-3-4b-it-4bit" ;; + gemma3n) + MODEL="mlx-community/gemma-3n-E4B-it-4bit" + ;; qwen) MODEL="mlx-community/Qwen3-VL-4B-Instruct-4bit" ;; *) echo "Unknown model choice: $MODEL_CHOICE" - echo "Usage: $0 [gemma|qwen]" + echo "Usage: $0 [gemma|gemma3n|qwen]" exit 1 ;; esac