From c80fe97f41efa02a8d742f32d18ac425b6de6414 Mon Sep 17 00:00:00 2001
From: Chili Palmer <hugoms@me.com>
Date: Tue, 17 Mar 2026 13:24:43 +0100
Subject: [PATCH] feat: added gemma 3n E4B as another model for fast response

---
 CLAUDE.md            |  5 +++--
 README.md            |  1 +
 mlx_server/engine.py | 14 ++++++++++++++
 run.sh               |  7 +++++--
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 245d90a..ea625ee 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -29,8 +29,9 @@ python -m mlx_server.main --model mlx-community/Qwen3-VL-4B-Instruct-4bit --port
 
 | Alias | HuggingFace ID | Notes |
 |-------|---------------|-------|
-| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks |
-| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `<tool_call>` tags |
+| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks (128k context) |
+| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | Vision/audio/video + tool use via `tool_code` blocks (32k context, ~1.5x faster) |
+| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `<tool_call>` tags (256k context) |
 
 ## Key Design Decisions
 
diff --git a/README.md b/README.md
index de13332..4586f81 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ OpenAI-compatible API server for running local LLMs on Apple Silicon via [MLX](h
 | Alias | Model | Context | Capabilities |
 |-------|-------|---------|-------------|
 | `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) |
+| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | 32k | Vision/audio/video, tool use (`tool_code` blocks), ~1.5x faster |
 | `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`<tool_call>` tags) |
 
 ## Quick Start
diff --git a/mlx_server/engine.py b/mlx_server/engine.py
index 2048298..c164c51 100644
--- a/mlx_server/engine.py
+++ b/mlx_server/engine.py
@@ -23,9 +23,16 @@ DEFAULT_MODEL = "mlx-community/gemma-3-4b-it-4bit"
 # Known model aliases for quick selection
 MODEL_ALIASES: dict[str, str] = {
     "gemma": "mlx-community/gemma-3-4b-it-4bit",
+    "gemma3n": "mlx-community/gemma-3n-E4B-it-4bit",
     "qwen": "mlx-community/Qwen3-VL-4B-Instruct-4bit",
 }
 
+# Fallback context lengths for models whose config doesn't expose
+# max_position_embeddings (e.g. gemma3n uses a MatFormer architecture).
+_CONTEXT_LENGTH_OVERRIDES: dict[str, int] = {
+    "gemma3n": 32768,
+}
+
 
 def _resolve_local_model_path(repo_id: str) -> Path | None:
     """If a HuggingFace model is already cached locally, return its snapshot path.
@@ -300,6 +307,9 @@ class InferenceEngine:
         """Max context length from the model config."""
         if self.config is None:
             return 0
+        # Some architectures don't expose max_position_embeddings in config
+        if self._model_type in _CONTEXT_LENGTH_OVERRIDES:
+            return _CONTEXT_LENGTH_OVERRIDES[self._model_type]
         # VLMs nest the LLM config under text_config
         text_cfg = getattr(self.config, "text_config", self.config)
         return getattr(text_cfg, "max_position_embeddings", 0)
@@ -1095,6 +1105,10 @@ class ModelManager:
             return None
         try:
             config = json.loads(config_file.read_text())
+            model_type = config.get("model_type", "")
+            # Check override table for models that don't expose it in config
+            if model_type in _CONTEXT_LENGTH_OVERRIDES:
+                return _CONTEXT_LENGTH_OVERRIDES[model_type]
             # VLMs nest under text_config
             text_cfg = config.get("text_config", config)
             return text_cfg.get("max_position_embeddings")
diff --git a/run.sh b/run.sh
index ab197c1..9567dfe 100755
--- a/run.sh
+++ b/run.sh
@@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"
 source .venv/bin/activate
 
 # --- Model selection ---
-# Usage:  ./run.sh [gemma|qwen]
+# Usage:  ./run.sh [gemma|gemma3n|qwen]
 # Or set MODEL env var directly for a custom model.
 
 MODEL_CHOICE="${1:-gemma}"
@@ -18,12 +18,15 @@ if [[ -z "${MODEL:-}" ]]; then
         gemma)
             MODEL="mlx-community/gemma-3-4b-it-4bit"
             ;;
+        gemma3n)
+            MODEL="mlx-community/gemma-3n-E4B-it-4bit"
+            ;;
         qwen)
             MODEL="mlx-community/Qwen3-VL-4B-Instruct-4bit"
             ;;
         *)
             echo "Unknown model choice: $MODEL_CHOICE"
-            echo "Usage: $0 [gemma|qwen]"
+            echo "Usage: $0 [gemma|gemma3n|qwen]"
             exit 1
             ;;
     esac