feat: added gemma 3n E4B as another model for fast response
This commit is contained in:
@@ -29,8 +29,9 @@ python -m mlx_server.main --model mlx-community/Qwen3-VL-4B-Instruct-4bit --port
|
|||||||
|
|
||||||
| Alias | HuggingFace ID | Notes |
|
| Alias | HuggingFace ID | Notes |
|
||||||
|-------|---------------|-------|
|
|-------|---------------|-------|
|
||||||
| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks |
|
| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | Vision + tool use via `tool_code` blocks (128k context) |
|
||||||
| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `<tool_call>` tags |
|
| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | Vision/audio/video + tool use via `tool_code` blocks (32k context, ~1.5x faster) |
|
||||||
|
| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | Vision + tool use via `<tool_call>` tags (256k context) |
|
||||||
|
|
||||||
## Key Design Decisions
|
## Key Design Decisions
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ OpenAI-compatible API server for running local LLMs on Apple Silicon via [MLX](h
|
|||||||
| Alias | Model | Context | Capabilities |
|
| Alias | Model | Context | Capabilities |
|
||||||
|-------|-------|---------|-------------|
|
|-------|-------|---------|-------------|
|
||||||
| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) |
|
| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) |
|
||||||
|
| `gemma3n` | `mlx-community/gemma-3n-E4B-it-4bit` | 32k | Vision/audio/video, tool use (`tool_code` blocks), ~1.5x faster |
|
||||||
| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`<tool_call>` tags) |
|
| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`<tool_call>` tags) |
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|||||||
@@ -23,9 +23,16 @@ DEFAULT_MODEL = "mlx-community/gemma-3-4b-it-4bit"
|
|||||||
# Known model aliases for quick selection
|
# Known model aliases for quick selection
|
||||||
MODEL_ALIASES: dict[str, str] = {
|
MODEL_ALIASES: dict[str, str] = {
|
||||||
"gemma": "mlx-community/gemma-3-4b-it-4bit",
|
"gemma": "mlx-community/gemma-3-4b-it-4bit",
|
||||||
|
"gemma3n": "mlx-community/gemma-3n-E4B-it-4bit",
|
||||||
"qwen": "mlx-community/Qwen3-VL-4B-Instruct-4bit",
|
"qwen": "mlx-community/Qwen3-VL-4B-Instruct-4bit",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Fallback context lengths for models whose config doesn't expose
|
||||||
|
# max_position_embeddings (e.g. gemma3n uses a MatFormer architecture).
|
||||||
|
_CONTEXT_LENGTH_OVERRIDES: dict[str, int] = {
|
||||||
|
"gemma3n": 32768,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _resolve_local_model_path(repo_id: str) -> Path | None:
|
def _resolve_local_model_path(repo_id: str) -> Path | None:
|
||||||
"""If a HuggingFace model is already cached locally, return its snapshot path.
|
"""If a HuggingFace model is already cached locally, return its snapshot path.
|
||||||
@@ -300,6 +307,9 @@ class InferenceEngine:
|
|||||||
"""Max context length from the model config."""
|
"""Max context length from the model config."""
|
||||||
if self.config is None:
|
if self.config is None:
|
||||||
return 0
|
return 0
|
||||||
|
# Some architectures don't expose max_position_embeddings in config
|
||||||
|
if self._model_type in _CONTEXT_LENGTH_OVERRIDES:
|
||||||
|
return _CONTEXT_LENGTH_OVERRIDES[self._model_type]
|
||||||
# VLMs nest the LLM config under text_config
|
# VLMs nest the LLM config under text_config
|
||||||
text_cfg = getattr(self.config, "text_config", self.config)
|
text_cfg = getattr(self.config, "text_config", self.config)
|
||||||
return getattr(text_cfg, "max_position_embeddings", 0)
|
return getattr(text_cfg, "max_position_embeddings", 0)
|
||||||
@@ -1095,6 +1105,10 @@ class ModelManager:
|
|||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
config = json.loads(config_file.read_text())
|
config = json.loads(config_file.read_text())
|
||||||
|
model_type = config.get("model_type", "")
|
||||||
|
# Check override table for models that don't expose it in config
|
||||||
|
if model_type in _CONTEXT_LENGTH_OVERRIDES:
|
||||||
|
return _CONTEXT_LENGTH_OVERRIDES[model_type]
|
||||||
# VLMs nest under text_config
|
# VLMs nest under text_config
|
||||||
text_cfg = config.get("text_config", config)
|
text_cfg = config.get("text_config", config)
|
||||||
return text_cfg.get("max_position_embeddings")
|
return text_cfg.get("max_position_embeddings")
|
||||||
|
|||||||
7
run.sh
7
run.sh
@@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"
|
|||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
|
|
||||||
# --- Model selection ---
|
# --- Model selection ---
|
||||||
# Usage: ./run.sh [gemma|qwen]
|
# Usage: ./run.sh [gemma|gemma3n|qwen]
|
||||||
# Or set MODEL env var directly for a custom model.
|
# Or set MODEL env var directly for a custom model.
|
||||||
|
|
||||||
MODEL_CHOICE="${1:-gemma}"
|
MODEL_CHOICE="${1:-gemma}"
|
||||||
@@ -18,12 +18,15 @@ if [[ -z "${MODEL:-}" ]]; then
|
|||||||
gemma)
|
gemma)
|
||||||
MODEL="mlx-community/gemma-3-4b-it-4bit"
|
MODEL="mlx-community/gemma-3-4b-it-4bit"
|
||||||
;;
|
;;
|
||||||
|
gemma3n)
|
||||||
|
MODEL="mlx-community/gemma-3n-E4B-it-4bit"
|
||||||
|
;;
|
||||||
qwen)
|
qwen)
|
||||||
MODEL="mlx-community/Qwen3-VL-4B-Instruct-4bit"
|
MODEL="mlx-community/Qwen3-VL-4B-Instruct-4bit"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown model choice: $MODEL_CHOICE"
|
echo "Unknown model choice: $MODEL_CHOICE"
|
||||||
echo "Usage: $0 [gemma|qwen]"
|
echo "Usage: $0 [gemma|gemma3n|qwen]"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|||||||
Reference in New Issue
Block a user