initial commit

2026-03-17 09:14:27 +01:00
commit df81afe8d7
10 changed files with 1389 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
 __pycache__/
 *.py[cod]
 *$py.class
 *.egg-info/
 dist/
 build/
 .venv/
 .env
 *.log
 .DS_Store
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,38 @@
 # MLX Server
 OpenAI-compatible API server for Gemma 3 4B (vision + tool use) on Apple Silicon via MLX.
 ## Quick Start
 ```bash
 # Activate virtual environment
 source .venv/bin/activate
 # Run the server (downloads model on first run)
 ./run.sh
 # Or directly:
 python -m mlx_server.main --model mlx-community/gemma-3-4b-it-4bit --port 1234
 ```
 ## Project Structure
 - `mlx_server/main.py` — FastAPI server, endpoints, CLI entrypoint
 - `mlx_server/engine.py` — Model loading, prompt building, generation (mlx_vlm)
 - `mlx_server/models.py` — Pydantic models for OpenAI API request/response types
 ## Key Design Decisions
 - Uses `mlx_vlm` (not `mlx_lm`) as the inference backend — this supports both text and vision in a single model load
 - Gemma 3 has no system role — system messages are converted to user/assistant pairs
 - Tool use is prompt-engineered: tools are injected into the system prompt with `<tool_call>` XML tags, and parsed from model output
 - Thread lock on generation (single-request-at-a-time) — MLX models aren't safe for concurrent generation
 - 128k context window supported via the model's native capabilities
 ## Dependencies
 Managed via `uv` and `pyproject.toml`. Virtual environment in `.venv/`.
 ```bash
 uv pip install -e "."
 ```
--- a/mlx_server/init.py
+++ b/mlx_server/init.py
--- a/mlx_server/main.py
+++ b/mlx_server/main.py
@@ -0,0 +1,3 @@
 from mlx_server.main import main
 main()
--- a/mlx_server/engine.py
+++ b/mlx_server/engine.py
@@ -0,0 +1,576 @@
 """Model loading and inference engine using mlx_vlm (supports both text and vision)."""
 from __future__ import annotations
 import base64
 import io
 import json
 import logging
 import re
 import tempfile
 import threading
 from collections.abc import Generator
 from pathlib import Path
 import mlx.core as mx
 import mlx_vlm
 from PIL import Image
 logger = logging.getLogger(__name__)
 DEFAULT_MODEL = "mlx-community/gemma-3-4b-it-4bit"
 # ------------------------------------------------------------------
 # Helpers for Gemma 3 tool_code format
 # ------------------------------------------------------------------
 _JSON_TO_PYTHON_TYPE = {
    "string": "str",
    "integer": "int",
    "number": "float",
    "boolean": "bool",
    "array": "list",
    "object": "dict",
 }
 _JSON_TYPE_DEFAULTS = {
    "string": '""',
    "integer": "0",
    "number": "0.0",
    "boolean": "False",
    "array": "[]",
    "object": "{}",
 }
 def _json_type_to_python(json_type: str) -> str:
    return _JSON_TO_PYTHON_TYPE.get(json_type, "str")
 def _json_type_default(json_type: str) -> str:
    return _JSON_TYPE_DEFAULTS.get(json_type, "None")
 def _python_repr(value) -> str:
    """Produce a Python-repr-style string for a value."""
    if isinstance(value, str):
        return repr(value)
    if isinstance(value, bool):
        return "True" if value else "False"
    if isinstance(value, (int, float)):
        return str(value)
    return repr(value)
 def _parse_python_call(call_str: str, tool_defs: dict[str, dict] | None = None) -> tuple[str, dict]:
    """Parse a function call string into (name, args_dict).
    Handles multiple formats:
    1. Python-style: func_name(arg1="value1", arg2=42)
    2. Shell-style:  func_name arg1 arg2  (common with small LLMs)
    3. Mixed:        func_name("value")   (positional args)
    tool_defs maps function names to their parameter schemas, used to
    infer which parameter a positional/shell-style argument maps to.
    """
    import ast
    call_str = call_str.strip()
    # Try Python-style: function_name(...)
    m = re.match(r"(\w+)\s*\((.*)\)\s*$", call_str, re.DOTALL)
    if m:
        name = m.group(1)
        args_str = m.group(2).strip()
        if not args_str:
            return name, {}
        # Try parsing as a Python function call via dict()
        try:
            tree = ast.parse(f"dict({args_str})", mode="eval")
            call_node = tree.body
            args = {}
            # Handle keyword arguments: func(key="val")
            for kw in call_node.keywords:
                args[kw.arg] = ast.literal_eval(kw.value)
            # Handle positional arguments: func("val1", "val2")
            if call_node.args and not args:
                param_names = _get_param_names(name, tool_defs)
                for i, arg_node in enumerate(call_node.args):
                    val = ast.literal_eval(arg_node)
                    if i < len(param_names):
                        args[param_names[i]] = val
                    else:
                        args[f"arg{i}"] = val
            if args:
                return name, args
        except Exception:
            pass
        # Fallback: regex-based key=value parsing
        args = {}
        for pair_match in re.finditer(r"(\w+)\s*=\s*(.+?)(?:,\s*(?=\w+\s*=)|$)", args_str, re.DOTALL):
            key = pair_match.group(1)
            val_str = pair_match.group(2).strip()
            try:
                args[key] = ast.literal_eval(val_str)
            except Exception:
                args[key] = val_str
        return name, args
    # Shell-style: "func_name arg1 arg2" or "func_name some/path"
    # Also handles: "func_name -flag arg" (common with shell tools)
    parts = call_str.split(None, 1)
    if parts and re.match(r"^\w+$", parts[0]):
        name = parts[0]
        if len(parts) == 1:
            return name, {}
        rest = parts[1].strip()
        param_names = _get_param_names(name, tool_defs)
        first_param = param_names[0] if param_names else "input"
        return name, {first_param: rest}
    # Last resort: treat the entire block as a command for the first
    # known tool that looks like a shell/command tool, or just fail
    raise ValueError(f"Cannot parse as function call: {call_str!r}")
 def _get_param_names(func_name: str, tool_defs: dict[str, dict] | None) -> list[str]:
    """Get ordered parameter names for a function from tool definitions."""
    if not tool_defs or func_name not in tool_defs:
        return []
    params = tool_defs[func_name].get("parameters", {})
    properties = params.get("properties", {})
    required = params.get("required", [])
    # Required params first, then optional
    optional = [k for k in properties if k not in required]
    return list(required) + optional
 class PromptCache:
    """Manages KV cache reuse across requests with shared prompt prefixes."""
    def __init__(self):
        self._cache = None
        self._cached_token_ids: list[int] | None = None
    def get_reusable_length(self, new_token_ids: list[int]) -> int:
        """Find how many leading tokens match the cached prefix."""
        if self._cached_token_ids is None or self._cache is None:
            return 0
        max_match = min(len(self._cached_token_ids), len(new_token_ids))
        match_len = 0
        for i in range(max_match):
            if self._cached_token_ids[i] != new_token_ids[i]:
                break
            match_len = i + 1
        return match_len
    def update(self, cache, token_ids: list[int]) -> None:
        """Store cache and the token IDs it was built from."""
        self._cache = cache
        self._cached_token_ids = list(token_ids)
    def clear(self) -> None:
        self._cache = None
        self._cached_token_ids = None
    @property
    def cache(self):
        return self._cache
 class InferenceEngine:
    """Manages model loading and text/vision generation."""
    def __init__(self, model_path: str = DEFAULT_MODEL):
        self.model_path = model_path
        self.model = None
        self.processor = None
        self.config = None
        self._lock = threading.Lock()
        self._prompt_cache = PromptCache()
    def load(self) -> None:
        logger.info("Loading model %s ...", self.model_path)
        self.model, self.processor = mlx_vlm.load(self.model_path)
        # Load model config for chat template
        from transformers import AutoConfig
        self.config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
        logger.info("Model loaded successfully.")
    # ------------------------------------------------------------------
    # Image helpers
    # ------------------------------------------------------------------
    @staticmethod
    def _decode_image_url(url: str) -> str:
        """Convert a data URI or URL to a file path that mlx_vlm can consume."""
        if url.startswith("data:"):
            # data:image/png;base64,iVBOR...
            header, b64data = url.split(",", 1)
            img_bytes = base64.b64decode(b64data)
            img = Image.open(io.BytesIO(img_bytes))
            tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
            img.save(tmp, format="PNG")
            tmp.close()
            return tmp.name
        # Assume it's a URL or local path – mlx_vlm handles URLs natively
        return url
    # ------------------------------------------------------------------
    # Prompt building
    # ------------------------------------------------------------------
    def build_prompt(
        self,
        messages: list[dict],
        tools: list[dict] | None = None,
    ) -> tuple[str, list[str]]:
        """Build a prompt string and collect image paths from messages.
        Returns (prompt_str, image_paths).
        """
        image_paths: list[str] = []
        formatted_messages: list[dict] = []
        for msg in messages:
            role = msg["role"]
            content = msg.get("content")
            tool_calls = msg.get("tool_calls")
            tool_call_id = msg.get("tool_call_id")
            if role == "system":
                text = self._get_text_content(content)
                # Inject tool definitions into system prompt
                if tools:
                    text = self._inject_tools_into_system(text, tools)
                formatted_messages.append({"role": "user", "content": text})
                # Gemma 3 doesn't have a system role; we use the user role
                # and add a model acknowledgment
                formatted_messages.append({
                    "role": "assistant",
                    "content": "Understood. I will follow these instructions.",
                })
            elif role == "user":
                text, imgs = self._extract_content_parts(content)
                image_paths.extend(imgs)
                formatted_messages.append({"role": "user", "content": text})
            elif role == "assistant":
                text = self._get_text_content(content) or ""
                if tool_calls:
                    # Format tool calls in the way Gemma 3 expects
                    tc_text = self._format_tool_calls_for_prompt(tool_calls)
                    text = (text + "\n" + tc_text).strip()
                formatted_messages.append({"role": "assistant", "content": text})
            elif role == "tool":
                # Tool results use Gemma 3's tool_output format
                tool_text = self._get_text_content(content) or ""
                result_msg = f"```tool_output\n{tool_text}\n```"
                formatted_messages.append({"role": "user", "content": result_msg})
        # If the first system prompt had no tools but we have tools, inject at start
        if tools and not any(m.get("role") == "system" for m in messages):
            tool_system = self._build_tool_system_prompt(tools)
            formatted_messages.insert(0, {"role": "user", "content": tool_system})
            formatted_messages.insert(1, {
                "role": "assistant",
                "content": "Understood. I will follow these instructions and use tools when appropriate.",
            })
        # Gemma 3 requires strictly alternating user/assistant turns.
        # Merge consecutive same-role messages and ensure it starts with user.
        formatted_messages = self._merge_consecutive_roles(formatted_messages)
        # Apply chat template via mlx_vlm
        prompt = mlx_vlm.apply_chat_template(
            self.processor,
            self.config,
            formatted_messages,
            add_generation_prompt=True,
            num_images=len(image_paths),
        )
        return prompt, image_paths
    @staticmethod
    def _merge_consecutive_roles(messages: list[dict]) -> list[dict]:
        """Merge consecutive messages with the same role into one.
        Gemma 3's chat template enforces strict user/assistant alternation.
        """
        if not messages:
            return messages
        merged = [messages[0].copy()]
        for msg in messages[1:]:
            if msg["role"] == merged[-1]["role"]:
                # Merge content with the previous message
                merged[-1]["content"] = (
                    merged[-1].get("content", "") + "\n\n" + msg.get("content", "")
                )
            else:
                merged.append(msg.copy())
        # Ensure conversation starts with user
        if merged and merged[0]["role"] != "user":
            merged.insert(0, {"role": "user", "content": ""})
        return merged
    def _get_text_content(self, content) -> str:
        if content is None:
            return ""
        if isinstance(content, str):
            return content
        # list of content parts
        parts = []
        for part in content:
            if isinstance(part, dict) and part.get("type") == "text":
                parts.append(part["text"])
        return "\n".join(parts)
    def _extract_content_parts(self, content) -> tuple[str, list[str]]:
        """Extract text and image paths from content parts."""
        if isinstance(content, str):
            return content, []
        if content is None:
            return "", []
        texts = []
        images = []
        for part in content:
            if isinstance(part, dict):
                if part.get("type") == "text":
                    texts.append(part["text"])
                elif part.get("type") == "image_url":
                    url = part["image_url"]["url"]
                    images.append(self._decode_image_url(url))
        return "\n".join(texts), images
    def _inject_tools_into_system(self, system_text: str, tools: list[dict]) -> str:
        tool_block = self._build_tool_system_prompt(tools)
        return f"{system_text}\n\n{tool_block}"
    def _build_tool_system_prompt(self, tools: list[dict]) -> str:
        """Build the tool system prompt using Google's official Gemma 3 format.
        Uses the tool_code/tool_output convention recommended by Google:
        - Tools defined as Python function signatures with docstrings
        - Model outputs calls in ```tool_code``` fenced blocks
        - Results returned in ```tool_output``` fenced blocks
        """
        func_defs = []
        for tool in tools:
            func = tool.get("function", tool)
            func_defs.append(self._tool_to_python_signature(func))
        functions_block = "\n\n".join(func_defs)
        return (
            "At each turn, if you decide to invoke any of the function(s), "
            "it should be wrapped with ```tool_code```. "
            "The python methods described below are imported and available, "
            "you can only use defined methods. "
            "The generated code should be readable and efficient. "
            "The response to a method will be wrapped in ```tool_output``` "
            "use it to call more tools or generate a helpful, friendly response.\n"
            "\n"
            f"{functions_block}"
        )
    @staticmethod
    def _tool_to_python_signature(func: dict) -> str:
        """Convert an OpenAI function definition to a Python function signature with docstring."""
        name = func["name"]
        desc = func.get("description", "")
        params = func.get("parameters", {})
        properties = params.get("properties", {})
        required = set(params.get("required", []))
        # Build parameter list
        param_parts = []
        doc_args = []
        for pname, pinfo in properties.items():
            ptype = _json_type_to_python(pinfo.get("type", "str"))
            pdesc = pinfo.get("description", "")
            if pname in required:
                param_parts.append(f"{pname}: {ptype}")
            else:
                default = _json_type_default(pinfo.get("type", "str"))
                param_parts.append(f"{pname}: {ptype} = {default}")
            doc_args.append(f"      {pname}: {pdesc}" if pdesc else f"      {pname}")
        sig = f"def {name}({', '.join(param_parts)}):"
        doc_lines = [f'    """{desc}']
        if doc_args:
            doc_lines.append("")
            doc_lines.append("    Args:")
            doc_lines.extend(doc_args)
        doc_lines.append('    """')
        return sig + "\n" + "\n".join(doc_lines)
    def _format_tool_calls_for_prompt(self, tool_calls: list[dict]) -> str:
        """Format OpenAI-style tool calls back into Gemma 3 tool_code blocks."""
        parts = []
        for tc in tool_calls:
            func = tc.get("function", tc)
            name = func["name"]
            args = func.get("arguments", "{}")
            if isinstance(args, str):
                args = json.loads(args)
            # Format as Python function call
            arg_parts = [f"{k}={_python_repr(v)}" for k, v in args.items()]
            call_str = f"{name}({', '.join(arg_parts)})"
            parts.append(f"```tool_code\n{call_str}\n```")
        return "\n".join(parts)
    # ------------------------------------------------------------------
    # Generation
    # ------------------------------------------------------------------
    # Common kwargs for mlx_vlm generate calls — optimized for Apple Silicon
    _GENERATE_KWARGS = {
        "kv_bits": 8,           # Quantize KV cache to 8-bit (halves memory bandwidth)
        "kv_group_size": 64,    # Group size for KV quantization
    }
    def generate(
        self,
        prompt: str,
        images: list[str] | None = None,
        max_tokens: int = 4096,
        temperature: float = 0.7,
        top_p: float = 0.9,
        stop: list[str] | None = None,
        repetition_penalty: float = 1.1,
    ) -> tuple[str, int, int]:
        """Generate a complete response. Returns (text, prompt_tokens, completion_tokens)."""
        with self._lock:
            image_arg = images if images else None
            result = mlx_vlm.generate(
                self.model,
                self.processor,
                prompt,
                image=image_arg,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                verbose=False,
                **self._GENERATE_KWARGS,
            )
            text = result.text
            if stop:
                text = self._apply_stop(text, stop)
            return text, result.prompt_tokens, result.generation_tokens
    def stream_generate(
        self,
        prompt: str,
        images: list[str] | None = None,
        max_tokens: int = 4096,
        temperature: float = 0.7,
        top_p: float = 0.9,
        stop: list[str] | None = None,
        repetition_penalty: float = 1.1,
    ) -> Generator[tuple[str, bool, int, int], None, None]:
        """Stream tokens. Yields (token_text, is_final, prompt_tokens, gen_tokens)."""
        with self._lock:
            image_arg = images if images else None
            accumulated = ""
            prompt_tokens = 0
            gen_tokens = 0
            for result in mlx_vlm.stream_generate(
                self.model,
                self.processor,
                prompt,
                image=image_arg,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                **self._GENERATE_KWARGS,
            ):
                # result.text is the incremental segment (detokenizer.last_segment),
                # NOT the full accumulated text.
                token_text = result.text
                accumulated += token_text
                prompt_tokens = result.prompt_tokens
                gen_tokens = result.generation_tokens
                if stop and self._check_stop(accumulated, stop):
                    # Trim the accumulated text and yield what's safe
                    trimmed = self._apply_stop(accumulated, stop)
                    # Only yield the part we haven't yielded yet
                    safe_delta = trimmed[len(accumulated) - len(token_text):]
                    yield safe_delta, True, prompt_tokens, gen_tokens
                    return
                yield token_text, False, prompt_tokens, gen_tokens
            # Final yield to signal completion
            yield "", True, prompt_tokens, gen_tokens
    @staticmethod
    def _apply_stop(text: str, stop: list[str]) -> str:
        for s in stop:
            idx = text.find(s)
            if idx != -1:
                text = text[:idx]
        return text
    @staticmethod
    def _check_stop(text: str, stop: list[str]) -> bool:
        return any(s in text for s in stop)
    # ------------------------------------------------------------------
    # Tool call parsing from model output
    # ------------------------------------------------------------------
    @staticmethod
    def parse_tool_calls(
        text: str, tools: list[dict] | None = None
    ) -> tuple[str, list[dict]]:
        """Parse tool calls from model output using Gemma 3's tool_code format.
        Detects ```tool_code ... ``` blocks containing Python-style or
        shell-style function calls.
        Returns (clean_text, tool_calls) where tool_calls is a list of
        {"id": str, "type": "function", "function": {"name": str, "arguments": str}}.
        """
        # Build a lookup of function name -> parameter schema
        tool_defs: dict[str, dict] = {}
        if tools:
            for tool in tools:
                func = tool.get("function", tool)
                tool_defs[func["name"]] = func
        tool_calls = []
        pattern = r"```tool_code\s*(.*?)\s*```"
        matches = re.findall(pattern, text, re.DOTALL)
        clean_text = re.sub(r"```tool_code\s*.*?\s*```", "", text, flags=re.DOTALL).strip()
        for i, match in enumerate(matches):
            call_str = match.strip()
            try:
                name, args = _parse_python_call(call_str, tool_defs)
                tool_calls.append({
                    "id": f"call_{i}_{hash(call_str) % 10**8:08d}",
                    "type": "function",
                    "function": {
                        "name": name,
                        "arguments": json.dumps(args),
                    },
                })
            except Exception as e:
                logger.warning("Failed to parse tool_code call %r: %s", call_str, e)
        return clean_text, tool_calls
--- a/mlx_server/main.py
+++ b/mlx_server/main.py
@@ -0,0 +1,278 @@
 """OpenAI-compatible API server for Gemma 3 4B via MLX."""
 from __future__ import annotations
 import argparse
 import json
 import logging
 import time
 import uuid
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
 from .engine import DEFAULT_MODEL, InferenceEngine
 from .models import (
    ChatCompletionChunk,
    ChatCompletionRequest,
    ChatCompletionResponse,
    Choice,
    ChoiceMessage,
    DeltaMessage,
    ModelInfo,
    ModelListResponse,
    StreamChoice,
    ToolCall,
    FunctionCall,
    UsageInfo,
 )
 logger = logging.getLogger(__name__)
 app = FastAPI(title="MLX Server", description="OpenAI-compatible API for Gemma 3 4B")
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 engine: InferenceEngine | None = None
 def get_engine() -> InferenceEngine:
    if engine is None:
        raise HTTPException(status_code=503, detail="Model not loaded")
    return engine
 def _make_id() -> str:
    return f"chatcmpl-{uuid.uuid4().hex[:12]}"
 # ------------------------------------------------------------------
 # Endpoints
 # ------------------------------------------------------------------
@app.get("/v1/models")
 async def list_models() -> ModelListResponse:
    e = get_engine()
    return ModelListResponse(data=[ModelInfo(id=e.model_path)])
@app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
    e = get_engine()
    # Convert pydantic messages to dicts
    messages = [m.model_dump(exclude_none=True) for m in request.messages]
    tools = None
    if request.tools:
        tools = [t.model_dump(exclude_none=True) for t in request.tools]
    prompt, images = e.build_prompt(messages, tools)
    stop = request.stop
    if isinstance(stop, str):
        stop = [stop]
    temperature = request.temperature if request.temperature is not None else 0.7
    top_p = request.top_p if request.top_p is not None else 0.9
    max_tokens = request.max_tokens if request.max_tokens is not None else 4096
    if request.stream:
        return EventSourceResponse(
            _stream_response(e, prompt, images, max_tokens, temperature, top_p, stop, tools, request.model),
            media_type="text/event-stream",
        )
    # Non-streaming
    text, prompt_tokens, completion_tokens = e.generate(
        prompt=prompt,
        images=images or None,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=stop,
    )
    # Check for tool calls in the response
    finish_reason = "stop"
    tool_calls_parsed = None
    if tools:
        clean_text, parsed = e.parse_tool_calls(text, tools)
        if parsed:
            tool_calls_parsed = [
                ToolCall(
                    index=i,
                    id=tc["id"],
                    type="function",
                    function=FunctionCall(
                        name=tc["function"]["name"],
                        arguments=tc["function"]["arguments"],
                    ),
                )
                for i, tc in enumerate(parsed)
            ]
            text = clean_text if clean_text else None
            finish_reason = "tool_calls"
    return ChatCompletionResponse(
        id=_make_id(),
        model=request.model,
        choices=[
            Choice(
                message=ChoiceMessage(
                    role="assistant",
                    content=text if not tool_calls_parsed else (text or None),
                    tool_calls=tool_calls_parsed,
                ),
                finish_reason=finish_reason,
            )
        ],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )
 async def _stream_response(
    e: InferenceEngine,
    prompt: str,
    images: list[str] | None,
    max_tokens: int,
    temperature: float,
    top_p: float,
    stop: list[str] | None,
    tools: list[dict] | None,
    model_name: str,
 ):
    request_id = _make_id()
    created = int(time.time())
    # Send initial chunk with role
    initial_chunk = ChatCompletionChunk(
        id=request_id,
        created=created,
        model=model_name,
        choices=[StreamChoice(delta=DeltaMessage(role="assistant"))],
    )
    yield {"data": initial_chunk.model_dump_json()}
    full_text = ""
    prompt_tokens = 0
    gen_tokens = 0
    for token_text, is_final, pt, gt in e.stream_generate(
        prompt=prompt,
        images=images or None,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=stop,
    ):
        prompt_tokens = pt
        gen_tokens = gt
        full_text += token_text
        if not is_final and token_text:
            chunk = ChatCompletionChunk(
                id=request_id,
                created=created,
                model=model_name,
                choices=[StreamChoice(delta=DeltaMessage(content=token_text))],
            )
            yield {"data": chunk.model_dump_json()}
    # Check for tool calls in complete response
    finish_reason = "stop"
    if tools:
        clean_text, parsed = e.parse_tool_calls(full_text, tools)
        if parsed:
            finish_reason = "tool_calls"
            # Emit tool call chunks
            for i, tc in enumerate(parsed):
                tc_chunk = ChatCompletionChunk(
                    id=request_id,
                    created=created,
                    model=model_name,
                    choices=[
                        StreamChoice(
                            delta=DeltaMessage(
                                tool_calls=[
                                    ToolCall(
                                        index=i,
                                        id=tc["id"],
                                        type="function",
                                        function=FunctionCall(
                                            name=tc["function"]["name"],
                                            arguments=tc["function"]["arguments"],
                                        ),
                                    )
                                ]
                            )
                        )
                    ],
                )
                yield {"data": tc_chunk.model_dump_json()}
    # Final chunk with finish reason and usage
    final_chunk = ChatCompletionChunk(
        id=request_id,
        created=created,
        model=model_name,
        choices=[StreamChoice(delta=DeltaMessage(), finish_reason=finish_reason)],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=gen_tokens,
            total_tokens=prompt_tokens + gen_tokens,
        ),
    )
    yield {"data": final_chunk.model_dump_json()}
    yield {"data": "[DONE]"}
 # ------------------------------------------------------------------
 # Health / utility
 # ------------------------------------------------------------------
@app.get("/health")
 async def health():
    return {"status": "ok"}
 # ------------------------------------------------------------------
 # Entrypoint
 # ------------------------------------------------------------------
 def main():
    parser = argparse.ArgumentParser(description="MLX Server – OpenAI-compatible API")
    parser.add_argument("--model", type=str, default=DEFAULT_MODEL, help="HuggingFace model path")
    parser.add_argument("--host", type=str, default="127.0.0.1")
    parser.add_argument("--port", type=int, default=1234)
    parser.add_argument("--log-level", type=str, default="info")
    args = parser.parse_args()
    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
    global engine
    engine = InferenceEngine(model_path=args.model)
    engine.load()
    uvicorn.run(app, host=args.host, port=args.port, log_level=args.log_level)
 if __name__ == "__main__":
    main()
--- a/mlx_server/models.py
+++ b/mlx_server/models.py
@@ -0,0 +1,144 @@
 """OpenAI API compatible request/response models."""
 from __future__ import annotations
 import time
 from typing import Any, Literal
 from pydantic import BaseModel, Field
 # --- Request models ---
 class FunctionDefinition(BaseModel):
    name: str
    description: str | None = None
    parameters: dict[str, Any] | None = None
 class ToolDefinition(BaseModel):
    type: Literal["function"] = "function"
    function: FunctionDefinition
 class FunctionCall(BaseModel):
    name: str
    arguments: str  # JSON string
 class ToolCall(BaseModel):
    index: int = 0
    id: str
    type: Literal["function"] = "function"
    function: FunctionCall
 class ContentPartText(BaseModel):
    type: Literal["text"] = "text"
    text: str
 class ImageURL(BaseModel):
    url: str  # Can be a URL or base64 data URI
    detail: str | None = None
 class ContentPartImage(BaseModel):
    type: Literal["image_url"] = "image_url"
    image_url: ImageURL
 ContentPart = ContentPartText | ContentPartImage
 class ChatMessage(BaseModel):
    role: Literal["system", "user", "assistant", "tool"]
    content: str | list[ContentPart] | None = None
    name: str | None = None
    tool_calls: list[ToolCall] | None = None
    tool_call_id: str | None = None
 class ChatCompletionRequest(BaseModel):
    model: str = "gemma-3-4b-it"
    messages: list[ChatMessage]
    temperature: float | None = 0.7
    top_p: float | None = 0.9
    max_tokens: int | None = 4096
    stream: bool = False
    stop: str | list[str] | None = None
    tools: list[ToolDefinition] | None = None
    tool_choice: str | dict | None = None
    frequency_penalty: float | None = None
    presence_penalty: float | None = None
    n: int | None = 1
 # --- Response models ---
 class UsageInfo(BaseModel):
    prompt_tokens: int = 0
    completion_tokens: int = 0
    total_tokens: int = 0
 class ChoiceMessage(BaseModel):
    role: str = "assistant"
    content: str | None = None
    tool_calls: list[ToolCall] | None = None
 class Choice(BaseModel):
    index: int = 0
    message: ChoiceMessage
    finish_reason: str | None = "stop"
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[Choice]
    usage: UsageInfo
 # --- Streaming response models ---
 class DeltaMessage(BaseModel):
    role: str | None = None
    content: str | None = None
    tool_calls: list[ToolCall] | None = None
 class StreamChoice(BaseModel):
    index: int = 0
    delta: DeltaMessage
    finish_reason: str | None = None
 class ChatCompletionChunk(BaseModel):
    id: str
    object: str = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: list[StreamChoice]
    usage: UsageInfo | None = None
 # --- Model listing ---
 class ModelInfo(BaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "local"
 class ModelListResponse(BaseModel):
    object: str = "list"
    data: list[ModelInfo]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,20 @@
 [project]
 name = "mlx-server"
 version = "0.1.0"
 description = "OpenAI-compatible API server for Gemma 3 4B via MLX"
 requires-python = ">=3.11"
 dependencies = [
    "fastapi>=0.115.0",
    "uvicorn[standard]>=0.30.0",
    "mlx>=0.22.0",
    "mlx-lm>=0.22.0",
    "mlx-vlm>=0.1.18",
    "pydantic>=2.0.0",
    "sse-starlette>=2.0.0",
    "pillow>=10.0.0",
    "httpx>=0.27.0",
    "torchvision>=0.20.0",
 ]
 [project.scripts]
 mlx-server = "mlx_server.main:main"
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,24 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 cd "$SCRIPT_DIR"
 # Activate virtual environment
 source .venv/bin/activate
 # Default model – 4-bit quantized Gemma 3 4B IT (vision-capable)
 MODEL="${MODEL:-mlx-community/gemma-3-4b-it-4bit}"
 HOST="${HOST:-127.0.0.1}"
 PORT="${PORT:-1234}"
 echo "Starting MLX Server..."
 echo "  Model: $MODEL"
 echo "  Endpoint: http://$HOST:$PORT"
 echo ""
 exec python -m mlx_server.main \
    --model "$MODEL" \
    --host "$HOST" \
    --port "$PORT" \
    "$@"
--- a/test_server.py
+++ b/test_server.py
@@ -0,0 +1,296 @@
 """Test script for MLX Server – exercises chat, streaming, vision, and tool use."""
 import base64
 import io
 import json
 import sys
 import httpx
 from PIL import Image, ImageDraw
 BASE_URL = "http://127.0.0.1:1234/v1"
 MODEL = "mlx-community/gemma-3-4b-it-4bit"
 def test_models():
    """Test GET /v1/models."""
    print("=" * 60)
    print("TEST: List models")
    print("=" * 60)
    r = httpx.get(f"{BASE_URL}/models")
    r.raise_for_status()
    data = r.json()
    print(f"Models: {[m['id'] for m in data['data']]}")
    print("PASS\n")
 def test_chat_basic():
    """Test basic non-streaming chat."""
    print("=" * 60)
    print("TEST: Basic chat (non-streaming)")
    print("=" * 60)
    r = httpx.post(
        f"{BASE_URL}/chat/completions",
        json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Say exactly: 'The sky is blue.' Nothing else."}],
            "max_tokens": 50,
            "temperature": 0.1,
        },
        timeout=120,
    )
    r.raise_for_status()
    data = r.json()
    msg = data["choices"][0]["message"]["content"]
    usage = data["usage"]
    print(f"Response: {msg}")
    print(f"Usage: {usage}")
    print(f"Finish reason: {data['choices'][0]['finish_reason']}")
    print("PASS\n")
 def test_chat_streaming():
    """Test streaming chat."""
    print("=" * 60)
    print("TEST: Streaming chat")
    print("=" * 60)
    collected = ""
    with httpx.stream(
        "POST",
        f"{BASE_URL}/chat/completions",
        json={
            "model": MODEL,
            "messages": [{"role": "user", "content": "Count from 1 to 5, one number per line."}],
            "max_tokens": 100,
            "temperature": 0.1,
            "stream": True,
        },
        timeout=120,
    ) as response:
        response.raise_for_status()
        for line in response.iter_lines():
            if not line.startswith("data: "):
                continue
            payload = line[len("data: "):]
            if payload == "[DONE]":
                break
            chunk = json.loads(payload)
            delta = chunk["choices"][0]["delta"]
            if delta.get("content"):
                collected += delta["content"]
                print(delta["content"], end="", flush=True)
            if chunk["choices"][0].get("finish_reason"):
                print(f"\n[finish_reason: {chunk['choices'][0]['finish_reason']}]")
            if chunk.get("usage") and chunk["usage"].get("total_tokens", 0) > 0:
                print(f"[usage: {chunk['usage']}]")
    print(f"Full collected: {collected!r}")
    print("PASS\n")
 def _make_test_image() -> str:
    """Create a simple test image and return it as a base64 data URI."""
    img = Image.new("RGB", (200, 200), color=(135, 206, 235))
    draw = ImageDraw.Draw(img)
    # Draw a red circle
    draw.ellipse([50, 50, 150, 150], fill=(255, 0, 0), outline=(0, 0, 0), width=2)
    # Draw a green triangle
    draw.polygon([(100, 20), (60, 80), (140, 80)], fill=(0, 180, 0), outline=(0, 0, 0))
    # Draw yellow text area
    draw.rectangle([10, 160, 190, 190], fill=(255, 255, 0))
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode()
    return f"data:image/png;base64,{b64}"
 def test_vision():
    """Test vision with an image."""
    print("=" * 60)
    print("TEST: Vision (image description)")
    print("=" * 60)
    image_uri = _make_test_image()
    print(f"Image: 200x200 PNG with red circle, green triangle, yellow bar")
    r = httpx.post(
        f"{BASE_URL}/chat/completions",
        json={
            "model": MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe what shapes and colors you see in this image. Be brief."},
                        {"type": "image_url", "image_url": {"url": image_uri}},
                    ],
                }
            ],
            "max_tokens": 200,
            "temperature": 0.1,
        },
        timeout=120,
    )
    r.raise_for_status()
    data = r.json()
    msg = data["choices"][0]["message"]["content"]
    print(f"Response: {msg}")
    print("PASS\n")
 def test_tool_use():
    """Test tool calling."""
    print("=" * 60)
    print("TEST: Tool use")
    print("=" * 60)
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a given city",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "city": {
                            "type": "string",
                            "description": "The city name, e.g. 'London'",
                        },
                        "units": {
                            "type": "string",
                            "description": "Temperature units: 'celsius' or 'fahrenheit'",
                        },
                    },
                    "required": ["city"],
                },
            },
        }
    ]
    # Step 1: Ask the model to use the tool
    print("Step 1: Asking model to get weather for Paris...")
    r = httpx.post(
        f"{BASE_URL}/chat/completions",
        json={
            "model": MODEL,
            "messages": [
                {"role": "user", "content": "What is the weather in Paris right now? Use the get_weather tool."},
            ],
            "tools": tools,
            "max_tokens": 300,
            "temperature": 0.1,
        },
        timeout=120,
    )
    r.raise_for_status()
    data = r.json()
    choice = data["choices"][0]
    print(f"Finish reason: {choice['finish_reason']}")
    print(f"Content: {choice['message'].get('content')}")
    print(f"Tool calls: {choice['message'].get('tool_calls')}")
    if choice["message"].get("tool_calls"):
        tc = choice["message"]["tool_calls"][0]
        print(f"\nTool call detected:")
        print(f"  ID: {tc['id']}")
        print(f"  Function: {tc['function']['name']}")
        print(f"  Arguments: {tc['function']['arguments']}")
        # Step 2: Send the tool result back
        print("\nStep 2: Sending mock tool result back...")
        r2 = httpx.post(
            f"{BASE_URL}/chat/completions",
            json={
                "model": MODEL,
                "messages": [
                    {"role": "user", "content": "What is the weather in Paris right now? Use the get_weather tool."},
                    {
                        "role": "assistant",
                        "content": choice["message"].get("content"),
                        "tool_calls": choice["message"]["tool_calls"],
                    },
                    {
                        "role": "tool",
                        "tool_call_id": tc["id"],
                        "content": json.dumps({"temperature": 18, "condition": "Partly cloudy", "humidity": 65}),
                    },
                ],
                "tools": tools,
                "max_tokens": 300,
                "temperature": 0.1,
            },
            timeout=120,
        )
        r2.raise_for_status()
        data2 = r2.json()
        msg2 = data2["choices"][0]["message"]["content"]
        print(f"Final response: {msg2}")
    else:
        print("WARNING: Model did not produce a tool call. Raw response above.")
    print("PASS\n")
 def test_multi_turn():
    """Test multi-turn conversation."""
    print("=" * 60)
    print("TEST: Multi-turn conversation")
    print("=" * 60)
    messages = [
        {"role": "user", "content": "My name is Alice."},
    ]
    r = httpx.post(
        f"{BASE_URL}/chat/completions",
        json={"model": MODEL, "messages": messages, "max_tokens": 100, "temperature": 0.1},
        timeout=120,
    )
    r.raise_for_status()
    reply1 = r.json()["choices"][0]["message"]["content"]
    print(f"Turn 1 reply: {reply1}")
    messages.append({"role": "assistant", "content": reply1})
    messages.append({"role": "user", "content": "What is my name?"})
    r2 = httpx.post(
        f"{BASE_URL}/chat/completions",
        json={"model": MODEL, "messages": messages, "max_tokens": 100, "temperature": 0.1},
        timeout=120,
    )
    r2.raise_for_status()
    reply2 = r2.json()["choices"][0]["message"]["content"]
    print(f"Turn 2 reply: {reply2}")
    assert "alice" in reply2.lower(), f"Expected 'Alice' in response, got: {reply2}"
    print("PASS\n")
 if __name__ == "__main__":
    tests = [
        test_models,
        test_chat_basic,
        test_chat_streaming,
        test_vision,
        test_tool_use,
        test_multi_turn,
    ]
    # Allow running a single test by name
    if len(sys.argv) > 1:
        name = sys.argv[1]
        tests = [t for t in tests if name in t.__name__]
        if not tests:
            print(f"No test matching '{name}'. Available: models, chat_basic, chat_streaming, vision, tool_use, multi_turn")
            sys.exit(1)
    passed = 0
    failed = 0
    for test in tests:
        try:
            test()
            passed += 1
        except Exception as e:
            print(f"FAIL: {e}\n")
            failed += 1
    print("=" * 60)
    print(f"Results: {passed} passed, {failed} failed")
    print("=" * 60)