From 27849ccbd7eed5e8aabd63f44326190edf6a6cb3 Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Wed, 18 Mar 2026 13:08:21 +0100 Subject: [PATCH] feat: added stheno (llambda based) text-only model, too --- MLXServer/ContentView.swift | 2 +- MLXServer/Models/ModelConfig.swift | 32 +++++++++++++++++++++--- MLXServer/Server/APIServer.swift | 21 +++++++++++++++- MLXServer/ViewModels/ChatViewModel.swift | 10 +++++++- MLXServer/ViewModels/ModelManager.swift | 21 ++++++++++++---- MLXServer/Views/ChatInputView.swift | 11 ++++++-- README.md | 25 ++++++++++++------ 7 files changed, 101 insertions(+), 21 deletions(-) diff --git a/MLXServer/ContentView.swift b/MLXServer/ContentView.swift index 9ab55db..6f52594 100644 --- a/MLXServer/ContentView.swift +++ b/MLXServer/ContentView.swift @@ -22,7 +22,7 @@ struct ContentView: View { } } .onChange(of: modelManager.currentModel) { - chatVM?.resetSession() + chatVM?.handleModelChange() // Persist last used model if let id = modelManager.currentModel?.id { Preferences.lastModelId = id diff --git a/MLXServer/Models/ModelConfig.swift b/MLXServer/Models/ModelConfig.swift index ac68653..d393cdd 100644 --- a/MLXServer/Models/ModelConfig.swift +++ b/MLXServer/Models/ModelConfig.swift @@ -3,10 +3,18 @@ import MLXLMCommon /// Defines a supported model with its metadata. struct ModelConfig: Identifiable, Hashable { + enum LoaderKind: Hashable { + case llm + case vlm + } + let id: String // alias: "gemma", "gemma3n", "qwen" let repoId: String // HuggingFace ID let displayName: String let contextLength: Int + let loaderKind: LoaderKind + let supportsImages: Bool + let supportsTools: Bool /// All models supported by the app. static let availableModels: [ModelConfig] = [ @@ -14,19 +22,37 @@ struct ModelConfig: Identifiable, Hashable { id: "gemma", repoId: "mlx-community/gemma-3-4b-it-4bit", displayName: "Gemma 3 4B", - contextLength: 128_000 + contextLength: 128_000, + loaderKind: .vlm, + supportsImages: true, + supportsTools: true ), ModelConfig( id: "qwen", repoId: "mlx-community/Qwen3-VL-4B-Instruct-4bit", displayName: "Qwen3 VL 4B", - contextLength: 256_000 + contextLength: 256_000, + loaderKind: .vlm, + supportsImages: true, + supportsTools: true ), ModelConfig( id: "qwen3.5-9b", repoId: "mlx-community/Qwen3.5-9B-4bit", displayName: "Qwen3.5 9B", - contextLength: 256_000 + contextLength: 256_000, + loaderKind: .llm, + supportsImages: false, + supportsTools: true + ), + ModelConfig( + id: "stheno", + repoId: "synk/L3-8B-Stheno-v3.2-MLX", + displayName: "Stheno L3 8B", + contextLength: 8_192, + loaderKind: .llm, + supportsImages: false, + supportsTools: false ), ] diff --git a/MLXServer/Server/APIServer.swift b/MLXServer/Server/APIServer.swift index 9da6a70..ab6c102 100644 --- a/MLXServer/Server/APIServer.swift +++ b/MLXServer/Server/APIServer.swift @@ -221,12 +221,22 @@ final class APIServer { let requestId = "chatcmpl-\(UUID().uuidString.prefix(12).lowercased())" let created = Int(Date().timeIntervalSince1970) let modelName = request.model ?? modelManager.currentModel?.repoId ?? "unknown" + let currentModel = modelManager.currentModel let contextLength = modelManager.currentModel?.contextLength ?? 0 + if let tools = request.tools, !tools.isEmpty, currentModel?.supportsTools != true { + sendResponse( + connection: connection, + status: 400, + body: #"{"error":{"message":"The currently selected model does not support tool calls.","type":"invalid_request_error","code":"tools_not_supported"}}"# + ) + return + } + // Convert API messages to Chat.Message, extracting images from content parts var chatMessages: [Chat.Message] = [] var images: [UserInput.Image] = [] - let currentModelRepoId = modelManager.currentModel?.repoId ?? modelName + let currentModelRepoId = currentModel?.repoId ?? modelName // Build the instructions string (system prompt + tool definitions). // This is passed to ChatSession via `instructions:` rather than injected @@ -298,6 +308,15 @@ final class APIServer { images.append(contentsOf: messageImages) } + if !images.isEmpty, currentModel?.supportsImages != true { + sendResponse( + connection: connection, + status: 400, + body: #"{"error":{"message":"The currently selected model does not support image inputs.","type":"invalid_request_error","code":"vision_not_supported"}}"# + ) + return + } + // Context window check: estimate token count and reject if over limit if contextLength > 0 { let totalChars = chatMessages.reduce(0) { $0 + $1.content.count } diff --git a/MLXServer/ViewModels/ChatViewModel.swift b/MLXServer/ViewModels/ChatViewModel.swift index fe4c8db..76c353a 100644 --- a/MLXServer/ViewModels/ChatViewModel.swift +++ b/MLXServer/ViewModels/ChatViewModel.swift @@ -53,7 +53,7 @@ final class ChatViewModel { ensureSession() guard let session = chatSession else { return } - let images = attachedImages + let images = modelManager.currentModel?.supportsImages == true ? attachedImages : [] inputText = "" attachedImages = [] @@ -135,6 +135,7 @@ final class ChatViewModel { } func attachImage(_ image: NSImage) { + guard modelManager.currentModel?.supportsImages == true else { return } attachedImages.append(image) } @@ -154,6 +155,13 @@ final class ChatViewModel { chatSession = nil } + func handleModelChange() { + resetSession() + if modelManager.currentModel?.supportsImages != true { + attachedImages = [] + } + } + // MARK: - API Server func startAPIServer() { diff --git a/MLXServer/ViewModels/ModelManager.swift b/MLXServer/ViewModels/ModelManager.swift index aa22335..a6e914c 100644 --- a/MLXServer/ViewModels/ModelManager.swift +++ b/MLXServer/ViewModels/ModelManager.swift @@ -1,6 +1,7 @@ import Foundation import Hub import MLX +import MLXLLM import MLXLMCommon import MLXVLM @@ -77,11 +78,21 @@ final class ModelManager { configuration = config.modelConfiguration } - let container = try await VLMModelFactory.shared.loadContainer( - hub: Self.hub, - configuration: configuration, - progressHandler: progressHandler - ) + let container: ModelContainer + switch config.loaderKind { + case .llm: + container = try await LLMModelFactory.shared.loadContainer( + hub: Self.hub, + configuration: configuration, + progressHandler: progressHandler + ) + case .vlm: + container = try await VLMModelFactory.shared.loadContainer( + hub: Self.hub, + configuration: configuration, + progressHandler: progressHandler + ) + } self.isDownloading = false self.modelContainer = container diff --git a/MLXServer/Views/ChatInputView.swift b/MLXServer/Views/ChatInputView.swift index 1c544a7..6117a22 100644 --- a/MLXServer/Views/ChatInputView.swift +++ b/MLXServer/Views/ChatInputView.swift @@ -5,10 +5,14 @@ struct ChatInputView: View { @Bindable var viewModel: ChatViewModel @State private var pasteMonitor: Any? + private var supportsImages: Bool { + viewModel.modelManager.currentModel?.supportsImages == true + } + var body: some View { VStack(spacing: 8) { // Image preview strip - if !viewModel.attachedImages.isEmpty { + if supportsImages && !viewModel.attachedImages.isEmpty { ScrollView(.horizontal, showsIndicators: false) { HStack(spacing: 8) { ForEach(Array(viewModel.attachedImages.enumerated()), id: \.offset) { index, image in @@ -46,7 +50,7 @@ struct ChatInputView: View { .font(.title3) } .buttonStyle(.plain) - .disabled(!viewModel.modelManager.isReady) + .disabled(!viewModel.modelManager.isReady || !supportsImages) // Text field TextField("Message…", text: $viewModel.inputText, axis: .vertical) @@ -87,6 +91,7 @@ struct ChatInputView: View { } .padding(.top, 4) .onDrop(of: [.image, .fileURL], isTargeted: nil) { providers in + guard supportsImages else { return false } for provider in providers { if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) { provider.loadItem(forTypeIdentifier: UTType.fileURL.identifier, options: nil) { data, _ in @@ -121,6 +126,7 @@ struct ChatInputView: View { private func installPasteMonitor() { guard pasteMonitor == nil else { return } pasteMonitor = NSEvent.addLocalMonitorForEvents(matching: .keyDown) { event in + guard supportsImages else { return event } // Check for Cmd+V guard event.modifierFlags.contains(.command), event.charactersIgnoringModifiers == "v" else { @@ -178,6 +184,7 @@ struct ChatInputView: View { // MARK: - File picker private func pickImage() { + guard supportsImages else { return } let panel = NSOpenPanel() panel.allowedContentTypes = [.image] panel.allowsMultipleSelection = true diff --git a/README.md b/README.md index 1075e12..cbac6d8 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,17 @@ # MLX Server -Native macOS app for running local LLMs on Apple Silicon via [MLX](https://github.com/ml-explore/mlx). Built with SwiftUI, it provides both a **chat UI** and an embedded **OpenAI-compatible API server**. Supports vision, tool use, and thinking mode. +Native macOS app for running local LLMs on Apple Silicon via [MLX](https://github.com/ml-explore/mlx). Built with SwiftUI, it provides both a **chat UI** and an embedded **OpenAI-compatible API server**. Supports both vision-capable and text-only MLX models, plus tool use and thinking mode where the selected model supports them. ## Supported Models -| Alias | Model | Context | Capabilities | -|-------|-------|---------|-------------| -| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) | -| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`` tags) | -| `qwen3.5-9b` | `mlx-community/Qwen3.5-9B-4bit` | 256k | Thinking mode, tool use | +| Alias | Model | Context | Loader | Capabilities | +|-------|-------|---------|--------|-------------| +| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | `VLMModelFactory` | Vision, tool use (`tool_code` blocks) | +| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | `VLMModelFactory` | Vision, tool use (`` tags) | +| `qwen3.5-9b` | `mlx-community/Qwen3.5-9B-4bit` | 256k | `LLMModelFactory` | Text-only, thinking mode, tool use | +| `stheno` | `synk/L3-8B-Stheno-v3.2-MLX` | 8k | `LLMModelFactory` | Text-only | + +`stheno` is loaded as a standard MLX text model. The Hugging Face card provides an `mlx_lm.load(...)` sample rather than a VLM example, and its config reports `model_type: llama` with `max_position_embeddings: 8192`, so the app treats it as an 8k Llama-family text model. Any model in MLX format on HuggingFace can be added — there is no restriction on uploader or architecture. @@ -23,7 +26,7 @@ open "build/Debug/MLX Server.app" ## App Features -- **Chat interface** with markdown rendering, image attachments (file picker, drag & drop, clipboard paste, Finder copy-paste) +- **Chat interface** with markdown rendering and model-aware image attachments (file picker, drag & drop, clipboard paste, Finder copy-paste on vision-capable models) - **Model picker** in toolbar with local/download status indicators and re-download button - **Download progress modal** — shows file progress, percentage, and speed when downloading a new model - **Thinking mode** — models like Qwen3.5 can reason internally before responding; thinking content appears in a collapsible box. Toggle on/off in Settings. @@ -42,6 +45,8 @@ The embedded API server (toggle in toolbar) runs on port 1234 by default. Standa - `POST /v1/chat/completions` — chat completions (streaming and non-streaming) - `GET /health` — health check +Capability checks are enforced server-side. If a request sends images to a text-only model or tools to a model without tool support, the server returns a `400 invalid_request_error`. + ### Model Swapping Send any model ID or alias in the `model` field. If it differs from the currently loaded model, the server swaps automatically: @@ -69,10 +74,14 @@ Pass images as base64 data URIs in the `image_url` content part: } ``` +Text-only models such as `qwen3.5-9b` and `stheno` reject image inputs. + ### Tool Use Pass tools in the `tools` field (OpenAI format). The server handles model-specific formatting (Gemma `tool_code` blocks, Qwen `` XML tags) and parses tool calls from output automatically. When tools are present during streaming, output is buffered to strip tool-call markup before sending to the client. +`stheno` is currently documented and configured as a plain text model, so tool requests to it are rejected. + ## Project Structure ``` @@ -112,7 +121,7 @@ build.sh — One-command build script (xcodegen + xcodebuild) ## Key Design Decisions -- Uses `mlx-swift-lm` (`MLXVLM` / `VLMModelFactory`) for inference — loads any MLX-format model from HuggingFace +- Uses `mlx-swift-lm` for inference — `VLMModelFactory` for vision models and `LLMModelFactory` for text-only models - **Offline-first**: `LocalModelResolver` checks both the sandboxed app container and `~/.cache/huggingface/hub/` for locally-cached models before downloading - **No duplicate storage**: custom `HubApi` with blob cache disabled — models are stored once in the snapshot cache - **KV cache reuse** across API requests — reuses `ChatSession` when conversation history prefix matches