feat: added stheno (llambda based) text-only model, too

2026-03-18 13:08:21 +01:00
parent 6a87fe6f08
commit 27849ccbd7
7 changed files with 101 additions and 21 deletions
--- a/MLXServer/ContentView.swift
+++ b/MLXServer/ContentView.swift
@@ -22,7 +22,7 @@ struct ContentView: View {
                }
            }
            .onChange(of: modelManager.currentModel) {
-                chatVM?.resetSession()
+                chatVM?.handleModelChange()
                // Persist last used model
                if let id = modelManager.currentModel?.id {
                    Preferences.lastModelId = id
--- a/MLXServer/Models/ModelConfig.swift
+++ b/MLXServer/Models/ModelConfig.swift
@@ -3,10 +3,18 @@ import MLXLMCommon
 /// Defines a supported model with its metadata.
 struct ModelConfig: Identifiable, Hashable {
    enum LoaderKind: Hashable {
        case llm
        case vlm
    }
    let id: String          // alias: "gemma", "gemma3n", "qwen"
    let repoId: String      // HuggingFace ID
    let displayName: String
    let contextLength: Int
    let loaderKind: LoaderKind
    let supportsImages: Bool
    let supportsTools: Bool
    /// All models supported by the app.
    static let availableModels: [ModelConfig] = [
@@ -14,19 +22,37 @@ struct ModelConfig: Identifiable, Hashable {
            id: "gemma",
            repoId: "mlx-community/gemma-3-4b-it-4bit",
            displayName: "Gemma 3 4B",
-            contextLength: 128_000
+            contextLength: 128_000,
            loaderKind: .vlm,
            supportsImages: true,
            supportsTools: true
        ),
        ModelConfig(
            id: "qwen",
            repoId: "mlx-community/Qwen3-VL-4B-Instruct-4bit",
            displayName: "Qwen3 VL 4B",
-            contextLength: 256_000
+            contextLength: 256_000,
            loaderKind: .vlm,
            supportsImages: true,
            supportsTools: true
        ),
        ModelConfig(
            id: "qwen3.5-9b",
            repoId: "mlx-community/Qwen3.5-9B-4bit",
            displayName: "Qwen3.5 9B",
-            contextLength: 256_000
+            contextLength: 256_000,
            loaderKind: .llm,
            supportsImages: false,
            supportsTools: true
        ),
        ModelConfig(
            id: "stheno",
            repoId: "synk/L3-8B-Stheno-v3.2-MLX",
            displayName: "Stheno L3 8B",
            contextLength: 8_192,
            loaderKind: .llm,
            supportsImages: false,
            supportsTools: false
        ),
    ]
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -221,12 +221,22 @@ final class APIServer {
        let requestId = "chatcmpl-\(UUID().uuidString.prefix(12).lowercased())"
        let created = Int(Date().timeIntervalSince1970)
        let modelName = request.model ?? modelManager.currentModel?.repoId ?? "unknown"
        let currentModel = modelManager.currentModel
        let contextLength = modelManager.currentModel?.contextLength ?? 0
        if let tools = request.tools, !tools.isEmpty, currentModel?.supportsTools != true {
            sendResponse(
                connection: connection,
                status: 400,
                body: #"{"error":{"message":"The currently selected model does not support tool calls.","type":"invalid_request_error","code":"tools_not_supported"}}"#
            )
            return
        }
        // Convert API messages to Chat.Message, extracting images from content parts
        var chatMessages: [Chat.Message] = []
        var images: [UserInput.Image] = []
-        let currentModelRepoId = modelManager.currentModel?.repoId ?? modelName
+        let currentModelRepoId = currentModel?.repoId ?? modelName
        // Build the instructions string (system prompt + tool definitions).
        // This is passed to ChatSession via `instructions:` rather than injected
@@ -298,6 +308,15 @@ final class APIServer {
            images.append(contentsOf: messageImages)
        }
        if !images.isEmpty, currentModel?.supportsImages != true {
            sendResponse(
                connection: connection,
                status: 400,
                body: #"{"error":{"message":"The currently selected model does not support image inputs.","type":"invalid_request_error","code":"vision_not_supported"}}"#
            )
            return
        }
        // Context window check: estimate token count and reject if over limit
        if contextLength > 0 {
            let totalChars = chatMessages.reduce(0) { $0 + $1.content.count }
--- a/MLXServer/ViewModels/ChatViewModel.swift
+++ b/MLXServer/ViewModels/ChatViewModel.swift
@@ -53,7 +53,7 @@ final class ChatViewModel {
        ensureSession()
        guard let session = chatSession else { return }
-        let images = attachedImages
+        let images = modelManager.currentModel?.supportsImages == true ? attachedImages : []
        inputText = ""
        attachedImages = []
@@ -135,6 +135,7 @@ final class ChatViewModel {
    }
    func attachImage(_ image: NSImage) {
        guard modelManager.currentModel?.supportsImages == true else { return }
        attachedImages.append(image)
    }
@@ -154,6 +155,13 @@ final class ChatViewModel {
        chatSession = nil
    }
    func handleModelChange() {
        resetSession()
        if modelManager.currentModel?.supportsImages != true {
            attachedImages = []
        }
    }
    // MARK: - API Server
    func startAPIServer() {
--- a/MLXServer/ViewModels/ModelManager.swift
+++ b/MLXServer/ViewModels/ModelManager.swift
@@ -1,6 +1,7 @@
 import Foundation
 import Hub
 import MLX
 import MLXLLM
 import MLXLMCommon
 import MLXVLM
@@ -77,11 +78,21 @@ final class ModelManager {
                configuration = config.modelConfiguration
            }
-            let container = try await VLMModelFactory.shared.loadContainer(
+            let container: ModelContainer
            switch config.loaderKind {
            case .llm:
                container = try await LLMModelFactory.shared.loadContainer(
                    hub: Self.hub,
                    configuration: configuration,
                    progressHandler: progressHandler
                )
            case .vlm:
                container = try await VLMModelFactory.shared.loadContainer(
                    hub: Self.hub,
                    configuration: configuration,
                    progressHandler: progressHandler
                )
            }
            self.isDownloading = false
            self.modelContainer = container
--- a/MLXServer/Views/ChatInputView.swift
+++ b/MLXServer/Views/ChatInputView.swift
@@ -5,10 +5,14 @@ struct ChatInputView: View {
    @Bindable var viewModel: ChatViewModel
    @State private var pasteMonitor: Any?
    private var supportsImages: Bool {
        viewModel.modelManager.currentModel?.supportsImages == true
    }
    var body: some View {
        VStack(spacing: 8) {
            // Image preview strip
-            if !viewModel.attachedImages.isEmpty {
+            if supportsImages && !viewModel.attachedImages.isEmpty {
                ScrollView(.horizontal, showsIndicators: false) {
                    HStack(spacing: 8) {
                        ForEach(Array(viewModel.attachedImages.enumerated()), id: \.offset) { index, image in
@@ -46,7 +50,7 @@ struct ChatInputView: View {
                        .font(.title3)
                }
                .buttonStyle(.plain)
-                .disabled(!viewModel.modelManager.isReady)
+                .disabled(!viewModel.modelManager.isReady || !supportsImages)
                // Text field
                TextField("Message…", text: $viewModel.inputText, axis: .vertical)
@@ -87,6 +91,7 @@ struct ChatInputView: View {
        }
        .padding(.top, 4)
        .onDrop(of: [.image, .fileURL], isTargeted: nil) { providers in
            guard supportsImages else { return false }
            for provider in providers {
                if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) {
                    provider.loadItem(forTypeIdentifier: UTType.fileURL.identifier, options: nil) { data, _ in
@@ -121,6 +126,7 @@ struct ChatInputView: View {
    private func installPasteMonitor() {
        guard pasteMonitor == nil else { return }
        pasteMonitor = NSEvent.addLocalMonitorForEvents(matching: .keyDown) { event in
            guard supportsImages else { return event }
            // Check for Cmd+V
            guard event.modifierFlags.contains(.command),
                  event.charactersIgnoringModifiers == "v" else {
@@ -178,6 +184,7 @@ struct ChatInputView: View {
    // MARK: - File picker
    private func pickImage() {
        guard supportsImages else { return }
        let panel = NSOpenPanel()
        panel.allowedContentTypes = [.image]
        panel.allowsMultipleSelection = true
--- a/README.md
+++ b/README.md
@@ -1,14 +1,17 @@
 # MLX Server
-Native macOS app for running local LLMs on Apple Silicon via [MLX](https://github.com/ml-explore/mlx). Built with SwiftUI, it provides both a **chat UI** and an embedded **OpenAI-compatible API server**. Supports vision, tool use, and thinking mode.
+Native macOS app for running local LLMs on Apple Silicon via [MLX](https://github.com/ml-explore/mlx). Built with SwiftUI, it provides both a **chat UI** and an embedded **OpenAI-compatible API server**. Supports both vision-capable and text-only MLX models, plus tool use and thinking mode where the selected model supports them.
 ## Supported Models
-| Alias | Model | Context | Capabilities |
+| Alias | Model | Context | Loader | Capabilities |
-|-------|-------|---------|-------------|
+|-------|-------|---------|--------|-------------|
-| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | Vision, tool use (`tool_code` blocks) |
+| `gemma` | `mlx-community/gemma-3-4b-it-4bit` | 128k | `VLMModelFactory` | Vision, tool use (`tool_code` blocks) |
-| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | Vision, tool use (`<tool_call>` tags) |
+| `qwen` | `mlx-community/Qwen3-VL-4B-Instruct-4bit` | 256k | `VLMModelFactory` | Vision, tool use (`<tool_call>` tags) |
-| `qwen3.5-9b` | `mlx-community/Qwen3.5-9B-4bit` | 256k | Thinking mode, tool use |
+| `qwen3.5-9b` | `mlx-community/Qwen3.5-9B-4bit` | 256k | `LLMModelFactory` | Text-only, thinking mode, tool use |
 | `stheno` | `synk/L3-8B-Stheno-v3.2-MLX` | 8k | `LLMModelFactory` | Text-only |
 `stheno` is loaded as a standard MLX text model. The Hugging Face card provides an `mlx_lm.load(...)` sample rather than a VLM example, and its config reports `model_type: llama` with `max_position_embeddings: 8192`, so the app treats it as an 8k Llama-family text model.
 Any model in MLX format on HuggingFace can be added — there is no restriction on uploader or architecture.
@@ -23,7 +26,7 @@ open "build/Debug/MLX Server.app"
 ## App Features
- **Chat interface** with markdown rendering, image attachments (file picker, drag & drop, clipboard paste, Finder copy-paste)
+- **Chat interface** with markdown rendering and model-aware image attachments (file picker, drag & drop, clipboard paste, Finder copy-paste on vision-capable models)
 - **Model picker** in toolbar with local/download status indicators and re-download button
 - **Download progress modal** — shows file progress, percentage, and speed when downloading a new model
 - **Thinking mode** — models like Qwen3.5 can reason internally before responding; thinking content appears in a collapsible box. Toggle on/off in Settings.
@@ -42,6 +45,8 @@ The embedded API server (toggle in toolbar) runs on port 1234 by default. Standa
 - `POST /v1/chat/completions` — chat completions (streaming and non-streaming)
 - `GET /health` — health check
 Capability checks are enforced server-side. If a request sends images to a text-only model or tools to a model without tool support, the server returns a `400 invalid_request_error`.
 ### Model Swapping
 Send any model ID or alias in the `model` field. If it differs from the currently loaded model, the server swaps automatically:
@@ -69,10 +74,14 @@ Pass images as base64 data URIs in the `image_url` content part:
 }
 ```
 Text-only models such as `qwen3.5-9b` and `stheno` reject image inputs.
 ### Tool Use
 Pass tools in the `tools` field (OpenAI format). The server handles model-specific formatting (Gemma `tool_code` blocks, Qwen `<tool_call>` XML tags) and parses tool calls from output automatically. When tools are present during streaming, output is buffered to strip tool-call markup before sending to the client.
 `stheno` is currently documented and configured as a plain text model, so tool requests to it are rejected.
 ## Project Structure
 ```
@@ -112,7 +121,7 @@ build.sh        — One-command build script (xcodegen + xcodebuild)
 ## Key Design Decisions
- Uses `mlx-swift-lm` (`MLXVLM` / `VLMModelFactory`) for inference — loads any MLX-format model from HuggingFace
+- Uses `mlx-swift-lm` for inference — `VLMModelFactory` for vision models and `LLMModelFactory` for text-only models
 - **Offline-first**: `LocalModelResolver` checks both the sandboxed app container and `~/.cache/huggingface/hub/` for locally-cached models before downloading
 - **No duplicate storage**: custom `HubApi` with blob cache disabled — models are stored once in the snapshot cache
 - **KV cache reuse** across API requests — reuses `ChatSession` when conversation history prefix matches