feat: start of support for thinking mode, qwen 3.5 9b addition and better idle time handling

2026-03-18 09:15:08 +01:00
parent ed6cc5f5d1
commit 07b71f90ec
13 changed files with 389 additions and 33 deletions
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -26,6 +26,7 @@
 		B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4239CFF94B819C35A8D4D617 /* MonitorView.swift */; };
 		B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */; };
 		B6D3662995B885C102876B4A /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = 9090667D4134056AE66DC2F1 /* MLXLMCommon */; };
+		C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
 		D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
 		D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
 		F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
@@ -36,6 +37,7 @@
 /* Begin PBXFileReference section */
 		145B888FBDD4F931512C5473 /* Preferences.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Preferences.swift; sourceTree = "<group>"; };
 		16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolPromptBuilder.swift; sourceTree = "<group>"; };
+		2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadModalView.swift; sourceTree = "<group>"; };
 		38DFC212AF4359A45FBE22BA /* ModelConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelConfig.swift; sourceTree = "<group>"; };
 		3AF462805202797F61422AEE /* MLXServer.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = MLXServer.entitlements; sourceTree = "<group>"; };
 		3D08828E16B17EF02C14243E /* APIServer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServer.swift; sourceTree = "<group>"; };
@@ -111,6 +113,7 @@
 			children = (
 				E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */,
 				DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */,
+				2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */,
 				C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */,
 				4239CFF94B819C35A8D4D617 /* MonitorView.swift */,
 				4147321383E94E9F17A0154E /* SettingsView.swift */,
@@ -240,6 +243,7 @@
 				5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */,
 				B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
 				5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
+				C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
 				2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
 				6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */,
 				50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */,
--- a/MLXServer/ContentView.swift
+++ b/MLXServer/ContentView.swift
@@ -56,14 +56,23 @@ struct ContentView: View {

    @ViewBuilder
    private var mainContent: some View {
-        if let chatVM {
-            if showMonitor {
-                MonitorView(stats: chatVM.apiServer.inferenceStats)
+        ZStack {
+            if let chatVM {
+                if showMonitor {
+                    MonitorView(stats: chatVM.apiServer.inferenceStats)
+                } else {
+                    ChatView(viewModel: chatVM)
+                }
            } else {
-                ChatView(viewModel: chatVM)
+                ProgressView("Initializing…")
+            }
+
+            // Download modal overlay
+            if modelManager.isDownloading {
+                Color.black.opacity(0.3)
+                    .ignoresSafeArea()
+                DownloadModalView()
            }
-        } else {
-            ProgressView("Initializing…")
        }
    }

--- a/MLXServer/Models/ChatMessage.swift
+++ b/MLXServer/Models/ChatMessage.swift
@@ -10,6 +10,16 @@ struct ChatMessage: Identifiable {
    var isStreaming: Bool
    let timestamp: Date

+    /// Raw streamed text including <think> tags (only for assistant messages).
+    /// `content` and `thinkingContent` are derived from this.
+    var rawContent: String = ""
+
+    /// The thinking/reasoning content extracted from <think>...</think> tags.
+    var thinkingContent: String = ""
+
+    /// Whether the model is currently in a thinking block.
+    var isThinking: Bool = false
+
    enum Role: String {
        case system
        case user
@@ -19,6 +29,7 @@ struct ChatMessage: Identifiable {
    init(role: Role, content: String, images: [NSImage] = [], isStreaming: Bool = false) {
        self.role = role
        self.content = content
+        self.rawContent = content
        self.images = images
        self.isStreaming = isStreaming
        self.timestamp = Date()
@@ -43,15 +54,53 @@ final class Conversation {
    }

    /// Appends a text chunk to the assistant message at the given index.
+    /// Handles `<think>...</think>` tags by routing content to `thinkingContent` vs `content`.
    func appendToMessage(at index: Int, chunk: String) {
        guard index < messages.count else { return }
-        messages[index].content += chunk
+        messages[index].rawContent += chunk
+
+        // Parse the full raw content to separate thinking from response.
+        // This is simpler and more robust than incremental parsing since
+        // tag boundaries can split across chunks.
+        let raw = messages[index].rawContent
+        var thinking = ""
+        var visible = ""
+        var isInThink = false
+
+        var scanner = raw[raw.startIndex...]
+        while !scanner.isEmpty {
+            if isInThink {
+                if let endRange = scanner.range(of: "</think>") {
+                    thinking += String(scanner[scanner.startIndex..<endRange.lowerBound])
+                    scanner = scanner[endRange.upperBound...]
+                    isInThink = false
+                } else {
+                    // Still inside thinking — all remaining text is thinking
+                    thinking += String(scanner)
+                    break
+                }
+            } else {
+                if let startRange = scanner.range(of: "<think>") {
+                    visible += String(scanner[scanner.startIndex..<startRange.lowerBound])
+                    scanner = scanner[startRange.upperBound...]
+                    isInThink = true
+                } else {
+                    visible += String(scanner)
+                    break
+                }
+            }
+        }
+
+        messages[index].thinkingContent = thinking.trimmingCharacters(in: .whitespacesAndNewlines)
+        messages[index].content = visible.trimmingCharacters(in: .whitespacesAndNewlines)
+        messages[index].isThinking = isInThink
    }

    /// Marks the assistant message at the given index as done streaming.
    func finalizeMessage(at index: Int) {
        guard index < messages.count else { return }
        messages[index].isStreaming = false
+        messages[index].isThinking = false
    }

    func clear() {
--- a/MLXServer/Models/ModelConfig.swift
+++ b/MLXServer/Models/ModelConfig.swift
@@ -22,6 +22,12 @@ struct ModelConfig: Identifiable, Hashable {
            displayName: "Qwen3 VL 4B",
            contextLength: 256_000
        ),
+        ModelConfig(
+            id: "qwen3.5-9b",
+            repoId: "mlx-community/Qwen3.5-9B-4bit",
+            displayName: "Qwen3.5 9B",
+            contextLength: 256_000
+        ),
    ]

    static let `default` = availableModels[0]
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -347,18 +347,23 @@ final class APIServer {
            // Only conversation turns go in `history:` — this avoids replaying the
            // large tool prompt as history on every new session.
            let instr = instructions.isEmpty ? nil : instructions
+            let thinkingContext: [String: any Sendable]? = Preferences.enableThinking
+                ? nil
+                : ["enable_thinking": false]
            if !allButLast.isEmpty {
                session = ChatSession(
                    container,
                    instructions: instr,
                    history: allButLast,
-                    generateParameters: generateParams
+                    generateParameters: generateParams,
+                    additionalContext: thinkingContext
                )
            } else {
                session = ChatSession(
                    container,
                    instructions: instr,
-                    generateParameters: generateParams
+                    generateParameters: generateParams,
+                    additionalContext: thinkingContext
                )
            }
        }
@@ -464,6 +469,7 @@ final class APIServer {
            }

            LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
+            modelManager?.touchActivity()

            // Parse tool calls: first check framework-detected ones, then our own text parser
            var finishReason = "stop"
@@ -536,6 +542,7 @@ final class APIServer {
            }
        } catch {
            LiveCounters.shared.requestCompleted(generationTokens: 0)
+            modelManager?.touchActivity()
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
        }
    }
@@ -671,6 +678,7 @@ final class APIServer {
        ))

        LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
+        modelManager?.touchActivity()

        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
--- a/MLXServer/Utilities/LocalModelResolver.swift
+++ b/MLXServer/Utilities/LocalModelResolver.swift
@@ -3,14 +3,43 @@ import Foundation
 /// Resolves HuggingFace model repos to local snapshot directories,
 /// matching the cache layout used by Python's `huggingface_hub`.
 ///
+/// Checks two locations:
+///   1. App sandbox container: ~/Library/Containers/com.mlxserver.app/.../huggingface/hub/
+///   2. System-wide cache: ~/.cache/huggingface/hub/ (shared with Python tools)
+///
 /// Cache structure:
-///   ~/.cache/huggingface/hub/models--{org}--{name}/snapshots/{hash}/
+///   .../huggingface/hub/models--{org}--{name}/snapshots/{hash}/
 enum LocalModelResolver {

-    /// The standard HuggingFace cache directory used by Python's `huggingface_hub`.
-    private static let cacheBase: URL = {
-        FileManager.default.homeDirectoryForCurrentUser
+    /// All HuggingFace cache directories to search, in priority order.
+    /// The sandboxed container path is checked first (where the app downloads to),
+    /// then the system-wide Python cache (for models downloaded via huggingface-cli).
+    private static let cacheBases: [URL] = {
+        var bases: [URL] = []
+
+        // 1. Sandboxed app container cache (where swift-transformers Hub downloads to)
+        let containerCache = FileManager.default.homeDirectoryForCurrentUser
+            .appendingPathComponent("Library/Caches/huggingface/hub", isDirectory: true)
+        bases.append(containerCache)
+
+        // 2. System-wide ~/.cache/huggingface/hub/ (Python huggingface_hub)
+        //    When sandboxed, homeDirectory points to the container, so construct the real path.
+        let realHome = URL(fileURLWithPath: NSHomeDirectory())
+        let systemCache = realHome
            .appendingPathComponent(".cache/huggingface/hub", isDirectory: true)
+        // Avoid duplicate if they resolve to the same path
+        if systemCache.path != containerCache.path {
+            bases.append(systemCache)
+        }
+
+        // 3. Also try the unsandboxed home directory path
+        let globalHome = FileManager.default.homeDirectoryForCurrentUser
+            .appendingPathComponent(".cache/huggingface/hub", isDirectory: true)
+        if globalHome.path != containerCache.path && globalHome.path != systemCache.path {
+            bases.append(globalHome)
+        }
+
+        return bases
    }()

    /// Resolve a HuggingFace repo ID (e.g. "mlx-community/gemma-3-4b-it-4bit")
@@ -18,30 +47,71 @@ enum LocalModelResolver {
    ///
    /// Returns `nil` if the model hasn't been downloaded yet.
    static func resolve(repoId: String) -> URL? {
-        // Convert "mlx-community/gemma-3-4b-it-4bit" → "models--mlx-community--gemma-3-4b-it-4bit"
        let dirName = "models--" + repoId.replacingOccurrences(of: "/", with: "--")
-        let snapshotsDir = cacheBase
-            .appendingPathComponent(dirName, isDirectory: true)
-            .appendingPathComponent("snapshots", isDirectory: true)

-        // Find the first (usually only) snapshot hash directory
-        guard let contents = try? FileManager.default.contentsOfDirectory(
-            at: snapshotsDir,
-            includingPropertiesForKeys: [.isDirectoryKey],
-            options: [.skipsHiddenFiles]
-        ) else {
-            return nil
+        for cacheBase in cacheBases {
+            let snapshotsDir = cacheBase
+                .appendingPathComponent(dirName, isDirectory: true)
+                .appendingPathComponent("snapshots", isDirectory: true)
+
+            guard let contents = try? FileManager.default.contentsOfDirectory(
+                at: snapshotsDir,
+                includingPropertiesForKeys: [.isDirectoryKey],
+                options: [.skipsHiddenFiles]
+            ) else {
+                continue
+            }
+
+            if let snapshot = contents
+                .filter({ (try? $0.resourceValues(forKeys: [.isDirectoryKey]).isDirectory) == true })
+                .sorted(by: { $0.lastPathComponent < $1.lastPathComponent })
+                .last {
+                return snapshot
+            }
        }

-        // Return the most recent snapshot (last alphabetically = latest hash)
-        return contents
-            .filter { (try? $0.resourceValues(forKeys: [.isDirectoryKey]).isDirectory) == true }
-            .sorted(by: { $0.lastPathComponent < $1.lastPathComponent })
-            .last
+        return nil
    }

    /// Check if a model is available locally.
    static func isAvailable(repoId: String) -> Bool {
        resolve(repoId: repoId) != nil
    }
+
+    /// Delete the local cache for a model so it will be re-downloaded next time.
+    /// Removes from all cache locations.
+    /// Returns true if something was deleted.
+    @discardableResult
+    static func deleteLocal(repoId: String) -> Bool {
+        let dirName = "models--" + repoId.replacingOccurrences(of: "/", with: "--")
+        var deleted = false
+
+        for cacheBase in cacheBases {
+            let modelDir = cacheBase.appendingPathComponent(dirName, isDirectory: true)
+            guard FileManager.default.fileExists(atPath: modelDir.path) else { continue }
+            do {
+                try FileManager.default.removeItem(at: modelDir)
+                print("[LocalModelResolver] Deleted \(modelDir.path)")
+                deleted = true
+            } catch {
+                print("[LocalModelResolver] Failed to delete \(modelDir.path): \(error)")
+            }
+        }
+
+        // Also clean up the per-model cache in the container (used by swift-transformers)
+        let containerModelsDir = FileManager.default.homeDirectoryForCurrentUser
+            .appendingPathComponent("Library/Caches/models", isDirectory: true)
+            .appendingPathComponent(repoId, isDirectory: true)
+        if FileManager.default.fileExists(atPath: containerModelsDir.path) {
+            do {
+                try FileManager.default.removeItem(at: containerModelsDir)
+                print("[LocalModelResolver] Deleted \(containerModelsDir.path)")
+                deleted = true
+            } catch {
+                print("[LocalModelResolver] Failed to delete \(containerModelsDir.path): \(error)")
+            }
+        }
+
+        return deleted
+    }
 }
--- a/MLXServer/Utilities/Preferences.swift
+++ b/MLXServer/Utilities/Preferences.swift
@@ -49,6 +49,17 @@ enum Preferences {
        set { defaults.set(newValue, forKey: apiAutoStartKey) }
    }

+    // MARK: - Thinking mode
+
+    private static let enableThinkingKey = "enableThinking"
+
+    /// Whether to enable thinking/reasoning mode for models that support it (e.g. Qwen3.5).
+    /// When disabled, the model skips internal reasoning and responds directly.
+    static var enableThinking: Bool {
+        get { defaults.object(forKey: enableThinkingKey) == nil ? true : defaults.bool(forKey: enableThinkingKey) }
+        set { defaults.set(newValue, forKey: enableThinkingKey) }
+    }
+
    // MARK: - Idle unload

    private static let idleUnloadMinutesKey = "idleUnloadMinutes"
--- a/MLXServer/ViewModels/ChatViewModel.swift
+++ b/MLXServer/ViewModels/ChatViewModel.swift
@@ -31,10 +31,16 @@ final class ChatViewModel {
        guard let container = modelManager.modelContainer else { return }
        if chatSession == nil {
            let systemPrompt = Preferences.systemPrompt
+            // Pass enable_thinking to the Jinja chat template context.
+            // Qwen3.5 and similar models use this to control reasoning mode.
+            let thinkingContext: [String: any Sendable]? = Preferences.enableThinking
+                ? nil
+                : ["enable_thinking": false]
            chatSession = ChatSession(
                container,
                instructions: systemPrompt.isEmpty ? nil : systemPrompt,
-                generateParameters: GenerateParameters(temperature: 0.7)
+                generateParameters: GenerateParameters(temperature: 0.7),
+                additionalContext: thinkingContext
            )
        }
    }
@@ -113,6 +119,7 @@ final class ChatViewModel {
            conversation.finalizeMessage(at: assistantIndex)
            isGenerating = false
            generationTask = nil
+            modelManager.touchActivity()
        }
    }

--- a/MLXServer/ViewModels/ModelManager.swift
+++ b/MLXServer/ViewModels/ModelManager.swift
@@ -1,4 +1,5 @@
 import Foundation
+import Hub
 import MLX
 import MLXLMCommon
 import MLXVLM
@@ -7,6 +8,11 @@ import MLXVLM
@Observable
@MainActor
 final class ModelManager {
+
+    /// HubApi with blob cache disabled to avoid storing every model twice.
+    /// swift-huggingface defaults to caching in both huggingface/hub/ (snapshots)
+    /// AND models/ (content-addressed blobs). We only need the snapshots.
+    private static let hub = HubApi(cache: nil)
    var currentModel: ModelConfig?
    var modelContainer: ModelContainer?
    var isLoading = false
@@ -14,6 +20,12 @@ final class ModelManager {
    var loadingModelName: String = ""
    var errorMessage: String?

+    // Download-specific state for the modal
+    var isDownloading = false
+    var downloadFilesTotal: Int64 = 0
+    var downloadFilesCompleted: Int64 = 0
+    var downloadSpeed: Double = 0 // bytes/sec
+
    private var idleTimer: Timer?
    private(set) var lastUsed: Date?

@@ -31,11 +43,26 @@ final class ModelManager {
        loadingModelName = config.displayName
        errorMessage = nil

+        let needsDownload = !config.isLocal
+        if needsDownload {
+            isDownloading = true
+            downloadFilesTotal = 0
+            downloadFilesCompleted = 0
+            downloadSpeed = 0
+        }
+
        do {
            let container: ModelContainer
            let progressHandler: @Sendable (Progress) -> Void = { progress in
                Task { @MainActor in
                    self.downloadProgress = progress.fractionCompleted
+                    if self.isDownloading {
+                        self.downloadFilesTotal = progress.totalUnitCount
+                        self.downloadFilesCompleted = progress.completedUnitCount
+                        if let speed = progress.userInfo[.throughputKey] as? Double {
+                            self.downloadSpeed = speed
+                        }
+                    }
                }
            }

@@ -47,20 +74,30 @@ final class ModelManager {
            }

            container = try await VLMModelFactory.shared.loadContainer(
+                hub: Self.hub,
                configuration: configuration,
                progressHandler: progressHandler
            )

+            self.isDownloading = false
            self.modelContainer = container
            self.currentModel = config
            touchActivity()
        } catch {
+            self.isDownloading = false
            self.errorMessage = "Failed to load model: \(error.localizedDescription)"
        }

        isLoading = false
    }

+    /// Delete local cache and re-download a model.
+    func redownloadModel(_ config: ModelConfig) async {
+        unloadModel()
+        LocalModelResolver.deleteLocal(repoId: config.repoId)
+        await loadModel(config)
+    }
+
    /// Unload the current model and free GPU memory.
    func unloadModel() {
        idleTimer?.invalidate()
--- a/MLXServer/Views/ChatMessagesView.swift
+++ b/MLXServer/Views/ChatMessagesView.swift
@@ -57,6 +57,7 @@ struct ChatMessagesView: View {

 struct MessageBubbleView: View {
    let message: ChatMessage
+    @State private var showThinking = false

    var body: some View {
        HStack {
@@ -76,11 +77,16 @@ struct MessageBubbleView: View {
                    }
                }

+                // Thinking block (collapsible)
+                if !message.thinkingContent.isEmpty || message.isThinking {
+                    thinkingView
+                }
+
                // Message content
-                if !message.content.isEmpty || message.isStreaming {
+                if !message.content.isEmpty || (message.isStreaming && !message.isThinking) {
                    Group {
                        if message.role == .assistant {
-                            Markdown(message.content + (message.isStreaming ? " ●" : ""))
+                            Markdown(message.content + (message.isStreaming && !message.isThinking ? " ●" : ""))
                                .textSelection(.enabled)
                        } else {
                            Text(message.content)
@@ -101,4 +107,43 @@ struct MessageBubbleView: View {
            if message.role == .assistant { Spacer(minLength: 60) }
        }
    }
+
+    private var thinkingView: some View {
+        VStack(alignment: .leading, spacing: 0) {
+            Button {
+                withAnimation(.easeInOut(duration: 0.15)) {
+                    showThinking.toggle()
+                }
+            } label: {
+                HStack(spacing: 4) {
+                    Image(systemName: showThinking ? "chevron.down" : "chevron.right")
+                        .font(.caption2)
+                    if message.isThinking {
+                        ProgressView()
+                            .controlSize(.mini)
+                        Text("Thinking…")
+                    } else {
+                        Image(systemName: "brain")
+                        Text("Thought")
+                    }
+                }
+                .font(.caption)
+                .foregroundStyle(.secondary)
+            }
+            .buttonStyle(.plain)
+
+            if showThinking {
+                Text(message.thinkingContent + (message.isThinking ? " ●" : ""))
+                    .font(.caption)
+                    .foregroundStyle(.tertiary)
+                    .textSelection(.enabled)
+                    .padding(.top, 4)
+                    .padding(.leading, 14)
+            }
+        }
+        .padding(.horizontal, 12)
+        .padding(.vertical, 6)
+        .background(Color.purple.opacity(0.06))
+        .clipShape(RoundedRectangle(cornerRadius: 8))
+    }
 }
--- a/MLXServer/Views/DownloadModalView.swift
+++ b/MLXServer/Views/DownloadModalView.swift
@@ -0,0 +1,68 @@
+import SwiftUI
+
+/// Modal overlay shown when a model is being downloaded from HuggingFace.
+struct DownloadModalView: View {
+    @Environment(ModelManager.self) private var modelManager
+
+    var body: some View {
+        VStack(spacing: 20) {
+            // Header
+            Label("Downloading Model", systemImage: "arrow.down.circle")
+                .font(.headline)
+
+            Text(modelManager.loadingModelName)
+                .font(.title3.weight(.medium))
+                .foregroundStyle(.primary)
+
+            // Progress bar
+            VStack(spacing: 8) {
+                ProgressView(value: modelManager.downloadProgress)
+                    .progressViewStyle(.linear)
+
+                HStack {
+                    // Files progress
+                    if modelManager.downloadFilesTotal > 0 {
+                        Text("File \(modelManager.downloadFilesCompleted)/\(modelManager.downloadFilesTotal)")
+                            .font(.caption.monospacedDigit())
+                            .foregroundStyle(.secondary)
+                    }
+
+                    Spacer()
+
+                    // Percentage
+                    Text("\(Int(modelManager.downloadProgress * 100))%")
+                        .font(.caption.monospacedDigit())
+                        .foregroundStyle(.secondary)
+                }
+
+                // Speed
+                if modelManager.downloadSpeed > 0 {
+                    Text(formatSpeed(modelManager.downloadSpeed))
+                        .font(.caption.monospacedDigit())
+                        .foregroundStyle(.tertiary)
+                }
+            }
+
+            Text("The model will be cached locally for future use.")
+                .font(.caption)
+                .foregroundStyle(.tertiary)
+                .multilineTextAlignment(.center)
+        }
+        .padding(32)
+        .frame(width: 380)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 16))
+        .shadow(radius: 20)
+    }
+
+    private func formatSpeed(_ bytesPerSec: Double) -> String {
+        if bytesPerSec >= 1_073_741_824 {
+            return String(format: "%.1f GB/s", bytesPerSec / 1_073_741_824)
+        } else if bytesPerSec >= 1_048_576 {
+            return String(format: "%.1f MB/s", bytesPerSec / 1_048_576)
+        } else if bytesPerSec >= 1024 {
+            return String(format: "%.0f KB/s", bytesPerSec / 1024)
+        } else {
+            return String(format: "%.0f B/s", bytesPerSec)
+        }
+    }
+}
--- a/MLXServer/Views/ModelPickerView.swift
+++ b/MLXServer/Views/ModelPickerView.swift
@@ -2,6 +2,7 @@ import SwiftUI

 struct ModelPickerView: View {
    @Environment(ModelManager.self) private var modelManager
+    @State private var confirmRedownload: ModelConfig?

    var body: some View {
        HStack(spacing: 8) {
@@ -15,6 +16,35 @@ struct ModelPickerView: View {
            }
            .frame(width: 160)
            .disabled(modelManager.isLoading)
+
+            // Re-download button (visible when a model is loaded)
+            if let current = modelManager.currentModel, !modelManager.isLoading {
+                Button {
+                    confirmRedownload = current
+                } label: {
+                    Image(systemName: "arrow.clockwise")
+                        .font(.caption)
+                }
+                .buttonStyle(.borderless)
+                .help("Re-download \(current.displayName)")
+            }
+        }
+        .alert("Re-download Model?", isPresented: .init(
+            get: { confirmRedownload != nil },
+            set: { if !$0 { confirmRedownload = nil } }
+        )) {
+            Button("Re-download", role: .destructive) {
+                if let config = confirmRedownload {
+                    Task { await modelManager.redownloadModel(config) }
+                }
+            }
+            Button("Cancel", role: .cancel) {
+                confirmRedownload = nil
+            }
+        } message: {
+            if let config = confirmRedownload {
+                Text("This will delete the local cache for \(config.displayName) and download it again from HuggingFace.")
+            }
        }
    }

--- a/MLXServer/Views/SettingsView.swift
+++ b/MLXServer/Views/SettingsView.swift
@@ -6,6 +6,7 @@ struct SettingsView: View {
    @State private var apiAutoStart: Bool = Preferences.apiAutoStart
    @State private var idleUnloadMinutes: String = String(Preferences.idleUnloadMinutes)
    @State private var defaultModelId: String = Preferences.defaultModelId ?? ModelConfig.default.id
+    @State private var enableThinking: Bool = Preferences.enableThinking

    var body: some View {
        Form {
@@ -24,6 +25,17 @@ struct SettingsView: View {
                    .foregroundStyle(.secondary)
            }

+            Section("Generation") {
+                Toggle("Enable thinking mode", isOn: $enableThinking)
+                    .onChange(of: enableThinking) {
+                        Preferences.enableThinking = enableThinking
+                    }
+
+                Text("When enabled, models like Qwen3.5 reason internally before responding. Produces better answers but slower. Takes effect on the next conversation.")
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
+            }
+
            Section("System Prompt") {
                TextEditor(text: $systemPrompt)
                    .font(.body.monospaced())
@@ -75,6 +87,6 @@ struct SettingsView: View {
            }
        }
        .formStyle(.grouped)
-        .frame(width: 450, height: 460)
+        .frame(width: 450, height: 550)
    }
 }