From 07b71f90ec2928ca9cb8abbe357dca13084bc63f Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Wed, 18 Mar 2026 09:15:08 +0100 Subject: [PATCH] feat: start of support for thinking mode, qwen 3.5 9b addition and better idle time handling --- MLXServer.xcodeproj/project.pbxproj | 4 + MLXServer/ContentView.swift | 21 +++- MLXServer/Models/ChatMessage.swift | 51 ++++++++- MLXServer/Models/ModelConfig.swift | 6 + MLXServer/Server/APIServer.swift | 12 +- MLXServer/Utilities/LocalModelResolver.swift | 110 +++++++++++++++---- MLXServer/Utilities/Preferences.swift | 11 ++ MLXServer/ViewModels/ChatViewModel.swift | 9 +- MLXServer/ViewModels/ModelManager.swift | 37 +++++++ MLXServer/Views/ChatMessagesView.swift | 49 ++++++++- MLXServer/Views/DownloadModalView.swift | 68 ++++++++++++ MLXServer/Views/ModelPickerView.swift | 30 +++++ MLXServer/Views/SettingsView.swift | 14 ++- 13 files changed, 389 insertions(+), 33 deletions(-) create mode 100644 MLXServer/Views/DownloadModalView.swift diff --git a/MLXServer.xcodeproj/project.pbxproj b/MLXServer.xcodeproj/project.pbxproj index 2c00ae2..1d673c1 100644 --- a/MLXServer.xcodeproj/project.pbxproj +++ b/MLXServer.xcodeproj/project.pbxproj @@ -26,6 +26,7 @@ B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4239CFF94B819C35A8D4D617 /* MonitorView.swift */; }; B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */; }; B6D3662995B885C102876B4A /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = 9090667D4134056AE66DC2F1 /* MLXLMCommon */; }; + C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; }; D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; }; D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; }; F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; }; @@ -36,6 +37,7 @@ /* Begin PBXFileReference section */ 145B888FBDD4F931512C5473 /* Preferences.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Preferences.swift; sourceTree = ""; }; 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolPromptBuilder.swift; sourceTree = ""; }; + 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadModalView.swift; sourceTree = ""; }; 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelConfig.swift; sourceTree = ""; }; 3AF462805202797F61422AEE /* MLXServer.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = MLXServer.entitlements; sourceTree = ""; }; 3D08828E16B17EF02C14243E /* APIServer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServer.swift; sourceTree = ""; }; @@ -111,6 +113,7 @@ children = ( E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */, DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */, + 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */, C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */, 4239CFF94B819C35A8D4D617 /* MonitorView.swift */, 4147321383E94E9F17A0154E /* SettingsView.swift */, @@ -240,6 +243,7 @@ 5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */, B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */, 5946258F1DE88CE904584E0B /* ContentView.swift in Sources */, + C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */, 2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */, 6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */, 50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */, diff --git a/MLXServer/ContentView.swift b/MLXServer/ContentView.swift index 4b1d67a..e9d54e0 100644 --- a/MLXServer/ContentView.swift +++ b/MLXServer/ContentView.swift @@ -56,14 +56,23 @@ struct ContentView: View { @ViewBuilder private var mainContent: some View { - if let chatVM { - if showMonitor { - MonitorView(stats: chatVM.apiServer.inferenceStats) + ZStack { + if let chatVM { + if showMonitor { + MonitorView(stats: chatVM.apiServer.inferenceStats) + } else { + ChatView(viewModel: chatVM) + } } else { - ChatView(viewModel: chatVM) + ProgressView("Initializing…") + } + + // Download modal overlay + if modelManager.isDownloading { + Color.black.opacity(0.3) + .ignoresSafeArea() + DownloadModalView() } - } else { - ProgressView("Initializing…") } } diff --git a/MLXServer/Models/ChatMessage.swift b/MLXServer/Models/ChatMessage.swift index 5f8860d..1ff5b28 100644 --- a/MLXServer/Models/ChatMessage.swift +++ b/MLXServer/Models/ChatMessage.swift @@ -10,6 +10,16 @@ struct ChatMessage: Identifiable { var isStreaming: Bool let timestamp: Date + /// Raw streamed text including tags (only for assistant messages). + /// `content` and `thinkingContent` are derived from this. + var rawContent: String = "" + + /// The thinking/reasoning content extracted from ... tags. + var thinkingContent: String = "" + + /// Whether the model is currently in a thinking block. + var isThinking: Bool = false + enum Role: String { case system case user @@ -19,6 +29,7 @@ struct ChatMessage: Identifiable { init(role: Role, content: String, images: [NSImage] = [], isStreaming: Bool = false) { self.role = role self.content = content + self.rawContent = content self.images = images self.isStreaming = isStreaming self.timestamp = Date() @@ -43,15 +54,53 @@ final class Conversation { } /// Appends a text chunk to the assistant message at the given index. + /// Handles `...` tags by routing content to `thinkingContent` vs `content`. func appendToMessage(at index: Int, chunk: String) { guard index < messages.count else { return } - messages[index].content += chunk + messages[index].rawContent += chunk + + // Parse the full raw content to separate thinking from response. + // This is simpler and more robust than incremental parsing since + // tag boundaries can split across chunks. + let raw = messages[index].rawContent + var thinking = "" + var visible = "" + var isInThink = false + + var scanner = raw[raw.startIndex...] + while !scanner.isEmpty { + if isInThink { + if let endRange = scanner.range(of: "") { + thinking += String(scanner[scanner.startIndex..") { + visible += String(scanner[scanner.startIndex.. URL? { - // Convert "mlx-community/gemma-3-4b-it-4bit" → "models--mlx-community--gemma-3-4b-it-4bit" let dirName = "models--" + repoId.replacingOccurrences(of: "/", with: "--") - let snapshotsDir = cacheBase - .appendingPathComponent(dirName, isDirectory: true) - .appendingPathComponent("snapshots", isDirectory: true) - // Find the first (usually only) snapshot hash directory - guard let contents = try? FileManager.default.contentsOfDirectory( - at: snapshotsDir, - includingPropertiesForKeys: [.isDirectoryKey], - options: [.skipsHiddenFiles] - ) else { - return nil + for cacheBase in cacheBases { + let snapshotsDir = cacheBase + .appendingPathComponent(dirName, isDirectory: true) + .appendingPathComponent("snapshots", isDirectory: true) + + guard let contents = try? FileManager.default.contentsOfDirectory( + at: snapshotsDir, + includingPropertiesForKeys: [.isDirectoryKey], + options: [.skipsHiddenFiles] + ) else { + continue + } + + if let snapshot = contents + .filter({ (try? $0.resourceValues(forKeys: [.isDirectoryKey]).isDirectory) == true }) + .sorted(by: { $0.lastPathComponent < $1.lastPathComponent }) + .last { + return snapshot + } } - // Return the most recent snapshot (last alphabetically = latest hash) - return contents - .filter { (try? $0.resourceValues(forKeys: [.isDirectoryKey]).isDirectory) == true } - .sorted(by: { $0.lastPathComponent < $1.lastPathComponent }) - .last + return nil } /// Check if a model is available locally. static func isAvailable(repoId: String) -> Bool { resolve(repoId: repoId) != nil } + + /// Delete the local cache for a model so it will be re-downloaded next time. + /// Removes from all cache locations. + /// Returns true if something was deleted. + @discardableResult + static func deleteLocal(repoId: String) -> Bool { + let dirName = "models--" + repoId.replacingOccurrences(of: "/", with: "--") + var deleted = false + + for cacheBase in cacheBases { + let modelDir = cacheBase.appendingPathComponent(dirName, isDirectory: true) + guard FileManager.default.fileExists(atPath: modelDir.path) else { continue } + do { + try FileManager.default.removeItem(at: modelDir) + print("[LocalModelResolver] Deleted \(modelDir.path)") + deleted = true + } catch { + print("[LocalModelResolver] Failed to delete \(modelDir.path): \(error)") + } + } + + // Also clean up the per-model cache in the container (used by swift-transformers) + let containerModelsDir = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent("Library/Caches/models", isDirectory: true) + .appendingPathComponent(repoId, isDirectory: true) + if FileManager.default.fileExists(atPath: containerModelsDir.path) { + do { + try FileManager.default.removeItem(at: containerModelsDir) + print("[LocalModelResolver] Deleted \(containerModelsDir.path)") + deleted = true + } catch { + print("[LocalModelResolver] Failed to delete \(containerModelsDir.path): \(error)") + } + } + + return deleted + } } diff --git a/MLXServer/Utilities/Preferences.swift b/MLXServer/Utilities/Preferences.swift index 2b4962e..e58f3d0 100644 --- a/MLXServer/Utilities/Preferences.swift +++ b/MLXServer/Utilities/Preferences.swift @@ -49,6 +49,17 @@ enum Preferences { set { defaults.set(newValue, forKey: apiAutoStartKey) } } + // MARK: - Thinking mode + + private static let enableThinkingKey = "enableThinking" + + /// Whether to enable thinking/reasoning mode for models that support it (e.g. Qwen3.5). + /// When disabled, the model skips internal reasoning and responds directly. + static var enableThinking: Bool { + get { defaults.object(forKey: enableThinkingKey) == nil ? true : defaults.bool(forKey: enableThinkingKey) } + set { defaults.set(newValue, forKey: enableThinkingKey) } + } + // MARK: - Idle unload private static let idleUnloadMinutesKey = "idleUnloadMinutes" diff --git a/MLXServer/ViewModels/ChatViewModel.swift b/MLXServer/ViewModels/ChatViewModel.swift index d84b4d7..fe4c8db 100644 --- a/MLXServer/ViewModels/ChatViewModel.swift +++ b/MLXServer/ViewModels/ChatViewModel.swift @@ -31,10 +31,16 @@ final class ChatViewModel { guard let container = modelManager.modelContainer else { return } if chatSession == nil { let systemPrompt = Preferences.systemPrompt + // Pass enable_thinking to the Jinja chat template context. + // Qwen3.5 and similar models use this to control reasoning mode. + let thinkingContext: [String: any Sendable]? = Preferences.enableThinking + ? nil + : ["enable_thinking": false] chatSession = ChatSession( container, instructions: systemPrompt.isEmpty ? nil : systemPrompt, - generateParameters: GenerateParameters(temperature: 0.7) + generateParameters: GenerateParameters(temperature: 0.7), + additionalContext: thinkingContext ) } } @@ -113,6 +119,7 @@ final class ChatViewModel { conversation.finalizeMessage(at: assistantIndex) isGenerating = false generationTask = nil + modelManager.touchActivity() } } diff --git a/MLXServer/ViewModels/ModelManager.swift b/MLXServer/ViewModels/ModelManager.swift index 25c350d..ec83d74 100644 --- a/MLXServer/ViewModels/ModelManager.swift +++ b/MLXServer/ViewModels/ModelManager.swift @@ -1,4 +1,5 @@ import Foundation +import Hub import MLX import MLXLMCommon import MLXVLM @@ -7,6 +8,11 @@ import MLXVLM @Observable @MainActor final class ModelManager { + + /// HubApi with blob cache disabled to avoid storing every model twice. + /// swift-huggingface defaults to caching in both huggingface/hub/ (snapshots) + /// AND models/ (content-addressed blobs). We only need the snapshots. + private static let hub = HubApi(cache: nil) var currentModel: ModelConfig? var modelContainer: ModelContainer? var isLoading = false @@ -14,6 +20,12 @@ final class ModelManager { var loadingModelName: String = "" var errorMessage: String? + // Download-specific state for the modal + var isDownloading = false + var downloadFilesTotal: Int64 = 0 + var downloadFilesCompleted: Int64 = 0 + var downloadSpeed: Double = 0 // bytes/sec + private var idleTimer: Timer? private(set) var lastUsed: Date? @@ -31,11 +43,26 @@ final class ModelManager { loadingModelName = config.displayName errorMessage = nil + let needsDownload = !config.isLocal + if needsDownload { + isDownloading = true + downloadFilesTotal = 0 + downloadFilesCompleted = 0 + downloadSpeed = 0 + } + do { let container: ModelContainer let progressHandler: @Sendable (Progress) -> Void = { progress in Task { @MainActor in self.downloadProgress = progress.fractionCompleted + if self.isDownloading { + self.downloadFilesTotal = progress.totalUnitCount + self.downloadFilesCompleted = progress.completedUnitCount + if let speed = progress.userInfo[.throughputKey] as? Double { + self.downloadSpeed = speed + } + } } } @@ -47,20 +74,30 @@ final class ModelManager { } container = try await VLMModelFactory.shared.loadContainer( + hub: Self.hub, configuration: configuration, progressHandler: progressHandler ) + self.isDownloading = false self.modelContainer = container self.currentModel = config touchActivity() } catch { + self.isDownloading = false self.errorMessage = "Failed to load model: \(error.localizedDescription)" } isLoading = false } + /// Delete local cache and re-download a model. + func redownloadModel(_ config: ModelConfig) async { + unloadModel() + LocalModelResolver.deleteLocal(repoId: config.repoId) + await loadModel(config) + } + /// Unload the current model and free GPU memory. func unloadModel() { idleTimer?.invalidate() diff --git a/MLXServer/Views/ChatMessagesView.swift b/MLXServer/Views/ChatMessagesView.swift index d7476ec..7346197 100644 --- a/MLXServer/Views/ChatMessagesView.swift +++ b/MLXServer/Views/ChatMessagesView.swift @@ -57,6 +57,7 @@ struct ChatMessagesView: View { struct MessageBubbleView: View { let message: ChatMessage + @State private var showThinking = false var body: some View { HStack { @@ -76,11 +77,16 @@ struct MessageBubbleView: View { } } + // Thinking block (collapsible) + if !message.thinkingContent.isEmpty || message.isThinking { + thinkingView + } + // Message content - if !message.content.isEmpty || message.isStreaming { + if !message.content.isEmpty || (message.isStreaming && !message.isThinking) { Group { if message.role == .assistant { - Markdown(message.content + (message.isStreaming ? " ●" : "")) + Markdown(message.content + (message.isStreaming && !message.isThinking ? " ●" : "")) .textSelection(.enabled) } else { Text(message.content) @@ -101,4 +107,43 @@ struct MessageBubbleView: View { if message.role == .assistant { Spacer(minLength: 60) } } } + + private var thinkingView: some View { + VStack(alignment: .leading, spacing: 0) { + Button { + withAnimation(.easeInOut(duration: 0.15)) { + showThinking.toggle() + } + } label: { + HStack(spacing: 4) { + Image(systemName: showThinking ? "chevron.down" : "chevron.right") + .font(.caption2) + if message.isThinking { + ProgressView() + .controlSize(.mini) + Text("Thinking…") + } else { + Image(systemName: "brain") + Text("Thought") + } + } + .font(.caption) + .foregroundStyle(.secondary) + } + .buttonStyle(.plain) + + if showThinking { + Text(message.thinkingContent + (message.isThinking ? " ●" : "")) + .font(.caption) + .foregroundStyle(.tertiary) + .textSelection(.enabled) + .padding(.top, 4) + .padding(.leading, 14) + } + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(Color.purple.opacity(0.06)) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } } diff --git a/MLXServer/Views/DownloadModalView.swift b/MLXServer/Views/DownloadModalView.swift new file mode 100644 index 0000000..7c38dd4 --- /dev/null +++ b/MLXServer/Views/DownloadModalView.swift @@ -0,0 +1,68 @@ +import SwiftUI + +/// Modal overlay shown when a model is being downloaded from HuggingFace. +struct DownloadModalView: View { + @Environment(ModelManager.self) private var modelManager + + var body: some View { + VStack(spacing: 20) { + // Header + Label("Downloading Model", systemImage: "arrow.down.circle") + .font(.headline) + + Text(modelManager.loadingModelName) + .font(.title3.weight(.medium)) + .foregroundStyle(.primary) + + // Progress bar + VStack(spacing: 8) { + ProgressView(value: modelManager.downloadProgress) + .progressViewStyle(.linear) + + HStack { + // Files progress + if modelManager.downloadFilesTotal > 0 { + Text("File \(modelManager.downloadFilesCompleted)/\(modelManager.downloadFilesTotal)") + .font(.caption.monospacedDigit()) + .foregroundStyle(.secondary) + } + + Spacer() + + // Percentage + Text("\(Int(modelManager.downloadProgress * 100))%") + .font(.caption.monospacedDigit()) + .foregroundStyle(.secondary) + } + + // Speed + if modelManager.downloadSpeed > 0 { + Text(formatSpeed(modelManager.downloadSpeed)) + .font(.caption.monospacedDigit()) + .foregroundStyle(.tertiary) + } + } + + Text("The model will be cached locally for future use.") + .font(.caption) + .foregroundStyle(.tertiary) + .multilineTextAlignment(.center) + } + .padding(32) + .frame(width: 380) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 16)) + .shadow(radius: 20) + } + + private func formatSpeed(_ bytesPerSec: Double) -> String { + if bytesPerSec >= 1_073_741_824 { + return String(format: "%.1f GB/s", bytesPerSec / 1_073_741_824) + } else if bytesPerSec >= 1_048_576 { + return String(format: "%.1f MB/s", bytesPerSec / 1_048_576) + } else if bytesPerSec >= 1024 { + return String(format: "%.0f KB/s", bytesPerSec / 1024) + } else { + return String(format: "%.0f B/s", bytesPerSec) + } + } +} diff --git a/MLXServer/Views/ModelPickerView.swift b/MLXServer/Views/ModelPickerView.swift index 0c01fe0..bcc8ea4 100644 --- a/MLXServer/Views/ModelPickerView.swift +++ b/MLXServer/Views/ModelPickerView.swift @@ -2,6 +2,7 @@ import SwiftUI struct ModelPickerView: View { @Environment(ModelManager.self) private var modelManager + @State private var confirmRedownload: ModelConfig? var body: some View { HStack(spacing: 8) { @@ -15,6 +16,35 @@ struct ModelPickerView: View { } .frame(width: 160) .disabled(modelManager.isLoading) + + // Re-download button (visible when a model is loaded) + if let current = modelManager.currentModel, !modelManager.isLoading { + Button { + confirmRedownload = current + } label: { + Image(systemName: "arrow.clockwise") + .font(.caption) + } + .buttonStyle(.borderless) + .help("Re-download \(current.displayName)") + } + } + .alert("Re-download Model?", isPresented: .init( + get: { confirmRedownload != nil }, + set: { if !$0 { confirmRedownload = nil } } + )) { + Button("Re-download", role: .destructive) { + if let config = confirmRedownload { + Task { await modelManager.redownloadModel(config) } + } + } + Button("Cancel", role: .cancel) { + confirmRedownload = nil + } + } message: { + if let config = confirmRedownload { + Text("This will delete the local cache for \(config.displayName) and download it again from HuggingFace.") + } } } diff --git a/MLXServer/Views/SettingsView.swift b/MLXServer/Views/SettingsView.swift index d5e5344..2c31acf 100644 --- a/MLXServer/Views/SettingsView.swift +++ b/MLXServer/Views/SettingsView.swift @@ -6,6 +6,7 @@ struct SettingsView: View { @State private var apiAutoStart: Bool = Preferences.apiAutoStart @State private var idleUnloadMinutes: String = String(Preferences.idleUnloadMinutes) @State private var defaultModelId: String = Preferences.defaultModelId ?? ModelConfig.default.id + @State private var enableThinking: Bool = Preferences.enableThinking var body: some View { Form { @@ -24,6 +25,17 @@ struct SettingsView: View { .foregroundStyle(.secondary) } + Section("Generation") { + Toggle("Enable thinking mode", isOn: $enableThinking) + .onChange(of: enableThinking) { + Preferences.enableThinking = enableThinking + } + + Text("When enabled, models like Qwen3.5 reason internally before responding. Produces better answers but slower. Takes effect on the next conversation.") + .font(.caption) + .foregroundStyle(.secondary) + } + Section("System Prompt") { TextEditor(text: $systemPrompt) .font(.body.monospaced()) @@ -75,6 +87,6 @@ struct SettingsView: View { } } .formStyle(.grouped) - .frame(width: 450, height: 460) + .frame(width: 450, height: 550) } }