From aa2712555ad8ed2480a87e964850ba4859cc2819 Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Tue, 17 Mar 2026 20:01:44 +0100 Subject: [PATCH] feat: idle-unload of models --- MLXServer/Server/APIServer.swift | 12 ++++++++++++ MLXServer/Utilities/Preferences.swift | 12 ++++++++++++ MLXServer/ViewModels/ChatViewModel.swift | 1 + MLXServer/ViewModels/ModelManager.swift | 22 ++++++++++++++++++++++ MLXServer/Views/SettingsView.swift | 22 +++++++++++++++++++++- 5 files changed, 68 insertions(+), 1 deletion(-) diff --git a/MLXServer/Server/APIServer.swift b/MLXServer/Server/APIServer.swift index 17e7d07..2a36415 100644 --- a/MLXServer/Server/APIServer.swift +++ b/MLXServer/Server/APIServer.swift @@ -189,11 +189,23 @@ final class APIServer { // If we can't resolve the model, continue with whatever is loaded } + // Reload model if it was idle-unloaded + if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId, + let config = ModelConfig.resolve(lastModelId) { + print("[APIServer] Reloading idle-unloaded model: \(config.repoId)") + cachedSession = nil + cachedMessages = nil + cachedModelId = nil + await modelManager.loadModel(config) + } + guard modelManager.isReady, let container = modelManager.modelContainer else { sendResponse(connection: connection, status: 503, body: #"{"error":"No model loaded"}"#) return } + modelManager.touchActivity() + let isStream = request.stream ?? false let temperature = request.temperature ?? 0.7 let topP = request.top_p ?? 1.0 diff --git a/MLXServer/Utilities/Preferences.swift b/MLXServer/Utilities/Preferences.swift index 710410d..34cf519 100644 --- a/MLXServer/Utilities/Preferences.swift +++ b/MLXServer/Utilities/Preferences.swift @@ -39,4 +39,16 @@ enum Preferences { get { defaults.bool(forKey: apiAutoStartKey) } set { defaults.set(newValue, forKey: apiAutoStartKey) } } + + // MARK: - Idle unload + + private static let idleUnloadMinutesKey = "idleUnloadMinutes" + + static var idleUnloadMinutes: Int { + get { + let val = defaults.integer(forKey: idleUnloadMinutesKey) + return val > 0 ? val : 3 + } + set { defaults.set(newValue, forKey: idleUnloadMinutesKey) } + } } diff --git a/MLXServer/ViewModels/ChatViewModel.swift b/MLXServer/ViewModels/ChatViewModel.swift index d874c23..d84b4d7 100644 --- a/MLXServer/ViewModels/ChatViewModel.swift +++ b/MLXServer/ViewModels/ChatViewModel.swift @@ -43,6 +43,7 @@ final class ChatViewModel { let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines) guard !text.isEmpty, modelManager.isReady else { return } + modelManager.touchActivity() ensureSession() guard let session = chatSession else { return } diff --git a/MLXServer/ViewModels/ModelManager.swift b/MLXServer/ViewModels/ModelManager.swift index 129d527..25c350d 100644 --- a/MLXServer/ViewModels/ModelManager.swift +++ b/MLXServer/ViewModels/ModelManager.swift @@ -14,6 +14,9 @@ final class ModelManager { var loadingModelName: String = "" var errorMessage: String? + private var idleTimer: Timer? + private(set) var lastUsed: Date? + /// Load a model, unloading the current one first. /// Prefers the local snapshot from ~/.cache/huggingface/hub/ (shared with the Python server). /// Only downloads if the model isn't cached locally. @@ -50,6 +53,7 @@ final class ModelManager { self.modelContainer = container self.currentModel = config + touchActivity() } catch { self.errorMessage = "Failed to load model: \(error.localizedDescription)" } @@ -59,11 +63,29 @@ final class ModelManager { /// Unload the current model and free GPU memory. func unloadModel() { + idleTimer?.invalidate() + idleTimer = nil + lastUsed = nil modelContainer = nil currentModel = nil MLX.GPU.clearCache() } + /// Record model activity and reset the idle unload timer. + func touchActivity() { + lastUsed = Date() + idleTimer?.invalidate() + let minutes = Preferences.idleUnloadMinutes + guard minutes > 0 else { return } + idleTimer = Timer.scheduledTimer(withTimeInterval: TimeInterval(minutes * 60), repeats: false) { [weak self] _ in + Task { @MainActor [weak self] in + guard let self, self.modelContainer != nil else { return } + print("[ModelManager] Idle for \(minutes) min — unloading model") + self.unloadModel() + } + } + } + /// Whether a model is ready for generation. var isReady: Bool { modelContainer != nil && !isLoading diff --git a/MLXServer/Views/SettingsView.swift b/MLXServer/Views/SettingsView.swift index 79ae560..05b4b39 100644 --- a/MLXServer/Views/SettingsView.swift +++ b/MLXServer/Views/SettingsView.swift @@ -4,6 +4,7 @@ struct SettingsView: View { @State private var systemPrompt: String = Preferences.systemPrompt @State private var apiPort: String = String(Preferences.apiPort) @State private var apiAutoStart: Bool = Preferences.apiAutoStart + @State private var idleUnloadMinutes: String = String(Preferences.idleUnloadMinutes) var body: some View { Form { @@ -37,8 +38,27 @@ struct SettingsView: View { Preferences.apiAutoStart = apiAutoStart } } + + Section("Memory") { + HStack { + Text("Unload model after idle") + TextField("3", text: $idleUnloadMinutes) + .frame(width: 50) + .onChange(of: idleUnloadMinutes) { + if let mins = Int(idleUnloadMinutes), mins > 0 { + Preferences.idleUnloadMinutes = mins + } + } + Text("minutes") + .foregroundStyle(.secondary) + } + + Text("The model is automatically unloaded to free memory after being idle, and reloaded on the next request.") + .font(.caption) + .foregroundStyle(.secondary) + } } .formStyle(.grouped) - .frame(width: 450, height: 300) + .frame(width: 450, height: 380) } }