feat: idle-unload of models
This commit is contained in:
@@ -189,11 +189,23 @@ final class APIServer {
|
||||
// If we can't resolve the model, continue with whatever is loaded
|
||||
}
|
||||
|
||||
// Reload model if it was idle-unloaded
|
||||
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
||||
let config = ModelConfig.resolve(lastModelId) {
|
||||
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
||||
cachedSession = nil
|
||||
cachedMessages = nil
|
||||
cachedModelId = nil
|
||||
await modelManager.loadModel(config)
|
||||
}
|
||||
|
||||
guard modelManager.isReady, let container = modelManager.modelContainer else {
|
||||
sendResponse(connection: connection, status: 503, body: #"{"error":"No model loaded"}"#)
|
||||
return
|
||||
}
|
||||
|
||||
modelManager.touchActivity()
|
||||
|
||||
let isStream = request.stream ?? false
|
||||
let temperature = request.temperature ?? 0.7
|
||||
let topP = request.top_p ?? 1.0
|
||||
|
||||
@@ -39,4 +39,16 @@ enum Preferences {
|
||||
get { defaults.bool(forKey: apiAutoStartKey) }
|
||||
set { defaults.set(newValue, forKey: apiAutoStartKey) }
|
||||
}
|
||||
|
||||
// MARK: - Idle unload
|
||||
|
||||
private static let idleUnloadMinutesKey = "idleUnloadMinutes"
|
||||
|
||||
static var idleUnloadMinutes: Int {
|
||||
get {
|
||||
let val = defaults.integer(forKey: idleUnloadMinutesKey)
|
||||
return val > 0 ? val : 3
|
||||
}
|
||||
set { defaults.set(newValue, forKey: idleUnloadMinutesKey) }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,6 +43,7 @@ final class ChatViewModel {
|
||||
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !text.isEmpty, modelManager.isReady else { return }
|
||||
|
||||
modelManager.touchActivity()
|
||||
ensureSession()
|
||||
guard let session = chatSession else { return }
|
||||
|
||||
|
||||
@@ -14,6 +14,9 @@ final class ModelManager {
|
||||
var loadingModelName: String = ""
|
||||
var errorMessage: String?
|
||||
|
||||
private var idleTimer: Timer?
|
||||
private(set) var lastUsed: Date?
|
||||
|
||||
/// Load a model, unloading the current one first.
|
||||
/// Prefers the local snapshot from ~/.cache/huggingface/hub/ (shared with the Python server).
|
||||
/// Only downloads if the model isn't cached locally.
|
||||
@@ -50,6 +53,7 @@ final class ModelManager {
|
||||
|
||||
self.modelContainer = container
|
||||
self.currentModel = config
|
||||
touchActivity()
|
||||
} catch {
|
||||
self.errorMessage = "Failed to load model: \(error.localizedDescription)"
|
||||
}
|
||||
@@ -59,11 +63,29 @@ final class ModelManager {
|
||||
|
||||
/// Unload the current model and free GPU memory.
|
||||
func unloadModel() {
|
||||
idleTimer?.invalidate()
|
||||
idleTimer = nil
|
||||
lastUsed = nil
|
||||
modelContainer = nil
|
||||
currentModel = nil
|
||||
MLX.GPU.clearCache()
|
||||
}
|
||||
|
||||
/// Record model activity and reset the idle unload timer.
|
||||
func touchActivity() {
|
||||
lastUsed = Date()
|
||||
idleTimer?.invalidate()
|
||||
let minutes = Preferences.idleUnloadMinutes
|
||||
guard minutes > 0 else { return }
|
||||
idleTimer = Timer.scheduledTimer(withTimeInterval: TimeInterval(minutes * 60), repeats: false) { [weak self] _ in
|
||||
Task { @MainActor [weak self] in
|
||||
guard let self, self.modelContainer != nil else { return }
|
||||
print("[ModelManager] Idle for \(minutes) min — unloading model")
|
||||
self.unloadModel()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether a model is ready for generation.
|
||||
var isReady: Bool {
|
||||
modelContainer != nil && !isLoading
|
||||
|
||||
@@ -4,6 +4,7 @@ struct SettingsView: View {
|
||||
@State private var systemPrompt: String = Preferences.systemPrompt
|
||||
@State private var apiPort: String = String(Preferences.apiPort)
|
||||
@State private var apiAutoStart: Bool = Preferences.apiAutoStart
|
||||
@State private var idleUnloadMinutes: String = String(Preferences.idleUnloadMinutes)
|
||||
|
||||
var body: some View {
|
||||
Form {
|
||||
@@ -37,8 +38,27 @@ struct SettingsView: View {
|
||||
Preferences.apiAutoStart = apiAutoStart
|
||||
}
|
||||
}
|
||||
|
||||
Section("Memory") {
|
||||
HStack {
|
||||
Text("Unload model after idle")
|
||||
TextField("3", text: $idleUnloadMinutes)
|
||||
.frame(width: 50)
|
||||
.onChange(of: idleUnloadMinutes) {
|
||||
if let mins = Int(idleUnloadMinutes), mins > 0 {
|
||||
Preferences.idleUnloadMinutes = mins
|
||||
}
|
||||
}
|
||||
Text("minutes")
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
|
||||
Text("The model is automatically unloaded to free memory after being idle, and reloaded on the next request.")
|
||||
.font(.caption)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
}
|
||||
.formStyle(.grouped)
|
||||
.frame(width: 450, height: 300)
|
||||
.frame(width: 450, height: 380)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user