feat: idle-unload of models
This commit is contained in:
@@ -189,11 +189,23 @@ final class APIServer {
|
|||||||
// If we can't resolve the model, continue with whatever is loaded
|
// If we can't resolve the model, continue with whatever is loaded
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reload model if it was idle-unloaded
|
||||||
|
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
||||||
|
let config = ModelConfig.resolve(lastModelId) {
|
||||||
|
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
||||||
|
cachedSession = nil
|
||||||
|
cachedMessages = nil
|
||||||
|
cachedModelId = nil
|
||||||
|
await modelManager.loadModel(config)
|
||||||
|
}
|
||||||
|
|
||||||
guard modelManager.isReady, let container = modelManager.modelContainer else {
|
guard modelManager.isReady, let container = modelManager.modelContainer else {
|
||||||
sendResponse(connection: connection, status: 503, body: #"{"error":"No model loaded"}"#)
|
sendResponse(connection: connection, status: 503, body: #"{"error":"No model loaded"}"#)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
modelManager.touchActivity()
|
||||||
|
|
||||||
let isStream = request.stream ?? false
|
let isStream = request.stream ?? false
|
||||||
let temperature = request.temperature ?? 0.7
|
let temperature = request.temperature ?? 0.7
|
||||||
let topP = request.top_p ?? 1.0
|
let topP = request.top_p ?? 1.0
|
||||||
|
|||||||
@@ -39,4 +39,16 @@ enum Preferences {
|
|||||||
get { defaults.bool(forKey: apiAutoStartKey) }
|
get { defaults.bool(forKey: apiAutoStartKey) }
|
||||||
set { defaults.set(newValue, forKey: apiAutoStartKey) }
|
set { defaults.set(newValue, forKey: apiAutoStartKey) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Idle unload
|
||||||
|
|
||||||
|
private static let idleUnloadMinutesKey = "idleUnloadMinutes"
|
||||||
|
|
||||||
|
static var idleUnloadMinutes: Int {
|
||||||
|
get {
|
||||||
|
let val = defaults.integer(forKey: idleUnloadMinutesKey)
|
||||||
|
return val > 0 ? val : 3
|
||||||
|
}
|
||||||
|
set { defaults.set(newValue, forKey: idleUnloadMinutesKey) }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ final class ChatViewModel {
|
|||||||
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
let text = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
guard !text.isEmpty, modelManager.isReady else { return }
|
guard !text.isEmpty, modelManager.isReady else { return }
|
||||||
|
|
||||||
|
modelManager.touchActivity()
|
||||||
ensureSession()
|
ensureSession()
|
||||||
guard let session = chatSession else { return }
|
guard let session = chatSession else { return }
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ final class ModelManager {
|
|||||||
var loadingModelName: String = ""
|
var loadingModelName: String = ""
|
||||||
var errorMessage: String?
|
var errorMessage: String?
|
||||||
|
|
||||||
|
private var idleTimer: Timer?
|
||||||
|
private(set) var lastUsed: Date?
|
||||||
|
|
||||||
/// Load a model, unloading the current one first.
|
/// Load a model, unloading the current one first.
|
||||||
/// Prefers the local snapshot from ~/.cache/huggingface/hub/ (shared with the Python server).
|
/// Prefers the local snapshot from ~/.cache/huggingface/hub/ (shared with the Python server).
|
||||||
/// Only downloads if the model isn't cached locally.
|
/// Only downloads if the model isn't cached locally.
|
||||||
@@ -50,6 +53,7 @@ final class ModelManager {
|
|||||||
|
|
||||||
self.modelContainer = container
|
self.modelContainer = container
|
||||||
self.currentModel = config
|
self.currentModel = config
|
||||||
|
touchActivity()
|
||||||
} catch {
|
} catch {
|
||||||
self.errorMessage = "Failed to load model: \(error.localizedDescription)"
|
self.errorMessage = "Failed to load model: \(error.localizedDescription)"
|
||||||
}
|
}
|
||||||
@@ -59,11 +63,29 @@ final class ModelManager {
|
|||||||
|
|
||||||
/// Unload the current model and free GPU memory.
|
/// Unload the current model and free GPU memory.
|
||||||
func unloadModel() {
|
func unloadModel() {
|
||||||
|
idleTimer?.invalidate()
|
||||||
|
idleTimer = nil
|
||||||
|
lastUsed = nil
|
||||||
modelContainer = nil
|
modelContainer = nil
|
||||||
currentModel = nil
|
currentModel = nil
|
||||||
MLX.GPU.clearCache()
|
MLX.GPU.clearCache()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Record model activity and reset the idle unload timer.
|
||||||
|
func touchActivity() {
|
||||||
|
lastUsed = Date()
|
||||||
|
idleTimer?.invalidate()
|
||||||
|
let minutes = Preferences.idleUnloadMinutes
|
||||||
|
guard minutes > 0 else { return }
|
||||||
|
idleTimer = Timer.scheduledTimer(withTimeInterval: TimeInterval(minutes * 60), repeats: false) { [weak self] _ in
|
||||||
|
Task { @MainActor [weak self] in
|
||||||
|
guard let self, self.modelContainer != nil else { return }
|
||||||
|
print("[ModelManager] Idle for \(minutes) min — unloading model")
|
||||||
|
self.unloadModel()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Whether a model is ready for generation.
|
/// Whether a model is ready for generation.
|
||||||
var isReady: Bool {
|
var isReady: Bool {
|
||||||
modelContainer != nil && !isLoading
|
modelContainer != nil && !isLoading
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ struct SettingsView: View {
|
|||||||
@State private var systemPrompt: String = Preferences.systemPrompt
|
@State private var systemPrompt: String = Preferences.systemPrompt
|
||||||
@State private var apiPort: String = String(Preferences.apiPort)
|
@State private var apiPort: String = String(Preferences.apiPort)
|
||||||
@State private var apiAutoStart: Bool = Preferences.apiAutoStart
|
@State private var apiAutoStart: Bool = Preferences.apiAutoStart
|
||||||
|
@State private var idleUnloadMinutes: String = String(Preferences.idleUnloadMinutes)
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
Form {
|
Form {
|
||||||
@@ -37,8 +38,27 @@ struct SettingsView: View {
|
|||||||
Preferences.apiAutoStart = apiAutoStart
|
Preferences.apiAutoStart = apiAutoStart
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Section("Memory") {
|
||||||
|
HStack {
|
||||||
|
Text("Unload model after idle")
|
||||||
|
TextField("3", text: $idleUnloadMinutes)
|
||||||
|
.frame(width: 50)
|
||||||
|
.onChange(of: idleUnloadMinutes) {
|
||||||
|
if let mins = Int(idleUnloadMinutes), mins > 0 {
|
||||||
|
Preferences.idleUnloadMinutes = mins
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Text("minutes")
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
|
||||||
|
Text("The model is automatically unloaded to free memory after being idle, and reloaded on the next request.")
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
.formStyle(.grouped)
|
.formStyle(.grouped)
|
||||||
.frame(width: 450, height: 300)
|
.frame(width: 450, height: 380)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user