fix: more telemetry and tighter implementation of cache

This commit is contained in:
2026-03-19 11:30:18 +01:00
parent c2e80e4066
commit 49bd165ce7
6 changed files with 1154 additions and 156 deletions

View File

@@ -42,6 +42,7 @@
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; }; D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; }; D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; }; DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; };
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; }; F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; }; FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; }; FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -85,6 +86,7 @@
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; }; E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = "<group>"; }; EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = "<group>"; };
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; }; F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
@@ -203,6 +205,7 @@
children = ( children = (
F1A52E2C9964ADA9D841A89B /* APIModels.swift */, F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
3D08828E16B17EF02C14243E /* APIServer.swift */, 3D08828E16B17EF02C14243E /* APIServer.swift */,
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
E73B165A1822729C907791AE /* ToolCallParser.swift */, E73B165A1822729C907791AE /* ToolCallParser.swift */,
16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */, 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */,
); );
@@ -306,6 +309,7 @@
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */, 85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */, B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */, 5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */, C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */, 4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */, 2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,

View File

@@ -9,9 +9,14 @@ final class LiveCounters: @unchecked Sendable {
static let shared = LiveCounters() static let shared = LiveCounters()
private let lock = OSAllocatedUnfairLock() private let lock = OSAllocatedUnfairLock()
private var requestPhases: [String: RequestPhase] = [:]
// Current request // Current request
private var _activeRequests: Int = 0 private var _activeRequests: Int = 0
private var _preparingRequests: Int = 0
private var _sessionBuildRequests: Int = 0
private var _prefillRequests: Int = 0
private var _generatingRequests: Int = 0
private var _promptTokens: Int = 0 private var _promptTokens: Int = 0
private var _generationTokens: Int = 0 private var _generationTokens: Int = 0
private var _tokensPerSecond: Double = 0 private var _tokensPerSecond: Double = 0
@@ -24,9 +29,10 @@ final class LiveCounters: @unchecked Sendable {
private var _totalPromptTokens: Int = 0 private var _totalPromptTokens: Int = 0
private var _totalGenerationTokens: Int = 0 private var _totalGenerationTokens: Int = 0
func requestStarted(contextLength: Int) { func requestStarted(requestId: String, contextLength: Int) {
lock.lock() lock.lock()
_activeRequests += 1 _activeRequests += 1
_preparingRequests += 1
_totalRequests += 1 _totalRequests += 1
_isPrefilling = true _isPrefilling = true
_isGenerating = false _isGenerating = false
@@ -34,15 +40,33 @@ final class LiveCounters: @unchecked Sendable {
_generationTokens = 0 _generationTokens = 0
_tokensPerSecond = 0 _tokensPerSecond = 0
_contextMax = contextLength _contextMax = contextLength
requestPhases[requestId] = .preparing
lock.unlock() lock.unlock()
} }
func prefillCompleted(promptTokens: Int) { func requestPhaseChanged(requestId: String, phase: RequestPhase) {
lock.lock() lock.lock()
_isPrefilling = false if let current = requestPhases[requestId] {
_isGenerating = true decrementCount(for: current)
}
incrementCount(for: phase)
requestPhases[requestId] = phase
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0
lock.unlock()
}
func prefillCompleted(requestId: String, promptTokens: Int) {
lock.lock()
if let current = requestPhases[requestId] {
decrementCount(for: current)
}
incrementCount(for: .generating)
requestPhases[requestId] = .generating
_promptTokens = promptTokens _promptTokens = promptTokens
_totalPromptTokens += promptTokens _totalPromptTokens += promptTokens
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0
lock.unlock() lock.unlock()
} }
@@ -53,21 +77,32 @@ final class LiveCounters: @unchecked Sendable {
lock.unlock() lock.unlock()
} }
func requestCompleted(generationTokens: Int) { func requestCompleted(requestId: String, generationTokens: Int) {
lock.lock() lock.lock()
if let current = requestPhases.removeValue(forKey: requestId) {
decrementCount(for: current)
}
_activeRequests = max(0, _activeRequests - 1) _activeRequests = max(0, _activeRequests - 1)
_totalGenerationTokens += generationTokens _totalGenerationTokens += generationTokens
if _activeRequests == 0 { if _activeRequests == 0 {
_isGenerating = false _isGenerating = false
_isPrefilling = false _isPrefilling = false
_tokensPerSecond = 0 _tokensPerSecond = 0
} else {
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0
} }
lock.unlock() lock.unlock()
} }
func reset() { func reset() {
lock.lock() lock.lock()
requestPhases.removeAll()
_activeRequests = 0 _activeRequests = 0
_preparingRequests = 0
_sessionBuildRequests = 0
_prefillRequests = 0
_generatingRequests = 0
_promptTokens = 0 _promptTokens = 0
_generationTokens = 0 _generationTokens = 0
_tokensPerSecond = 0 _tokensPerSecond = 0
@@ -85,6 +120,10 @@ final class LiveCounters: @unchecked Sendable {
lock.lock() lock.lock()
let s = Snapshot( let s = Snapshot(
activeRequests: _activeRequests, activeRequests: _activeRequests,
preparingRequests: _preparingRequests,
sessionBuildRequests: _sessionBuildRequests,
prefillRequests: _prefillRequests,
generatingRequests: _generatingRequests,
promptTokens: _promptTokens, promptTokens: _promptTokens,
generationTokens: _generationTokens, generationTokens: _generationTokens,
tokensPerSecond: _tokensPerSecond, tokensPerSecond: _tokensPerSecond,
@@ -101,6 +140,10 @@ final class LiveCounters: @unchecked Sendable {
struct Snapshot { struct Snapshot {
let activeRequests: Int let activeRequests: Int
let preparingRequests: Int
let sessionBuildRequests: Int
let prefillRequests: Int
let generatingRequests: Int
let promptTokens: Int let promptTokens: Int
let generationTokens: Int let generationTokens: Int
let tokensPerSecond: Double let tokensPerSecond: Double
@@ -111,6 +154,39 @@ final class LiveCounters: @unchecked Sendable {
let totalPromptTokens: Int let totalPromptTokens: Int
let totalGenerationTokens: Int let totalGenerationTokens: Int
} }
private func incrementCount(for phase: RequestPhase) {
switch phase {
case .preparing:
_preparingRequests += 1
case .sessionBuild:
_sessionBuildRequests += 1
case .prefilling:
_prefillRequests += 1
case .generating:
_generatingRequests += 1
}
}
private func decrementCount(for phase: RequestPhase) {
switch phase {
case .preparing:
_preparingRequests = max(0, _preparingRequests - 1)
case .sessionBuild:
_sessionBuildRequests = max(0, _sessionBuildRequests - 1)
case .prefilling:
_prefillRequests = max(0, _prefillRequests - 1)
case .generating:
_generatingRequests = max(0, _generatingRequests - 1)
}
}
enum RequestPhase {
case preparing
case sessionBuild
case prefilling
case generating
}
} }
// MARK: - Observable stats for the UI (polls LiveCounters at 1Hz) // MARK: - Observable stats for the UI (polls LiveCounters at 1Hz)
@@ -121,6 +197,10 @@ final class InferenceStats {
// MARK: - Current request state (refreshed from LiveCounters) // MARK: - Current request state (refreshed from LiveCounters)
var activeRequests: Int = 0 var activeRequests: Int = 0
var preparingRequests: Int = 0
var sessionBuildRequests: Int = 0
var prefillingRequests: Int = 0
var generatingRequests: Int = 0
var currentPromptTokens: Int = 0 var currentPromptTokens: Int = 0
var currentGenerationTokens: Int = 0 var currentGenerationTokens: Int = 0
var isGenerating: Bool = false var isGenerating: Bool = false
@@ -134,6 +214,21 @@ final class InferenceStats {
var totalRequests: Int = 0 var totalRequests: Int = 0
var totalPromptTokens: Int = 0 var totalPromptTokens: Int = 0
var totalGenerationTokens: Int = 0 var totalGenerationTokens: Int = 0
var totalCacheHits: Int = 0
var totalCacheMisses: Int = 0
var totalCacheEvictions: Int = 0
var totalCacheReusePromptTokens: Int = 0
var totalCacheRebuildPromptTokens: Int = 0
// MARK: - Cache state
var cacheEntryCount: Int = 0
var warmCacheEntryCount: Int = 0
var activeCacheEntryCount: Int = 0
var generatingCacheEntryCount: Int = 0
var cacheEstimatedBytes: Int = 0
var cacheEstimatedTokens: Int = 0
var cachedSessions: [ConversationSessionCache.SessionSummary] = []
// MARK: - Time series data (ring buffers for charts) // MARK: - Time series data (ring buffers for charts)
@@ -146,6 +241,11 @@ final class InferenceStats {
private(set) var tokenRateHistory: [DataPoint] = [] private(set) var tokenRateHistory: [DataPoint] = []
private(set) var promptTokenHistory: [DataPoint] = [] private(set) var promptTokenHistory: [DataPoint] = []
private(set) var generationTokenHistory: [DataPoint] = [] private(set) var generationTokenHistory: [DataPoint] = []
private(set) var cacheEntryHistory: [DataPoint] = []
private(set) var activeSessionHistory: [DataPoint] = []
private(set) var cacheFootprintHistory: [DataPoint] = []
private(set) var cacheReuseHistory: [DataPoint] = []
private(set) var cacheRebuildHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -153,6 +253,8 @@ final class InferenceStats {
private var sampleTimer: Timer? private var sampleTimer: Timer?
private var lastGenerationTokenCount: Int = 0 private var lastGenerationTokenCount: Int = 0
private var lastPromptTokenCount: Int = 0 private var lastPromptTokenCount: Int = 0
private var lastCacheReuseTokenCount: Int = 0
private var lastCacheRebuildTokenCount: Int = 0
func startSampling() { func startSampling() {
guard sampleTimer == nil else { return } guard sampleTimer == nil else { return }
@@ -171,8 +273,13 @@ final class InferenceStats {
private func recordSample() { private func recordSample() {
// Pull live values from the thread-safe counters // Pull live values from the thread-safe counters
let snap = LiveCounters.shared.snapshot() let snap = LiveCounters.shared.snapshot()
let cache = ConversationSessionCache.shared.snapshot()
activeRequests = snap.activeRequests activeRequests = snap.activeRequests
preparingRequests = snap.preparingRequests
sessionBuildRequests = snap.sessionBuildRequests
prefillingRequests = snap.prefillRequests
generatingRequests = snap.generatingRequests
currentPromptTokens = snap.promptTokens currentPromptTokens = snap.promptTokens
currentGenerationTokens = snap.generationTokens currentGenerationTokens = snap.generationTokens
currentTokensPerSecond = snap.tokensPerSecond currentTokensPerSecond = snap.tokensPerSecond
@@ -183,16 +290,37 @@ final class InferenceStats {
totalRequests = snap.totalRequests totalRequests = snap.totalRequests
totalPromptTokens = snap.totalPromptTokens totalPromptTokens = snap.totalPromptTokens
totalGenerationTokens = snap.totalGenerationTokens totalGenerationTokens = snap.totalGenerationTokens
totalCacheHits = cache.totalHits
totalCacheMisses = cache.totalMisses
totalCacheEvictions = cache.totalEvictions
totalCacheReusePromptTokens = cache.totalReusePromptTokens
totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
cacheEntryCount = cache.totalEntries
warmCacheEntryCount = cache.warmEntries
activeCacheEntryCount = cache.activeEntries
generatingCacheEntryCount = cache.generatingEntries
cacheEstimatedBytes = cache.estimatedBytes
cacheEstimatedTokens = cache.cachedTokenEstimate
cachedSessions = cache.sessions
let now = Date.now let now = Date.now
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
lastGenerationTokenCount = snap.totalGenerationTokens lastGenerationTokenCount = snap.totalGenerationTokens
lastPromptTokenCount = snap.totalPromptTokens lastPromptTokenCount = snap.totalPromptTokens
lastCacheReuseTokenCount = cache.totalReusePromptTokens
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond)) tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta))) generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta))) promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
if tokenRateHistory.count > Self.maxHistoryPoints { if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints) tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -203,11 +331,31 @@ final class InferenceStats {
if promptTokenHistory.count > Self.maxHistoryPoints { if promptTokenHistory.count > Self.maxHistoryPoints {
promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints) promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
} }
if cacheEntryHistory.count > Self.maxHistoryPoints {
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
}
if activeSessionHistory.count > Self.maxHistoryPoints {
activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
}
if cacheFootprintHistory.count > Self.maxHistoryPoints {
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
}
if cacheReuseHistory.count > Self.maxHistoryPoints {
cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
}
if cacheRebuildHistory.count > Self.maxHistoryPoints {
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
}
} }
func reset() { func reset() {
LiveCounters.shared.reset() LiveCounters.shared.reset()
ConversationSessionCache.shared.reset()
activeRequests = 0 activeRequests = 0
preparingRequests = 0
sessionBuildRequests = 0
prefillingRequests = 0
generatingRequests = 0
currentPromptTokens = 0 currentPromptTokens = 0
currentGenerationTokens = 0 currentGenerationTokens = 0
isGenerating = false isGenerating = false
@@ -218,10 +366,29 @@ final class InferenceStats {
totalRequests = 0 totalRequests = 0
totalPromptTokens = 0 totalPromptTokens = 0
totalGenerationTokens = 0 totalGenerationTokens = 0
totalCacheHits = 0
totalCacheMisses = 0
totalCacheEvictions = 0
totalCacheReusePromptTokens = 0
totalCacheRebuildPromptTokens = 0
cacheEntryCount = 0
warmCacheEntryCount = 0
activeCacheEntryCount = 0
generatingCacheEntryCount = 0
cacheEstimatedBytes = 0
cacheEstimatedTokens = 0
cachedSessions.removeAll()
tokenRateHistory.removeAll() tokenRateHistory.removeAll()
promptTokenHistory.removeAll() promptTokenHistory.removeAll()
generationTokenHistory.removeAll() generationTokenHistory.removeAll()
cacheEntryHistory.removeAll()
activeSessionHistory.removeAll()
cacheFootprintHistory.removeAll()
cacheReuseHistory.removeAll()
cacheRebuildHistory.removeAll()
lastGenerationTokenCount = 0 lastGenerationTokenCount = 0
lastPromptTokenCount = 0 lastPromptTokenCount = 0
lastCacheReuseTokenCount = 0
lastCacheRebuildTokenCount = 0
} }
} }

View File

@@ -16,6 +16,50 @@ struct APIToolDefinition: Codable {
struct APIFunctionCall: Codable { struct APIFunctionCall: Codable {
let name: String let name: String
let arguments: String // JSON string let arguments: String // JSON string
init(name: String, arguments: String) {
self.name = name
self.arguments = arguments
}
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
name = try container.decode(String.self, forKey: .name)
if let argumentString = try? container.decode(String.self, forKey: .arguments) {
arguments = argumentString
return
}
if let argumentObject = try? container.decode([String: AnyCodable].self, forKey: .arguments) {
let jsonObject = argumentObject.mapValues(\.value)
if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
let string = String(data: data, encoding: .utf8) {
arguments = string
} else {
arguments = "{}"
}
return
}
if let argumentArray = try? container.decode([AnyCodable].self, forKey: .arguments) {
let jsonObject = argumentArray.map(\.value)
if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
let string = String(data: data, encoding: .utf8) {
arguments = string
} else {
arguments = "[]"
}
return
}
if (try? container.decodeNil(forKey: .arguments)) == true {
arguments = "{}"
return
}
arguments = "{}"
}
} }
struct APIToolCall: Codable { struct APIToolCall: Codable {
@@ -30,6 +74,14 @@ struct APIToolCall: Codable {
self.type = type self.type = type
self.function = function self.function = function
} }
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
index = try container.decodeIfPresent(Int.self, forKey: .index) ?? 0
id = try container.decodeIfPresent(String.self, forKey: .id) ?? "call_\(UUID().uuidString.lowercased())"
type = try container.decodeIfPresent(String.self, forKey: .type) ?? "function"
function = try container.decode(APIFunctionCall.self, forKey: .function)
}
} }
struct APIImageURL: Codable { struct APIImageURL: Codable {

View File

@@ -16,12 +16,6 @@ final class APIServer {
private var listener: NWListener? private var listener: NWListener?
private var modelManager: ModelManager? private var modelManager: ModelManager?
// Persistent ChatSession for KV cache reuse across requests
private var cachedSession: ChatSession?
private var cachedMessages: [Chat.Message]?
private var cachedModelId: String?
private var cachedInstructions: String = ""
func start(modelManager: ModelManager, port: Int = 1234) { func start(modelManager: ModelManager, port: Int = 1234) {
guard !isRunning else { return } guard !isRunning else { return }
self.modelManager = modelManager self.modelManager = modelManager
@@ -70,10 +64,7 @@ final class APIServer {
listener?.cancel() listener?.cancel()
listener = nil listener = nil
isRunning = false isRunning = false
cachedSession = nil ConversationSessionCache.shared.invalidateAll()
cachedMessages = nil
cachedModelId = nil
cachedInstructions = ""
inferenceStats.stopSampling() inferenceStats.stopSampling()
} }
@@ -186,10 +177,7 @@ final class APIServer {
if let targetConfig = ModelConfig.resolve(requestedModel) { if let targetConfig = ModelConfig.resolve(requestedModel) {
if modelManager.currentModel?.id != targetConfig.id { if modelManager.currentModel?.id != targetConfig.id {
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)") print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
cachedSession = nil ConversationSessionCache.shared.invalidateAll()
cachedMessages = nil
cachedModelId = nil
cachedInstructions = ""
await modelManager.loadModel(targetConfig) await modelManager.loadModel(targetConfig)
} }
} }
@@ -200,10 +188,7 @@ final class APIServer {
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId, if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
let config = ModelConfig.resolve(lastModelId) { let config = ModelConfig.resolve(lastModelId) {
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)") print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
cachedSession = nil ConversationSessionCache.shared.invalidateAll()
cachedMessages = nil
cachedModelId = nil
cachedInstructions = ""
await modelManager.loadModel(config) await modelManager.loadModel(config)
} }
@@ -233,9 +218,13 @@ final class APIServer {
return return
} }
LiveCounters.shared.requestStarted(requestId: requestId, contextLength: contextLength)
// Convert API messages to Chat.Message, extracting images from content parts // Convert API messages to Chat.Message, extracting images from content parts
var chatMessages: [Chat.Message] = [] var chatMessages: [Chat.Message] = []
var messageSignatures: [UInt64] = []
var images: [UserInput.Image] = [] var images: [UserInput.Image] = []
var estimatedBytes = 0
let currentModelRepoId = currentModel?.repoId ?? modelName let currentModelRepoId = currentModel?.repoId ?? modelName
// Build the instructions string (system prompt + tool definitions). // Build the instructions string (system prompt + tool definitions).
@@ -259,8 +248,8 @@ final class APIServer {
instructions += toolSystemPrompt instructions += toolSystemPrompt
} }
let toolsForInjection = request.tools
let isQwen = currentModelRepoId.lowercased().contains("qwen") let isQwen = currentModelRepoId.lowercased().contains("qwen")
estimatedBytes += instructions.utf8.count
// Convert non-system messages to Chat.Message // Convert non-system messages to Chat.Message
for msg in request.messages where msg.role != "system" { for msg in request.messages where msg.role != "system" {
@@ -297,18 +286,25 @@ final class APIServer {
// Extract base64 images from content parts // Extract base64 images from content parts
let imageURLs = msg.content?.imageURLs ?? [] let imageURLs = msg.content?.imageURLs ?? []
var messageImages: [UserInput.Image] = [] var messageImages: [UserInput.Image] = []
var messageImageBytes = 0
for urlString in imageURLs { for urlString in imageURLs {
if let image = decodeBase64Image(urlString) { if let decoded = decodeBase64Image(urlString) {
messageImages.append(image) messageImages.append(decoded.image)
messageImageBytes += decoded.estimatedBytes
} }
} }
// Attach images to this specific message // Attach images to this specific message
chatMessages.append(Chat.Message(role: role, content: text, images: messageImages)) chatMessages.append(Chat.Message(role: role, content: text, images: messageImages))
messageSignatures.append(
Self.messageSignature(role: role, content: text, imageURLs: imageURLs)
)
estimatedBytes += text.utf8.count + messageImageBytes
images.append(contentsOf: messageImages) images.append(contentsOf: messageImages)
} }
if !images.isEmpty, currentModel?.supportsImages != true { if !images.isEmpty, currentModel?.supportsImages != true {
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse( sendResponse(
connection: connection, connection: connection,
status: 400, status: 400,
@@ -318,18 +314,18 @@ final class APIServer {
} }
// Context window check: estimate token count and reject if over limit // Context window check: estimate token count and reject if over limit
let estimatedPromptTokens = (instructions.count + chatMessages.reduce(0) { $0 + $1.content.count }) * 10 / 35
if contextLength > 0 { if contextLength > 0 {
let totalChars = chatMessages.reduce(0) { $0 + $1.content.count } let needed = estimatedPromptTokens + maxTokens
let estimatedTokens = totalChars * 10 / 35 // ~3.5 chars per token
let needed = estimatedTokens + maxTokens
if needed > contextLength { if needed > contextLength {
let errorBody = """ let errorBody = """
{"error":{"message":"This model's maximum context length is \(contextLength) tokens. \ {"error":{"message":"This model's maximum context length is \(contextLength) tokens. \
However, your messages resulted in approximately \(estimatedTokens) tokens and \ However, your messages resulted in approximately \(estimatedPromptTokens) tokens and \
\(maxTokens) tokens were requested for the completion (\(needed) total). \ \(maxTokens) tokens were requested for the completion (\(needed) total). \
Please reduce the length of the messages or completion.",\ Please reduce the length of the messages or completion.",\
"type":"invalid_request_error","code":"context_length_exceeded"}} "type":"invalid_request_error","code":"context_length_exceeded"}}
""" """
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse(connection: connection, status: 400, body: errorBody) sendResponse(connection: connection, status: 400, body: errorBody)
return return
} }
@@ -345,23 +341,28 @@ final class APIServer {
let allButLast = Array(chatMessages.dropLast()) let allButLast = Array(chatMessages.dropLast())
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "") let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
// KV cache reuse: check if the cached session's history matches let historySignatures = Array(messageSignatures.dropLast())
let currentModelId = modelManager.currentModel?.id let currentModelId = modelManager.currentModel?.id ?? modelName
let canReuse = cachedSession != nil let lease = ConversationSessionCache.shared.checkoutSession(
&& cachedModelId == currentModelId modelId: currentModelId,
&& cachedMessages != nil instructions: instructions,
&& cachedInstructions == instructions historySignatures: historySignatures,
&& messagesMatch(cachedMessages!, allButLast) requestMessageCount: chatMessages.count,
estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: estimatedBytes
)
let session: ChatSession let session: ChatSession
if canReuse { if let reusableSession = lease.session {
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)") print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
session = cachedSession! session = reusableSession
session.generateParameters = generateParams session.generateParameters = generateParams
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
} else { } else {
if cachedSession != nil { print("[APIServer] Creating fresh session")
print("[APIServer] History diverged, creating fresh session") ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
} LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
// Use `instructions:` for system/tool prompt (matches internal chat pattern). // Use `instructions:` for system/tool prompt (matches internal chat pattern).
// Only conversation turns go in `history:` this avoids replaying the // Only conversation turns go in `history:` this avoids replaying the
// large tool prompt as history on every new session. // large tool prompt as history on every new session.
@@ -385,47 +386,62 @@ final class APIServer {
additionalContext: thinkingContext additionalContext: thinkingContext
) )
} }
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
} }
// Extract images from the last message only (ChatSession.streamDetails takes images separately) // Extract images from the last message only (ChatSession.streamDetails takes images separately)
let lastImages = lastMessage.images let lastImages = lastMessage.images
LiveCounters.shared.requestStarted(contextLength: contextLength) let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
if isStream { if isStream {
await handleStreamingResponse( result = await handleStreamingResponse(
connection: connection, connection: connection,
requestId: requestId,
cacheEntryId: lease.entryId,
session: session, session: session,
prompt: lastMessage.content, prompt: lastMessage.content,
images: lastImages, images: lastImages,
tools: request.tools, tools: request.tools,
requestId: requestId,
created: created, created: created,
modelName: modelName modelName: modelName
) )
} else { } else {
await handleNonStreamingResponse( result = await handleNonStreamingResponse(
connection: connection, connection: connection,
requestId: requestId,
cacheEntryId: lease.entryId,
session: session, session: session,
prompt: lastMessage.content, prompt: lastMessage.content,
images: lastImages, images: lastImages,
tools: request.tools, tools: request.tools,
requestId: requestId,
created: created, created: created,
modelName: modelName modelName: modelName
) )
} }
// Cache the session for reuse on next request if result.succeeded {
// allButLast + lastMessage (user) + assistant response = new cached history ConversationSessionCache.shared.completeRequest(
cachedSession = session entryId: lease.entryId,
cachedMessages = chatMessages // full messages including the one just sent session: session,
cachedModelId = currentModelId requestMessageSignatures: messageSignatures,
cachedInstructions = instructions requestMessageCount: chatMessages.count,
estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: estimatedBytes,
promptTokens: result.promptTokens,
completionTokens: result.completionTokens
)
} else {
ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
}
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
modelManager.touchActivity()
} }
/// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image. /// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image.
private func decodeBase64Image(_ urlString: String) -> UserInput.Image? { private func decodeBase64Image(_ urlString: String) -> DecodedImage? {
// Handle data URIs: data:image/png;base64,<data> // Handle data URIs: data:image/png;base64,<data>
let base64String: String let base64String: String
if urlString.hasPrefix("data:") { if urlString.hasPrefix("data:") {
@@ -442,21 +458,23 @@ final class APIServer {
return nil return nil
} }
return .ciImage(CIImage(cgImage: cgImage)) let estimatedBytes = max(data.count, cgImage.width * cgImage.height * 4)
return DecodedImage(image: .ciImage(CIImage(cgImage: cgImage)), estimatedBytes: estimatedBytes)
} }
// MARK: - Non-streaming response // MARK: - Non-streaming response
private func handleNonStreamingResponse( private func handleNonStreamingResponse(
connection: NWConnection, connection: NWConnection,
requestId: String,
cacheEntryId: UUID,
session: ChatSession, session: ChatSession,
prompt: String, prompt: String,
images: [UserInput.Image], images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
requestId: String,
created: Int, created: Int,
modelName: String modelName: String
) async { ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
do { do {
var fullText = "" var fullText = ""
var promptTokens = 0 var promptTokens = 0
@@ -478,7 +496,12 @@ final class APIServer {
case .info(let info): case .info(let info):
promptTokens = info.promptTokenCount promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount completionTokens = info.generationTokenCount
LiveCounters.shared.prefillCompleted(promptTokens: promptTokens) ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if info.tokensPerSecond > 0 { if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens) LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
} }
@@ -487,9 +510,6 @@ final class APIServer {
} }
} }
LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
modelManager?.touchActivity()
// Parse tool calls: first check framework-detected ones, then our own text parser // Parse tool calls: first check framework-detected ones, then our own text parser
var finishReason = "stop" var finishReason = "stop"
var responseContent: String? = fullText var responseContent: String? = fullText
@@ -559,10 +579,10 @@ final class APIServer {
if let json = try? JSONEncoder().encode(response) { if let json = try? JSONEncoder().encode(response) {
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}") sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
} }
return (promptTokens, completionTokens, true)
} catch { } catch {
LiveCounters.shared.requestCompleted(generationTokens: 0)
modelManager?.touchActivity()
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return (0, 0, false)
} }
} }
@@ -570,14 +590,15 @@ final class APIServer {
private func handleStreamingResponse( private func handleStreamingResponse(
connection: NWConnection, connection: NWConnection,
requestId: String,
cacheEntryId: UUID,
session: ChatSession, session: ChatSession,
prompt: String, prompt: String,
images: [UserInput.Image], images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
requestId: String,
created: Int, created: Int,
modelName: String modelName: String
) async { ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
// Send SSE headers // Send SSE headers
let header = [ let header = [
"HTTP/1.1 200 OK", "HTTP/1.1 200 OK",
@@ -625,7 +646,16 @@ final class APIServer {
) )
}() }()
let (promptTokens, completionTokens, fullText, frameworkToolCalls) = result let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
if promptTokens > 0 {
ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
}
// Stats were already updated by LiveCounters inside the loop // Stats were already updated by LiveCounters inside the loop
@@ -696,12 +726,10 @@ final class APIServer {
) )
)) ))
LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
modelManager?.touchActivity()
// Send [DONE] and close // Send [DONE] and close
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!) await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
connection.cancel() connection.cancel()
return (promptTokens, completionTokens, succeeded)
} }
/// Run the token generation + SSE send loop entirely off MainActor. /// Run the token generation + SSE send loop entirely off MainActor.
@@ -713,7 +741,7 @@ final class APIServer {
requestId: String, requestId: String,
created: Int, created: Int,
modelName: String modelName: String
) async -> (Int, Int, String, [MLXLMCommon.ToolCall]) { ) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
var promptTokens = 0 var promptTokens = 0
var completionTokens = 0 var completionTokens = 0
var fullText = "" var fullText = ""
@@ -742,7 +770,6 @@ final class APIServer {
case .info(let info): case .info(let info):
promptTokens = info.promptTokenCount promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount completionTokens = info.generationTokenCount
LiveCounters.shared.prefillCompleted(promptTokens: promptTokens)
if info.tokensPerSecond > 0 { if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens) LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
} }
@@ -754,9 +781,10 @@ final class APIServer {
} catch { } catch {
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n" let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!) await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
} }
return (promptTokens, completionTokens, fullText, frameworkToolCalls) return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
} }
/// Send an SSE event and wait for the protocol stack to process it. /// Send an SSE event and wait for the protocol stack to process it.
@@ -819,24 +847,40 @@ final class APIServer {
] ]
} }
/// Check if the cached session can be reused for the new history. private static func messageSignature(role: Chat.Message.Role, content: String, imageURLs: [String]) -> UInt64 {
/// var hash: UInt64 = 14_695_981_039_346_656_037
/// After a request the session's KV cache contains:
/// cachedMessages (history + user prompt) + the generated assistant response. func mix(_ text: String) {
/// On the next request the client sends back the full conversation, so for byte in text.utf8 {
/// `newHistory` (allButLast) is typically `cachedMessages` + 1 assistant reply. hash ^= UInt64(byte)
/// We allow reuse when `cached` is a prefix of `newHistory` and there is at most hash &*= 1_099_511_628_211
/// one extra message (the assistant response the session already generated).
/// More than one extra message (e.g. injected tool results) means the session
/// hasn't processed them, so we must create a fresh session.
private func messagesMatch(_ cached: [Chat.Message], _ newHistory: [Chat.Message]) -> Bool {
guard cached.count <= newHistory.count,
newHistory.count <= cached.count + 1 else { return false }
for (a, b) in zip(cached, newHistory) {
if a.role != b.role || a.content != b.content { return false }
} }
return true
} }
switch role {
case .assistant:
mix("assistant")
case .system:
mix("system")
case .user:
mix("user")
@unknown default:
mix("unknown")
}
mix("|")
mix(content)
for imageURL in imageURLs {
mix("|")
mix(imageURL)
}
return hash
}
}
private struct DecodedImage {
let image: UserInput.Image
let estimatedBytes: Int
} }
// MARK: - HTTP request parser // MARK: - HTTP request parser

View File

@@ -0,0 +1,358 @@
import Foundation
import MLXLMCommon
import os
enum APISessionPhase: String, Sendable {
case idle = "Idle"
case sessionBuild = "Session Build"
case prefilling = "Prefilling"
case generating = "Generating"
}
/// Bounded cache of API chat sessions keyed by normalized conversation history.
/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
final class ConversationSessionCache: @unchecked Sendable {
static let shared = ConversationSessionCache()
private let lock = OSAllocatedUnfairLock()
private let maxEntries = 8
private let maxCachedTokens = 256_000
private let idleTTL: TimeInterval = 10 * 60
private var entries: [UUID: Entry] = [:]
private var totals = Totals()
private init() {}
struct Lease {
let entryId: UUID
let session: ChatSession?
let reusedPromptTokens: Int
let cacheHit: Bool
}
struct SessionSummary: Identifiable, Sendable {
let id: UUID
let modelId: String
let phase: APISessionPhase
let messageCount: Int
let cachedTokenEstimate: Int
let estimatedBytes: Int
let inFlightRequests: Int
let hitCount: Int
let lastPromptTokens: Int
let lastCompletionTokens: Int
let lastReuseTokens: Int
let createdAt: Date
let lastAccessAt: Date
}
struct Snapshot: Sendable {
let totalEntries: Int
let warmEntries: Int
let activeEntries: Int
let generatingEntries: Int
let estimatedBytes: Int
let cachedTokenEstimate: Int
let totalHits: Int
let totalMisses: Int
let totalEvictions: Int
let totalReusePromptTokens: Int
let totalRebuildPromptTokens: Int
let sessions: [SessionSummary]
}
func checkoutSession(
modelId: String,
instructions: String,
historySignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int
) -> Lease {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let instructionsHash = Self.stableHash(instructions)
let match = entries
.values
.filter {
$0.modelId == modelId
&& $0.instructionsHash == instructionsHash
&& $0.session != nil
&& $0.inFlightRequests == 0
&& Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
}
.max { lhs, rhs in
lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
}
if let match {
var entry = match
entry.inFlightRequests += 1
entry.lastAccessAt = now
entry.phase = .prefilling
entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
entry.hitCount += 1
entries[entry.id] = entry
totals.totalHits += 1
totals.totalReusePromptTokens += entry.lastReuseTokens
let lease = Lease(
entryId: entry.id,
session: entry.session,
reusedPromptTokens: entry.lastReuseTokens,
cacheHit: true
)
lock.unlock()
return lease
}
let entryId = UUID()
entries[entryId] = Entry(
id: entryId,
modelId: modelId,
instructionsHash: instructionsHash,
requestMessageSignatures: historySignatures,
messageCount: requestMessageCount,
cachedTokenEstimate: estimatedPromptTokens,
estimatedBytes: estimatedBytes,
createdAt: now,
lastAccessAt: now,
inFlightRequests: 1,
hitCount: 0,
phase: .sessionBuild,
lastPromptTokens: 0,
lastCompletionTokens: 0,
lastReuseTokens: 0,
session: nil
)
totals.totalMisses += 1
totals.totalRebuildPromptTokens += estimatedPromptTokens
lock.unlock()
return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
}
func markSessionBuild(entryId: UUID) {
updatePhase(entryId: entryId, phase: .sessionBuild)
}
func markPrefilling(entryId: UUID) {
updatePhase(entryId: entryId, phase: .prefilling)
}
func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = .generating
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
func completeRequest(
entryId: UUID,
session: ChatSession,
requestMessageSignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int,
promptTokens: Int,
completionTokens: Int
) {
lock.lock()
let now = Date()
if var entry = entries[entryId] {
entry.session = session
entry.requestMessageSignatures = requestMessageSignatures
entry.messageCount = requestMessageCount
entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
entry.estimatedBytes = estimatedBytes
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.lastAccessAt = now
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
entry.phase = .idle
entries[entryId] = entry
enforceBudgetLocked(now: now)
}
lock.unlock()
}
func abandonRequest(entryId: UUID) {
lock.lock()
if var entry = entries[entryId] {
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
if entry.session == nil && entry.inFlightRequests == 0 {
entries.removeValue(forKey: entryId)
} else {
entry.phase = .idle
entry.lastAccessAt = Date()
entries[entryId] = entry
}
}
lock.unlock()
}
func invalidateAll() {
lock.lock()
totals.totalEvictions += entries.count
entries.removeAll()
lock.unlock()
}
func reset() {
lock.lock()
entries.removeAll()
totals = Totals()
lock.unlock()
}
func snapshot() -> Snapshot {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let allEntries = Array(entries.values)
let sessions = allEntries
.sorted {
if $0.inFlightRequests != $1.inFlightRequests {
return $0.inFlightRequests > $1.inFlightRequests
}
return $0.lastAccessAt > $1.lastAccessAt
}
.map {
SessionSummary(
id: $0.id,
modelId: $0.modelId,
phase: $0.phase,
messageCount: $0.messageCount,
cachedTokenEstimate: $0.cachedTokenEstimate,
estimatedBytes: $0.estimatedBytes,
inFlightRequests: $0.inFlightRequests,
hitCount: $0.hitCount,
lastPromptTokens: $0.lastPromptTokens,
lastCompletionTokens: $0.lastCompletionTokens,
lastReuseTokens: $0.lastReuseTokens,
createdAt: $0.createdAt,
lastAccessAt: $0.lastAccessAt
)
}
let snapshot = Snapshot(
totalEntries: allEntries.count,
warmEntries: allEntries.filter { $0.session != nil }.count,
activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
generatingEntries: allEntries.filter { $0.phase == .generating }.count,
estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
totalHits: totals.totalHits,
totalMisses: totals.totalMisses,
totalEvictions: totals.totalEvictions,
totalReusePromptTokens: totals.totalReusePromptTokens,
totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
sessions: sessions
)
lock.unlock()
return snapshot
}
private func updatePhase(entryId: UUID, phase: APISessionPhase) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = phase
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
private func pruneExpiredLocked(now: Date) {
let expired = entries.values.filter {
$0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
}
guard !expired.isEmpty else { return }
for entry in expired {
entries.removeValue(forKey: entry.id)
}
totals.totalEvictions += expired.count
}
private func enforceBudgetLocked(now: Date) {
pruneExpiredLocked(now: now)
func totalCachedTokens() -> Int {
entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
}
while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
guard let victim = entries.values
.filter({ $0.inFlightRequests == 0 })
.sorted(by: evictionOrder)
.first
else {
break
}
entries.removeValue(forKey: victim.id)
totals.totalEvictions += 1
}
}
private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
if lhs.lastAccessAt != rhs.lastAccessAt {
return lhs.lastAccessAt < rhs.lastAccessAt
}
if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
}
return lhs.createdAt < rhs.createdAt
}
private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
guard cached.count <= incoming.count,
incoming.count <= cached.count + 1 else { return false }
for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
return false
}
return true
}
static func stableHash(_ text: String) -> UInt64 {
var hash: UInt64 = 14_695_981_039_346_656_037
for byte in text.utf8 {
hash ^= UInt64(byte)
hash &*= 1_099_511_628_211
}
return hash
}
private struct Entry {
let id: UUID
let modelId: String
let instructionsHash: UInt64
var requestMessageSignatures: [UInt64]
var messageCount: Int
var cachedTokenEstimate: Int
var estimatedBytes: Int
let createdAt: Date
var lastAccessAt: Date
var inFlightRequests: Int
var hitCount: Int
var phase: APISessionPhase
var lastPromptTokens: Int
var lastCompletionTokens: Int
var lastReuseTokens: Int
var session: ChatSession?
}
private struct Totals {
var totalHits: Int = 0
var totalMisses: Int = 0
var totalEvictions: Int = 0
var totalReusePromptTokens: Int = 0
var totalRebuildPromptTokens: Int = 0
}
}

View File

@@ -6,28 +6,31 @@ import SwiftUI
struct MonitorView: View { struct MonitorView: View {
let stats: InferenceStats let stats: InferenceStats
@Environment(ModelManager.self) private var modelManager @Environment(ModelManager.self) private var modelManager
private let chartColumns = [GridItem(.flexible(minimum: 260), spacing: 16), GridItem(.flexible(minimum: 260), spacing: 16)]
private let cardColumns = [GridItem(.flexible(minimum: 180), spacing: 16), GridItem(.flexible(minimum: 180), spacing: 16)]
var body: some View { var body: some View {
ScrollView { ScrollView {
VStack(spacing: 20) { VStack(spacing: 20) {
// Live status header
liveStatusSection liveStatusSection
// Charts LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
HStack(alignment: .top, spacing: 16) {
tokenRateChart tokenRateChart
tokenThroughputChart tokenThroughputChart
cacheReuseChart
cacheFootprintChart
cacheSessionChart
} }
// Gauges row LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 16) {
HStack(spacing: 16) {
contextGauge contextGauge
gpuMemoryGauge gpuMemoryGauge
requestsCard requestsCard
cacheCard
} }
// Cumulative stats
cumulativeSection cumulativeSection
sessionSection
} }
.padding(20) .padding(20)
} }
@@ -39,14 +42,14 @@ struct MonitorView: View {
@ViewBuilder @ViewBuilder
private var liveStatusSection: some View { private var liveStatusSection: some View {
VStack(alignment: .leading, spacing: 12) {
HStack(spacing: 16) { HStack(spacing: 16) {
// Activity indicator
HStack(spacing: 8) { HStack(spacing: 8) {
Circle() Circle()
.fill(activityColor) .fill(activityColor)
.frame(width: 10, height: 10) .frame(width: 10, height: 10)
.overlay { .overlay {
if stats.isGenerating || stats.isPrefilling { if stats.activeRequests > 0 {
Circle() Circle()
.stroke(activityColor.opacity(0.5), lineWidth: 2) .stroke(activityColor.opacity(0.5), lineWidth: 2)
.scaleEffect(1.8) .scaleEffect(1.8)
@@ -80,20 +83,33 @@ struct MonitorView: View {
.font(.callout) .font(.callout)
} }
} }
HStack(spacing: 8) {
phaseChip(title: "Preparing", count: stats.preparingRequests, color: .secondary)
phaseChip(title: "Session Build", count: stats.sessionBuildRequests, color: .purple)
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
}
}
.padding(12) .padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
} }
private var activityColor: Color { private var activityColor: Color {
if stats.isPrefilling { return .blue }
if stats.isGenerating { return .green } if stats.isGenerating { return .green }
if stats.prefillingRequests > 0 { return .blue }
if stats.sessionBuildRequests > 0 { return .purple }
if stats.preparingRequests > 0 { return .orange }
if stats.activeRequests > 0 { return .orange } if stats.activeRequests > 0 { return .orange }
return .secondary return .secondary
} }
private var activityLabel: String { private var activityLabel: String {
if stats.isPrefilling { return "Prefilling" }
if stats.isGenerating { return "Generating" } if stats.isGenerating { return "Generating" }
if stats.prefillingRequests > 0 { return "Prefilling" }
if stats.sessionBuildRequests > 0 { return "Building Sessions" }
if stats.preparingRequests > 0 { return "Preparing Requests" }
if stats.activeRequests > 0 { return "Processing" } if stats.activeRequests > 0 { return "Processing" }
return "Idle" return "Idle"
} }
@@ -145,6 +161,160 @@ struct MonitorView: View {
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
} }
@ViewBuilder
private var cacheReuseChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Prefill Reuse (/sec)")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart {
ForEach(stats.cacheReuseHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Tokens", point.value)
)
.foregroundStyle(.green.opacity(0.75))
}
ForEach(stats.cacheRebuildHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Tokens", point.value)
)
.foregroundStyle(.red.opacity(0.65))
}
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.0f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.frame(height: 150)
HStack(spacing: 12) {
Label("Reused", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.green)
Label("Rebuilt", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.red)
}
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
@ViewBuilder
private var cacheFootprintChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Cache Footprint (est)")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart(stats.cacheFootprintHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("MB", point.value / 1_048_576)
)
.foregroundStyle(.orange)
.interpolationMethod(.monotone)
AreaMark(
x: .value("Time", point.timestamp),
y: .value("MB", point.value / 1_048_576)
)
.foregroundStyle(.orange.opacity(0.12))
.interpolationMethod(.monotone)
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.1f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.frame(height: 150)
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
@ViewBuilder
private var cacheSessionChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Cached Sessions")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart {
ForEach(stats.cacheEntryHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("Cached", point.value)
)
.foregroundStyle(.purple)
.interpolationMethod(.monotone)
}
ForEach(stats.activeSessionHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("Active", point.value)
)
.foregroundStyle(.blue)
.interpolationMethod(.monotone)
}
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.0f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.frame(height: 150)
HStack(spacing: 12) {
Label("Cached", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.purple)
Label("Active", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.blue)
}
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
private var maxTokenRate: Double { private var maxTokenRate: Double {
stats.tokenRateHistory.map(\.value).max() ?? 10 stats.tokenRateHistory.map(\.value).max() ?? 10
} }
@@ -303,35 +473,69 @@ struct MonitorView: View {
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
} }
@ViewBuilder
private var cacheCard: some View {
VStack(alignment: .leading, spacing: 8) {
Text("Session Cache")
.font(.caption.bold())
.foregroundStyle(.secondary)
Text("\(stats.cacheEntryCount)")
.font(.title3.monospacedDigit().bold())
LabeledContent("Warm") {
Text("\(stats.warmCacheEntryCount)")
.monospacedDigit()
}
.font(.caption)
LabeledContent("Active") {
Text("\(stats.activeCacheEntryCount)")
.monospacedDigit()
}
.font(.caption)
LabeledContent("Est. Footprint") {
Text(formatByteCount(stats.cacheEstimatedBytes))
.monospacedDigit()
}
.font(.caption)
LabeledContent("Cached Tokens") {
Text(formatTokenCount(stats.cacheEstimatedTokens))
.monospacedDigit()
}
.font(.caption)
LabeledContent("Hit Rate") {
Text(String(format: "%.0f%%", cacheHitRate * 100))
.monospacedDigit()
}
.font(.caption)
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Cumulative // MARK: - Cumulative
@ViewBuilder @ViewBuilder
private var cumulativeSection: some View { private var cumulativeSection: some View {
HStack(spacing: 24) { VStack(alignment: .leading, spacing: 10) {
VStack(spacing: 2) { Text("Cumulative")
Text("Total Prompt Tokens") .font(.caption.bold())
.font(.caption2)
.foregroundStyle(.secondary) .foregroundStyle(.secondary)
Text(formatTokenCount(stats.totalPromptTokens))
.font(.callout.monospacedDigit().bold())
.foregroundStyle(.blue)
}
VStack(spacing: 2) { LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 12) {
Text("Total Generated Tokens") statTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue)
.font(.caption2) statTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .orange)
.foregroundStyle(.secondary) statTile(title: "Cache Hits", value: "\(stats.totalCacheHits)", color: .green)
Text(formatTokenCount(stats.totalGenerationTokens)) statTile(title: "Cache Misses", value: "\(stats.totalCacheMisses)", color: .red)
.font(.callout.monospacedDigit().bold()) statTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .green)
.foregroundStyle(.orange) statTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .red)
} statTile(title: "Evictions", value: "\(stats.totalCacheEvictions)", color: .secondary)
statTile(title: "Total Tokens", value: formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens), color: .primary)
VStack(spacing: 2) {
Text("Total Tokens")
.font(.caption2)
.foregroundStyle(.secondary)
Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens))
.font(.callout.monospacedDigit().bold())
} }
} }
.frame(maxWidth: .infinity) .frame(maxWidth: .infinity)
@@ -339,8 +543,129 @@ struct MonitorView: View {
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
} }
@ViewBuilder
private var sessionSection: some View {
VStack(alignment: .leading, spacing: 12) {
HStack {
Text("Cached Chat Sessions")
.font(.headline)
Spacer()
Text("\(stats.cachedSessions.count) visible")
.font(.caption)
.foregroundStyle(.secondary)
}
if stats.cachedSessions.isEmpty {
Text("No cached sessions yet.")
.font(.callout)
.foregroundStyle(.secondary)
.frame(maxWidth: .infinity, alignment: .leading)
} else {
ForEach(stats.cachedSessions) { session in
sessionRow(session)
}
}
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Helpers // MARK: - Helpers
@ViewBuilder
private func phaseChip(title: String, count: Int, color: Color) -> some View {
HStack(spacing: 6) {
Circle()
.fill(color)
.frame(width: 7, height: 7)
Text(title)
Text("\(count)")
.monospacedDigit()
}
.font(.caption)
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(color.opacity(0.12), in: Capsule())
}
@ViewBuilder
private func statTile(title: String, value: String, color: Color) -> some View {
VStack(alignment: .leading, spacing: 4) {
Text(title)
.font(.caption2)
.foregroundStyle(.secondary)
Text(value)
.font(.callout.monospacedDigit().bold())
.foregroundStyle(color)
}
.frame(maxWidth: .infinity, alignment: .leading)
.padding(10)
.background(Color.primary.opacity(0.04), in: RoundedRectangle(cornerRadius: 8))
}
@ViewBuilder
private func sessionRow(_ session: ConversationSessionCache.SessionSummary) -> some View {
VStack(alignment: .leading, spacing: 10) {
HStack(alignment: .firstTextBaseline) {
HStack(spacing: 8) {
Circle()
.fill(color(for: session.phase))
.frame(width: 8, height: 8)
Text(session.modelId)
.font(.callout.weight(.semibold))
.lineLimit(1)
}
Spacer()
Text(session.phase.rawValue)
.font(.caption.monospacedDigit())
.padding(.horizontal, 8)
.padding(.vertical, 4)
.background(color(for: session.phase).opacity(0.14), in: Capsule())
}
HStack(spacing: 12) {
sessionMetric("Msgs", "\(session.messageCount)")
sessionMetric("Cached", formatTokenCount(session.cachedTokenEstimate))
sessionMetric("Reuse", formatTokenCount(session.lastReuseTokens))
sessionMetric("Footprint", formatByteCount(session.estimatedBytes))
sessionMetric("Hits", "\(session.hitCount)")
sessionMetric("Active", "\(session.inFlightRequests)")
}
HStack(spacing: 12) {
sessionMetric("Prompt", formatTokenCount(session.lastPromptTokens))
sessionMetric("Completion", formatTokenCount(session.lastCompletionTokens))
sessionMetric("Last Access", relativeTimeString(session.lastAccessAt))
}
let ratio = maxContextRatio(for: session.cachedTokenEstimate)
ProgressView(value: ratio) {
Text("Cached Context")
.font(.caption2)
.foregroundStyle(.secondary)
} currentValueLabel: {
Text("\(Int(ratio * 100))%")
.font(.caption2.monospacedDigit())
.foregroundStyle(.secondary)
}
.tint(color(for: session.phase))
}
.padding(12)
.background(Color.primary.opacity(0.035), in: RoundedRectangle(cornerRadius: 10))
}
@ViewBuilder
private func sessionMetric(_ title: String, _ value: String) -> some View {
VStack(alignment: .leading, spacing: 2) {
Text(title)
.font(.caption2)
.foregroundStyle(.secondary)
Text(value)
.font(.caption.monospacedDigit().bold())
}
}
private func formatTokenCount(_ count: Int) -> String { private func formatTokenCount(_ count: Int) -> String {
if count >= 1_000_000 { if count >= 1_000_000 {
return String(format: "%.1fM", Double(count) / 1_000_000) return String(format: "%.1fM", Double(count) / 1_000_000)
@@ -349,4 +674,52 @@ struct MonitorView: View {
} }
return "\(count)" return "\(count)"
} }
private func formatByteCount(_ count: Int) -> String {
let bytes = Double(count)
if bytes >= 1_048_576 {
return String(format: "%.1f MB", bytes / 1_048_576)
}
if bytes >= 1024 {
return String(format: "%.0f KB", bytes / 1024)
}
return "\(count) B"
}
private func relativeTimeString(_ date: Date) -> String {
let seconds = max(0, Int(Date.now.timeIntervalSince(date)))
if seconds < 60 {
return "\(seconds)s"
}
let minutes = seconds / 60
if minutes < 60 {
return "\(minutes)m"
}
return "\(minutes / 60)h"
}
private func color(for phase: APISessionPhase) -> Color {
switch phase {
case .idle:
return .secondary
case .sessionBuild:
return .purple
case .prefilling:
return .blue
case .generating:
return .green
}
}
private var cacheHitRate: Double {
let total = stats.totalCacheHits + stats.totalCacheMisses
guard total > 0 else { return 0 }
return Double(stats.totalCacheHits) / Double(total)
}
private func maxContextRatio(for tokens: Int) -> Double {
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
guard maxContext > 0 else { return 0 }
return min(1, Double(tokens) / Double(maxContext))
}
} }