From 49bd165ce7dffce957f7ebeda0ca11654abd67aa Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Thu, 19 Mar 2026 11:30:18 +0100 Subject: [PATCH] fix: more telemetry and tighter implementation of cache --- MLXServer.xcodeproj/project.pbxproj | 4 + MLXServer/Models/InferenceStats.swift | 177 +++++- MLXServer/Server/APIModels.swift | 52 ++ MLXServer/Server/APIServer.swift | 210 +++++--- .../Server/ConversationSessionCache.swift | 358 ++++++++++++ MLXServer/Views/MonitorView.swift | 509 +++++++++++++++--- 6 files changed, 1154 insertions(+), 156 deletions(-) create mode 100644 MLXServer/Server/ConversationSessionCache.swift diff --git a/MLXServer.xcodeproj/project.pbxproj b/MLXServer.xcodeproj/project.pbxproj index 0597bbe..abb58d5 100644 --- a/MLXServer.xcodeproj/project.pbxproj +++ b/MLXServer.xcodeproj/project.pbxproj @@ -42,6 +42,7 @@ D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; }; D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; }; DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; }; + F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; }; F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; }; FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; }; FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; }; @@ -85,6 +86,7 @@ E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = ""; }; EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = ""; }; F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = ""; }; + FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -203,6 +205,7 @@ children = ( F1A52E2C9964ADA9D841A89B /* APIModels.swift */, 3D08828E16B17EF02C14243E /* APIServer.swift */, + FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */, E73B165A1822729C907791AE /* ToolCallParser.swift */, 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */, ); @@ -306,6 +309,7 @@ 85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */, B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */, 5946258F1DE88CE904584E0B /* ContentView.swift in Sources */, + F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */, C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */, 4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */, 2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */, diff --git a/MLXServer/Models/InferenceStats.swift b/MLXServer/Models/InferenceStats.swift index 66c08e2..aaa1208 100644 --- a/MLXServer/Models/InferenceStats.swift +++ b/MLXServer/Models/InferenceStats.swift @@ -9,9 +9,14 @@ final class LiveCounters: @unchecked Sendable { static let shared = LiveCounters() private let lock = OSAllocatedUnfairLock() + private var requestPhases: [String: RequestPhase] = [:] // Current request private var _activeRequests: Int = 0 + private var _preparingRequests: Int = 0 + private var _sessionBuildRequests: Int = 0 + private var _prefillRequests: Int = 0 + private var _generatingRequests: Int = 0 private var _promptTokens: Int = 0 private var _generationTokens: Int = 0 private var _tokensPerSecond: Double = 0 @@ -24,9 +29,10 @@ final class LiveCounters: @unchecked Sendable { private var _totalPromptTokens: Int = 0 private var _totalGenerationTokens: Int = 0 - func requestStarted(contextLength: Int) { + func requestStarted(requestId: String, contextLength: Int) { lock.lock() _activeRequests += 1 + _preparingRequests += 1 _totalRequests += 1 _isPrefilling = true _isGenerating = false @@ -34,15 +40,33 @@ final class LiveCounters: @unchecked Sendable { _generationTokens = 0 _tokensPerSecond = 0 _contextMax = contextLength + requestPhases[requestId] = .preparing lock.unlock() } - func prefillCompleted(promptTokens: Int) { + func requestPhaseChanged(requestId: String, phase: RequestPhase) { lock.lock() - _isPrefilling = false - _isGenerating = true + if let current = requestPhases[requestId] { + decrementCount(for: current) + } + incrementCount(for: phase) + requestPhases[requestId] = phase + _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 + _isGenerating = _generatingRequests > 0 + lock.unlock() + } + + func prefillCompleted(requestId: String, promptTokens: Int) { + lock.lock() + if let current = requestPhases[requestId] { + decrementCount(for: current) + } + incrementCount(for: .generating) + requestPhases[requestId] = .generating _promptTokens = promptTokens _totalPromptTokens += promptTokens + _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 + _isGenerating = _generatingRequests > 0 lock.unlock() } @@ -53,21 +77,32 @@ final class LiveCounters: @unchecked Sendable { lock.unlock() } - func requestCompleted(generationTokens: Int) { + func requestCompleted(requestId: String, generationTokens: Int) { lock.lock() + if let current = requestPhases.removeValue(forKey: requestId) { + decrementCount(for: current) + } _activeRequests = max(0, _activeRequests - 1) _totalGenerationTokens += generationTokens if _activeRequests == 0 { _isGenerating = false _isPrefilling = false _tokensPerSecond = 0 + } else { + _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 + _isGenerating = _generatingRequests > 0 } lock.unlock() } func reset() { lock.lock() + requestPhases.removeAll() _activeRequests = 0 + _preparingRequests = 0 + _sessionBuildRequests = 0 + _prefillRequests = 0 + _generatingRequests = 0 _promptTokens = 0 _generationTokens = 0 _tokensPerSecond = 0 @@ -85,6 +120,10 @@ final class LiveCounters: @unchecked Sendable { lock.lock() let s = Snapshot( activeRequests: _activeRequests, + preparingRequests: _preparingRequests, + sessionBuildRequests: _sessionBuildRequests, + prefillRequests: _prefillRequests, + generatingRequests: _generatingRequests, promptTokens: _promptTokens, generationTokens: _generationTokens, tokensPerSecond: _tokensPerSecond, @@ -101,6 +140,10 @@ final class LiveCounters: @unchecked Sendable { struct Snapshot { let activeRequests: Int + let preparingRequests: Int + let sessionBuildRequests: Int + let prefillRequests: Int + let generatingRequests: Int let promptTokens: Int let generationTokens: Int let tokensPerSecond: Double @@ -111,6 +154,39 @@ final class LiveCounters: @unchecked Sendable { let totalPromptTokens: Int let totalGenerationTokens: Int } + + private func incrementCount(for phase: RequestPhase) { + switch phase { + case .preparing: + _preparingRequests += 1 + case .sessionBuild: + _sessionBuildRequests += 1 + case .prefilling: + _prefillRequests += 1 + case .generating: + _generatingRequests += 1 + } + } + + private func decrementCount(for phase: RequestPhase) { + switch phase { + case .preparing: + _preparingRequests = max(0, _preparingRequests - 1) + case .sessionBuild: + _sessionBuildRequests = max(0, _sessionBuildRequests - 1) + case .prefilling: + _prefillRequests = max(0, _prefillRequests - 1) + case .generating: + _generatingRequests = max(0, _generatingRequests - 1) + } + } + + enum RequestPhase { + case preparing + case sessionBuild + case prefilling + case generating + } } // MARK: - Observable stats for the UI (polls LiveCounters at 1Hz) @@ -121,6 +197,10 @@ final class InferenceStats { // MARK: - Current request state (refreshed from LiveCounters) var activeRequests: Int = 0 + var preparingRequests: Int = 0 + var sessionBuildRequests: Int = 0 + var prefillingRequests: Int = 0 + var generatingRequests: Int = 0 var currentPromptTokens: Int = 0 var currentGenerationTokens: Int = 0 var isGenerating: Bool = false @@ -134,6 +214,21 @@ final class InferenceStats { var totalRequests: Int = 0 var totalPromptTokens: Int = 0 var totalGenerationTokens: Int = 0 + var totalCacheHits: Int = 0 + var totalCacheMisses: Int = 0 + var totalCacheEvictions: Int = 0 + var totalCacheReusePromptTokens: Int = 0 + var totalCacheRebuildPromptTokens: Int = 0 + + // MARK: - Cache state + + var cacheEntryCount: Int = 0 + var warmCacheEntryCount: Int = 0 + var activeCacheEntryCount: Int = 0 + var generatingCacheEntryCount: Int = 0 + var cacheEstimatedBytes: Int = 0 + var cacheEstimatedTokens: Int = 0 + var cachedSessions: [ConversationSessionCache.SessionSummary] = [] // MARK: - Time series data (ring buffers for charts) @@ -146,6 +241,11 @@ final class InferenceStats { private(set) var tokenRateHistory: [DataPoint] = [] private(set) var promptTokenHistory: [DataPoint] = [] private(set) var generationTokenHistory: [DataPoint] = [] + private(set) var cacheEntryHistory: [DataPoint] = [] + private(set) var activeSessionHistory: [DataPoint] = [] + private(set) var cacheFootprintHistory: [DataPoint] = [] + private(set) var cacheReuseHistory: [DataPoint] = [] + private(set) var cacheRebuildHistory: [DataPoint] = [] private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz @@ -153,6 +253,8 @@ final class InferenceStats { private var sampleTimer: Timer? private var lastGenerationTokenCount: Int = 0 private var lastPromptTokenCount: Int = 0 + private var lastCacheReuseTokenCount: Int = 0 + private var lastCacheRebuildTokenCount: Int = 0 func startSampling() { guard sampleTimer == nil else { return } @@ -171,8 +273,13 @@ final class InferenceStats { private func recordSample() { // Pull live values from the thread-safe counters let snap = LiveCounters.shared.snapshot() + let cache = ConversationSessionCache.shared.snapshot() activeRequests = snap.activeRequests + preparingRequests = snap.preparingRequests + sessionBuildRequests = snap.sessionBuildRequests + prefillingRequests = snap.prefillRequests + generatingRequests = snap.generatingRequests currentPromptTokens = snap.promptTokens currentGenerationTokens = snap.generationTokens currentTokensPerSecond = snap.tokensPerSecond @@ -183,16 +290,37 @@ final class InferenceStats { totalRequests = snap.totalRequests totalPromptTokens = snap.totalPromptTokens totalGenerationTokens = snap.totalGenerationTokens + totalCacheHits = cache.totalHits + totalCacheMisses = cache.totalMisses + totalCacheEvictions = cache.totalEvictions + totalCacheReusePromptTokens = cache.totalReusePromptTokens + totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens + cacheEntryCount = cache.totalEntries + warmCacheEntryCount = cache.warmEntries + activeCacheEntryCount = cache.activeEntries + generatingCacheEntryCount = cache.generatingEntries + cacheEstimatedBytes = cache.estimatedBytes + cacheEstimatedTokens = cache.cachedTokenEstimate + cachedSessions = cache.sessions let now = Date.now let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount let promptDelta = snap.totalPromptTokens - lastPromptTokenCount + let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount + let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount lastGenerationTokenCount = snap.totalGenerationTokens lastPromptTokenCount = snap.totalPromptTokens + lastCacheReuseTokenCount = cache.totalReusePromptTokens + lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond)) generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta))) promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta))) + cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries))) + activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries))) + cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes))) + cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta))) + cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta))) if tokenRateHistory.count > Self.maxHistoryPoints { tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints) @@ -203,11 +331,31 @@ final class InferenceStats { if promptTokenHistory.count > Self.maxHistoryPoints { promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints) } + if cacheEntryHistory.count > Self.maxHistoryPoints { + cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints) + } + if activeSessionHistory.count > Self.maxHistoryPoints { + activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints) + } + if cacheFootprintHistory.count > Self.maxHistoryPoints { + cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints) + } + if cacheReuseHistory.count > Self.maxHistoryPoints { + cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints) + } + if cacheRebuildHistory.count > Self.maxHistoryPoints { + cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints) + } } func reset() { LiveCounters.shared.reset() + ConversationSessionCache.shared.reset() activeRequests = 0 + preparingRequests = 0 + sessionBuildRequests = 0 + prefillingRequests = 0 + generatingRequests = 0 currentPromptTokens = 0 currentGenerationTokens = 0 isGenerating = false @@ -218,10 +366,29 @@ final class InferenceStats { totalRequests = 0 totalPromptTokens = 0 totalGenerationTokens = 0 + totalCacheHits = 0 + totalCacheMisses = 0 + totalCacheEvictions = 0 + totalCacheReusePromptTokens = 0 + totalCacheRebuildPromptTokens = 0 + cacheEntryCount = 0 + warmCacheEntryCount = 0 + activeCacheEntryCount = 0 + generatingCacheEntryCount = 0 + cacheEstimatedBytes = 0 + cacheEstimatedTokens = 0 + cachedSessions.removeAll() tokenRateHistory.removeAll() promptTokenHistory.removeAll() generationTokenHistory.removeAll() + cacheEntryHistory.removeAll() + activeSessionHistory.removeAll() + cacheFootprintHistory.removeAll() + cacheReuseHistory.removeAll() + cacheRebuildHistory.removeAll() lastGenerationTokenCount = 0 lastPromptTokenCount = 0 + lastCacheReuseTokenCount = 0 + lastCacheRebuildTokenCount = 0 } } diff --git a/MLXServer/Server/APIModels.swift b/MLXServer/Server/APIModels.swift index 0841412..61563ee 100644 --- a/MLXServer/Server/APIModels.swift +++ b/MLXServer/Server/APIModels.swift @@ -16,6 +16,50 @@ struct APIToolDefinition: Codable { struct APIFunctionCall: Codable { let name: String let arguments: String // JSON string + + init(name: String, arguments: String) { + self.name = name + self.arguments = arguments + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + name = try container.decode(String.self, forKey: .name) + + if let argumentString = try? container.decode(String.self, forKey: .arguments) { + arguments = argumentString + return + } + + if let argumentObject = try? container.decode([String: AnyCodable].self, forKey: .arguments) { + let jsonObject = argumentObject.mapValues(\.value) + if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]), + let string = String(data: data, encoding: .utf8) { + arguments = string + } else { + arguments = "{}" + } + return + } + + if let argumentArray = try? container.decode([AnyCodable].self, forKey: .arguments) { + let jsonObject = argumentArray.map(\.value) + if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]), + let string = String(data: data, encoding: .utf8) { + arguments = string + } else { + arguments = "[]" + } + return + } + + if (try? container.decodeNil(forKey: .arguments)) == true { + arguments = "{}" + return + } + + arguments = "{}" + } } struct APIToolCall: Codable { @@ -30,6 +74,14 @@ struct APIToolCall: Codable { self.type = type self.function = function } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + index = try container.decodeIfPresent(Int.self, forKey: .index) ?? 0 + id = try container.decodeIfPresent(String.self, forKey: .id) ?? "call_\(UUID().uuidString.lowercased())" + type = try container.decodeIfPresent(String.self, forKey: .type) ?? "function" + function = try container.decode(APIFunctionCall.self, forKey: .function) + } } struct APIImageURL: Codable { diff --git a/MLXServer/Server/APIServer.swift b/MLXServer/Server/APIServer.swift index ab6c102..26b8eee 100644 --- a/MLXServer/Server/APIServer.swift +++ b/MLXServer/Server/APIServer.swift @@ -16,12 +16,6 @@ final class APIServer { private var listener: NWListener? private var modelManager: ModelManager? - // Persistent ChatSession for KV cache reuse across requests - private var cachedSession: ChatSession? - private var cachedMessages: [Chat.Message]? - private var cachedModelId: String? - private var cachedInstructions: String = "" - func start(modelManager: ModelManager, port: Int = 1234) { guard !isRunning else { return } self.modelManager = modelManager @@ -70,10 +64,7 @@ final class APIServer { listener?.cancel() listener = nil isRunning = false - cachedSession = nil - cachedMessages = nil - cachedModelId = nil - cachedInstructions = "" + ConversationSessionCache.shared.invalidateAll() inferenceStats.stopSampling() } @@ -186,10 +177,7 @@ final class APIServer { if let targetConfig = ModelConfig.resolve(requestedModel) { if modelManager.currentModel?.id != targetConfig.id { print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)") - cachedSession = nil - cachedMessages = nil - cachedModelId = nil - cachedInstructions = "" + ConversationSessionCache.shared.invalidateAll() await modelManager.loadModel(targetConfig) } } @@ -200,10 +188,7 @@ final class APIServer { if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId, let config = ModelConfig.resolve(lastModelId) { print("[APIServer] Reloading idle-unloaded model: \(config.repoId)") - cachedSession = nil - cachedMessages = nil - cachedModelId = nil - cachedInstructions = "" + ConversationSessionCache.shared.invalidateAll() await modelManager.loadModel(config) } @@ -233,9 +218,13 @@ final class APIServer { return } + LiveCounters.shared.requestStarted(requestId: requestId, contextLength: contextLength) + // Convert API messages to Chat.Message, extracting images from content parts var chatMessages: [Chat.Message] = [] + var messageSignatures: [UInt64] = [] var images: [UserInput.Image] = [] + var estimatedBytes = 0 let currentModelRepoId = currentModel?.repoId ?? modelName // Build the instructions string (system prompt + tool definitions). @@ -259,8 +248,8 @@ final class APIServer { instructions += toolSystemPrompt } - let toolsForInjection = request.tools let isQwen = currentModelRepoId.lowercased().contains("qwen") + estimatedBytes += instructions.utf8.count // Convert non-system messages to Chat.Message for msg in request.messages where msg.role != "system" { @@ -297,18 +286,25 @@ final class APIServer { // Extract base64 images from content parts let imageURLs = msg.content?.imageURLs ?? [] var messageImages: [UserInput.Image] = [] + var messageImageBytes = 0 for urlString in imageURLs { - if let image = decodeBase64Image(urlString) { - messageImages.append(image) + if let decoded = decodeBase64Image(urlString) { + messageImages.append(decoded.image) + messageImageBytes += decoded.estimatedBytes } } // Attach images to this specific message chatMessages.append(Chat.Message(role: role, content: text, images: messageImages)) + messageSignatures.append( + Self.messageSignature(role: role, content: text, imageURLs: imageURLs) + ) + estimatedBytes += text.utf8.count + messageImageBytes images.append(contentsOf: messageImages) } if !images.isEmpty, currentModel?.supportsImages != true { + LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0) sendResponse( connection: connection, status: 400, @@ -318,18 +314,18 @@ final class APIServer { } // Context window check: estimate token count and reject if over limit + let estimatedPromptTokens = (instructions.count + chatMessages.reduce(0) { $0 + $1.content.count }) * 10 / 35 if contextLength > 0 { - let totalChars = chatMessages.reduce(0) { $0 + $1.content.count } - let estimatedTokens = totalChars * 10 / 35 // ~3.5 chars per token - let needed = estimatedTokens + maxTokens + let needed = estimatedPromptTokens + maxTokens if needed > contextLength { let errorBody = """ {"error":{"message":"This model's maximum context length is \(contextLength) tokens. \ - However, your messages resulted in approximately \(estimatedTokens) tokens and \ + However, your messages resulted in approximately \(estimatedPromptTokens) tokens and \ \(maxTokens) tokens were requested for the completion (\(needed) total). \ Please reduce the length of the messages or completion.",\ "type":"invalid_request_error","code":"context_length_exceeded"}} """ + LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0) sendResponse(connection: connection, status: 400, body: errorBody) return } @@ -345,23 +341,28 @@ final class APIServer { let allButLast = Array(chatMessages.dropLast()) let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "") - // KV cache reuse: check if the cached session's history matches - let currentModelId = modelManager.currentModel?.id - let canReuse = cachedSession != nil - && cachedModelId == currentModelId - && cachedMessages != nil - && cachedInstructions == instructions - && messagesMatch(cachedMessages!, allButLast) + let historySignatures = Array(messageSignatures.dropLast()) + let currentModelId = modelManager.currentModel?.id ?? modelName + let lease = ConversationSessionCache.shared.checkoutSession( + modelId: currentModelId, + instructions: instructions, + historySignatures: historySignatures, + requestMessageCount: chatMessages.count, + estimatedPromptTokens: estimatedPromptTokens, + estimatedBytes: estimatedBytes + ) let session: ChatSession - if canReuse { + if let reusableSession = lease.session { print("[APIServer] Reusing cached session (\(allButLast.count) history messages)") - session = cachedSession! + session = reusableSession session.generateParameters = generateParams + ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId) + LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling) } else { - if cachedSession != nil { - print("[APIServer] History diverged, creating fresh session") - } + print("[APIServer] Creating fresh session") + ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId) + LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild) // Use `instructions:` for system/tool prompt (matches internal chat pattern). // Only conversation turns go in `history:` — this avoids replaying the // large tool prompt as history on every new session. @@ -385,47 +386,62 @@ final class APIServer { additionalContext: thinkingContext ) } + ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId) + LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling) } // Extract images from the last message only (ChatSession.streamDetails takes images separately) let lastImages = lastMessage.images - LiveCounters.shared.requestStarted(contextLength: contextLength) + let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool) if isStream { - await handleStreamingResponse( + result = await handleStreamingResponse( connection: connection, + requestId: requestId, + cacheEntryId: lease.entryId, session: session, prompt: lastMessage.content, images: lastImages, tools: request.tools, - requestId: requestId, created: created, modelName: modelName ) } else { - await handleNonStreamingResponse( + result = await handleNonStreamingResponse( connection: connection, + requestId: requestId, + cacheEntryId: lease.entryId, session: session, prompt: lastMessage.content, images: lastImages, tools: request.tools, - requestId: requestId, created: created, modelName: modelName ) } - // Cache the session for reuse on next request - // allButLast + lastMessage (user) + assistant response = new cached history - cachedSession = session - cachedMessages = chatMessages // full messages including the one just sent - cachedModelId = currentModelId - cachedInstructions = instructions + if result.succeeded { + ConversationSessionCache.shared.completeRequest( + entryId: lease.entryId, + session: session, + requestMessageSignatures: messageSignatures, + requestMessageCount: chatMessages.count, + estimatedPromptTokens: estimatedPromptTokens, + estimatedBytes: estimatedBytes, + promptTokens: result.promptTokens, + completionTokens: result.completionTokens + ) + } else { + ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId) + } + + LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens) + modelManager.touchActivity() } /// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image. - private func decodeBase64Image(_ urlString: String) -> UserInput.Image? { + private func decodeBase64Image(_ urlString: String) -> DecodedImage? { // Handle data URIs: data:image/png;base64, let base64String: String if urlString.hasPrefix("data:") { @@ -442,21 +458,23 @@ final class APIServer { return nil } - return .ciImage(CIImage(cgImage: cgImage)) + let estimatedBytes = max(data.count, cgImage.width * cgImage.height * 4) + return DecodedImage(image: .ciImage(CIImage(cgImage: cgImage)), estimatedBytes: estimatedBytes) } // MARK: - Non-streaming response private func handleNonStreamingResponse( connection: NWConnection, + requestId: String, + cacheEntryId: UUID, session: ChatSession, prompt: String, images: [UserInput.Image], tools: [APIToolDefinition]?, - requestId: String, created: Int, modelName: String - ) async { + ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) { do { var fullText = "" var promptTokens = 0 @@ -478,7 +496,12 @@ final class APIServer { case .info(let info): promptTokens = info.promptTokenCount completionTokens = info.generationTokenCount - LiveCounters.shared.prefillCompleted(promptTokens: promptTokens) + ConversationSessionCache.shared.markGenerating( + entryId: cacheEntryId, + promptTokens: promptTokens, + completionTokens: completionTokens + ) + LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens) if info.tokensPerSecond > 0 { LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens) } @@ -487,9 +510,6 @@ final class APIServer { } } - LiveCounters.shared.requestCompleted(generationTokens: completionTokens) - modelManager?.touchActivity() - // Parse tool calls: first check framework-detected ones, then our own text parser var finishReason = "stop" var responseContent: String? = fullText @@ -559,10 +579,10 @@ final class APIServer { if let json = try? JSONEncoder().encode(response) { sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}") } + return (promptTokens, completionTokens, true) } catch { - LiveCounters.shared.requestCompleted(generationTokens: 0) - modelManager?.touchActivity() sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) + return (0, 0, false) } } @@ -570,14 +590,15 @@ final class APIServer { private func handleStreamingResponse( connection: NWConnection, + requestId: String, + cacheEntryId: UUID, session: ChatSession, prompt: String, images: [UserInput.Image], tools: [APIToolDefinition]?, - requestId: String, created: Int, modelName: String - ) async { + ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) { // Send SSE headers let header = [ "HTTP/1.1 200 OK", @@ -625,7 +646,16 @@ final class APIServer { ) }() - let (promptTokens, completionTokens, fullText, frameworkToolCalls) = result + let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result + + if promptTokens > 0 { + ConversationSessionCache.shared.markGenerating( + entryId: cacheEntryId, + promptTokens: promptTokens, + completionTokens: completionTokens + ) + LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens) + } // Stats were already updated by LiveCounters inside the loop @@ -696,12 +726,10 @@ final class APIServer { ) )) - LiveCounters.shared.requestCompleted(generationTokens: completionTokens) - modelManager?.touchActivity() - // Send [DONE] and close await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!) connection.cancel() + return (promptTokens, completionTokens, succeeded) } /// Run the token generation + SSE send loop entirely off MainActor. @@ -713,7 +741,7 @@ final class APIServer { requestId: String, created: Int, modelName: String - ) async -> (Int, Int, String, [MLXLMCommon.ToolCall]) { + ) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) { var promptTokens = 0 var completionTokens = 0 var fullText = "" @@ -742,7 +770,6 @@ final class APIServer { case .info(let info): promptTokens = info.promptTokenCount completionTokens = info.generationTokenCount - LiveCounters.shared.prefillCompleted(promptTokens: promptTokens) if info.tokensPerSecond > 0 { LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens) } @@ -754,9 +781,10 @@ final class APIServer { } catch { let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n" await sendData(connection: connection, data: errorEvent.data(using: .utf8)!) + return (promptTokens, completionTokens, fullText, frameworkToolCalls, false) } - return (promptTokens, completionTokens, fullText, frameworkToolCalls) + return (promptTokens, completionTokens, fullText, frameworkToolCalls, true) } /// Send an SSE event and wait for the protocol stack to process it. @@ -819,26 +847,42 @@ final class APIServer { ] } - /// Check if the cached session can be reused for the new history. - /// - /// After a request the session's KV cache contains: - /// cachedMessages (history + user prompt) + the generated assistant response. - /// On the next request the client sends back the full conversation, so - /// `newHistory` (allButLast) is typically `cachedMessages` + 1 assistant reply. - /// We allow reuse when `cached` is a prefix of `newHistory` and there is at most - /// one extra message (the assistant response the session already generated). - /// More than one extra message (e.g. injected tool results) means the session - /// hasn't processed them, so we must create a fresh session. - private func messagesMatch(_ cached: [Chat.Message], _ newHistory: [Chat.Message]) -> Bool { - guard cached.count <= newHistory.count, - newHistory.count <= cached.count + 1 else { return false } - for (a, b) in zip(cached, newHistory) { - if a.role != b.role || a.content != b.content { return false } + private static func messageSignature(role: Chat.Message.Role, content: String, imageURLs: [String]) -> UInt64 { + var hash: UInt64 = 14_695_981_039_346_656_037 + + func mix(_ text: String) { + for byte in text.utf8 { + hash ^= UInt64(byte) + hash &*= 1_099_511_628_211 + } } - return true + + switch role { + case .assistant: + mix("assistant") + case .system: + mix("system") + case .user: + mix("user") + @unknown default: + mix("unknown") + } + mix("|") + mix(content) + for imageURL in imageURLs { + mix("|") + mix(imageURL) + } + + return hash } } +private struct DecodedImage { + let image: UserInput.Image + let estimatedBytes: Int +} + // MARK: - HTTP request parser private struct HTTPRequest { diff --git a/MLXServer/Server/ConversationSessionCache.swift b/MLXServer/Server/ConversationSessionCache.swift new file mode 100644 index 0000000..a6964ff --- /dev/null +++ b/MLXServer/Server/ConversationSessionCache.swift @@ -0,0 +1,358 @@ +import Foundation +import MLXLMCommon +import os + +enum APISessionPhase: String, Sendable { + case idle = "Idle" + case sessionBuild = "Session Build" + case prefilling = "Prefilling" + case generating = "Generating" +} + +/// Bounded cache of API chat sessions keyed by normalized conversation history. +/// The cache is internal-only and safe to sample from the monitor without involving MainActor. +final class ConversationSessionCache: @unchecked Sendable { + static let shared = ConversationSessionCache() + + private let lock = OSAllocatedUnfairLock() + + private let maxEntries = 8 + private let maxCachedTokens = 256_000 + private let idleTTL: TimeInterval = 10 * 60 + + private var entries: [UUID: Entry] = [:] + private var totals = Totals() + + private init() {} + + struct Lease { + let entryId: UUID + let session: ChatSession? + let reusedPromptTokens: Int + let cacheHit: Bool + } + + struct SessionSummary: Identifiable, Sendable { + let id: UUID + let modelId: String + let phase: APISessionPhase + let messageCount: Int + let cachedTokenEstimate: Int + let estimatedBytes: Int + let inFlightRequests: Int + let hitCount: Int + let lastPromptTokens: Int + let lastCompletionTokens: Int + let lastReuseTokens: Int + let createdAt: Date + let lastAccessAt: Date + } + + struct Snapshot: Sendable { + let totalEntries: Int + let warmEntries: Int + let activeEntries: Int + let generatingEntries: Int + let estimatedBytes: Int + let cachedTokenEstimate: Int + let totalHits: Int + let totalMisses: Int + let totalEvictions: Int + let totalReusePromptTokens: Int + let totalRebuildPromptTokens: Int + let sessions: [SessionSummary] + } + + func checkoutSession( + modelId: String, + instructions: String, + historySignatures: [UInt64], + requestMessageCount: Int, + estimatedPromptTokens: Int, + estimatedBytes: Int + ) -> Lease { + lock.lock() + let now = Date() + pruneExpiredLocked(now: now) + + let instructionsHash = Self.stableHash(instructions) + let match = entries + .values + .filter { + $0.modelId == modelId + && $0.instructionsHash == instructionsHash + && $0.session != nil + && $0.inFlightRequests == 0 + && Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures) + } + .max { lhs, rhs in + lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count + } + + if let match { + var entry = match + entry.inFlightRequests += 1 + entry.lastAccessAt = now + entry.phase = .prefilling + entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens) + entry.hitCount += 1 + entries[entry.id] = entry + totals.totalHits += 1 + totals.totalReusePromptTokens += entry.lastReuseTokens + let lease = Lease( + entryId: entry.id, + session: entry.session, + reusedPromptTokens: entry.lastReuseTokens, + cacheHit: true + ) + lock.unlock() + return lease + } + + let entryId = UUID() + entries[entryId] = Entry( + id: entryId, + modelId: modelId, + instructionsHash: instructionsHash, + requestMessageSignatures: historySignatures, + messageCount: requestMessageCount, + cachedTokenEstimate: estimatedPromptTokens, + estimatedBytes: estimatedBytes, + createdAt: now, + lastAccessAt: now, + inFlightRequests: 1, + hitCount: 0, + phase: .sessionBuild, + lastPromptTokens: 0, + lastCompletionTokens: 0, + lastReuseTokens: 0, + session: nil + ) + totals.totalMisses += 1 + totals.totalRebuildPromptTokens += estimatedPromptTokens + lock.unlock() + return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false) + } + + func markSessionBuild(entryId: UUID) { + updatePhase(entryId: entryId, phase: .sessionBuild) + } + + func markPrefilling(entryId: UUID) { + updatePhase(entryId: entryId, phase: .prefilling) + } + + func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) { + lock.lock() + if var entry = entries[entryId] { + entry.phase = .generating + entry.lastPromptTokens = promptTokens + entry.lastCompletionTokens = completionTokens + entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens) + entry.lastAccessAt = Date() + entries[entryId] = entry + } + lock.unlock() + } + + func completeRequest( + entryId: UUID, + session: ChatSession, + requestMessageSignatures: [UInt64], + requestMessageCount: Int, + estimatedPromptTokens: Int, + estimatedBytes: Int, + promptTokens: Int, + completionTokens: Int + ) { + lock.lock() + let now = Date() + if var entry = entries[entryId] { + entry.session = session + entry.requestMessageSignatures = requestMessageSignatures + entry.messageCount = requestMessageCount + entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens) + entry.estimatedBytes = estimatedBytes + entry.lastPromptTokens = promptTokens + entry.lastCompletionTokens = completionTokens + entry.lastAccessAt = now + entry.inFlightRequests = max(0, entry.inFlightRequests - 1) + entry.phase = .idle + entries[entryId] = entry + enforceBudgetLocked(now: now) + } + lock.unlock() + } + + func abandonRequest(entryId: UUID) { + lock.lock() + if var entry = entries[entryId] { + entry.inFlightRequests = max(0, entry.inFlightRequests - 1) + if entry.session == nil && entry.inFlightRequests == 0 { + entries.removeValue(forKey: entryId) + } else { + entry.phase = .idle + entry.lastAccessAt = Date() + entries[entryId] = entry + } + } + lock.unlock() + } + + func invalidateAll() { + lock.lock() + totals.totalEvictions += entries.count + entries.removeAll() + lock.unlock() + } + + func reset() { + lock.lock() + entries.removeAll() + totals = Totals() + lock.unlock() + } + + func snapshot() -> Snapshot { + lock.lock() + let now = Date() + pruneExpiredLocked(now: now) + let allEntries = Array(entries.values) + let sessions = allEntries + .sorted { + if $0.inFlightRequests != $1.inFlightRequests { + return $0.inFlightRequests > $1.inFlightRequests + } + return $0.lastAccessAt > $1.lastAccessAt + } + .map { + SessionSummary( + id: $0.id, + modelId: $0.modelId, + phase: $0.phase, + messageCount: $0.messageCount, + cachedTokenEstimate: $0.cachedTokenEstimate, + estimatedBytes: $0.estimatedBytes, + inFlightRequests: $0.inFlightRequests, + hitCount: $0.hitCount, + lastPromptTokens: $0.lastPromptTokens, + lastCompletionTokens: $0.lastCompletionTokens, + lastReuseTokens: $0.lastReuseTokens, + createdAt: $0.createdAt, + lastAccessAt: $0.lastAccessAt + ) + } + let snapshot = Snapshot( + totalEntries: allEntries.count, + warmEntries: allEntries.filter { $0.session != nil }.count, + activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count, + generatingEntries: allEntries.filter { $0.phase == .generating }.count, + estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes }, + cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate }, + totalHits: totals.totalHits, + totalMisses: totals.totalMisses, + totalEvictions: totals.totalEvictions, + totalReusePromptTokens: totals.totalReusePromptTokens, + totalRebuildPromptTokens: totals.totalRebuildPromptTokens, + sessions: sessions + ) + lock.unlock() + return snapshot + } + + private func updatePhase(entryId: UUID, phase: APISessionPhase) { + lock.lock() + if var entry = entries[entryId] { + entry.phase = phase + entry.lastAccessAt = Date() + entries[entryId] = entry + } + lock.unlock() + } + + private func pruneExpiredLocked(now: Date) { + let expired = entries.values.filter { + $0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL + } + guard !expired.isEmpty else { return } + for entry in expired { + entries.removeValue(forKey: entry.id) + } + totals.totalEvictions += expired.count + } + + private func enforceBudgetLocked(now: Date) { + pruneExpiredLocked(now: now) + + func totalCachedTokens() -> Int { + entries.values.reduce(0) { $0 + $1.cachedTokenEstimate } + } + + while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens { + guard let victim = entries.values + .filter({ $0.inFlightRequests == 0 }) + .sorted(by: evictionOrder) + .first + else { + break + } + entries.removeValue(forKey: victim.id) + totals.totalEvictions += 1 + } + } + + private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool { + if lhs.lastAccessAt != rhs.lastAccessAt { + return lhs.lastAccessAt < rhs.lastAccessAt + } + if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate { + return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate + } + return lhs.createdAt < rhs.createdAt + } + + private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool { + guard cached.count <= incoming.count, + incoming.count <= cached.count + 1 else { return false } + for (lhs, rhs) in zip(cached, incoming) where lhs != rhs { + return false + } + return true + } + + static func stableHash(_ text: String) -> UInt64 { + var hash: UInt64 = 14_695_981_039_346_656_037 + for byte in text.utf8 { + hash ^= UInt64(byte) + hash &*= 1_099_511_628_211 + } + return hash + } + + private struct Entry { + let id: UUID + let modelId: String + let instructionsHash: UInt64 + var requestMessageSignatures: [UInt64] + var messageCount: Int + var cachedTokenEstimate: Int + var estimatedBytes: Int + let createdAt: Date + var lastAccessAt: Date + var inFlightRequests: Int + var hitCount: Int + var phase: APISessionPhase + var lastPromptTokens: Int + var lastCompletionTokens: Int + var lastReuseTokens: Int + var session: ChatSession? + } + + private struct Totals { + var totalHits: Int = 0 + var totalMisses: Int = 0 + var totalEvictions: Int = 0 + var totalReusePromptTokens: Int = 0 + var totalRebuildPromptTokens: Int = 0 + } +} \ No newline at end of file diff --git a/MLXServer/Views/MonitorView.swift b/MLXServer/Views/MonitorView.swift index 505110b..0cc3061 100644 --- a/MLXServer/Views/MonitorView.swift +++ b/MLXServer/Views/MonitorView.swift @@ -6,28 +6,31 @@ import SwiftUI struct MonitorView: View { let stats: InferenceStats @Environment(ModelManager.self) private var modelManager + private let chartColumns = [GridItem(.flexible(minimum: 260), spacing: 16), GridItem(.flexible(minimum: 260), spacing: 16)] + private let cardColumns = [GridItem(.flexible(minimum: 180), spacing: 16), GridItem(.flexible(minimum: 180), spacing: 16)] var body: some View { ScrollView { VStack(spacing: 20) { - // Live status header liveStatusSection - // Charts - HStack(alignment: .top, spacing: 16) { + LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) { tokenRateChart tokenThroughputChart + cacheReuseChart + cacheFootprintChart + cacheSessionChart } - // Gauges row - HStack(spacing: 16) { + LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 16) { contextGauge gpuMemoryGauge requestsCard + cacheCard } - // Cumulative stats cumulativeSection + sessionSection } .padding(20) } @@ -39,45 +42,54 @@ struct MonitorView: View { @ViewBuilder private var liveStatusSection: some View { - HStack(spacing: 16) { - // Activity indicator - HStack(spacing: 8) { - Circle() - .fill(activityColor) - .frame(width: 10, height: 10) - .overlay { - if stats.isGenerating || stats.isPrefilling { - Circle() - .stroke(activityColor.opacity(0.5), lineWidth: 2) - .scaleEffect(1.8) - .opacity(0.6) + VStack(alignment: .leading, spacing: 12) { + HStack(spacing: 16) { + HStack(spacing: 8) { + Circle() + .fill(activityColor) + .frame(width: 10, height: 10) + .overlay { + if stats.activeRequests > 0 { + Circle() + .stroke(activityColor.opacity(0.5), lineWidth: 2) + .scaleEffect(1.8) + .opacity(0.6) + } } - } - Text(activityLabel) - .font(.headline) - } - - Spacer() - - if stats.isGenerating { - Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond)) - .font(.title2.monospacedDigit().bold()) - .foregroundStyle(.green) - } - - if stats.currentPromptTokens > 0 { - HStack(spacing: 4) { - Image(systemName: "arrow.down.circle.fill") - .foregroundStyle(.blue) - Text("\(stats.currentPromptTokens)") - .monospacedDigit() - Image(systemName: "arrow.up.circle.fill") - .foregroundStyle(.orange) - Text("\(stats.currentGenerationTokens)") - .monospacedDigit() + Text(activityLabel) + .font(.headline) } - .font(.callout) + + Spacer() + + if stats.isGenerating { + Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond)) + .font(.title2.monospacedDigit().bold()) + .foregroundStyle(.green) + } + + if stats.currentPromptTokens > 0 { + HStack(spacing: 4) { + Image(systemName: "arrow.down.circle.fill") + .foregroundStyle(.blue) + Text("\(stats.currentPromptTokens)") + .monospacedDigit() + Image(systemName: "arrow.up.circle.fill") + .foregroundStyle(.orange) + Text("\(stats.currentGenerationTokens)") + .monospacedDigit() + } + .font(.callout) + } + } + + HStack(spacing: 8) { + phaseChip(title: "Preparing", count: stats.preparingRequests, color: .secondary) + phaseChip(title: "Session Build", count: stats.sessionBuildRequests, color: .purple) + phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue) + phaseChip(title: "Generating", count: stats.generatingRequests, color: .green) + phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange) } } .padding(12) @@ -85,15 +97,19 @@ struct MonitorView: View { } private var activityColor: Color { - if stats.isPrefilling { return .blue } if stats.isGenerating { return .green } + if stats.prefillingRequests > 0 { return .blue } + if stats.sessionBuildRequests > 0 { return .purple } + if stats.preparingRequests > 0 { return .orange } if stats.activeRequests > 0 { return .orange } return .secondary } private var activityLabel: String { - if stats.isPrefilling { return "Prefilling" } if stats.isGenerating { return "Generating" } + if stats.prefillingRequests > 0 { return "Prefilling" } + if stats.sessionBuildRequests > 0 { return "Building Sessions" } + if stats.preparingRequests > 0 { return "Preparing Requests" } if stats.activeRequests > 0 { return "Processing" } return "Idle" } @@ -145,6 +161,160 @@ struct MonitorView: View { .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) } + @ViewBuilder + private var cacheReuseChart: some View { + VStack(alignment: .leading, spacing: 6) { + Text("Prefill Reuse (/sec)") + .font(.caption.bold()) + .foregroundStyle(.secondary) + + Chart { + ForEach(stats.cacheReuseHistory) { point in + BarMark( + x: .value("Time", point.timestamp), + y: .value("Tokens", point.value) + ) + .foregroundStyle(.green.opacity(0.75)) + } + ForEach(stats.cacheRebuildHistory) { point in + BarMark( + x: .value("Time", point.timestamp), + y: .value("Tokens", point.value) + ) + .foregroundStyle(.red.opacity(0.65)) + } + } + .chartXAxis { + AxisMarks(values: .stride(by: .second, count: 30)) { _ in + AxisGridLine() + } + } + .chartYAxis { + AxisMarks(position: .leading) { value in + AxisGridLine() + AxisValueLabel { + if let v = value.as(Double.self) { + Text(String(format: "%.0f", v)) + .font(.caption2.monospacedDigit()) + } + } + } + } + .frame(height: 150) + + HStack(spacing: 12) { + Label("Reused", systemImage: "circle.fill") + .font(.caption2) + .foregroundStyle(.green) + Label("Rebuilt", systemImage: "circle.fill") + .font(.caption2) + .foregroundStyle(.red) + } + } + .padding(12) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) + } + + @ViewBuilder + private var cacheFootprintChart: some View { + VStack(alignment: .leading, spacing: 6) { + Text("Cache Footprint (est)") + .font(.caption.bold()) + .foregroundStyle(.secondary) + + Chart(stats.cacheFootprintHistory) { point in + LineMark( + x: .value("Time", point.timestamp), + y: .value("MB", point.value / 1_048_576) + ) + .foregroundStyle(.orange) + .interpolationMethod(.monotone) + + AreaMark( + x: .value("Time", point.timestamp), + y: .value("MB", point.value / 1_048_576) + ) + .foregroundStyle(.orange.opacity(0.12)) + .interpolationMethod(.monotone) + } + .chartXAxis { + AxisMarks(values: .stride(by: .second, count: 30)) { _ in + AxisGridLine() + } + } + .chartYAxis { + AxisMarks(position: .leading) { value in + AxisGridLine() + AxisValueLabel { + if let v = value.as(Double.self) { + Text(String(format: "%.1f", v)) + .font(.caption2.monospacedDigit()) + } + } + } + } + .frame(height: 150) + } + .padding(12) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) + } + + @ViewBuilder + private var cacheSessionChart: some View { + VStack(alignment: .leading, spacing: 6) { + Text("Cached Sessions") + .font(.caption.bold()) + .foregroundStyle(.secondary) + + Chart { + ForEach(stats.cacheEntryHistory) { point in + LineMark( + x: .value("Time", point.timestamp), + y: .value("Cached", point.value) + ) + .foregroundStyle(.purple) + .interpolationMethod(.monotone) + } + ForEach(stats.activeSessionHistory) { point in + LineMark( + x: .value("Time", point.timestamp), + y: .value("Active", point.value) + ) + .foregroundStyle(.blue) + .interpolationMethod(.monotone) + } + } + .chartXAxis { + AxisMarks(values: .stride(by: .second, count: 30)) { _ in + AxisGridLine() + } + } + .chartYAxis { + AxisMarks(position: .leading) { value in + AxisGridLine() + AxisValueLabel { + if let v = value.as(Double.self) { + Text(String(format: "%.0f", v)) + .font(.caption2.monospacedDigit()) + } + } + } + } + .frame(height: 150) + + HStack(spacing: 12) { + Label("Cached", systemImage: "circle.fill") + .font(.caption2) + .foregroundStyle(.purple) + Label("Active", systemImage: "circle.fill") + .font(.caption2) + .foregroundStyle(.blue) + } + } + .padding(12) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) + } + private var maxTokenRate: Double { stats.tokenRateHistory.map(\.value).max() ?? 10 } @@ -303,35 +473,69 @@ struct MonitorView: View { .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) } + @ViewBuilder + private var cacheCard: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Session Cache") + .font(.caption.bold()) + .foregroundStyle(.secondary) + + Text("\(stats.cacheEntryCount)") + .font(.title3.monospacedDigit().bold()) + + LabeledContent("Warm") { + Text("\(stats.warmCacheEntryCount)") + .monospacedDigit() + } + .font(.caption) + + LabeledContent("Active") { + Text("\(stats.activeCacheEntryCount)") + .monospacedDigit() + } + .font(.caption) + + LabeledContent("Est. Footprint") { + Text(formatByteCount(stats.cacheEstimatedBytes)) + .monospacedDigit() + } + .font(.caption) + + LabeledContent("Cached Tokens") { + Text(formatTokenCount(stats.cacheEstimatedTokens)) + .monospacedDigit() + } + .font(.caption) + + LabeledContent("Hit Rate") { + Text(String(format: "%.0f%%", cacheHitRate * 100)) + .monospacedDigit() + } + .font(.caption) + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(12) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) + } + // MARK: - Cumulative @ViewBuilder private var cumulativeSection: some View { - HStack(spacing: 24) { - VStack(spacing: 2) { - Text("Total Prompt Tokens") - .font(.caption2) - .foregroundStyle(.secondary) - Text(formatTokenCount(stats.totalPromptTokens)) - .font(.callout.monospacedDigit().bold()) - .foregroundStyle(.blue) - } + VStack(alignment: .leading, spacing: 10) { + Text("Cumulative") + .font(.caption.bold()) + .foregroundStyle(.secondary) - VStack(spacing: 2) { - Text("Total Generated Tokens") - .font(.caption2) - .foregroundStyle(.secondary) - Text(formatTokenCount(stats.totalGenerationTokens)) - .font(.callout.monospacedDigit().bold()) - .foregroundStyle(.orange) - } - - VStack(spacing: 2) { - Text("Total Tokens") - .font(.caption2) - .foregroundStyle(.secondary) - Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens)) - .font(.callout.monospacedDigit().bold()) + LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 12) { + statTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue) + statTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .orange) + statTile(title: "Cache Hits", value: "\(stats.totalCacheHits)", color: .green) + statTile(title: "Cache Misses", value: "\(stats.totalCacheMisses)", color: .red) + statTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .green) + statTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .red) + statTile(title: "Evictions", value: "\(stats.totalCacheEvictions)", color: .secondary) + statTile(title: "Total Tokens", value: formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens), color: .primary) } } .frame(maxWidth: .infinity) @@ -339,8 +543,129 @@ struct MonitorView: View { .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) } + @ViewBuilder + private var sessionSection: some View { + VStack(alignment: .leading, spacing: 12) { + HStack { + Text("Cached Chat Sessions") + .font(.headline) + Spacer() + Text("\(stats.cachedSessions.count) visible") + .font(.caption) + .foregroundStyle(.secondary) + } + + if stats.cachedSessions.isEmpty { + Text("No cached sessions yet.") + .font(.callout) + .foregroundStyle(.secondary) + .frame(maxWidth: .infinity, alignment: .leading) + } else { + ForEach(stats.cachedSessions) { session in + sessionRow(session) + } + } + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(12) + .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) + } + // MARK: - Helpers + @ViewBuilder + private func phaseChip(title: String, count: Int, color: Color) -> some View { + HStack(spacing: 6) { + Circle() + .fill(color) + .frame(width: 7, height: 7) + Text(title) + Text("\(count)") + .monospacedDigit() + } + .font(.caption) + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(color.opacity(0.12), in: Capsule()) + } + + @ViewBuilder + private func statTile(title: String, value: String, color: Color) -> some View { + VStack(alignment: .leading, spacing: 4) { + Text(title) + .font(.caption2) + .foregroundStyle(.secondary) + Text(value) + .font(.callout.monospacedDigit().bold()) + .foregroundStyle(color) + } + .frame(maxWidth: .infinity, alignment: .leading) + .padding(10) + .background(Color.primary.opacity(0.04), in: RoundedRectangle(cornerRadius: 8)) + } + + @ViewBuilder + private func sessionRow(_ session: ConversationSessionCache.SessionSummary) -> some View { + VStack(alignment: .leading, spacing: 10) { + HStack(alignment: .firstTextBaseline) { + HStack(spacing: 8) { + Circle() + .fill(color(for: session.phase)) + .frame(width: 8, height: 8) + Text(session.modelId) + .font(.callout.weight(.semibold)) + .lineLimit(1) + } + Spacer() + Text(session.phase.rawValue) + .font(.caption.monospacedDigit()) + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(color(for: session.phase).opacity(0.14), in: Capsule()) + } + + HStack(spacing: 12) { + sessionMetric("Msgs", "\(session.messageCount)") + sessionMetric("Cached", formatTokenCount(session.cachedTokenEstimate)) + sessionMetric("Reuse", formatTokenCount(session.lastReuseTokens)) + sessionMetric("Footprint", formatByteCount(session.estimatedBytes)) + sessionMetric("Hits", "\(session.hitCount)") + sessionMetric("Active", "\(session.inFlightRequests)") + } + + HStack(spacing: 12) { + sessionMetric("Prompt", formatTokenCount(session.lastPromptTokens)) + sessionMetric("Completion", formatTokenCount(session.lastCompletionTokens)) + sessionMetric("Last Access", relativeTimeString(session.lastAccessAt)) + } + + let ratio = maxContextRatio(for: session.cachedTokenEstimate) + ProgressView(value: ratio) { + Text("Cached Context") + .font(.caption2) + .foregroundStyle(.secondary) + } currentValueLabel: { + Text("\(Int(ratio * 100))%") + .font(.caption2.monospacedDigit()) + .foregroundStyle(.secondary) + } + .tint(color(for: session.phase)) + } + .padding(12) + .background(Color.primary.opacity(0.035), in: RoundedRectangle(cornerRadius: 10)) + } + + @ViewBuilder + private func sessionMetric(_ title: String, _ value: String) -> some View { + VStack(alignment: .leading, spacing: 2) { + Text(title) + .font(.caption2) + .foregroundStyle(.secondary) + Text(value) + .font(.caption.monospacedDigit().bold()) + } + } + private func formatTokenCount(_ count: Int) -> String { if count >= 1_000_000 { return String(format: "%.1fM", Double(count) / 1_000_000) @@ -349,4 +674,52 @@ struct MonitorView: View { } return "\(count)" } + + private func formatByteCount(_ count: Int) -> String { + let bytes = Double(count) + if bytes >= 1_048_576 { + return String(format: "%.1f MB", bytes / 1_048_576) + } + if bytes >= 1024 { + return String(format: "%.0f KB", bytes / 1024) + } + return "\(count) B" + } + + private func relativeTimeString(_ date: Date) -> String { + let seconds = max(0, Int(Date.now.timeIntervalSince(date))) + if seconds < 60 { + return "\(seconds)s" + } + let minutes = seconds / 60 + if minutes < 60 { + return "\(minutes)m" + } + return "\(minutes / 60)h" + } + + private func color(for phase: APISessionPhase) -> Color { + switch phase { + case .idle: + return .secondary + case .sessionBuild: + return .purple + case .prefilling: + return .blue + case .generating: + return .green + } + } + + private var cacheHitRate: Double { + let total = stats.totalCacheHits + stats.totalCacheMisses + guard total > 0 else { return 0 } + return Double(stats.totalCacheHits) / Double(total) + } + + private func maxContextRatio(for tokens: Int) -> Double { + let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0) + guard maxContext > 0 else { return 0 } + return min(1, Double(tokens) / Double(maxContext)) + } }