fix: more telemetry and tighter implementation of cache

2026-03-19 11:30:18 +01:00
parent c2e80e4066
commit 49bd165ce7
6 changed files with 1154 additions and 156 deletions
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -42,6 +42,7 @@
 		D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
 		D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
 		DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; };
+		F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
 		F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
 		FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
 		FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -85,6 +86,7 @@
 		E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
 		EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = "<group>"; };
 		F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
+		FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -203,6 +205,7 @@
 			children = (
 				F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
 				3D08828E16B17EF02C14243E /* APIServer.swift */,
+				FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
 				E73B165A1822729C907791AE /* ToolCallParser.swift */,
 				16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */,
 			);
@@ -306,6 +309,7 @@
 				85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
 				B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
 				5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
+				F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
 				C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
 				4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
 				2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -9,9 +9,14 @@ final class LiveCounters: @unchecked Sendable {
    static let shared = LiveCounters()

    private let lock = OSAllocatedUnfairLock()
+    private var requestPhases: [String: RequestPhase] = [:]

    // Current request
    private var _activeRequests: Int = 0
+    private var _preparingRequests: Int = 0
+    private var _sessionBuildRequests: Int = 0
+    private var _prefillRequests: Int = 0
+    private var _generatingRequests: Int = 0
    private var _promptTokens: Int = 0
    private var _generationTokens: Int = 0
    private var _tokensPerSecond: Double = 0
@@ -24,9 +29,10 @@ final class LiveCounters: @unchecked Sendable {
    private var _totalPromptTokens: Int = 0
    private var _totalGenerationTokens: Int = 0

-    func requestStarted(contextLength: Int) {
+    func requestStarted(requestId: String, contextLength: Int) {
        lock.lock()
        _activeRequests += 1
+        _preparingRequests += 1
        _totalRequests += 1
        _isPrefilling = true
        _isGenerating = false
@@ -34,15 +40,33 @@ final class LiveCounters: @unchecked Sendable {
        _generationTokens = 0
        _tokensPerSecond = 0
        _contextMax = contextLength
+        requestPhases[requestId] = .preparing
        lock.unlock()
    }

-    func prefillCompleted(promptTokens: Int) {
+    func requestPhaseChanged(requestId: String, phase: RequestPhase) {
        lock.lock()
-        _isPrefilling = false
-        _isGenerating = true
+        if let current = requestPhases[requestId] {
+            decrementCount(for: current)
+        }
+        incrementCount(for: phase)
+        requestPhases[requestId] = phase
+        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
+        _isGenerating = _generatingRequests > 0
+        lock.unlock()
+    }
+
+    func prefillCompleted(requestId: String, promptTokens: Int) {
+        lock.lock()
+        if let current = requestPhases[requestId] {
+            decrementCount(for: current)
+        }
+        incrementCount(for: .generating)
+        requestPhases[requestId] = .generating
        _promptTokens = promptTokens
        _totalPromptTokens += promptTokens
+        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
+        _isGenerating = _generatingRequests > 0
        lock.unlock()
    }

@@ -53,21 +77,32 @@ final class LiveCounters: @unchecked Sendable {
        lock.unlock()
    }

-    func requestCompleted(generationTokens: Int) {
+    func requestCompleted(requestId: String, generationTokens: Int) {
        lock.lock()
+        if let current = requestPhases.removeValue(forKey: requestId) {
+            decrementCount(for: current)
+        }
        _activeRequests = max(0, _activeRequests - 1)
        _totalGenerationTokens += generationTokens
        if _activeRequests == 0 {
            _isGenerating = false
            _isPrefilling = false
            _tokensPerSecond = 0
+        } else {
+            _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
+            _isGenerating = _generatingRequests > 0
        }
        lock.unlock()
    }

    func reset() {
        lock.lock()
+        requestPhases.removeAll()
        _activeRequests = 0
+        _preparingRequests = 0
+        _sessionBuildRequests = 0
+        _prefillRequests = 0
+        _generatingRequests = 0
        _promptTokens = 0
        _generationTokens = 0
        _tokensPerSecond = 0
@@ -85,6 +120,10 @@ final class LiveCounters: @unchecked Sendable {
        lock.lock()
        let s = Snapshot(
            activeRequests: _activeRequests,
+            preparingRequests: _preparingRequests,
+            sessionBuildRequests: _sessionBuildRequests,
+            prefillRequests: _prefillRequests,
+            generatingRequests: _generatingRequests,
            promptTokens: _promptTokens,
            generationTokens: _generationTokens,
            tokensPerSecond: _tokensPerSecond,
@@ -101,6 +140,10 @@ final class LiveCounters: @unchecked Sendable {

    struct Snapshot {
        let activeRequests: Int
+        let preparingRequests: Int
+        let sessionBuildRequests: Int
+        let prefillRequests: Int
+        let generatingRequests: Int
        let promptTokens: Int
        let generationTokens: Int
        let tokensPerSecond: Double
@@ -111,6 +154,39 @@ final class LiveCounters: @unchecked Sendable {
        let totalPromptTokens: Int
        let totalGenerationTokens: Int
    }
+
+    private func incrementCount(for phase: RequestPhase) {
+        switch phase {
+        case .preparing:
+            _preparingRequests += 1
+        case .sessionBuild:
+            _sessionBuildRequests += 1
+        case .prefilling:
+            _prefillRequests += 1
+        case .generating:
+            _generatingRequests += 1
+        }
+    }
+
+    private func decrementCount(for phase: RequestPhase) {
+        switch phase {
+        case .preparing:
+            _preparingRequests = max(0, _preparingRequests - 1)
+        case .sessionBuild:
+            _sessionBuildRequests = max(0, _sessionBuildRequests - 1)
+        case .prefilling:
+            _prefillRequests = max(0, _prefillRequests - 1)
+        case .generating:
+            _generatingRequests = max(0, _generatingRequests - 1)
+        }
+    }
+
+    enum RequestPhase {
+        case preparing
+        case sessionBuild
+        case prefilling
+        case generating
+    }
 }

 // MARK: - Observable stats for the UI (polls LiveCounters at 1Hz)
@@ -121,6 +197,10 @@ final class InferenceStats {
    // MARK: - Current request state (refreshed from LiveCounters)

    var activeRequests: Int = 0
+    var preparingRequests: Int = 0
+    var sessionBuildRequests: Int = 0
+    var prefillingRequests: Int = 0
+    var generatingRequests: Int = 0
    var currentPromptTokens: Int = 0
    var currentGenerationTokens: Int = 0
    var isGenerating: Bool = false
@@ -134,6 +214,21 @@ final class InferenceStats {
    var totalRequests: Int = 0
    var totalPromptTokens: Int = 0
    var totalGenerationTokens: Int = 0
+    var totalCacheHits: Int = 0
+    var totalCacheMisses: Int = 0
+    var totalCacheEvictions: Int = 0
+    var totalCacheReusePromptTokens: Int = 0
+    var totalCacheRebuildPromptTokens: Int = 0
+
+    // MARK: - Cache state
+
+    var cacheEntryCount: Int = 0
+    var warmCacheEntryCount: Int = 0
+    var activeCacheEntryCount: Int = 0
+    var generatingCacheEntryCount: Int = 0
+    var cacheEstimatedBytes: Int = 0
+    var cacheEstimatedTokens: Int = 0
+    var cachedSessions: [ConversationSessionCache.SessionSummary] = []

    // MARK: - Time series data (ring buffers for charts)

@@ -146,6 +241,11 @@ final class InferenceStats {
    private(set) var tokenRateHistory: [DataPoint] = []
    private(set) var promptTokenHistory: [DataPoint] = []
    private(set) var generationTokenHistory: [DataPoint] = []
+    private(set) var cacheEntryHistory: [DataPoint] = []
+    private(set) var activeSessionHistory: [DataPoint] = []
+    private(set) var cacheFootprintHistory: [DataPoint] = []
+    private(set) var cacheReuseHistory: [DataPoint] = []
+    private(set) var cacheRebuildHistory: [DataPoint] = []

    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz

@@ -153,6 +253,8 @@ final class InferenceStats {
    private var sampleTimer: Timer?
    private var lastGenerationTokenCount: Int = 0
    private var lastPromptTokenCount: Int = 0
+    private var lastCacheReuseTokenCount: Int = 0
+    private var lastCacheRebuildTokenCount: Int = 0

    func startSampling() {
        guard sampleTimer == nil else { return }
@@ -171,8 +273,13 @@ final class InferenceStats {
    private func recordSample() {
        // Pull live values from the thread-safe counters
        let snap = LiveCounters.shared.snapshot()
+        let cache = ConversationSessionCache.shared.snapshot()

        activeRequests = snap.activeRequests
+        preparingRequests = snap.preparingRequests
+        sessionBuildRequests = snap.sessionBuildRequests
+        prefillingRequests = snap.prefillRequests
+        generatingRequests = snap.generatingRequests
        currentPromptTokens = snap.promptTokens
        currentGenerationTokens = snap.generationTokens
        currentTokensPerSecond = snap.tokensPerSecond
@@ -183,16 +290,37 @@ final class InferenceStats {
        totalRequests = snap.totalRequests
        totalPromptTokens = snap.totalPromptTokens
        totalGenerationTokens = snap.totalGenerationTokens
+        totalCacheHits = cache.totalHits
+        totalCacheMisses = cache.totalMisses
+        totalCacheEvictions = cache.totalEvictions
+        totalCacheReusePromptTokens = cache.totalReusePromptTokens
+        totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
+        cacheEntryCount = cache.totalEntries
+        warmCacheEntryCount = cache.warmEntries
+        activeCacheEntryCount = cache.activeEntries
+        generatingCacheEntryCount = cache.generatingEntries
+        cacheEstimatedBytes = cache.estimatedBytes
+        cacheEstimatedTokens = cache.cachedTokenEstimate
+        cachedSessions = cache.sessions

        let now = Date.now
        let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
        let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
+        let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
+        let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
        lastGenerationTokenCount = snap.totalGenerationTokens
        lastPromptTokenCount = snap.totalPromptTokens
+        lastCacheReuseTokenCount = cache.totalReusePromptTokens
+        lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens

        tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
        promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
+        cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
+        activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
+        cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
+        cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
+        cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))

        if tokenRateHistory.count > Self.maxHistoryPoints {
            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -203,11 +331,31 @@ final class InferenceStats {
        if promptTokenHistory.count > Self.maxHistoryPoints {
            promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
        }
+        if cacheEntryHistory.count > Self.maxHistoryPoints {
+            cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
+        }
+        if activeSessionHistory.count > Self.maxHistoryPoints {
+            activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheFootprintHistory.count > Self.maxHistoryPoints {
+            cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheReuseHistory.count > Self.maxHistoryPoints {
+            cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheRebuildHistory.count > Self.maxHistoryPoints {
+            cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
+        }
    }

    func reset() {
        LiveCounters.shared.reset()
+        ConversationSessionCache.shared.reset()
        activeRequests = 0
+        preparingRequests = 0
+        sessionBuildRequests = 0
+        prefillingRequests = 0
+        generatingRequests = 0
        currentPromptTokens = 0
        currentGenerationTokens = 0
        isGenerating = false
@@ -218,10 +366,29 @@ final class InferenceStats {
        totalRequests = 0
        totalPromptTokens = 0
        totalGenerationTokens = 0
+        totalCacheHits = 0
+        totalCacheMisses = 0
+        totalCacheEvictions = 0
+        totalCacheReusePromptTokens = 0
+        totalCacheRebuildPromptTokens = 0
+        cacheEntryCount = 0
+        warmCacheEntryCount = 0
+        activeCacheEntryCount = 0
+        generatingCacheEntryCount = 0
+        cacheEstimatedBytes = 0
+        cacheEstimatedTokens = 0
+        cachedSessions.removeAll()
        tokenRateHistory.removeAll()
        promptTokenHistory.removeAll()
        generationTokenHistory.removeAll()
+        cacheEntryHistory.removeAll()
+        activeSessionHistory.removeAll()
+        cacheFootprintHistory.removeAll()
+        cacheReuseHistory.removeAll()
+        cacheRebuildHistory.removeAll()
        lastGenerationTokenCount = 0
        lastPromptTokenCount = 0
+        lastCacheReuseTokenCount = 0
+        lastCacheRebuildTokenCount = 0
    }
 }
--- a/MLXServer/Server/APIModels.swift
+++ b/MLXServer/Server/APIModels.swift
@@ -16,6 +16,50 @@ struct APIToolDefinition: Codable {
 struct APIFunctionCall: Codable {
    let name: String
    let arguments: String // JSON string
+
+    init(name: String, arguments: String) {
+        self.name = name
+        self.arguments = arguments
+    }
+
+    init(from decoder: Decoder) throws {
+        let container = try decoder.container(keyedBy: CodingKeys.self)
+        name = try container.decode(String.self, forKey: .name)
+
+        if let argumentString = try? container.decode(String.self, forKey: .arguments) {
+            arguments = argumentString
+            return
+        }
+
+        if let argumentObject = try? container.decode([String: AnyCodable].self, forKey: .arguments) {
+            let jsonObject = argumentObject.mapValues(\.value)
+            if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
+               let string = String(data: data, encoding: .utf8) {
+                arguments = string
+            } else {
+                arguments = "{}"
+            }
+            return
+        }
+
+        if let argumentArray = try? container.decode([AnyCodable].self, forKey: .arguments) {
+            let jsonObject = argumentArray.map(\.value)
+            if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
+               let string = String(data: data, encoding: .utf8) {
+                arguments = string
+            } else {
+                arguments = "[]"
+            }
+            return
+        }
+
+        if (try? container.decodeNil(forKey: .arguments)) == true {
+            arguments = "{}"
+            return
+        }
+
+        arguments = "{}"
+    }
 }

 struct APIToolCall: Codable {
@@ -30,6 +74,14 @@ struct APIToolCall: Codable {
        self.type = type
        self.function = function
    }
+
+    init(from decoder: Decoder) throws {
+        let container = try decoder.container(keyedBy: CodingKeys.self)
+        index = try container.decodeIfPresent(Int.self, forKey: .index) ?? 0
+        id = try container.decodeIfPresent(String.self, forKey: .id) ?? "call_\(UUID().uuidString.lowercased())"
+        type = try container.decodeIfPresent(String.self, forKey: .type) ?? "function"
+        function = try container.decode(APIFunctionCall.self, forKey: .function)
+    }
 }

 struct APIImageURL: Codable {
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -16,12 +16,6 @@ final class APIServer {
    private var listener: NWListener?
    private var modelManager: ModelManager?

-    // Persistent ChatSession for KV cache reuse across requests
-    private var cachedSession: ChatSession?
-    private var cachedMessages: [Chat.Message]?
-    private var cachedModelId: String?
-    private var cachedInstructions: String = ""
-
    func start(modelManager: ModelManager, port: Int = 1234) {
        guard !isRunning else { return }
        self.modelManager = modelManager
@@ -70,10 +64,7 @@ final class APIServer {
        listener?.cancel()
        listener = nil
        isRunning = false
-        cachedSession = nil
-        cachedMessages = nil
-        cachedModelId = nil
-        cachedInstructions = ""
+        ConversationSessionCache.shared.invalidateAll()
        inferenceStats.stopSampling()
    }

@@ -186,10 +177,7 @@ final class APIServer {
            if let targetConfig = ModelConfig.resolve(requestedModel) {
                if modelManager.currentModel?.id != targetConfig.id {
                    print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
-                    cachedSession = nil
-                    cachedMessages = nil
-                    cachedModelId = nil
-                    cachedInstructions = ""
+                    ConversationSessionCache.shared.invalidateAll()
                    await modelManager.loadModel(targetConfig)
                }
            }
@@ -200,10 +188,7 @@ final class APIServer {
        if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
           let config = ModelConfig.resolve(lastModelId) {
            print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
-            cachedSession = nil
-            cachedMessages = nil
-            cachedModelId = nil
-            cachedInstructions = ""
+            ConversationSessionCache.shared.invalidateAll()
            await modelManager.loadModel(config)
        }

@@ -233,9 +218,13 @@ final class APIServer {
            return
        }

+        LiveCounters.shared.requestStarted(requestId: requestId, contextLength: contextLength)
+
        // Convert API messages to Chat.Message, extracting images from content parts
        var chatMessages: [Chat.Message] = []
+        var messageSignatures: [UInt64] = []
        var images: [UserInput.Image] = []
+        var estimatedBytes = 0
        let currentModelRepoId = currentModel?.repoId ?? modelName

        // Build the instructions string (system prompt + tool definitions).
@@ -259,8 +248,8 @@ final class APIServer {
            instructions += toolSystemPrompt
        }

-        let toolsForInjection = request.tools
        let isQwen = currentModelRepoId.lowercased().contains("qwen")
+        estimatedBytes += instructions.utf8.count

        // Convert non-system messages to Chat.Message
        for msg in request.messages where msg.role != "system" {
@@ -297,18 +286,25 @@ final class APIServer {
            // Extract base64 images from content parts
            let imageURLs = msg.content?.imageURLs ?? []
            var messageImages: [UserInput.Image] = []
+            var messageImageBytes = 0
            for urlString in imageURLs {
-                if let image = decodeBase64Image(urlString) {
-                    messageImages.append(image)
+                if let decoded = decodeBase64Image(urlString) {
+                    messageImages.append(decoded.image)
+                    messageImageBytes += decoded.estimatedBytes
                }
            }

            // Attach images to this specific message
            chatMessages.append(Chat.Message(role: role, content: text, images: messageImages))
+            messageSignatures.append(
+                Self.messageSignature(role: role, content: text, imageURLs: imageURLs)
+            )
+            estimatedBytes += text.utf8.count + messageImageBytes
            images.append(contentsOf: messageImages)
        }

        if !images.isEmpty, currentModel?.supportsImages != true {
+            LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
            sendResponse(
                connection: connection,
                status: 400,
@@ -318,18 +314,18 @@ final class APIServer {
        }

        // Context window check: estimate token count and reject if over limit
+        let estimatedPromptTokens = (instructions.count + chatMessages.reduce(0) { $0 + $1.content.count }) * 10 / 35
        if contextLength > 0 {
-            let totalChars = chatMessages.reduce(0) { $0 + $1.content.count }
-            let estimatedTokens = totalChars * 10 / 35  // ~3.5 chars per token
-            let needed = estimatedTokens + maxTokens
+            let needed = estimatedPromptTokens + maxTokens
            if needed > contextLength {
                let errorBody = """
                {"error":{"message":"This model's maximum context length is \(contextLength) tokens. \
-                However, your messages resulted in approximately \(estimatedTokens) tokens and \
+                However, your messages resulted in approximately \(estimatedPromptTokens) tokens and \
                \(maxTokens) tokens were requested for the completion (\(needed) total). \
                Please reduce the length of the messages or completion.",\
                "type":"invalid_request_error","code":"context_length_exceeded"}}
                """
+                LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
                sendResponse(connection: connection, status: 400, body: errorBody)
                return
            }
@@ -345,23 +341,28 @@ final class APIServer {
        let allButLast = Array(chatMessages.dropLast())
        let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")

-        // KV cache reuse: check if the cached session's history matches
-        let currentModelId = modelManager.currentModel?.id
-        let canReuse = cachedSession != nil
-            && cachedModelId == currentModelId
-            && cachedMessages != nil
-            && cachedInstructions == instructions
-            && messagesMatch(cachedMessages!, allButLast)
+        let historySignatures = Array(messageSignatures.dropLast())
+        let currentModelId = modelManager.currentModel?.id ?? modelName
+        let lease = ConversationSessionCache.shared.checkoutSession(
+            modelId: currentModelId,
+            instructions: instructions,
+            historySignatures: historySignatures,
+            requestMessageCount: chatMessages.count,
+            estimatedPromptTokens: estimatedPromptTokens,
+            estimatedBytes: estimatedBytes
+        )

        let session: ChatSession
-        if canReuse {
+        if let reusableSession = lease.session {
            print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
-            session = cachedSession!
+            session = reusableSession
            session.generateParameters = generateParams
+            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
+            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
        } else {
-            if cachedSession != nil {
-                print("[APIServer] History diverged, creating fresh session")
-            }
+            print("[APIServer] Creating fresh session")
+            ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
+            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
            // Use `instructions:` for system/tool prompt (matches internal chat pattern).
            // Only conversation turns go in `history:` — this avoids replaying the
            // large tool prompt as history on every new session.
@@ -385,47 +386,62 @@ final class APIServer {
                    additionalContext: thinkingContext
                )
            }
+            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
+            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
        }

        // Extract images from the last message only (ChatSession.streamDetails takes images separately)
        let lastImages = lastMessage.images

-        LiveCounters.shared.requestStarted(contextLength: contextLength)
+        let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)

        if isStream {
-            await handleStreamingResponse(
+            result = await handleStreamingResponse(
                connection: connection,
+                requestId: requestId,
+                cacheEntryId: lease.entryId,
                session: session,
                prompt: lastMessage.content,
                images: lastImages,
                tools: request.tools,
-                requestId: requestId,
                created: created,
                modelName: modelName
            )
        } else {
-            await handleNonStreamingResponse(
+            result = await handleNonStreamingResponse(
                connection: connection,
+                requestId: requestId,
+                cacheEntryId: lease.entryId,
                session: session,
                prompt: lastMessage.content,
                images: lastImages,
                tools: request.tools,
-                requestId: requestId,
                created: created,
                modelName: modelName
            )
        }

-        // Cache the session for reuse on next request
-        // allButLast + lastMessage (user) + assistant response = new cached history
-        cachedSession = session
-        cachedMessages = chatMessages  // full messages including the one just sent
-        cachedModelId = currentModelId
-        cachedInstructions = instructions
+        if result.succeeded {
+            ConversationSessionCache.shared.completeRequest(
+                entryId: lease.entryId,
+                session: session,
+                requestMessageSignatures: messageSignatures,
+                requestMessageCount: chatMessages.count,
+                estimatedPromptTokens: estimatedPromptTokens,
+                estimatedBytes: estimatedBytes,
+                promptTokens: result.promptTokens,
+                completionTokens: result.completionTokens
+            )
+        } else {
+            ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
+        }
+
+        LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
+        modelManager.touchActivity()
    }

    /// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image.
-    private func decodeBase64Image(_ urlString: String) -> UserInput.Image? {
+    private func decodeBase64Image(_ urlString: String) -> DecodedImage? {
        // Handle data URIs: data:image/png;base64,<data>
        let base64String: String
        if urlString.hasPrefix("data:") {
@@ -442,21 +458,23 @@ final class APIServer {
            return nil
        }

-        return .ciImage(CIImage(cgImage: cgImage))
+                let estimatedBytes = max(data.count, cgImage.width * cgImage.height * 4)
+                return DecodedImage(image: .ciImage(CIImage(cgImage: cgImage)), estimatedBytes: estimatedBytes)
    }

    // MARK: - Non-streaming response

    private func handleNonStreamingResponse(
        connection: NWConnection,
+        requestId: String,
+        cacheEntryId: UUID,
        session: ChatSession,
        prompt: String,
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
-        requestId: String,
        created: Int,
        modelName: String
-    ) async {
+    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
        do {
            var fullText = ""
            var promptTokens = 0
@@ -478,7 +496,12 @@ final class APIServer {
                case .info(let info):
                    promptTokens = info.promptTokenCount
                    completionTokens = info.generationTokenCount
-                    LiveCounters.shared.prefillCompleted(promptTokens: promptTokens)
+                    ConversationSessionCache.shared.markGenerating(
+                        entryId: cacheEntryId,
+                        promptTokens: promptTokens,
+                        completionTokens: completionTokens
+                    )
+                    LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
                    if info.tokensPerSecond > 0 {
                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
                    }
@@ -487,9 +510,6 @@ final class APIServer {
                }
            }

-            LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
-            modelManager?.touchActivity()
-
            // Parse tool calls: first check framework-detected ones, then our own text parser
            var finishReason = "stop"
            var responseContent: String? = fullText
@@ -559,10 +579,10 @@ final class APIServer {
            if let json = try? JSONEncoder().encode(response) {
                sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
            }
+            return (promptTokens, completionTokens, true)
        } catch {
-            LiveCounters.shared.requestCompleted(generationTokens: 0)
-            modelManager?.touchActivity()
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
+            return (0, 0, false)
        }
    }

@@ -570,14 +590,15 @@ final class APIServer {

    private func handleStreamingResponse(
        connection: NWConnection,
+        requestId: String,
+        cacheEntryId: UUID,
        session: ChatSession,
        prompt: String,
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
-        requestId: String,
        created: Int,
        modelName: String
-    ) async {
+    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
        // Send SSE headers
        let header = [
            "HTTP/1.1 200 OK",
@@ -625,7 +646,16 @@ final class APIServer {
            )
        }()

-        let (promptTokens, completionTokens, fullText, frameworkToolCalls) = result
+        let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
+
+        if promptTokens > 0 {
+            ConversationSessionCache.shared.markGenerating(
+                entryId: cacheEntryId,
+                promptTokens: promptTokens,
+                completionTokens: completionTokens
+            )
+            LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
+        }

        // Stats were already updated by LiveCounters inside the loop

@@ -696,12 +726,10 @@ final class APIServer {
            )
        ))

-        LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
-        modelManager?.touchActivity()
-
        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
        connection.cancel()
+        return (promptTokens, completionTokens, succeeded)
    }

    /// Run the token generation + SSE send loop entirely off MainActor.
@@ -713,7 +741,7 @@ final class APIServer {
        requestId: String,
        created: Int,
        modelName: String
-    ) async -> (Int, Int, String, [MLXLMCommon.ToolCall]) {
+    ) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
        var promptTokens = 0
        var completionTokens = 0
        var fullText = ""
@@ -742,7 +770,6 @@ final class APIServer {
                case .info(let info):
                    promptTokens = info.promptTokenCount
                    completionTokens = info.generationTokenCount
-                    LiveCounters.shared.prefillCompleted(promptTokens: promptTokens)
                    if info.tokensPerSecond > 0 {
                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
                    }
@@ -754,9 +781,10 @@ final class APIServer {
        } catch {
            let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
            await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
+            return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
        }

-        return (promptTokens, completionTokens, fullText, frameworkToolCalls)
+        return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
    }

    /// Send an SSE event and wait for the protocol stack to process it.
@@ -819,26 +847,42 @@ final class APIServer {
        ]
    }

-    /// Check if the cached session can be reused for the new history.
-    ///
-    /// After a request the session's KV cache contains:
-    ///   cachedMessages (history + user prompt) + the generated assistant response.
-    /// On the next request the client sends back the full conversation, so
-    /// `newHistory` (allButLast) is typically `cachedMessages` + 1 assistant reply.
-    /// We allow reuse when `cached` is a prefix of `newHistory` and there is at most
-    /// one extra message (the assistant response the session already generated).
-    /// More than one extra message (e.g. injected tool results) means the session
-    /// hasn't processed them, so we must create a fresh session.
-    private func messagesMatch(_ cached: [Chat.Message], _ newHistory: [Chat.Message]) -> Bool {
-        guard cached.count <= newHistory.count,
-              newHistory.count <= cached.count + 1 else { return false }
-        for (a, b) in zip(cached, newHistory) {
-            if a.role != b.role || a.content != b.content { return false }
+    private static func messageSignature(role: Chat.Message.Role, content: String, imageURLs: [String]) -> UInt64 {
+        var hash: UInt64 = 14_695_981_039_346_656_037
+
+        func mix(_ text: String) {
+            for byte in text.utf8 {
+                hash ^= UInt64(byte)
+                hash &*= 1_099_511_628_211
+            }
        }
-        return true
+
+        switch role {
+        case .assistant:
+            mix("assistant")
+        case .system:
+            mix("system")
+        case .user:
+            mix("user")
+        @unknown default:
+            mix("unknown")
+        }
+        mix("|")
+        mix(content)
+        for imageURL in imageURLs {
+            mix("|")
+            mix(imageURL)
+        }
+
+        return hash
    }
 }

+private struct DecodedImage {
+    let image: UserInput.Image
+    let estimatedBytes: Int
+}
+
 // MARK: - HTTP request parser

 private struct HTTPRequest {
--- a/MLXServer/Server/ConversationSessionCache.swift
+++ b/MLXServer/Server/ConversationSessionCache.swift
@@ -0,0 +1,358 @@
+import Foundation
+import MLXLMCommon
+import os
+
+enum APISessionPhase: String, Sendable {
+    case idle = "Idle"
+    case sessionBuild = "Session Build"
+    case prefilling = "Prefilling"
+    case generating = "Generating"
+}
+
+/// Bounded cache of API chat sessions keyed by normalized conversation history.
+/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
+final class ConversationSessionCache: @unchecked Sendable {
+    static let shared = ConversationSessionCache()
+
+    private let lock = OSAllocatedUnfairLock()
+
+    private let maxEntries = 8
+    private let maxCachedTokens = 256_000
+    private let idleTTL: TimeInterval = 10 * 60
+
+    private var entries: [UUID: Entry] = [:]
+    private var totals = Totals()
+
+    private init() {}
+
+    struct Lease {
+        let entryId: UUID
+        let session: ChatSession?
+        let reusedPromptTokens: Int
+        let cacheHit: Bool
+    }
+
+    struct SessionSummary: Identifiable, Sendable {
+        let id: UUID
+        let modelId: String
+        let phase: APISessionPhase
+        let messageCount: Int
+        let cachedTokenEstimate: Int
+        let estimatedBytes: Int
+        let inFlightRequests: Int
+        let hitCount: Int
+        let lastPromptTokens: Int
+        let lastCompletionTokens: Int
+        let lastReuseTokens: Int
+        let createdAt: Date
+        let lastAccessAt: Date
+    }
+
+    struct Snapshot: Sendable {
+        let totalEntries: Int
+        let warmEntries: Int
+        let activeEntries: Int
+        let generatingEntries: Int
+        let estimatedBytes: Int
+        let cachedTokenEstimate: Int
+        let totalHits: Int
+        let totalMisses: Int
+        let totalEvictions: Int
+        let totalReusePromptTokens: Int
+        let totalRebuildPromptTokens: Int
+        let sessions: [SessionSummary]
+    }
+
+    func checkoutSession(
+        modelId: String,
+        instructions: String,
+        historySignatures: [UInt64],
+        requestMessageCount: Int,
+        estimatedPromptTokens: Int,
+        estimatedBytes: Int
+    ) -> Lease {
+        lock.lock()
+        let now = Date()
+        pruneExpiredLocked(now: now)
+
+        let instructionsHash = Self.stableHash(instructions)
+        let match = entries
+            .values
+            .filter {
+                $0.modelId == modelId
+                    && $0.instructionsHash == instructionsHash
+                    && $0.session != nil
+                    && $0.inFlightRequests == 0
+                    && Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
+            }
+            .max { lhs, rhs in
+                lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
+            }
+
+        if let match {
+            var entry = match
+            entry.inFlightRequests += 1
+            entry.lastAccessAt = now
+            entry.phase = .prefilling
+            entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
+            entry.hitCount += 1
+            entries[entry.id] = entry
+            totals.totalHits += 1
+            totals.totalReusePromptTokens += entry.lastReuseTokens
+            let lease = Lease(
+                entryId: entry.id,
+                session: entry.session,
+                reusedPromptTokens: entry.lastReuseTokens,
+                cacheHit: true
+            )
+            lock.unlock()
+            return lease
+        }
+
+        let entryId = UUID()
+        entries[entryId] = Entry(
+            id: entryId,
+            modelId: modelId,
+            instructionsHash: instructionsHash,
+            requestMessageSignatures: historySignatures,
+            messageCount: requestMessageCount,
+            cachedTokenEstimate: estimatedPromptTokens,
+            estimatedBytes: estimatedBytes,
+            createdAt: now,
+            lastAccessAt: now,
+            inFlightRequests: 1,
+            hitCount: 0,
+            phase: .sessionBuild,
+            lastPromptTokens: 0,
+            lastCompletionTokens: 0,
+            lastReuseTokens: 0,
+            session: nil
+        )
+        totals.totalMisses += 1
+        totals.totalRebuildPromptTokens += estimatedPromptTokens
+        lock.unlock()
+        return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
+    }
+
+    func markSessionBuild(entryId: UUID) {
+        updatePhase(entryId: entryId, phase: .sessionBuild)
+    }
+
+    func markPrefilling(entryId: UUID) {
+        updatePhase(entryId: entryId, phase: .prefilling)
+    }
+
+    func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
+        lock.lock()
+        if var entry = entries[entryId] {
+            entry.phase = .generating
+            entry.lastPromptTokens = promptTokens
+            entry.lastCompletionTokens = completionTokens
+            entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
+            entry.lastAccessAt = Date()
+            entries[entryId] = entry
+        }
+        lock.unlock()
+    }
+
+    func completeRequest(
+        entryId: UUID,
+        session: ChatSession,
+        requestMessageSignatures: [UInt64],
+        requestMessageCount: Int,
+        estimatedPromptTokens: Int,
+        estimatedBytes: Int,
+        promptTokens: Int,
+        completionTokens: Int
+    ) {
+        lock.lock()
+        let now = Date()
+        if var entry = entries[entryId] {
+            entry.session = session
+            entry.requestMessageSignatures = requestMessageSignatures
+            entry.messageCount = requestMessageCount
+            entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
+            entry.estimatedBytes = estimatedBytes
+            entry.lastPromptTokens = promptTokens
+            entry.lastCompletionTokens = completionTokens
+            entry.lastAccessAt = now
+            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
+            entry.phase = .idle
+            entries[entryId] = entry
+            enforceBudgetLocked(now: now)
+        }
+        lock.unlock()
+    }
+
+    func abandonRequest(entryId: UUID) {
+        lock.lock()
+        if var entry = entries[entryId] {
+            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
+            if entry.session == nil && entry.inFlightRequests == 0 {
+                entries.removeValue(forKey: entryId)
+            } else {
+                entry.phase = .idle
+                entry.lastAccessAt = Date()
+                entries[entryId] = entry
+            }
+        }
+        lock.unlock()
+    }
+
+    func invalidateAll() {
+        lock.lock()
+        totals.totalEvictions += entries.count
+        entries.removeAll()
+        lock.unlock()
+    }
+
+    func reset() {
+        lock.lock()
+        entries.removeAll()
+        totals = Totals()
+        lock.unlock()
+    }
+
+    func snapshot() -> Snapshot {
+        lock.lock()
+        let now = Date()
+        pruneExpiredLocked(now: now)
+        let allEntries = Array(entries.values)
+        let sessions = allEntries
+            .sorted {
+                if $0.inFlightRequests != $1.inFlightRequests {
+                    return $0.inFlightRequests > $1.inFlightRequests
+                }
+                return $0.lastAccessAt > $1.lastAccessAt
+            }
+            .map {
+                SessionSummary(
+                    id: $0.id,
+                    modelId: $0.modelId,
+                    phase: $0.phase,
+                    messageCount: $0.messageCount,
+                    cachedTokenEstimate: $0.cachedTokenEstimate,
+                    estimatedBytes: $0.estimatedBytes,
+                    inFlightRequests: $0.inFlightRequests,
+                    hitCount: $0.hitCount,
+                    lastPromptTokens: $0.lastPromptTokens,
+                    lastCompletionTokens: $0.lastCompletionTokens,
+                    lastReuseTokens: $0.lastReuseTokens,
+                    createdAt: $0.createdAt,
+                    lastAccessAt: $0.lastAccessAt
+                )
+            }
+        let snapshot = Snapshot(
+            totalEntries: allEntries.count,
+            warmEntries: allEntries.filter { $0.session != nil }.count,
+            activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
+            generatingEntries: allEntries.filter { $0.phase == .generating }.count,
+            estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
+            cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
+            totalHits: totals.totalHits,
+            totalMisses: totals.totalMisses,
+            totalEvictions: totals.totalEvictions,
+            totalReusePromptTokens: totals.totalReusePromptTokens,
+            totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
+            sessions: sessions
+        )
+        lock.unlock()
+        return snapshot
+    }
+
+    private func updatePhase(entryId: UUID, phase: APISessionPhase) {
+        lock.lock()
+        if var entry = entries[entryId] {
+            entry.phase = phase
+            entry.lastAccessAt = Date()
+            entries[entryId] = entry
+        }
+        lock.unlock()
+    }
+
+    private func pruneExpiredLocked(now: Date) {
+        let expired = entries.values.filter {
+            $0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
+        }
+        guard !expired.isEmpty else { return }
+        for entry in expired {
+            entries.removeValue(forKey: entry.id)
+        }
+        totals.totalEvictions += expired.count
+    }
+
+    private func enforceBudgetLocked(now: Date) {
+        pruneExpiredLocked(now: now)
+
+        func totalCachedTokens() -> Int {
+            entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
+        }
+
+        while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
+            guard let victim = entries.values
+                .filter({ $0.inFlightRequests == 0 })
+                .sorted(by: evictionOrder)
+                .first
+            else {
+                break
+            }
+            entries.removeValue(forKey: victim.id)
+            totals.totalEvictions += 1
+        }
+    }
+
+    private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
+        if lhs.lastAccessAt != rhs.lastAccessAt {
+            return lhs.lastAccessAt < rhs.lastAccessAt
+        }
+        if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
+            return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
+        }
+        return lhs.createdAt < rhs.createdAt
+    }
+
+    private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
+        guard cached.count <= incoming.count,
+              incoming.count <= cached.count + 1 else { return false }
+        for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
+            return false
+        }
+        return true
+    }
+
+    static func stableHash(_ text: String) -> UInt64 {
+        var hash: UInt64 = 14_695_981_039_346_656_037
+        for byte in text.utf8 {
+            hash ^= UInt64(byte)
+            hash &*= 1_099_511_628_211
+        }
+        return hash
+    }
+
+    private struct Entry {
+        let id: UUID
+        let modelId: String
+        let instructionsHash: UInt64
+        var requestMessageSignatures: [UInt64]
+        var messageCount: Int
+        var cachedTokenEstimate: Int
+        var estimatedBytes: Int
+        let createdAt: Date
+        var lastAccessAt: Date
+        var inFlightRequests: Int
+        var hitCount: Int
+        var phase: APISessionPhase
+        var lastPromptTokens: Int
+        var lastCompletionTokens: Int
+        var lastReuseTokens: Int
+        var session: ChatSession?
+    }
+
+    private struct Totals {
+        var totalHits: Int = 0
+        var totalMisses: Int = 0
+        var totalEvictions: Int = 0
+        var totalReusePromptTokens: Int = 0
+        var totalRebuildPromptTokens: Int = 0
+    }
+}
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
@@ -6,28 +6,31 @@ import SwiftUI
 struct MonitorView: View {
    let stats: InferenceStats
    @Environment(ModelManager.self) private var modelManager
+    private let chartColumns = [GridItem(.flexible(minimum: 260), spacing: 16), GridItem(.flexible(minimum: 260), spacing: 16)]
+    private let cardColumns = [GridItem(.flexible(minimum: 180), spacing: 16), GridItem(.flexible(minimum: 180), spacing: 16)]

    var body: some View {
        ScrollView {
            VStack(spacing: 20) {
-                // Live status header
                liveStatusSection

-                // Charts
-                HStack(alignment: .top, spacing: 16) {
+                LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
                    tokenRateChart
                    tokenThroughputChart
+                    cacheReuseChart
+                    cacheFootprintChart
+                    cacheSessionChart
                }

-                // Gauges row
-                HStack(spacing: 16) {
+                LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 16) {
                    contextGauge
                    gpuMemoryGauge
                    requestsCard
+                    cacheCard
                }

-                // Cumulative stats
                cumulativeSection
+                sessionSection
            }
            .padding(20)
        }
@@ -39,45 +42,54 @@ struct MonitorView: View {

    @ViewBuilder
    private var liveStatusSection: some View {
-        HStack(spacing: 16) {
-            // Activity indicator
-            HStack(spacing: 8) {
-                Circle()
-                    .fill(activityColor)
-                    .frame(width: 10, height: 10)
-                    .overlay {
-                        if stats.isGenerating || stats.isPrefilling {
-                            Circle()
-                                .stroke(activityColor.opacity(0.5), lineWidth: 2)
-                                .scaleEffect(1.8)
-                                .opacity(0.6)
+        VStack(alignment: .leading, spacing: 12) {
+            HStack(spacing: 16) {
+                HStack(spacing: 8) {
+                    Circle()
+                        .fill(activityColor)
+                        .frame(width: 10, height: 10)
+                        .overlay {
+                            if stats.activeRequests > 0 {
+                                Circle()
+                                    .stroke(activityColor.opacity(0.5), lineWidth: 2)
+                                    .scaleEffect(1.8)
+                                    .opacity(0.6)
+                            }
                        }
-                    }

-                Text(activityLabel)
-                    .font(.headline)
-            }
-
-            Spacer()
-
-            if stats.isGenerating {
-                Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
-                    .font(.title2.monospacedDigit().bold())
-                    .foregroundStyle(.green)
-            }
-
-            if stats.currentPromptTokens > 0 {
-                HStack(spacing: 4) {
-                    Image(systemName: "arrow.down.circle.fill")
-                        .foregroundStyle(.blue)
-                    Text("\(stats.currentPromptTokens)")
-                        .monospacedDigit()
-                    Image(systemName: "arrow.up.circle.fill")
-                        .foregroundStyle(.orange)
-                    Text("\(stats.currentGenerationTokens)")
-                        .monospacedDigit()
+                    Text(activityLabel)
+                        .font(.headline)
                }
-                .font(.callout)
+
+                Spacer()
+
+                if stats.isGenerating {
+                    Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
+                        .font(.title2.monospacedDigit().bold())
+                        .foregroundStyle(.green)
+                }
+
+                if stats.currentPromptTokens > 0 {
+                    HStack(spacing: 4) {
+                        Image(systemName: "arrow.down.circle.fill")
+                            .foregroundStyle(.blue)
+                        Text("\(stats.currentPromptTokens)")
+                            .monospacedDigit()
+                        Image(systemName: "arrow.up.circle.fill")
+                            .foregroundStyle(.orange)
+                        Text("\(stats.currentGenerationTokens)")
+                            .monospacedDigit()
+                    }
+                    .font(.callout)
+                }
+            }
+
+            HStack(spacing: 8) {
+                phaseChip(title: "Preparing", count: stats.preparingRequests, color: .secondary)
+                phaseChip(title: "Session Build", count: stats.sessionBuildRequests, color: .purple)
+                phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
+                phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
+                phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
            }
        }
        .padding(12)
@@ -85,15 +97,19 @@ struct MonitorView: View {
    }

    private var activityColor: Color {
-        if stats.isPrefilling { return .blue }
        if stats.isGenerating { return .green }
+        if stats.prefillingRequests > 0 { return .blue }
+        if stats.sessionBuildRequests > 0 { return .purple }
+        if stats.preparingRequests > 0 { return .orange }
        if stats.activeRequests > 0 { return .orange }
        return .secondary
    }

    private var activityLabel: String {
-        if stats.isPrefilling { return "Prefilling" }
        if stats.isGenerating { return "Generating" }
+        if stats.prefillingRequests > 0 { return "Prefilling" }
+        if stats.sessionBuildRequests > 0 { return "Building Sessions" }
+        if stats.preparingRequests > 0 { return "Preparing Requests" }
        if stats.activeRequests > 0 { return "Processing" }
        return "Idle"
    }
@@ -145,6 +161,160 @@ struct MonitorView: View {
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }

+    @ViewBuilder
+    private var cacheReuseChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Prefill Reuse (/sec)")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart {
+                ForEach(stats.cacheReuseHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Tokens", point.value)
+                    )
+                    .foregroundStyle(.green.opacity(0.75))
+                }
+                ForEach(stats.cacheRebuildHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Tokens", point.value)
+                    )
+                    .foregroundStyle(.red.opacity(0.65))
+                }
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.0f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .frame(height: 150)
+
+            HStack(spacing: 12) {
+                Label("Reused", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.green)
+                Label("Rebuilt", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.red)
+            }
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    @ViewBuilder
+    private var cacheFootprintChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Cache Footprint (est)")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart(stats.cacheFootprintHistory) { point in
+                LineMark(
+                    x: .value("Time", point.timestamp),
+                    y: .value("MB", point.value / 1_048_576)
+                )
+                .foregroundStyle(.orange)
+                .interpolationMethod(.monotone)
+
+                AreaMark(
+                    x: .value("Time", point.timestamp),
+                    y: .value("MB", point.value / 1_048_576)
+                )
+                .foregroundStyle(.orange.opacity(0.12))
+                .interpolationMethod(.monotone)
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.1f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .frame(height: 150)
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    @ViewBuilder
+    private var cacheSessionChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Cached Sessions")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart {
+                ForEach(stats.cacheEntryHistory) { point in
+                    LineMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Cached", point.value)
+                    )
+                    .foregroundStyle(.purple)
+                    .interpolationMethod(.monotone)
+                }
+                ForEach(stats.activeSessionHistory) { point in
+                    LineMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Active", point.value)
+                    )
+                    .foregroundStyle(.blue)
+                    .interpolationMethod(.monotone)
+                }
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.0f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .frame(height: 150)
+
+            HStack(spacing: 12) {
+                Label("Cached", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.purple)
+                Label("Active", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.blue)
+            }
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
    private var maxTokenRate: Double {
        stats.tokenRateHistory.map(\.value).max() ?? 10
    }
@@ -303,35 +473,69 @@ struct MonitorView: View {
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }

+    @ViewBuilder
+    private var cacheCard: some View {
+        VStack(alignment: .leading, spacing: 8) {
+            Text("Session Cache")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Text("\(stats.cacheEntryCount)")
+                .font(.title3.monospacedDigit().bold())
+
+            LabeledContent("Warm") {
+                Text("\(stats.warmCacheEntryCount)")
+                    .monospacedDigit()
+            }
+            .font(.caption)
+
+            LabeledContent("Active") {
+                Text("\(stats.activeCacheEntryCount)")
+                    .monospacedDigit()
+            }
+            .font(.caption)
+
+            LabeledContent("Est. Footprint") {
+                Text(formatByteCount(stats.cacheEstimatedBytes))
+                    .monospacedDigit()
+            }
+            .font(.caption)
+
+            LabeledContent("Cached Tokens") {
+                Text(formatTokenCount(stats.cacheEstimatedTokens))
+                    .monospacedDigit()
+            }
+            .font(.caption)
+
+            LabeledContent("Hit Rate") {
+                Text(String(format: "%.0f%%", cacheHitRate * 100))
+                    .monospacedDigit()
+            }
+            .font(.caption)
+        }
+        .frame(maxWidth: .infinity, alignment: .leading)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
    // MARK: - Cumulative

    @ViewBuilder
    private var cumulativeSection: some View {
-        HStack(spacing: 24) {
-            VStack(spacing: 2) {
-                Text("Total Prompt Tokens")
-                    .font(.caption2)
-                    .foregroundStyle(.secondary)
-                Text(formatTokenCount(stats.totalPromptTokens))
-                    .font(.callout.monospacedDigit().bold())
-                    .foregroundStyle(.blue)
-            }
+        VStack(alignment: .leading, spacing: 10) {
+            Text("Cumulative")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)

-            VStack(spacing: 2) {
-                Text("Total Generated Tokens")
-                    .font(.caption2)
-                    .foregroundStyle(.secondary)
-                Text(formatTokenCount(stats.totalGenerationTokens))
-                    .font(.callout.monospacedDigit().bold())
-                    .foregroundStyle(.orange)
-            }
-
-            VStack(spacing: 2) {
-                Text("Total Tokens")
-                    .font(.caption2)
-                    .foregroundStyle(.secondary)
-                Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens))
-                    .font(.callout.monospacedDigit().bold())
+            LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 12) {
+                statTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue)
+                statTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .orange)
+                statTile(title: "Cache Hits", value: "\(stats.totalCacheHits)", color: .green)
+                statTile(title: "Cache Misses", value: "\(stats.totalCacheMisses)", color: .red)
+                statTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .green)
+                statTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .red)
+                statTile(title: "Evictions", value: "\(stats.totalCacheEvictions)", color: .secondary)
+                statTile(title: "Total Tokens", value: formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens), color: .primary)
            }
        }
        .frame(maxWidth: .infinity)
@@ -339,8 +543,129 @@ struct MonitorView: View {
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }

+    @ViewBuilder
+    private var sessionSection: some View {
+        VStack(alignment: .leading, spacing: 12) {
+            HStack {
+                Text("Cached Chat Sessions")
+                    .font(.headline)
+                Spacer()
+                Text("\(stats.cachedSessions.count) visible")
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
+            }
+
+            if stats.cachedSessions.isEmpty {
+                Text("No cached sessions yet.")
+                    .font(.callout)
+                    .foregroundStyle(.secondary)
+                    .frame(maxWidth: .infinity, alignment: .leading)
+            } else {
+                ForEach(stats.cachedSessions) { session in
+                    sessionRow(session)
+                }
+            }
+        }
+        .frame(maxWidth: .infinity, alignment: .leading)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
    // MARK: - Helpers

+    @ViewBuilder
+    private func phaseChip(title: String, count: Int, color: Color) -> some View {
+        HStack(spacing: 6) {
+            Circle()
+                .fill(color)
+                .frame(width: 7, height: 7)
+            Text(title)
+            Text("\(count)")
+                .monospacedDigit()
+        }
+        .font(.caption)
+        .padding(.horizontal, 8)
+        .padding(.vertical, 4)
+        .background(color.opacity(0.12), in: Capsule())
+    }
+
+    @ViewBuilder
+    private func statTile(title: String, value: String, color: Color) -> some View {
+        VStack(alignment: .leading, spacing: 4) {
+            Text(title)
+                .font(.caption2)
+                .foregroundStyle(.secondary)
+            Text(value)
+                .font(.callout.monospacedDigit().bold())
+                .foregroundStyle(color)
+        }
+        .frame(maxWidth: .infinity, alignment: .leading)
+        .padding(10)
+        .background(Color.primary.opacity(0.04), in: RoundedRectangle(cornerRadius: 8))
+    }
+
+    @ViewBuilder
+    private func sessionRow(_ session: ConversationSessionCache.SessionSummary) -> some View {
+        VStack(alignment: .leading, spacing: 10) {
+            HStack(alignment: .firstTextBaseline) {
+                HStack(spacing: 8) {
+                    Circle()
+                        .fill(color(for: session.phase))
+                        .frame(width: 8, height: 8)
+                    Text(session.modelId)
+                        .font(.callout.weight(.semibold))
+                        .lineLimit(1)
+                }
+                Spacer()
+                Text(session.phase.rawValue)
+                    .font(.caption.monospacedDigit())
+                    .padding(.horizontal, 8)
+                    .padding(.vertical, 4)
+                    .background(color(for: session.phase).opacity(0.14), in: Capsule())
+            }
+
+            HStack(spacing: 12) {
+                sessionMetric("Msgs", "\(session.messageCount)")
+                sessionMetric("Cached", formatTokenCount(session.cachedTokenEstimate))
+                sessionMetric("Reuse", formatTokenCount(session.lastReuseTokens))
+                sessionMetric("Footprint", formatByteCount(session.estimatedBytes))
+                sessionMetric("Hits", "\(session.hitCount)")
+                sessionMetric("Active", "\(session.inFlightRequests)")
+            }
+
+            HStack(spacing: 12) {
+                sessionMetric("Prompt", formatTokenCount(session.lastPromptTokens))
+                sessionMetric("Completion", formatTokenCount(session.lastCompletionTokens))
+                sessionMetric("Last Access", relativeTimeString(session.lastAccessAt))
+            }
+
+            let ratio = maxContextRatio(for: session.cachedTokenEstimate)
+            ProgressView(value: ratio) {
+                Text("Cached Context")
+                    .font(.caption2)
+                    .foregroundStyle(.secondary)
+            } currentValueLabel: {
+                Text("\(Int(ratio * 100))%")
+                    .font(.caption2.monospacedDigit())
+                    .foregroundStyle(.secondary)
+            }
+            .tint(color(for: session.phase))
+        }
+        .padding(12)
+        .background(Color.primary.opacity(0.035), in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    @ViewBuilder
+    private func sessionMetric(_ title: String, _ value: String) -> some View {
+        VStack(alignment: .leading, spacing: 2) {
+            Text(title)
+                .font(.caption2)
+                .foregroundStyle(.secondary)
+            Text(value)
+                .font(.caption.monospacedDigit().bold())
+        }
+    }
+
    private func formatTokenCount(_ count: Int) -> String {
        if count >= 1_000_000 {
            return String(format: "%.1fM", Double(count) / 1_000_000)
@@ -349,4 +674,52 @@ struct MonitorView: View {
        }
        return "\(count)"
    }
+
+    private func formatByteCount(_ count: Int) -> String {
+        let bytes = Double(count)
+        if bytes >= 1_048_576 {
+            return String(format: "%.1f MB", bytes / 1_048_576)
+        }
+        if bytes >= 1024 {
+            return String(format: "%.0f KB", bytes / 1024)
+        }
+        return "\(count) B"
+    }
+
+    private func relativeTimeString(_ date: Date) -> String {
+        let seconds = max(0, Int(Date.now.timeIntervalSince(date)))
+        if seconds < 60 {
+            return "\(seconds)s"
+        }
+        let minutes = seconds / 60
+        if minutes < 60 {
+            return "\(minutes)m"
+        }
+        return "\(minutes / 60)h"
+    }
+
+    private func color(for phase: APISessionPhase) -> Color {
+        switch phase {
+        case .idle:
+            return .secondary
+        case .sessionBuild:
+            return .purple
+        case .prefilling:
+            return .blue
+        case .generating:
+            return .green
+        }
+    }
+
+    private var cacheHitRate: Double {
+        let total = stats.totalCacheHits + stats.totalCacheMisses
+        guard total > 0 else { return 0 }
+        return Double(stats.totalCacheHits) / Double(total)
+    }
+
+    private func maxContextRatio(for tokens: Int) -> Double {
+        let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
+        guard maxContext > 0 else { return 0 }
+        return min(1, Double(tokens) / Double(maxContext))
+    }
 }