feat: better hardening with unit tests and end-to-end tests

2026-03-20 10:27:39 +01:00
parent e40a2f3c45
commit aadcc308a5
7 changed files with 1395 additions and 1326 deletions
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -46,6 +46,7 @@
 		C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
 		C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
 		CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
 		CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */; };
 		CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
 		D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
 		D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
@@ -53,7 +54,6 @@
 		E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
 		E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
 		EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
 		F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
 		F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
 		FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
 		FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -114,6 +114,7 @@
 		DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
 		E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
 		E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
 		E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerRewriteTests.swift; sourceTree = "<group>"; };
 		E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
 		E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
 		E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
@@ -121,7 +122,6 @@
 		F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
 		F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
 		FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
@@ -172,6 +172,7 @@
 		154AF0C071A7DC02EB5F6F49 /* Server */ = {
 			isa = PBXGroup;
 			children = (
 				E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
 				FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
 				E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
 				D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
@@ -263,7 +264,6 @@
 				F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
 				3D08828E16B17EF02C14243E /* APIServer.swift */,
 				3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
 				FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
 				7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
 				02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
 				E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
@@ -379,6 +379,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 				CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
 				962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
 				E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
 				8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
@@ -406,7 +407,6 @@
 				85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
 				B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
 				5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
 				F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
 				C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
 				4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
 				A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -24,11 +24,15 @@ final class LiveCounters: @unchecked Sendable {
    private var _isGenerating: Bool = false
    private var _contextMax: Int = 0
    private var _currentPhaseElapsed: TimeInterval = 0
    private var _currentCacheMatchedPromptTokens: Int = 0
    private var _currentCacheRebuiltPromptTokens: Int = 0
    // Cumulative
    private var _totalRequests: Int = 0
    private var _totalPromptTokens: Int = 0
    private var _totalGenerationTokens: Int = 0
    private var _totalCacheReusePromptTokens: Int = 0
    private var _totalCacheRebuildPromptTokens: Int = 0
    private var _totalPreparingDuration: TimeInterval = 0
    private var _totalSessionBuildDuration: TimeInterval = 0
    private var _totalPrefillDuration: TimeInterval = 0
@@ -90,6 +94,26 @@ final class LiveCounters: @unchecked Sendable {
        lock.unlock()
    }
    func recordPrefillReuse(requestId: String, matchedPromptTokens: Int, promptTokenCount: Int) {
        lock.lock()
        guard var state = requestPhases[requestId] else {
            lock.unlock()
            return
        }
        let matched = max(0, matchedPromptTokens)
        let rebuilt = max(0, promptTokenCount - matched)
        _totalCacheReusePromptTokens += matched
        _totalCacheRebuildPromptTokens += rebuilt
        state.matchedPromptTokens = matched
        state.rebuiltPromptTokens = rebuilt
        requestPhases[requestId] = state
        refreshCurrentCachePromptStatsLocked()
        lock.unlock()
    }
    func requestCompleted(requestId: String, generationTokens: Int) {
        let now = Date()
        lock.lock()
@@ -108,6 +132,7 @@ final class LiveCounters: @unchecked Sendable {
            _isGenerating = _generatingRequests > 0
        }
        refreshCurrentPhaseElapsed(now: now)
        refreshCurrentCachePromptStatsLocked()
        lock.unlock()
    }
@@ -126,9 +151,13 @@ final class LiveCounters: @unchecked Sendable {
        _isGenerating = false
        _contextMax = 0
        _currentPhaseElapsed = 0
        _currentCacheMatchedPromptTokens = 0
        _currentCacheRebuiltPromptTokens = 0
        _totalRequests = 0
        _totalPromptTokens = 0
        _totalGenerationTokens = 0
        _totalCacheReusePromptTokens = 0
        _totalCacheRebuildPromptTokens = 0
        _totalPreparingDuration = 0
        _totalSessionBuildDuration = 0
        _totalPrefillDuration = 0
@@ -154,9 +183,13 @@ final class LiveCounters: @unchecked Sendable {
            isGenerating: _isGenerating,
            contextMax: _contextMax,
            currentPhaseElapsed: _currentPhaseElapsed,
            currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
            currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
            totalRequests: _totalRequests,
            totalPromptTokens: _totalPromptTokens,
            totalGenerationTokens: _totalGenerationTokens,
            totalCacheReusePromptTokens: _totalCacheReusePromptTokens,
            totalCacheRebuildPromptTokens: _totalCacheRebuildPromptTokens,
            totalPreparingDuration: _totalPreparingDuration,
            totalSessionBuildDuration: _totalSessionBuildDuration,
            totalPrefillDuration: _totalPrefillDuration,
@@ -179,9 +212,13 @@ final class LiveCounters: @unchecked Sendable {
        let isGenerating: Bool
        let contextMax: Int
        let currentPhaseElapsed: TimeInterval
        let currentCacheMatchedPromptTokens: Int
        let currentCacheRebuiltPromptTokens: Int
        let totalRequests: Int
        let totalPromptTokens: Int
        let totalGenerationTokens: Int
        let totalCacheReusePromptTokens: Int
        let totalCacheRebuildPromptTokens: Int
        let totalPreparingDuration: TimeInterval
        let totalSessionBuildDuration: TimeInterval
        let totalPrefillDuration: TimeInterval
@@ -231,9 +268,16 @@ final class LiveCounters: @unchecked Sendable {
        _currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
    }
    private func refreshCurrentCachePromptStatsLocked() {
        _currentCacheMatchedPromptTokens = requestPhases.values.reduce(0) { $0 + $1.matchedPromptTokens }
        _currentCacheRebuiltPromptTokens = requestPhases.values.reduce(0) { $0 + $1.rebuiltPromptTokens }
    }
    private struct RequestState {
        var phase: RequestPhase
        var phaseStartedAt: Date
        var matchedPromptTokens: Int = 0
        var rebuiltPromptTokens: Int = 0
    }
    enum RequestPhase {
@@ -264,17 +308,20 @@ final class InferenceStats {
    var contextUsed: Int = 0
    var contextMax: Int = 0
    var currentPhaseElapsed: TimeInterval = 0
    var currentCacheMatchedPromptTokens: Int = 0
    var currentCacheRebuiltPromptTokens: Int = 0
    // MARK: - Cumulative counters
    var totalRequests: Int = 0
    var totalPromptTokens: Int = 0
    var totalGenerationTokens: Int = 0
    var totalCacheReusePromptTokens: Int = 0
    var totalCacheRebuildPromptTokens: Int = 0
    var totalCacheHits: Int = 0
    var totalCacheMisses: Int = 0
    var totalCacheEvictions: Int = 0
-    var totalCacheReusePromptTokens: Int = 0
+    var cacheHitRatePercent: Double = 0
    var totalCacheRebuildPromptTokens: Int = 0
    var totalPreparingDuration: TimeInterval = 0
    var totalSessionBuildDuration: TimeInterval = 0
    var totalPrefillDuration: TimeInterval = 0
@@ -283,12 +330,11 @@ final class InferenceStats {
    // MARK: - Cache state
    var cacheEntryCount: Int = 0
    var warmCacheEntryCount: Int = 0
    var activeCacheEntryCount: Int = 0
    var generatingCacheEntryCount: Int = 0
    var cacheEstimatedBytes: Int = 0
    var cacheEstimatedTokens: Int = 0
-    var cachedSessions: [ConversationSessionCache.SessionSummary] = []
+    var cacheMemoryBudgetBytes: Int = 0
    var cacheMemoryUsagePercent: Double = 0
    var cachedEntries: [TokenPrefixCache.EntrySummary] = []
    // MARK: - Time series data (ring buffers for charts)
@@ -302,13 +348,14 @@ final class InferenceStats {
    private(set) var promptTokenHistory: [DataPoint] = []
    private(set) var generationTokenHistory: [DataPoint] = []
    private(set) var cacheEntryHistory: [DataPoint] = []
    private(set) var activeSessionHistory: [DataPoint] = []
    private(set) var cacheFootprintHistory: [DataPoint] = []
-    private(set) var cacheReuseHistory: [DataPoint] = []
+    private(set) var cacheHitRateHistory: [DataPoint] = []
-    private(set) var cacheRebuildHistory: [DataPoint] = []
+    private(set) var cacheMemoryPressureHistory: [DataPoint] = []
    private(set) var currentPhaseElapsedHistory: [DataPoint] = []
    private(set) var prefillDurationHistory: [DataPoint] = []
-    private(set) var sessionBuildDurationHistory: [DataPoint] = []
+    private(set) var cacheReusePromptHistory: [DataPoint] = []
    private(set) var cacheRebuildPromptHistory: [DataPoint] = []
    private(set) var cacheMatchQualityHistory: [DataPoint] = []
    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -316,10 +363,9 @@ final class InferenceStats {
    private var sampleTimer: Timer?
    private var lastGenerationTokenCount: Int = 0
    private var lastPromptTokenCount: Int = 0
    private var lastCacheReuseTokenCount: Int = 0
    private var lastCacheRebuildTokenCount: Int = 0
    private var lastPrefillDuration: TimeInterval = 0
-    private var lastSessionBuildDuration: TimeInterval = 0
+    private var lastCacheReusePromptTokenCount: Int = 0
    private var lastCacheRebuildPromptTokenCount: Int = 0
    func startSampling() {
        guard sampleTimer == nil else { return }
@@ -338,7 +384,7 @@ final class InferenceStats {
    private func recordSample() {
        // Pull live values from the thread-safe counters
        let snap = LiveCounters.shared.snapshot()
-        let cache = ConversationSessionCache.shared.snapshot()
+        let cache = TokenPrefixCache.shared.snapshot()
        activeRequests = snap.activeRequests
        preparingRequests = snap.preparingRequests
@@ -353,9 +399,13 @@ final class InferenceStats {
        contextMax = snap.contextMax
        contextUsed = snap.promptTokens + snap.generationTokens
        currentPhaseElapsed = snap.currentPhaseElapsed
        currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
        currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
        totalRequests = snap.totalRequests
        totalPromptTokens = snap.totalPromptTokens
        totalGenerationTokens = snap.totalGenerationTokens
        totalCacheReusePromptTokens = snap.totalCacheReusePromptTokens
        totalCacheRebuildPromptTokens = snap.totalCacheRebuildPromptTokens
        totalPreparingDuration = snap.totalPreparingDuration
        totalSessionBuildDuration = snap.totalSessionBuildDuration
        totalPrefillDuration = snap.totalPrefillDuration
@@ -363,41 +413,41 @@ final class InferenceStats {
        totalCacheHits = cache.totalHits
        totalCacheMisses = cache.totalMisses
        totalCacheEvictions = cache.totalEvictions
-        totalCacheReusePromptTokens = cache.totalReusePromptTokens
+        cacheHitRatePercent = cache.hitRate
        totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
        cacheEntryCount = cache.totalEntries
        warmCacheEntryCount = cache.warmEntries
        activeCacheEntryCount = cache.activeEntries
        generatingCacheEntryCount = cache.generatingEntries
        cacheEstimatedBytes = cache.estimatedBytes
-        cacheEstimatedTokens = cache.cachedTokenEstimate
+        cacheEstimatedTokens = cache.totalCachedTokens
-        cachedSessions = cache.sessions
+        cacheMemoryBudgetBytes = cache.memoryBudgetBytes
        cacheMemoryUsagePercent = cache.memoryUsagePercent
        cachedEntries = cache.entries
        let now = Date.now
        let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
        let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
        let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
        let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
        let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
-        let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
+        let cacheReusePromptDelta = snap.totalCacheReusePromptTokens - lastCacheReusePromptTokenCount
        let cacheRebuildPromptDelta = snap.totalCacheRebuildPromptTokens - lastCacheRebuildPromptTokenCount
        let cacheMatchQualityDelta = cacheReusePromptDelta + cacheRebuildPromptDelta > 0
            ? (Double(cacheReusePromptDelta) / Double(cacheReusePromptDelta + cacheRebuildPromptDelta)) * 100
            : 0
        lastGenerationTokenCount = snap.totalGenerationTokens
        lastPromptTokenCount = snap.totalPromptTokens
        lastCacheReuseTokenCount = cache.totalReusePromptTokens
        lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
        lastPrefillDuration = snap.totalPrefillDuration
-        lastSessionBuildDuration = snap.totalSessionBuildDuration
+        lastCacheReusePromptTokenCount = snap.totalCacheReusePromptTokens
        lastCacheRebuildPromptTokenCount = snap.totalCacheRebuildPromptTokens
        tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
        promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
        cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
        activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
        cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
-        cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
+        cacheHitRateHistory.append(DataPoint(timestamp: now, value: cache.hitRate))
-        cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
+        cacheMemoryPressureHistory.append(DataPoint(timestamp: now, value: cache.memoryUsagePercent))
        currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
        prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
-        sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
+        cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
        cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
        cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
        if tokenRateHistory.count > Self.maxHistoryPoints {
            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -411,17 +461,14 @@ final class InferenceStats {
        if cacheEntryHistory.count > Self.maxHistoryPoints {
            cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
        }
        if activeSessionHistory.count > Self.maxHistoryPoints {
            activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
        }
        if cacheFootprintHistory.count > Self.maxHistoryPoints {
            cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
        }
-        if cacheReuseHistory.count > Self.maxHistoryPoints {
+        if cacheHitRateHistory.count > Self.maxHistoryPoints {
-            cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
+            cacheHitRateHistory.removeFirst(cacheHitRateHistory.count - Self.maxHistoryPoints)
        }
-        if cacheRebuildHistory.count > Self.maxHistoryPoints {
+        if cacheMemoryPressureHistory.count > Self.maxHistoryPoints {
-            cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
+            cacheMemoryPressureHistory.removeFirst(cacheMemoryPressureHistory.count - Self.maxHistoryPoints)
        }
        if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
            currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
@@ -429,14 +476,20 @@ final class InferenceStats {
        if prefillDurationHistory.count > Self.maxHistoryPoints {
            prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
        }
-        if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
+        if cacheReusePromptHistory.count > Self.maxHistoryPoints {
-            sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
+            cacheReusePromptHistory.removeFirst(cacheReusePromptHistory.count - Self.maxHistoryPoints)
        }
        if cacheRebuildPromptHistory.count > Self.maxHistoryPoints {
            cacheRebuildPromptHistory.removeFirst(cacheRebuildPromptHistory.count - Self.maxHistoryPoints)
        }
        if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
            cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
        }
    }
    func reset() {
        LiveCounters.shared.reset()
-        ConversationSessionCache.shared.reset()
+        TokenPrefixCache.shared.reset()
        activeRequests = 0
        preparingRequests = 0
        sessionBuildRequests = 0
@@ -450,9 +503,13 @@ final class InferenceStats {
        contextUsed = 0
        contextMax = 0
        currentPhaseElapsed = 0
        currentCacheMatchedPromptTokens = 0
        currentCacheRebuiltPromptTokens = 0
        totalRequests = 0
        totalPromptTokens = 0
        totalGenerationTokens = 0
        totalCacheReusePromptTokens = 0
        totalCacheRebuildPromptTokens = 0
        totalPreparingDuration = 0
        totalSessionBuildDuration = 0
        totalPrefillDuration = 0
@@ -460,31 +517,41 @@ final class InferenceStats {
        totalCacheHits = 0
        totalCacheMisses = 0
        totalCacheEvictions = 0
-        totalCacheReusePromptTokens = 0
+        cacheHitRatePercent = 0
        totalCacheRebuildPromptTokens = 0
        cacheEntryCount = 0
        warmCacheEntryCount = 0
        activeCacheEntryCount = 0
        generatingCacheEntryCount = 0
        cacheEstimatedBytes = 0
        cacheEstimatedTokens = 0
-        cachedSessions.removeAll()
+        cacheMemoryBudgetBytes = 0
        cacheMemoryUsagePercent = 0
        cachedEntries.removeAll()
        tokenRateHistory.removeAll()
        promptTokenHistory.removeAll()
        generationTokenHistory.removeAll()
        cacheEntryHistory.removeAll()
        activeSessionHistory.removeAll()
        cacheFootprintHistory.removeAll()
-        cacheReuseHistory.removeAll()
+        cacheHitRateHistory.removeAll()
-        cacheRebuildHistory.removeAll()
+        cacheMemoryPressureHistory.removeAll()
        currentPhaseElapsedHistory.removeAll()
        prefillDurationHistory.removeAll()
-        sessionBuildDurationHistory.removeAll()
+        cacheReusePromptHistory.removeAll()
        cacheRebuildPromptHistory.removeAll()
        cacheMatchQualityHistory.removeAll()
        lastGenerationTokenCount = 0
        lastPromptTokenCount = 0
        lastCacheReuseTokenCount = 0
        lastCacheRebuildTokenCount = 0
        lastPrefillDuration = 0
-        lastSessionBuildDuration = 0
+        lastCacheReusePromptTokenCount = 0
        lastCacheRebuildPromptTokenCount = 0
    }
    var currentCacheMatchQualityPercent: Double {
        let total = currentCacheMatchedPromptTokens + currentCacheRebuiltPromptTokens
        guard total > 0 else { return 0 }
        return (Double(currentCacheMatchedPromptTokens) / Double(total)) * 100
    }
    var totalCacheMatchQualityPercent: Double {
        let total = totalCacheReusePromptTokens + totalCacheRebuildPromptTokens
        guard total > 0 else { return 0 }
        return (Double(totalCacheReusePromptTokens) / Double(total)) * 100
    }
 }
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -63,7 +63,7 @@ final class APIServer {
        listener?.cancel()
        listener = nil
        isRunning = false
-        ConversationSessionCache.shared.invalidateAll()
+        TokenPrefixCache.shared.invalidateAll()
        inferenceStats.stopSampling()
    }
@@ -176,7 +176,7 @@ final class APIServer {
            if let targetConfig = ModelConfig.resolve(requestedModel) {
                if modelManager.currentModel?.id != targetConfig.id {
                    print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
-                    ConversationSessionCache.shared.invalidateAll()
+                    TokenPrefixCache.shared.invalidateAll()
                    await modelManager.loadModel(targetConfig)
                }
            }
@@ -187,7 +187,7 @@ final class APIServer {
        if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
           let config = ModelConfig.resolve(lastModelId) {
            print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
-            ConversationSessionCache.shared.invalidateAll()
+            TokenPrefixCache.shared.invalidateAll()
            await modelManager.loadModel(config)
        }
@@ -260,110 +260,80 @@ final class APIServer {
            temperature: Float(temperature),
            topP: Float(topP)
        )
        // Feed all messages except the last as history, then send the last as the prompt
        let chatMessages = preparedPrompt.chatMessages
        let allButLast = Array(chatMessages.dropLast())
        let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
        let historySignatures = Array(preparedPrompt.messageSignatures.dropLast())
        let currentModelId = modelManager.currentModel?.id ?? modelName
-        let lease = ConversationSessionCache.shared.checkoutSession(
+        let engine = InferenceEngine(container: container)
-            modelId: currentModelId,
+        let preparedInference: InferenceEngine.PreparedInference
-            instructions: preparedPrompt.instructions,
+        do {
-            historySignatures: historySignatures,
+            preparedInference = try await engine.prepare(preparedPrompt.userInput)
-            requestMessageCount: chatMessages.count,
+        } catch {
-            estimatedPromptTokens: estimatedPromptTokens,
+            LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
-            estimatedBytes: preparedPrompt.estimatedBytes
+            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
-        )
+            return
        let session: ChatSession
        if let reusableSession = lease.session {
            print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
            session = reusableSession
            session.generateParameters = generateParams
            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
        } else {
            print("[APIServer] Creating fresh session")
            ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
            // Use `instructions:` for system/tool prompt (matches internal chat pattern).
            // Only conversation turns go in `history:` — this avoids replaying the
            // large tool prompt as history on every new session.
            let instr = preparedPrompt.instructions.isEmpty ? nil : preparedPrompt.instructions
            if !allButLast.isEmpty {
                session = ChatSession(
                    container,
                    instructions: instr,
                    history: allButLast,
                    generateParameters: generateParams,
                    additionalContext: preparedPrompt.additionalContext
                )
            } else {
                session = ChatSession(
                    container,
                    instructions: instr,
                    generateParameters: generateParams,
                    additionalContext: preparedPrompt.additionalContext
                )
            }
            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
        }
-        // Extract images from the last message only (ChatSession.streamDetails takes images separately)
+        // Vision requests stay uncached until image fingerprinting lands.
-        let lastImages = lastMessage.images
+        let cacheKey = preparedInference.hasImages ? nil : preparedInference.tokens
        let lease = cacheKey.map { TokenPrefixCache.shared.lookup(cacheKey: $0, modelId: currentModelId) }
            ?? TokenPrefixCache.CacheLease(entryId: UUID(), kvCache: nil, matchedTokenCount: 0, isHit: false)
-        let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
+        LiveCounters.shared.recordPrefillReuse(
            requestId: requestId,
            matchedPromptTokens: lease.matchedTokenCount,
            promptTokenCount: preparedInference.tokens.count
        )
        LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
        let cancellation = CancellationToken()
        let streamHandle: InferenceEngine.StreamHandle
        do {
            streamHandle = try await engine.stream(
                InferenceEngine.InferenceRequest(
                    input: preparedInference.lmInput,
                    tokens: preparedInference.tokens,
                    parameters: generateParams,
                    cachedKV: lease.kvCache,
                    cachedTokenCount: lease.matchedTokenCount
                ),
                cancellation: cancellation
            )
        } catch {
            LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
            return
        }
        let result: GenerationOutcome
        if isStream {
            result = await handleStreamingResponse(
                connection: connection,
                requestId: requestId,
-                cacheEntryId: lease.entryId,
+                cancellation: cancellation,
-                session: session,
+                stream: streamHandle.stream,
                prompt: lastMessage.content,
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName,
+                modelName: modelName
                isQwen: isQwen
            )
        } else {
            result = await handleNonStreamingResponse(
                connection: connection,
                requestId: requestId,
-                cacheEntryId: lease.entryId,
+                stream: streamHandle.stream,
                session: session,
                prompt: lastMessage.content,
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName,
+                modelName: modelName
                isQwen: isQwen
            )
        }
-        if result.succeeded {
+        if let cacheKey,
-            var cachedSignatures = preparedPrompt.messageSignatures
+           result.succeeded || result.cancelled {
-            if let assistantHistoryText = result.assistantHistoryText {
+            Self.storePromptCache(
-                cachedSignatures.append(
+                streamHandle.workingCache,
-                    Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
+                promptTokenCount: preparedInference.tokens.count,
                )
            }
            ConversationSessionCache.shared.completeRequest(
                entryId: lease.entryId,
-                session: session,
+                cacheKey: cacheKey,
-                requestMessageSignatures: cachedSignatures,
+                modelId: currentModelId
                requestMessageCount: cachedSignatures.count,
                estimatedPromptTokens: estimatedPromptTokens,
                estimatedBytes: preparedPrompt.estimatedBytes,
                promptTokens: result.promptTokens,
                completionTokens: result.completionTokens
            )
        } else {
            ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
        }
        LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
@@ -375,53 +345,20 @@ final class APIServer {
    private func handleNonStreamingResponse(
        connection: NWConnection,
        requestId: String,
-        cacheEntryId: UUID,
+        stream: AsyncStream<Generation>,
        session: ChatSession,
        prompt: String,
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String,
+        modelName: String
-        isQwen: Bool
+    ) async -> GenerationOutcome {
    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        do {
-            var fullText = ""
+            let outcome = await Self.collectGenerationOutcome(
-            var promptTokens = 0
+                stream: stream,
-            var completionTokens = 0
+                requestId: requestId,
-            var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
+                cancellation: nil
            let stream = session.streamDetails(
                to: prompt,
                images: images,
                videos: []
            )
            for try await generation in stream {
                switch generation {
                case .chunk(let text):
                    fullText += text
                    completionTokens += 1
                    LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
                case .info(let info):
                    promptTokens = info.promptTokenCount
                    completionTokens = info.generationTokenCount
                    ConversationSessionCache.shared.markGenerating(
                        entryId: cacheEntryId,
                        promptTokens: promptTokens,
                        completionTokens: completionTokens
                    )
                    LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
                    if info.tokensPerSecond > 0 {
                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
                    }
                case .toolCall(let call):
                    frameworkToolCalls.append(call)
                }
            }
            let resolved = Self.resolveAssistantResponse(
-                fullText: fullText,
+                fullText: outcome.fullText,
-                frameworkToolCalls: frameworkToolCalls,
+                frameworkToolCalls: outcome.frameworkToolCalls,
                tools: tools
            )
@@ -442,24 +379,26 @@ final class APIServer {
                    )
                ],
                usage: APIUsageInfo(
-                    prompt_tokens: promptTokens,
+                    prompt_tokens: outcome.promptTokens,
-                    completion_tokens: completionTokens,
+                    completion_tokens: outcome.completionTokens,
-                    total_tokens: promptTokens + completionTokens
+                    total_tokens: outcome.promptTokens + outcome.completionTokens
                )
            )
            if let json = try? JSONEncoder().encode(response) {
                sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
            }
-            let assistantHistoryText = Self.normalizedAssistantHistoryContent(
+            return GenerationOutcome(
-                content: resolved.content,
+                promptTokens: outcome.promptTokens,
-                toolCalls: resolved.toolCalls,
+                completionTokens: outcome.completionTokens,
-                isQwen: isQwen
+                fullText: outcome.fullText,
                frameworkToolCalls: outcome.frameworkToolCalls,
                succeeded: true,
                cancelled: false
            )
            return (promptTokens, completionTokens, assistantHistoryText, true)
        } catch {
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
-            return (0, 0, nil, false)
+            return GenerationOutcome(promptTokens: 0, completionTokens: 0, fullText: "", frameworkToolCalls: [], succeeded: false, cancelled: false)
        }
    }
@@ -468,15 +407,12 @@ final class APIServer {
    private func handleStreamingResponse(
        connection: NWConnection,
        requestId: String,
-        cacheEntryId: UUID,
+        cancellation: CancellationToken,
-        session: ChatSession,
+        stream: AsyncStream<Generation>,
        prompt: String,
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String,
+        modelName: String
-        isQwen: Bool
+    ) async -> GenerationOutcome {
    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        // Send SSE headers
        let header = [
            "HTTP/1.1 200 OK",
@@ -489,55 +425,34 @@ final class APIServer {
        ].joined(separator: "\r\n")
        await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
        connection.stateUpdateHandler = { state in
            switch state {
            case .cancelled, .failed:
                cancellation.cancel()
            default:
                break
            }
        }
-        // Send initial role chunk
+        let encoder = StreamingSSEEncoder(requestId: requestId, created: created, modelName: modelName)
-        await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
+        await Self.sendData(connection: connection, data: encoder.encodeRoleDelta("assistant"))
            id: requestId,
            object: "chat.completion.chunk",
            created: created,
            model: modelName,
            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: "assistant", content: nil, tool_calls: nil), finish_reason: nil)],
            usage: nil
        ))
-        let hasTools = tools != nil && !(tools?.isEmpty ?? true)
+        let result = await Self.runStreamingLoop(
-
+            connection: connection,
-        // Run the generation loop OFF MainActor.
+            stream: stream,
-        // ChatSession and NWConnection don't need MainActor.
+            cancellation: cancellation,
-        // Running on MainActor caused every token to compete with SwiftUI
+            requestId: requestId,
-        // rendering, creating back-pressure that coalesced all output.
+            encoder: encoder
        let stream = session.streamDetails(
            to: prompt,
            images: images,
            videos: []
        )
        // Transfer non-Sendable values to the nonisolated loop.
        // Safe because we don't touch session/images again until after the loop.
        let result = await {
            nonisolated(unsafe) let stream = stream
            return await Self.runStreamingLoop(
                connection: connection,
                stream: stream,
                requestId: requestId,
                created: created,
                modelName: modelName
            )
        }()
-        let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
+        if result.cancelled {
-
+            connection.cancel()
-        if promptTokens > 0 {
+            return result
            ConversationSessionCache.shared.markGenerating(
                entryId: cacheEntryId,
                promptTokens: promptTokens,
                completionTokens: completionTokens
            )
            LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
        }
        let resolved = Self.resolveAssistantResponse(
-            fullText: fullText,
+            fullText: result.fullText,
-            frameworkToolCalls: frameworkToolCalls,
+            frameworkToolCalls: result.frameworkToolCalls,
            tools: tools
        )
@@ -562,21 +477,16 @@ final class APIServer {
            model: modelName,
            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
            usage: APIUsageInfo(
-                prompt_tokens: promptTokens,
+                prompt_tokens: result.promptTokens,
-                completion_tokens: completionTokens,
+                completion_tokens: result.completionTokens,
-                total_tokens: promptTokens + completionTokens
+                total_tokens: result.promptTokens + result.completionTokens
            )
        ))
        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
        connection.cancel()
-        let assistantHistoryText = Self.normalizedAssistantHistoryContent(
+        return result
            content: resolved.content,
            toolCalls: resolved.toolCalls,
            isQwen: isQwen
        )
        return (promptTokens, completionTokens, assistantHistoryText, succeeded)
    }
    /// Run the token generation + SSE send loop entirely off MainActor.
@@ -584,54 +494,20 @@ final class APIServer {
    /// multiple actor hops competing with SwiftUI, causing all output to batch.
    nonisolated private static func runStreamingLoop(
        connection: NWConnection,
-        stream: AsyncThrowingStream<Generation, any Error>,
+        stream: AsyncStream<Generation>,
        cancellation: CancellationToken,
        requestId: String,
-        created: Int,
+        encoder: StreamingSSEEncoder
-        modelName: String
+    ) async -> GenerationOutcome {
-    ) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
+        var outcome = await collectGenerationOutcome(
-        var promptTokens = 0
+            stream: stream,
-        var completionTokens = 0
+            requestId: requestId,
-        var fullText = ""
+            cancellation: cancellation
-        var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
+        ) { text in
-
+            await sendData(connection: connection, data: encoder.encodeContentDelta(text))
        do {
            for try await generation in stream {
                switch generation {
                case .chunk(let text):
                    completionTokens += 1
                    fullText += text
                    // Update live counters directly — no MainActor hop needed
                    LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
                    // Send directly — no MainActor hop.
                    await sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
                        id: requestId,
                        object: "chat.completion.chunk",
                        created: created,
                        model: modelName,
                        choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: text, tool_calls: nil), finish_reason: nil)],
                        usage: nil
                    ))
                case .info(let info):
                    promptTokens = info.promptTokenCount
                    completionTokens = info.generationTokenCount
                    if info.tokensPerSecond > 0 {
                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
                    }
                case .toolCall(let call):
                    frameworkToolCalls.append(call)
                }
            }
        } catch {
            let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
            await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
            return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
        }
-
+        outcome.succeeded = !outcome.cancelled
-        return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
+        return outcome
    }
    /// Send an SSE event and wait for the protocol stack to process it.
@@ -651,6 +527,88 @@ final class APIServer {
        }
    }
    nonisolated private static func collectGenerationOutcome(
        stream: AsyncStream<Generation>,
        requestId: String,
        cancellation: CancellationToken?,
        onChunk: ((String) async -> Void)? = nil
    ) async -> GenerationOutcome {
        var promptTokens = 0
        var completionTokens = 0
        var fullText = ""
        var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
        var cancelled = false
        for await generation in stream {
            if let cancellation, cancellation.isCancelled {
                cancelled = true
                break
            }
            switch generation {
            case .chunk(let text):
                completionTokens += 1
                fullText += text
                LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
                if let onChunk {
                    await onChunk(text)
                }
            case .info(let info):
                promptTokens = info.promptTokenCount
                completionTokens = info.generationTokenCount
                LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
                if info.tokensPerSecond > 0 {
                    LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
                }
            case .toolCall(let call):
                frameworkToolCalls.append(call)
            }
        }
        return GenerationOutcome(
            promptTokens: promptTokens,
            completionTokens: completionTokens,
            fullText: fullText,
            frameworkToolCalls: frameworkToolCalls,
            succeeded: !cancelled,
            cancelled: cancelled
        )
    }
    private static func storePromptCache(
        _ cache: [KVCache],
        promptTokenCount: Int,
        entryId: UUID,
        cacheKey: [Int],
        modelId: String
    ) {
        guard trimGeneratedTokens(cache, promptTokenCount: promptTokenCount) else {
            return
        }
        TokenPrefixCache.shared.store(
            entryId: entryId,
            kvCache: cache,
            cacheKey: cacheKey,
            modelId: modelId
        )
    }
    private static func trimGeneratedTokens(_ cache: [KVCache], promptTokenCount: Int) -> Bool {
        for layer in cache {
            let excess = layer.offset - promptTokenCount
            guard excess <= 0 || layer.isTrimmable else {
                return false
            }
            if excess > 0 {
                let trimmed = layer.trim(excess)
                guard trimmed == excess else {
                    return false
                }
            }
        }
        return true
    }
    // MARK: - HTTP helpers
    private func sendResponse(
@@ -787,6 +745,15 @@ final class APIServer {
    }
 }
 private struct GenerationOutcome {
    var promptTokens: Int
    var completionTokens: Int
    var fullText: String
    var frameworkToolCalls: [MLXLMCommon.ToolCall]
    var succeeded: Bool
    var cancelled: Bool
 }
 // MARK: - HTTP request parser
 private struct HTTPRequest {
--- a/MLXServer/Server/ConversationSessionCache.swift
+++ b/MLXServer/Server/ConversationSessionCache.swift
@@ -1,358 +0,0 @@
 import Foundation
 import MLXLMCommon
 import os
 enum APISessionPhase: String, Sendable {
    case idle = "Idle"
    case sessionBuild = "Session Build"
    case prefilling = "Prefilling"
    case generating = "Generating"
 }
 /// Bounded cache of API chat sessions keyed by normalized conversation history.
 /// The cache is internal-only and safe to sample from the monitor without involving MainActor.
 final class ConversationSessionCache: @unchecked Sendable {
    static let shared = ConversationSessionCache()
    private let lock = OSAllocatedUnfairLock()
    private let maxEntries = 8
    private let maxCachedTokens = 256_000
    private let idleTTL: TimeInterval = 10 * 60
    private var entries: [UUID: Entry] = [:]
    private var totals = Totals()
    private init() {}
    struct Lease {
        let entryId: UUID
        let session: ChatSession?
        let reusedPromptTokens: Int
        let cacheHit: Bool
    }
    struct SessionSummary: Identifiable, Sendable {
        let id: UUID
        let modelId: String
        let phase: APISessionPhase
        let messageCount: Int
        let cachedTokenEstimate: Int
        let estimatedBytes: Int
        let inFlightRequests: Int
        let hitCount: Int
        let lastPromptTokens: Int
        let lastCompletionTokens: Int
        let lastReuseTokens: Int
        let createdAt: Date
        let lastAccessAt: Date
    }
    struct Snapshot: Sendable {
        let totalEntries: Int
        let warmEntries: Int
        let activeEntries: Int
        let generatingEntries: Int
        let estimatedBytes: Int
        let cachedTokenEstimate: Int
        let totalHits: Int
        let totalMisses: Int
        let totalEvictions: Int
        let totalReusePromptTokens: Int
        let totalRebuildPromptTokens: Int
        let sessions: [SessionSummary]
    }
    func checkoutSession(
        modelId: String,
        instructions: String,
        historySignatures: [UInt64],
        requestMessageCount: Int,
        estimatedPromptTokens: Int,
        estimatedBytes: Int
    ) -> Lease {
        lock.lock()
        let now = Date()
        pruneExpiredLocked(now: now)
        let instructionsHash = Self.stableHash(instructions)
        let match = entries
            .values
            .filter {
                $0.modelId == modelId
                    && $0.instructionsHash == instructionsHash
                    && $0.session != nil
                    && $0.inFlightRequests == 0
                    && Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
            }
            .max { lhs, rhs in
                lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
            }
        if let match {
            var entry = match
            entry.inFlightRequests += 1
            entry.lastAccessAt = now
            entry.phase = .prefilling
            entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
            entry.hitCount += 1
            entries[entry.id] = entry
            totals.totalHits += 1
            totals.totalReusePromptTokens += entry.lastReuseTokens
            let lease = Lease(
                entryId: entry.id,
                session: entry.session,
                reusedPromptTokens: entry.lastReuseTokens,
                cacheHit: true
            )
            lock.unlock()
            return lease
        }
        let entryId = UUID()
        entries[entryId] = Entry(
            id: entryId,
            modelId: modelId,
            instructionsHash: instructionsHash,
            requestMessageSignatures: historySignatures,
            messageCount: requestMessageCount,
            cachedTokenEstimate: estimatedPromptTokens,
            estimatedBytes: estimatedBytes,
            createdAt: now,
            lastAccessAt: now,
            inFlightRequests: 1,
            hitCount: 0,
            phase: .sessionBuild,
            lastPromptTokens: 0,
            lastCompletionTokens: 0,
            lastReuseTokens: 0,
            session: nil
        )
        totals.totalMisses += 1
        totals.totalRebuildPromptTokens += estimatedPromptTokens
        lock.unlock()
        return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
    }
    func markSessionBuild(entryId: UUID) {
        updatePhase(entryId: entryId, phase: .sessionBuild)
    }
    func markPrefilling(entryId: UUID) {
        updatePhase(entryId: entryId, phase: .prefilling)
    }
    func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
        lock.lock()
        if var entry = entries[entryId] {
            entry.phase = .generating
            entry.lastPromptTokens = promptTokens
            entry.lastCompletionTokens = completionTokens
            entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
            entry.lastAccessAt = Date()
            entries[entryId] = entry
        }
        lock.unlock()
    }
    func completeRequest(
        entryId: UUID,
        session: ChatSession,
        requestMessageSignatures: [UInt64],
        requestMessageCount: Int,
        estimatedPromptTokens: Int,
        estimatedBytes: Int,
        promptTokens: Int,
        completionTokens: Int
    ) {
        lock.lock()
        let now = Date()
        if var entry = entries[entryId] {
            entry.session = session
            entry.requestMessageSignatures = requestMessageSignatures
            entry.messageCount = requestMessageCount
            entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
            entry.estimatedBytes = estimatedBytes
            entry.lastPromptTokens = promptTokens
            entry.lastCompletionTokens = completionTokens
            entry.lastAccessAt = now
            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
            entry.phase = .idle
            entries[entryId] = entry
            enforceBudgetLocked(now: now)
        }
        lock.unlock()
    }
    func abandonRequest(entryId: UUID) {
        lock.lock()
        if var entry = entries[entryId] {
            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
            if entry.session == nil && entry.inFlightRequests == 0 {
                entries.removeValue(forKey: entryId)
            } else {
                entry.phase = .idle
                entry.lastAccessAt = Date()
                entries[entryId] = entry
            }
        }
        lock.unlock()
    }
    func invalidateAll() {
        lock.lock()
        totals.totalEvictions += entries.count
        entries.removeAll()
        lock.unlock()
    }
    func reset() {
        lock.lock()
        entries.removeAll()
        totals = Totals()
        lock.unlock()
    }
    func snapshot() -> Snapshot {
        lock.lock()
        let now = Date()
        pruneExpiredLocked(now: now)
        let allEntries = Array(entries.values)
        let sessions = allEntries
            .sorted {
                if $0.inFlightRequests != $1.inFlightRequests {
                    return $0.inFlightRequests > $1.inFlightRequests
                }
                return $0.lastAccessAt > $1.lastAccessAt
            }
            .map {
                SessionSummary(
                    id: $0.id,
                    modelId: $0.modelId,
                    phase: $0.phase,
                    messageCount: $0.messageCount,
                    cachedTokenEstimate: $0.cachedTokenEstimate,
                    estimatedBytes: $0.estimatedBytes,
                    inFlightRequests: $0.inFlightRequests,
                    hitCount: $0.hitCount,
                    lastPromptTokens: $0.lastPromptTokens,
                    lastCompletionTokens: $0.lastCompletionTokens,
                    lastReuseTokens: $0.lastReuseTokens,
                    createdAt: $0.createdAt,
                    lastAccessAt: $0.lastAccessAt
                )
            }
        let snapshot = Snapshot(
            totalEntries: allEntries.count,
            warmEntries: allEntries.filter { $0.session != nil }.count,
            activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
            generatingEntries: allEntries.filter { $0.phase == .generating }.count,
            estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
            cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
            totalHits: totals.totalHits,
            totalMisses: totals.totalMisses,
            totalEvictions: totals.totalEvictions,
            totalReusePromptTokens: totals.totalReusePromptTokens,
            totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
            sessions: sessions
        )
        lock.unlock()
        return snapshot
    }
    private func updatePhase(entryId: UUID, phase: APISessionPhase) {
        lock.lock()
        if var entry = entries[entryId] {
            entry.phase = phase
            entry.lastAccessAt = Date()
            entries[entryId] = entry
        }
        lock.unlock()
    }
    private func pruneExpiredLocked(now: Date) {
        let expired = entries.values.filter {
            $0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
        }
        guard !expired.isEmpty else { return }
        for entry in expired {
            entries.removeValue(forKey: entry.id)
        }
        totals.totalEvictions += expired.count
    }
    private func enforceBudgetLocked(now: Date) {
        pruneExpiredLocked(now: now)
        func totalCachedTokens() -> Int {
            entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
        }
        while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
            guard let victim = entries.values
                .filter({ $0.inFlightRequests == 0 })
                .sorted(by: evictionOrder)
                .first
            else {
                break
            }
            entries.removeValue(forKey: victim.id)
            totals.totalEvictions += 1
        }
    }
    private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
        if lhs.lastAccessAt != rhs.lastAccessAt {
            return lhs.lastAccessAt < rhs.lastAccessAt
        }
        if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
            return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
        }
        return lhs.createdAt < rhs.createdAt
    }
    private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
        guard cached.count <= incoming.count,
              incoming.count <= cached.count + 1 else { return false }
        for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
            return false
        }
        return true
    }
    static func stableHash(_ text: String) -> UInt64 {
        var hash: UInt64 = 14_695_981_039_346_656_037
        for byte in text.utf8 {
            hash ^= UInt64(byte)
            hash &*= 1_099_511_628_211
        }
        return hash
    }
    private struct Entry {
        let id: UUID
        let modelId: String
        let instructionsHash: UInt64
        var requestMessageSignatures: [UInt64]
        var messageCount: Int
        var cachedTokenEstimate: Int
        var estimatedBytes: Int
        let createdAt: Date
        var lastAccessAt: Date
        var inFlightRequests: Int
        var hitCount: Int
        var phase: APISessionPhase
        var lastPromptTokens: Int
        var lastCompletionTokens: Int
        var lastReuseTokens: Int
        var session: ChatSession?
    }
    private struct Totals {
        var totalHits: Int = 0
        var totalMisses: Int = 0
        var totalEvictions: Int = 0
        var totalReusePromptTokens: Int = 0
        var totalRebuildPromptTokens: Int = 0
    }
 }
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
--- a/MLXServerTests/Server/APIServerRewriteTests.swift
+++ b/MLXServerTests/Server/APIServerRewriteTests.swift
@@ -0,0 +1,736 @@
 import Foundation
 import XCTest
@testable import MLX_Server
 final class APIServerRewriteTests: XCTestCase {
    func testNonStreamingChatCompletionUsesStatelessServerPathAndCachesPrompt() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let request = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 1,
            stream: false,
            stop: nil,
            tools: nil,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let firstResponse = try await sendChatCompletion(request, port: harness.port)
        XCTAssertEqual(firstResponse.choices.count, 1)
        XCTAssertEqual(firstResponse.choices[0].message.role, "assistant")
        XCTAssertGreaterThan(firstResponse.usage.prompt_tokens, 0)
        XCTAssertGreaterThanOrEqual(firstResponse.usage.completion_tokens, 0)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalEntries > 0
        }
        let firstSnapshot = TokenPrefixCache.shared.snapshot()
        let firstLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
        _ = try await sendChatCompletion(request, port: harness.port)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
        }
        let secondSnapshot = TokenPrefixCache.shared.snapshot()
        let secondLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
    }
    func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let firstRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 3,
            stream: true,
            stop: nil,
            tools: nil,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let firstStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
        XCTAssertEqual(firstStream.roleDeltaCount, 1)
        XCTAssertTrue(firstStream.sawDone)
        XCTAssertEqual(firstStream.finalFinishReason, "stop")
        XCTAssertGreaterThan(firstStream.usage?.prompt_tokens ?? 0, 0)
        XCTAssertFalse(firstStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalEntries > 0
        }
        let firstSnapshot = TokenPrefixCache.shared.snapshot()
        let firstLiveSnapshot = LiveCounters.shared.snapshot()
        let secondRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 3,
            stream: true,
            stop: nil,
            tools: nil,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let secondStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
        XCTAssertEqual(secondStream.roleDeltaCount, 1)
        XCTAssertTrue(secondStream.sawDone)
        XCTAssertEqual(secondStream.finalFinishReason, "stop")
        XCTAssertFalse(secondStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
        }
        let secondSnapshot = TokenPrefixCache.shared.snapshot()
        let secondLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
        let thirdRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: .text(secondStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Answer in one word: what color is snow?"), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 3,
            stream: true,
            stop: nil,
            tools: nil,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
        XCTAssertEqual(thirdStream.roleDeltaCount, 1)
        XCTAssertTrue(thirdStream.sawDone)
        XCTAssertEqual(thirdStream.finalFinishReason, "stop")
        XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
        }
        let thirdSnapshot = TokenPrefixCache.shared.snapshot()
        let thirdLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
        XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
    }
    func testStreamingChatCompletionReusesCacheAcrossToolBoundary() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let tools = [mockWeatherTool]
        let firstRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 48,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let toolCallStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
        XCTAssertEqual(toolCallStream.roleDeltaCount, 1)
        XCTAssertTrue(toolCallStream.sawDone)
        XCTAssertEqual(toolCallStream.finalFinishReason, "tool_calls")
        let toolCall = try XCTUnwrap(toolCallStream.toolCalls.first)
        XCTAssertEqual(toolCall.function.name, "weather")
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalEntries > 0
        }
        let afterToolCallSnapshot = TokenPrefixCache.shared.snapshot()
        let afterToolCallLiveSnapshot = LiveCounters.shared.snapshot()
        let secondRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
                APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 16,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let directAnswerStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
        XCTAssertEqual(directAnswerStream.roleDeltaCount, 1)
        XCTAssertTrue(directAnswerStream.sawDone)
        XCTAssertEqual(directAnswerStream.finalFinishReason, "stop")
        XCTAssertTrue(directAnswerStream.toolCalls.isEmpty)
        XCTAssertFalse(directAnswerStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > afterToolCallSnapshot.totalHits
        }
        let afterDirectAnswerSnapshot = TokenPrefixCache.shared.snapshot()
        let afterDirectAnswerLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(afterDirectAnswerSnapshot.totalHits, afterToolCallSnapshot.totalHits)
        XCTAssertGreaterThan(afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens, afterToolCallLiveSnapshot.totalCacheReusePromptTokens)
        let thirdRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
                APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id),
                APIChatMessage(role: "assistant", content: .text(directAnswerStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Now compress that answer to two words."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 8,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
        XCTAssertEqual(thirdStream.roleDeltaCount, 1)
        XCTAssertTrue(thirdStream.sawDone)
        XCTAssertEqual(thirdStream.finalFinishReason, "stop")
        XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > afterDirectAnswerSnapshot.totalHits
        }
        let finalSnapshot = TokenPrefixCache.shared.snapshot()
        let finalLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(finalSnapshot.totalHits, afterDirectAnswerSnapshot.totalHits)
        XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens)
    }
    func testStreamingChatCompletionReusesCacheAcrossMultipleToolTurns() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let tools = [mockWeatherTool]
        let berlinRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 48,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let firstToolTurn = try await sendStreamingChatCompletion(berlinRequest, port: harness.port)
        XCTAssertEqual(firstToolTurn.finalFinishReason, "tool_calls")
        let berlinToolCall = try XCTUnwrap(firstToolTurn.toolCalls.first)
        XCTAssertEqual(berlinToolCall.function.name, "weather")
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalEntries > 0
        }
        let firstSnapshot = TokenPrefixCache.shared.snapshot()
        let firstLiveSnapshot = LiveCounters.shared.snapshot()
        let berlinToolResult = APIChatMessage(
            role: "tool",
            content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"),
            name: nil,
            tool_calls: nil,
            tool_call_id: berlinToolCall.id
        )
        let berlinAnswerRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
                berlinToolResult
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 16,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let berlinAnswer = try await sendStreamingChatCompletion(berlinAnswerRequest, port: harness.port)
        XCTAssertEqual(berlinAnswer.finalFinishReason, "stop")
        XCTAssertFalse(berlinAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
        }
        let secondSnapshot = TokenPrefixCache.shared.snapshot()
        let secondLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
        let parisToolTurnRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
                berlinToolResult,
                APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 48,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let secondToolTurn = try await sendStreamingChatCompletion(parisToolTurnRequest, port: harness.port)
        XCTAssertEqual(secondToolTurn.finalFinishReason, "tool_calls")
        let parisToolCall = try XCTUnwrap(secondToolTurn.toolCalls.first)
        XCTAssertEqual(parisToolCall.function.name, "weather")
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
        }
        let thirdSnapshot = TokenPrefixCache.shared.snapshot()
        let thirdLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
        XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
        let parisAnswerRequest = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
                berlinToolResult,
                APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [parisToolCall], tool_call_id: nil),
                APIChatMessage(role: "tool", content: .text("{\"city\":\"Paris\",\"temperature_c\":21,\"condition\":\"clear\"}"), name: nil, tool_calls: nil, tool_call_id: parisToolCall.id)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 16,
            stream: true,
            stop: nil,
            tools: tools,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let parisAnswer = try await sendStreamingChatCompletion(parisAnswerRequest, port: harness.port)
        XCTAssertEqual(parisAnswer.finalFinishReason, "stop")
        XCTAssertTrue(parisAnswer.toolCalls.isEmpty)
        XCTAssertFalse(parisAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > thirdSnapshot.totalHits
        }
        let fourthSnapshot = TokenPrefixCache.shared.snapshot()
        let fourthLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(fourthSnapshot.totalHits, thirdSnapshot.totalHits)
        XCTAssertGreaterThan(fourthLiveSnapshot.totalCacheReusePromptTokens, thirdLiveSnapshot.totalCacheReusePromptTokens)
    }
    func testStreamingDisconnectStoresPromptCacheForReuse() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let request = APIChatCompletionRequest(
            model: "gemma",
            messages: [
                APIChatMessage(role: "user", content: .text("Count from one to twenty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
            ],
            temperature: 0,
            top_p: 1,
            max_tokens: 64,
            stream: true,
            stop: nil,
            tools: nil,
            tool_choice: nil,
            frequency_penalty: nil,
            presence_penalty: nil,
            n: nil
        )
        let initialSnapshot = TokenPrefixCache.shared.snapshot()
        try await cancelStreamingChatCompletionAfterFirstContent(request, port: harness.port)
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalEntries > initialSnapshot.totalEntries
        }
        let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
        let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
        _ = try await sendChatCompletion(
            APIChatCompletionRequest(
                model: request.model,
                messages: request.messages,
                temperature: request.temperature,
                top_p: request.top_p,
                max_tokens: 8,
                stream: false,
                stop: request.stop,
                tools: request.tools,
                tool_choice: request.tool_choice,
                frequency_penalty: request.frequency_penalty,
                presence_penalty: request.presence_penalty,
                n: request.n
            ),
            port: harness.port
        )
        try await waitUntil(timeoutSeconds: 5) {
            TokenPrefixCache.shared.snapshot().totalHits > afterDisconnectSnapshot.totalHits
        }
        let finalSnapshot = TokenPrefixCache.shared.snapshot()
        let finalLiveSnapshot = LiveCounters.shared.snapshot()
        XCTAssertGreaterThan(finalSnapshot.totalHits, afterDisconnectSnapshot.totalHits)
        XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
    }
    func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
        let harness = try await makeHarness()
        defer { harness.stop() }
        let detailed = try await sendStreamingChatCompletionDetailed(
            APIChatCompletionRequest(
                model: "gemma",
                messages: [
                    APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
                ],
                temperature: 0,
                top_p: 1,
                max_tokens: 48,
                stream: true,
                stop: nil,
                tools: [mockWeatherTool],
                tool_choice: nil,
                frequency_penalty: nil,
                presence_penalty: nil,
                n: nil
            ),
            port: harness.port
        )
        XCTAssertTrue(detailed.sawDone)
        XCTAssertFalse(detailed.events.isEmpty)
        let firstEvent = try XCTUnwrap(detailed.events.first)
        XCTAssertEqual(firstEvent.kind, .role)
        XCTAssertEqual(firstEvent.role, "assistant")
        let toolEventIndices = detailed.events.enumerated().compactMap { index, event in
            event.kind == .toolCall ? index : nil
        }
        XCTAssertFalse(toolEventIndices.isEmpty)
        let finalIndex = try XCTUnwrap(detailed.events.lastIndex(where: { $0.kind == .final }))
        XCTAssertEqual(finalIndex, detailed.events.count - 1)
        for toolIndex in toolEventIndices {
            XCTAssertLessThan(toolIndex, finalIndex)
        }
        let finalEvent = detailed.events[finalIndex]
        XCTAssertEqual(finalEvent.finishReason, "tool_calls")
        XCTAssertNotNil(finalEvent.usage)
        let roleEventCount = detailed.events.filter { $0.kind == .role }.count
        XCTAssertEqual(roleEventCount, 1)
    }
    private var mockWeatherTool: APIToolDefinition {
        APIToolDefinition(
            type: "function",
            function: APIFunctionDefinition(
                name: "weather",
                description: "Look up weather for a city.",
                parameters: [
                    "type": AnyCodable("object"),
                    "properties": AnyCodable([
                        "city": [
                            "type": "string",
                            "description": "City name"
                        ]
                    ]),
                    "required": AnyCodable(["city"])
                ]
            )
        )
    }
    private func makeHarness() async throws -> TestHarness {
        let modelManager = await MainActor.run { ModelManager() }
        let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
        LiveCounters.shared.reset()
        TokenPrefixCache.shared.reset()
        await modelManager.loadModel(config)
        let isReady = await MainActor.run { modelManager.isReady }
        XCTAssertTrue(isReady)
        let server = await MainActor.run { APIServer() }
        let port = UInt16.random(in: 20_000...40_000)
        await MainActor.run {
            server.start(modelManager: modelManager, port: Int(port))
        }
        try await waitUntil(timeoutSeconds: 5) {
            await MainActor.run { server.isRunning }
        }
        return TestHarness(server: server, modelManager: modelManager, port: port)
    }
    private func sendChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> APIChatCompletionResponse {
        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
        var urlRequest = URLRequest(url: url)
        urlRequest.httpMethod = "POST"
        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
        urlRequest.httpBody = try JSONEncoder().encode(request)
        let (data, response) = try await URLSession.shared.data(for: urlRequest)
        let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
        XCTAssertEqual(httpResponse.statusCode, 200, String(data: data, encoding: .utf8) ?? "")
        return try JSONDecoder().decode(APIChatCompletionResponse.self, from: data)
    }
    private func sendStreamingChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> StreamingResult {
        let detailed = try await sendStreamingChatCompletionDetailed(request, port: port)
        return StreamingResult(
            roleDeltaCount: detailed.events.filter { $0.kind == .role }.count,
            content: detailed.events.compactMap(\ .content).joined(),
            toolCalls: detailed.events.flatMap(\ .toolCalls),
            finalFinishReason: detailed.events.last(where: { $0.kind == .final })?.finishReason,
            usage: detailed.events.last(where: { $0.kind == .final })?.usage,
            sawDone: detailed.sawDone
        )
    }
    private func sendStreamingChatCompletionDetailed(_ request: APIChatCompletionRequest, port: UInt16) async throws -> DetailedStreamingResult {
        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
        var urlRequest = URLRequest(url: url)
        urlRequest.httpMethod = "POST"
        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
        urlRequest.httpBody = try JSONEncoder().encode(request)
        let (bytes, response) = try await URLSession.shared.bytes(for: urlRequest)
        let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
        guard httpResponse.statusCode == 200 else {
            var body = ""
            for try await line in bytes.lines {
                body += line
            }
            XCTFail("Expected 200 response, got \(httpResponse.statusCode): \(body)")
            return DetailedStreamingResult(events: [], sawDone: false)
        }
        var events: [StreamingEvent] = []
        var sawDone = false
        for try await line in bytes.lines {
            guard line.hasPrefix("data: ") else { continue }
            let payload = String(line.dropFirst(6))
            if payload == "[DONE]" {
                sawDone = true
                break
            }
            guard let data = payload.data(using: .utf8) else { continue }
            let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
            let choice = chunk.choices.first
            if let delta = chunk.choices.first?.delta.role, delta == "assistant" {
                events.append(StreamingEvent(kind: .role, role: delta, content: nil, toolCalls: [], finishReason: nil, usage: nil))
            }
            if let deltaContent = chunk.choices.first?.delta.content {
                events.append(StreamingEvent(kind: .content, role: nil, content: deltaContent, toolCalls: [], finishReason: nil, usage: nil))
            }
            if let deltaToolCalls = chunk.choices.first?.delta.tool_calls {
                events.append(StreamingEvent(kind: .toolCall, role: nil, content: nil, toolCalls: deltaToolCalls, finishReason: nil, usage: nil))
            }
            if let finishReason = choice?.finish_reason {
                events.append(StreamingEvent(kind: .final, role: nil, content: nil, toolCalls: [], finishReason: finishReason, usage: chunk.usage))
            }
        }
        return DetailedStreamingResult(events: events, sawDone: sawDone)
    }
    private func cancelStreamingChatCompletionAfterFirstContent(_ request: APIChatCompletionRequest, port: UInt16) async throws {
        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
        var urlRequest = URLRequest(url: url)
        urlRequest.httpMethod = "POST"
        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
        urlRequest.httpBody = try JSONEncoder().encode(request)
        let observer = StreamCancellationObserver()
        let session = URLSession(configuration: .ephemeral)
        let task = Task {
            let (bytes, response) = try await session.bytes(for: urlRequest)
            let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
            XCTAssertEqual(httpResponse.statusCode, 200)
            for try await line in bytes.lines {
                guard line.hasPrefix("data: ") else { continue }
                let payload = String(line.dropFirst(6))
                if payload == "[DONE]" {
                    break
                }
                guard let data = payload.data(using: .utf8) else { continue }
                let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
                if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
                    await observer.markFirstContentSeen()
                    try await Task.sleep(nanoseconds: 30_000_000_000)
                }
            }
        }
        try await waitUntil(timeoutSeconds: 10) {
            await observer.hasSeenFirstContent
        }
        session.invalidateAndCancel()
        task.cancel()
        _ = try? await task.value
    }
    private func waitUntil(
        timeoutSeconds: TimeInterval,
        intervalNanoseconds: UInt64 = 100_000_000,
        condition: @escaping () async -> Bool
    ) async throws {
        let deadline = Date().addingTimeInterval(timeoutSeconds)
        while Date() < deadline {
            if await condition() {
                return
            }
            try await Task.sleep(nanoseconds: intervalNanoseconds)
        }
        XCTFail("Condition not met before timeout")
    }
 }
 private actor StreamCancellationObserver {
    private var sawFirstContent = false
    func markFirstContentSeen() {
        sawFirstContent = true
    }
    var hasSeenFirstContent: Bool {
        sawFirstContent
    }
 }
 private struct DetailedStreamingResult {
    let events: [StreamingEvent]
    let sawDone: Bool
 }
 private struct StreamingEvent {
    enum Kind {
        case role
        case content
        case toolCall
        case final
    }
    let kind: Kind
    let role: String?
    let content: String?
    let toolCalls: [APIToolCall]
    let finishReason: String?
    let usage: APIUsageInfo?
 }
 private struct StreamingResult {
    let roleDeltaCount: Int
    let content: String
    let toolCalls: [APIToolCall]
    let finalFinishReason: String?
    let usage: APIUsageInfo?
    let sawDone: Bool
 }
 private struct TestHarness {
    let server: APIServer
    let modelManager: ModelManager
    let port: UInt16
    func stop() {
        Task { @MainActor in
            server.stop()
            modelManager.unloadModel()
        }
        TokenPrefixCache.shared.reset()
    }
 }
--- a/docs/session-cache-upgrade.md
+++ b/docs/session-cache-upgrade.md
@@ -2572,14 +2572,18 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
 ### Phase 3: Integration
-7. **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
+7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
 8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
 Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
 ### Phase 4: Statistics & Monitoring
 9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
-10. **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
+10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
-11. **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
+11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
 Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.
 ### Phase 5: Advanced Cache Matching