feat: better hardening with unit tests and end-to-end tests

2026-03-20 10:27:39 +01:00
parent e40a2f3c45
commit aadcc308a5
7 changed files with 1395 additions and 1326 deletions
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -46,6 +46,7 @@
 		C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
 		C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
 		CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
+		CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */; };
 		CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
 		D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
 		D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
@@ -53,7 +54,6 @@
 		E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
 		E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
 		EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
-		F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
 		F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
 		FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
 		FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -114,6 +114,7 @@
 		DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
 		E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
 		E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
+		E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerRewriteTests.swift; sourceTree = "<group>"; };
 		E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
 		E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
 		E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
@@ -121,7 +122,6 @@
 		F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
 		F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
-		FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -172,6 +172,7 @@
 		154AF0C071A7DC02EB5F6F49 /* Server */ = {
 			isa = PBXGroup;
 			children = (
+				E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
 				FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
 				E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
 				D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
@@ -263,7 +264,6 @@
 				F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
 				3D08828E16B17EF02C14243E /* APIServer.swift */,
 				3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
-				FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
 				7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
 				02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
 				E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
@@ -379,6 +379,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
 				962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
 				E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
 				8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
@@ -406,7 +407,6 @@
 				85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
 				B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
 				5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
-				F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
 				C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
 				4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
 				A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -24,11 +24,15 @@ final class LiveCounters: @unchecked Sendable {
    private var _isGenerating: Bool = false
    private var _contextMax: Int = 0
    private var _currentPhaseElapsed: TimeInterval = 0
+    private var _currentCacheMatchedPromptTokens: Int = 0
+    private var _currentCacheRebuiltPromptTokens: Int = 0

    // Cumulative
    private var _totalRequests: Int = 0
    private var _totalPromptTokens: Int = 0
    private var _totalGenerationTokens: Int = 0
+    private var _totalCacheReusePromptTokens: Int = 0
+    private var _totalCacheRebuildPromptTokens: Int = 0
    private var _totalPreparingDuration: TimeInterval = 0
    private var _totalSessionBuildDuration: TimeInterval = 0
    private var _totalPrefillDuration: TimeInterval = 0
@@ -90,6 +94,26 @@ final class LiveCounters: @unchecked Sendable {
        lock.unlock()
    }

+    func recordPrefillReuse(requestId: String, matchedPromptTokens: Int, promptTokenCount: Int) {
+        lock.lock()
+        guard var state = requestPhases[requestId] else {
+            lock.unlock()
+            return
+        }
+
+        let matched = max(0, matchedPromptTokens)
+        let rebuilt = max(0, promptTokenCount - matched)
+
+        _totalCacheReusePromptTokens += matched
+        _totalCacheRebuildPromptTokens += rebuilt
+
+        state.matchedPromptTokens = matched
+        state.rebuiltPromptTokens = rebuilt
+        requestPhases[requestId] = state
+        refreshCurrentCachePromptStatsLocked()
+        lock.unlock()
+    }
+
    func requestCompleted(requestId: String, generationTokens: Int) {
        let now = Date()
        lock.lock()
@@ -108,6 +132,7 @@ final class LiveCounters: @unchecked Sendable {
            _isGenerating = _generatingRequests > 0
        }
        refreshCurrentPhaseElapsed(now: now)
+        refreshCurrentCachePromptStatsLocked()
        lock.unlock()
    }

@@ -126,9 +151,13 @@ final class LiveCounters: @unchecked Sendable {
        _isGenerating = false
        _contextMax = 0
        _currentPhaseElapsed = 0
+        _currentCacheMatchedPromptTokens = 0
+        _currentCacheRebuiltPromptTokens = 0
        _totalRequests = 0
        _totalPromptTokens = 0
        _totalGenerationTokens = 0
+        _totalCacheReusePromptTokens = 0
+        _totalCacheRebuildPromptTokens = 0
        _totalPreparingDuration = 0
        _totalSessionBuildDuration = 0
        _totalPrefillDuration = 0
@@ -154,9 +183,13 @@ final class LiveCounters: @unchecked Sendable {
            isGenerating: _isGenerating,
            contextMax: _contextMax,
            currentPhaseElapsed: _currentPhaseElapsed,
+            currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
+            currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
            totalRequests: _totalRequests,
            totalPromptTokens: _totalPromptTokens,
            totalGenerationTokens: _totalGenerationTokens,
+            totalCacheReusePromptTokens: _totalCacheReusePromptTokens,
+            totalCacheRebuildPromptTokens: _totalCacheRebuildPromptTokens,
            totalPreparingDuration: _totalPreparingDuration,
            totalSessionBuildDuration: _totalSessionBuildDuration,
            totalPrefillDuration: _totalPrefillDuration,
@@ -179,9 +212,13 @@ final class LiveCounters: @unchecked Sendable {
        let isGenerating: Bool
        let contextMax: Int
        let currentPhaseElapsed: TimeInterval
+        let currentCacheMatchedPromptTokens: Int
+        let currentCacheRebuiltPromptTokens: Int
        let totalRequests: Int
        let totalPromptTokens: Int
        let totalGenerationTokens: Int
+        let totalCacheReusePromptTokens: Int
+        let totalCacheRebuildPromptTokens: Int
        let totalPreparingDuration: TimeInterval
        let totalSessionBuildDuration: TimeInterval
        let totalPrefillDuration: TimeInterval
@@ -231,9 +268,16 @@ final class LiveCounters: @unchecked Sendable {
        _currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
    }

+    private func refreshCurrentCachePromptStatsLocked() {
+        _currentCacheMatchedPromptTokens = requestPhases.values.reduce(0) { $0 + $1.matchedPromptTokens }
+        _currentCacheRebuiltPromptTokens = requestPhases.values.reduce(0) { $0 + $1.rebuiltPromptTokens }
+    }
+
    private struct RequestState {
        var phase: RequestPhase
        var phaseStartedAt: Date
+        var matchedPromptTokens: Int = 0
+        var rebuiltPromptTokens: Int = 0
    }

    enum RequestPhase {
@@ -264,17 +308,20 @@ final class InferenceStats {
    var contextUsed: Int = 0
    var contextMax: Int = 0
    var currentPhaseElapsed: TimeInterval = 0
+    var currentCacheMatchedPromptTokens: Int = 0
+    var currentCacheRebuiltPromptTokens: Int = 0

    // MARK: - Cumulative counters

    var totalRequests: Int = 0
    var totalPromptTokens: Int = 0
    var totalGenerationTokens: Int = 0
+    var totalCacheReusePromptTokens: Int = 0
+    var totalCacheRebuildPromptTokens: Int = 0
    var totalCacheHits: Int = 0
    var totalCacheMisses: Int = 0
    var totalCacheEvictions: Int = 0
-    var totalCacheReusePromptTokens: Int = 0
-    var totalCacheRebuildPromptTokens: Int = 0
+    var cacheHitRatePercent: Double = 0
    var totalPreparingDuration: TimeInterval = 0
    var totalSessionBuildDuration: TimeInterval = 0
    var totalPrefillDuration: TimeInterval = 0
@@ -283,12 +330,11 @@ final class InferenceStats {
    // MARK: - Cache state

    var cacheEntryCount: Int = 0
-    var warmCacheEntryCount: Int = 0
-    var activeCacheEntryCount: Int = 0
-    var generatingCacheEntryCount: Int = 0
    var cacheEstimatedBytes: Int = 0
    var cacheEstimatedTokens: Int = 0
-    var cachedSessions: [ConversationSessionCache.SessionSummary] = []
+    var cacheMemoryBudgetBytes: Int = 0
+    var cacheMemoryUsagePercent: Double = 0
+    var cachedEntries: [TokenPrefixCache.EntrySummary] = []

    // MARK: - Time series data (ring buffers for charts)

@@ -302,13 +348,14 @@ final class InferenceStats {
    private(set) var promptTokenHistory: [DataPoint] = []
    private(set) var generationTokenHistory: [DataPoint] = []
    private(set) var cacheEntryHistory: [DataPoint] = []
-    private(set) var activeSessionHistory: [DataPoint] = []
    private(set) var cacheFootprintHistory: [DataPoint] = []
-    private(set) var cacheReuseHistory: [DataPoint] = []
-    private(set) var cacheRebuildHistory: [DataPoint] = []
+    private(set) var cacheHitRateHistory: [DataPoint] = []
+    private(set) var cacheMemoryPressureHistory: [DataPoint] = []
    private(set) var currentPhaseElapsedHistory: [DataPoint] = []
    private(set) var prefillDurationHistory: [DataPoint] = []
-    private(set) var sessionBuildDurationHistory: [DataPoint] = []
+    private(set) var cacheReusePromptHistory: [DataPoint] = []
+    private(set) var cacheRebuildPromptHistory: [DataPoint] = []
+    private(set) var cacheMatchQualityHistory: [DataPoint] = []

    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz

@@ -316,10 +363,9 @@ final class InferenceStats {
    private var sampleTimer: Timer?
    private var lastGenerationTokenCount: Int = 0
    private var lastPromptTokenCount: Int = 0
-    private var lastCacheReuseTokenCount: Int = 0
-    private var lastCacheRebuildTokenCount: Int = 0
    private var lastPrefillDuration: TimeInterval = 0
-    private var lastSessionBuildDuration: TimeInterval = 0
+    private var lastCacheReusePromptTokenCount: Int = 0
+    private var lastCacheRebuildPromptTokenCount: Int = 0

    func startSampling() {
        guard sampleTimer == nil else { return }
@@ -338,7 +384,7 @@ final class InferenceStats {
    private func recordSample() {
        // Pull live values from the thread-safe counters
        let snap = LiveCounters.shared.snapshot()
-        let cache = ConversationSessionCache.shared.snapshot()
+        let cache = TokenPrefixCache.shared.snapshot()

        activeRequests = snap.activeRequests
        preparingRequests = snap.preparingRequests
@@ -353,9 +399,13 @@ final class InferenceStats {
        contextMax = snap.contextMax
        contextUsed = snap.promptTokens + snap.generationTokens
        currentPhaseElapsed = snap.currentPhaseElapsed
+        currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
+        currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
        totalRequests = snap.totalRequests
        totalPromptTokens = snap.totalPromptTokens
        totalGenerationTokens = snap.totalGenerationTokens
+        totalCacheReusePromptTokens = snap.totalCacheReusePromptTokens
+        totalCacheRebuildPromptTokens = snap.totalCacheRebuildPromptTokens
        totalPreparingDuration = snap.totalPreparingDuration
        totalSessionBuildDuration = snap.totalSessionBuildDuration
        totalPrefillDuration = snap.totalPrefillDuration
@@ -363,41 +413,41 @@ final class InferenceStats {
        totalCacheHits = cache.totalHits
        totalCacheMisses = cache.totalMisses
        totalCacheEvictions = cache.totalEvictions
-        totalCacheReusePromptTokens = cache.totalReusePromptTokens
-        totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
+        cacheHitRatePercent = cache.hitRate
        cacheEntryCount = cache.totalEntries
-        warmCacheEntryCount = cache.warmEntries
-        activeCacheEntryCount = cache.activeEntries
-        generatingCacheEntryCount = cache.generatingEntries
        cacheEstimatedBytes = cache.estimatedBytes
-        cacheEstimatedTokens = cache.cachedTokenEstimate
-        cachedSessions = cache.sessions
+        cacheEstimatedTokens = cache.totalCachedTokens
+        cacheMemoryBudgetBytes = cache.memoryBudgetBytes
+        cacheMemoryUsagePercent = cache.memoryUsagePercent
+        cachedEntries = cache.entries

        let now = Date.now
        let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
        let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
-        let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
-        let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
        let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
-        let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
+        let cacheReusePromptDelta = snap.totalCacheReusePromptTokens - lastCacheReusePromptTokenCount
+        let cacheRebuildPromptDelta = snap.totalCacheRebuildPromptTokens - lastCacheRebuildPromptTokenCount
+        let cacheMatchQualityDelta = cacheReusePromptDelta + cacheRebuildPromptDelta > 0
+            ? (Double(cacheReusePromptDelta) / Double(cacheReusePromptDelta + cacheRebuildPromptDelta)) * 100
+            : 0
        lastGenerationTokenCount = snap.totalGenerationTokens
        lastPromptTokenCount = snap.totalPromptTokens
-        lastCacheReuseTokenCount = cache.totalReusePromptTokens
-        lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
        lastPrefillDuration = snap.totalPrefillDuration
-        lastSessionBuildDuration = snap.totalSessionBuildDuration
+        lastCacheReusePromptTokenCount = snap.totalCacheReusePromptTokens
+        lastCacheRebuildPromptTokenCount = snap.totalCacheRebuildPromptTokens

        tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
        promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
        cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
-        activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
        cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
-        cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
-        cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
+        cacheHitRateHistory.append(DataPoint(timestamp: now, value: cache.hitRate))
+        cacheMemoryPressureHistory.append(DataPoint(timestamp: now, value: cache.memoryUsagePercent))
        currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
        prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
-        sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
+        cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
+        cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
+        cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))

        if tokenRateHistory.count > Self.maxHistoryPoints {
            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -411,17 +461,14 @@ final class InferenceStats {
        if cacheEntryHistory.count > Self.maxHistoryPoints {
            cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
        }
-        if activeSessionHistory.count > Self.maxHistoryPoints {
-            activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
-        }
        if cacheFootprintHistory.count > Self.maxHistoryPoints {
            cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
        }
-        if cacheReuseHistory.count > Self.maxHistoryPoints {
-            cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
+        if cacheHitRateHistory.count > Self.maxHistoryPoints {
+            cacheHitRateHistory.removeFirst(cacheHitRateHistory.count - Self.maxHistoryPoints)
        }
-        if cacheRebuildHistory.count > Self.maxHistoryPoints {
-            cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
+        if cacheMemoryPressureHistory.count > Self.maxHistoryPoints {
+            cacheMemoryPressureHistory.removeFirst(cacheMemoryPressureHistory.count - Self.maxHistoryPoints)
        }
        if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
            currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
@@ -429,14 +476,20 @@ final class InferenceStats {
        if prefillDurationHistory.count > Self.maxHistoryPoints {
            prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
        }
-        if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
-            sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
+        if cacheReusePromptHistory.count > Self.maxHistoryPoints {
+            cacheReusePromptHistory.removeFirst(cacheReusePromptHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheRebuildPromptHistory.count > Self.maxHistoryPoints {
+            cacheRebuildPromptHistory.removeFirst(cacheRebuildPromptHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
+            cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
        }
    }

    func reset() {
        LiveCounters.shared.reset()
-        ConversationSessionCache.shared.reset()
+        TokenPrefixCache.shared.reset()
        activeRequests = 0
        preparingRequests = 0
        sessionBuildRequests = 0
@@ -450,9 +503,13 @@ final class InferenceStats {
        contextUsed = 0
        contextMax = 0
        currentPhaseElapsed = 0
+        currentCacheMatchedPromptTokens = 0
+        currentCacheRebuiltPromptTokens = 0
        totalRequests = 0
        totalPromptTokens = 0
        totalGenerationTokens = 0
+        totalCacheReusePromptTokens = 0
+        totalCacheRebuildPromptTokens = 0
        totalPreparingDuration = 0
        totalSessionBuildDuration = 0
        totalPrefillDuration = 0
@@ -460,31 +517,41 @@ final class InferenceStats {
        totalCacheHits = 0
        totalCacheMisses = 0
        totalCacheEvictions = 0
-        totalCacheReusePromptTokens = 0
-        totalCacheRebuildPromptTokens = 0
+        cacheHitRatePercent = 0
        cacheEntryCount = 0
-        warmCacheEntryCount = 0
-        activeCacheEntryCount = 0
-        generatingCacheEntryCount = 0
        cacheEstimatedBytes = 0
        cacheEstimatedTokens = 0
-        cachedSessions.removeAll()
+        cacheMemoryBudgetBytes = 0
+        cacheMemoryUsagePercent = 0
+        cachedEntries.removeAll()
        tokenRateHistory.removeAll()
        promptTokenHistory.removeAll()
        generationTokenHistory.removeAll()
        cacheEntryHistory.removeAll()
-        activeSessionHistory.removeAll()
        cacheFootprintHistory.removeAll()
-        cacheReuseHistory.removeAll()
-        cacheRebuildHistory.removeAll()
+        cacheHitRateHistory.removeAll()
+        cacheMemoryPressureHistory.removeAll()
        currentPhaseElapsedHistory.removeAll()
        prefillDurationHistory.removeAll()
-        sessionBuildDurationHistory.removeAll()
+        cacheReusePromptHistory.removeAll()
+        cacheRebuildPromptHistory.removeAll()
+        cacheMatchQualityHistory.removeAll()
        lastGenerationTokenCount = 0
        lastPromptTokenCount = 0
-        lastCacheReuseTokenCount = 0
-        lastCacheRebuildTokenCount = 0
        lastPrefillDuration = 0
-        lastSessionBuildDuration = 0
+        lastCacheReusePromptTokenCount = 0
+        lastCacheRebuildPromptTokenCount = 0
+    }
+
+    var currentCacheMatchQualityPercent: Double {
+        let total = currentCacheMatchedPromptTokens + currentCacheRebuiltPromptTokens
+        guard total > 0 else { return 0 }
+        return (Double(currentCacheMatchedPromptTokens) / Double(total)) * 100
+    }
+
+    var totalCacheMatchQualityPercent: Double {
+        let total = totalCacheReusePromptTokens + totalCacheRebuildPromptTokens
+        guard total > 0 else { return 0 }
+        return (Double(totalCacheReusePromptTokens) / Double(total)) * 100
    }
 }
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -63,7 +63,7 @@ final class APIServer {
        listener?.cancel()
        listener = nil
        isRunning = false
-        ConversationSessionCache.shared.invalidateAll()
+        TokenPrefixCache.shared.invalidateAll()
        inferenceStats.stopSampling()
    }

@@ -176,7 +176,7 @@ final class APIServer {
            if let targetConfig = ModelConfig.resolve(requestedModel) {
                if modelManager.currentModel?.id != targetConfig.id {
                    print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
-                    ConversationSessionCache.shared.invalidateAll()
+                    TokenPrefixCache.shared.invalidateAll()
                    await modelManager.loadModel(targetConfig)
                }
            }
@@ -187,7 +187,7 @@ final class APIServer {
        if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
           let config = ModelConfig.resolve(lastModelId) {
            print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
-            ConversationSessionCache.shared.invalidateAll()
+            TokenPrefixCache.shared.invalidateAll()
            await modelManager.loadModel(config)
        }

@@ -260,110 +260,80 @@ final class APIServer {
            temperature: Float(temperature),
            topP: Float(topP)
        )
-
-        // Feed all messages except the last as history, then send the last as the prompt
-        let chatMessages = preparedPrompt.chatMessages
-        let allButLast = Array(chatMessages.dropLast())
-        let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
-
-        let historySignatures = Array(preparedPrompt.messageSignatures.dropLast())
        let currentModelId = modelManager.currentModel?.id ?? modelName
-        let lease = ConversationSessionCache.shared.checkoutSession(
-            modelId: currentModelId,
-            instructions: preparedPrompt.instructions,
-            historySignatures: historySignatures,
-            requestMessageCount: chatMessages.count,
-            estimatedPromptTokens: estimatedPromptTokens,
-            estimatedBytes: preparedPrompt.estimatedBytes
-        )
-
-        let session: ChatSession
-        if let reusableSession = lease.session {
-            print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
-            session = reusableSession
-            session.generateParameters = generateParams
-            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
-            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
-        } else {
-            print("[APIServer] Creating fresh session")
-            ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
-            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
-            // Use `instructions:` for system/tool prompt (matches internal chat pattern).
-            // Only conversation turns go in `history:` — this avoids replaying the
-            // large tool prompt as history on every new session.
-            let instr = preparedPrompt.instructions.isEmpty ? nil : preparedPrompt.instructions
-            if !allButLast.isEmpty {
-                session = ChatSession(
-                    container,
-                    instructions: instr,
-                    history: allButLast,
-                    generateParameters: generateParams,
-                    additionalContext: preparedPrompt.additionalContext
-                )
-            } else {
-                session = ChatSession(
-                    container,
-                    instructions: instr,
-                    generateParameters: generateParams,
-                    additionalContext: preparedPrompt.additionalContext
-                )
-            }
-            ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
-            LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
+        let engine = InferenceEngine(container: container)
+        let preparedInference: InferenceEngine.PreparedInference
+        do {
+            preparedInference = try await engine.prepare(preparedPrompt.userInput)
+        } catch {
+            LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
+            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
+            return
        }

-        // Extract images from the last message only (ChatSession.streamDetails takes images separately)
-        let lastImages = lastMessage.images
+        // Vision requests stay uncached until image fingerprinting lands.
+        let cacheKey = preparedInference.hasImages ? nil : preparedInference.tokens
+        let lease = cacheKey.map { TokenPrefixCache.shared.lookup(cacheKey: $0, modelId: currentModelId) }
+            ?? TokenPrefixCache.CacheLease(entryId: UUID(), kvCache: nil, matchedTokenCount: 0, isHit: false)

-        let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
+        LiveCounters.shared.recordPrefillReuse(
+            requestId: requestId,
+            matchedPromptTokens: lease.matchedTokenCount,
+            promptTokenCount: preparedInference.tokens.count
+        )

+        LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
+
+        let cancellation = CancellationToken()
+        let streamHandle: InferenceEngine.StreamHandle
+        do {
+            streamHandle = try await engine.stream(
+                InferenceEngine.InferenceRequest(
+                    input: preparedInference.lmInput,
+                    tokens: preparedInference.tokens,
+                    parameters: generateParams,
+                    cachedKV: lease.kvCache,
+                    cachedTokenCount: lease.matchedTokenCount
+                ),
+                cancellation: cancellation
+            )
+        } catch {
+            LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
+            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
+            return
+        }
+
+        let result: GenerationOutcome
        if isStream {
            result = await handleStreamingResponse(
                connection: connection,
                requestId: requestId,
-                cacheEntryId: lease.entryId,
-                session: session,
-                prompt: lastMessage.content,
-                images: lastImages,
+                cancellation: cancellation,
+                stream: streamHandle.stream,
                tools: request.tools,
                created: created,
-                modelName: modelName,
-                isQwen: isQwen
+                modelName: modelName
            )
        } else {
            result = await handleNonStreamingResponse(
                connection: connection,
                requestId: requestId,
-                cacheEntryId: lease.entryId,
-                session: session,
-                prompt: lastMessage.content,
-                images: lastImages,
+                stream: streamHandle.stream,
                tools: request.tools,
                created: created,
-                modelName: modelName,
-                isQwen: isQwen
+                modelName: modelName
            )
        }

-        if result.succeeded {
-            var cachedSignatures = preparedPrompt.messageSignatures
-            if let assistantHistoryText = result.assistantHistoryText {
-                cachedSignatures.append(
-                    Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
-                )
-            }
-            ConversationSessionCache.shared.completeRequest(
+        if let cacheKey,
+           result.succeeded || result.cancelled {
+            Self.storePromptCache(
+                streamHandle.workingCache,
+                promptTokenCount: preparedInference.tokens.count,
                entryId: lease.entryId,
-                session: session,
-                requestMessageSignatures: cachedSignatures,
-                requestMessageCount: cachedSignatures.count,
-                estimatedPromptTokens: estimatedPromptTokens,
-                estimatedBytes: preparedPrompt.estimatedBytes,
-                promptTokens: result.promptTokens,
-                completionTokens: result.completionTokens
+                cacheKey: cacheKey,
+                modelId: currentModelId
            )
-        } else {
-            ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
        }

        LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
@@ -375,53 +345,20 @@ final class APIServer {
    private func handleNonStreamingResponse(
        connection: NWConnection,
        requestId: String,
-        cacheEntryId: UUID,
-        session: ChatSession,
-        prompt: String,
-        images: [UserInput.Image],
+        stream: AsyncStream<Generation>,
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String,
-        isQwen: Bool
-    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
+        modelName: String
+    ) async -> GenerationOutcome {
        do {
-            var fullText = ""
-            var promptTokens = 0
-            var completionTokens = 0
-            var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
-
-            let stream = session.streamDetails(
-                to: prompt,
-                images: images,
-                videos: []
+            let outcome = await Self.collectGenerationOutcome(
+                stream: stream,
+                requestId: requestId,
+                cancellation: nil
            )
-
-            for try await generation in stream {
-                switch generation {
-                case .chunk(let text):
-                    fullText += text
-                    completionTokens += 1
-                    LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
-                case .info(let info):
-                    promptTokens = info.promptTokenCount
-                    completionTokens = info.generationTokenCount
-                    ConversationSessionCache.shared.markGenerating(
-                        entryId: cacheEntryId,
-                        promptTokens: promptTokens,
-                        completionTokens: completionTokens
-                    )
-                    LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
-                    if info.tokensPerSecond > 0 {
-                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
-                    }
-                case .toolCall(let call):
-                    frameworkToolCalls.append(call)
-                }
-            }
-
            let resolved = Self.resolveAssistantResponse(
-                fullText: fullText,
-                frameworkToolCalls: frameworkToolCalls,
+                fullText: outcome.fullText,
+                frameworkToolCalls: outcome.frameworkToolCalls,
                tools: tools
            )

@@ -442,24 +379,26 @@ final class APIServer {
                    )
                ],
                usage: APIUsageInfo(
-                    prompt_tokens: promptTokens,
-                    completion_tokens: completionTokens,
-                    total_tokens: promptTokens + completionTokens
+                    prompt_tokens: outcome.promptTokens,
+                    completion_tokens: outcome.completionTokens,
+                    total_tokens: outcome.promptTokens + outcome.completionTokens
                )
            )

            if let json = try? JSONEncoder().encode(response) {
                sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
            }
-            let assistantHistoryText = Self.normalizedAssistantHistoryContent(
-                content: resolved.content,
-                toolCalls: resolved.toolCalls,
-                isQwen: isQwen
+            return GenerationOutcome(
+                promptTokens: outcome.promptTokens,
+                completionTokens: outcome.completionTokens,
+                fullText: outcome.fullText,
+                frameworkToolCalls: outcome.frameworkToolCalls,
+                succeeded: true,
+                cancelled: false
            )
-            return (promptTokens, completionTokens, assistantHistoryText, true)
        } catch {
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
-            return (0, 0, nil, false)
+            return GenerationOutcome(promptTokens: 0, completionTokens: 0, fullText: "", frameworkToolCalls: [], succeeded: false, cancelled: false)
        }
    }

@@ -468,15 +407,12 @@ final class APIServer {
    private func handleStreamingResponse(
        connection: NWConnection,
        requestId: String,
-        cacheEntryId: UUID,
-        session: ChatSession,
-        prompt: String,
-        images: [UserInput.Image],
+        cancellation: CancellationToken,
+        stream: AsyncStream<Generation>,
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String,
-        isQwen: Bool
-    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
+        modelName: String
+    ) async -> GenerationOutcome {
        // Send SSE headers
        let header = [
            "HTTP/1.1 200 OK",
@@ -489,55 +425,34 @@ final class APIServer {
        ].joined(separator: "\r\n")

        await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
+        connection.stateUpdateHandler = { state in
+            switch state {
+            case .cancelled, .failed:
+                cancellation.cancel()
+            default:
+                break
+            }
+        }

-        // Send initial role chunk
-        await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
-            id: requestId,
-            object: "chat.completion.chunk",
-            created: created,
-            model: modelName,
-            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: "assistant", content: nil, tool_calls: nil), finish_reason: nil)],
-            usage: nil
-        ))
+        let encoder = StreamingSSEEncoder(requestId: requestId, created: created, modelName: modelName)
+        await Self.sendData(connection: connection, data: encoder.encodeRoleDelta("assistant"))

-        let hasTools = tools != nil && !(tools?.isEmpty ?? true)
-
-        // Run the generation loop OFF MainActor.
-        // ChatSession and NWConnection don't need MainActor.
-        // Running on MainActor caused every token to compete with SwiftUI
-        // rendering, creating back-pressure that coalesced all output.
-        let stream = session.streamDetails(
-            to: prompt,
-            images: images,
-            videos: []
+        let result = await Self.runStreamingLoop(
+            connection: connection,
+            stream: stream,
+            cancellation: cancellation,
+            requestId: requestId,
+            encoder: encoder
        )
-        // Transfer non-Sendable values to the nonisolated loop.
-        // Safe because we don't touch session/images again until after the loop.
-        let result = await {
-            nonisolated(unsafe) let stream = stream
-            return await Self.runStreamingLoop(
-                connection: connection,
-                stream: stream,
-                requestId: requestId,
-                created: created,
-                modelName: modelName
-            )
-        }()

-        let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
-
-        if promptTokens > 0 {
-            ConversationSessionCache.shared.markGenerating(
-                entryId: cacheEntryId,
-                promptTokens: promptTokens,
-                completionTokens: completionTokens
-            )
-            LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
+        if result.cancelled {
+            connection.cancel()
+            return result
        }

        let resolved = Self.resolveAssistantResponse(
-            fullText: fullText,
-            frameworkToolCalls: frameworkToolCalls,
+            fullText: result.fullText,
+            frameworkToolCalls: result.frameworkToolCalls,
            tools: tools
        )

@@ -562,21 +477,16 @@ final class APIServer {
            model: modelName,
            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
            usage: APIUsageInfo(
-                prompt_tokens: promptTokens,
-                completion_tokens: completionTokens,
-                total_tokens: promptTokens + completionTokens
+                prompt_tokens: result.promptTokens,
+                completion_tokens: result.completionTokens,
+                total_tokens: result.promptTokens + result.completionTokens
            )
        ))

        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
        connection.cancel()
-        let assistantHistoryText = Self.normalizedAssistantHistoryContent(
-            content: resolved.content,
-            toolCalls: resolved.toolCalls,
-            isQwen: isQwen
-        )
-        return (promptTokens, completionTokens, assistantHistoryText, succeeded)
+        return result
    }

    /// Run the token generation + SSE send loop entirely off MainActor.
@@ -584,54 +494,20 @@ final class APIServer {
    /// multiple actor hops competing with SwiftUI, causing all output to batch.
    nonisolated private static func runStreamingLoop(
        connection: NWConnection,
-        stream: AsyncThrowingStream<Generation, any Error>,
+        stream: AsyncStream<Generation>,
+        cancellation: CancellationToken,
        requestId: String,
-        created: Int,
-        modelName: String
-    ) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
-        var promptTokens = 0
-        var completionTokens = 0
-        var fullText = ""
-        var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
-
-        do {
-            for try await generation in stream {
-                switch generation {
-                case .chunk(let text):
-                    completionTokens += 1
-                    fullText += text
-
-                    // Update live counters directly — no MainActor hop needed
-                    LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
-
-                    // Send directly — no MainActor hop.
-                    await sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
-                        id: requestId,
-                        object: "chat.completion.chunk",
-                        created: created,
-                        model: modelName,
-                        choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: text, tool_calls: nil), finish_reason: nil)],
-                        usage: nil
-                    ))
-
-                case .info(let info):
-                    promptTokens = info.promptTokenCount
-                    completionTokens = info.generationTokenCount
-                    if info.tokensPerSecond > 0 {
-                        LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
-                    }
-
-                case .toolCall(let call):
-                    frameworkToolCalls.append(call)
-                }
-            }
-        } catch {
-            let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
-            await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
-            return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
+        encoder: StreamingSSEEncoder
+    ) async -> GenerationOutcome {
+        var outcome = await collectGenerationOutcome(
+            stream: stream,
+            requestId: requestId,
+            cancellation: cancellation
+        ) { text in
+            await sendData(connection: connection, data: encoder.encodeContentDelta(text))
        }
-
-        return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
+        outcome.succeeded = !outcome.cancelled
+        return outcome
    }

    /// Send an SSE event and wait for the protocol stack to process it.
@@ -651,6 +527,88 @@ final class APIServer {
        }
    }

+    nonisolated private static func collectGenerationOutcome(
+        stream: AsyncStream<Generation>,
+        requestId: String,
+        cancellation: CancellationToken?,
+        onChunk: ((String) async -> Void)? = nil
+    ) async -> GenerationOutcome {
+        var promptTokens = 0
+        var completionTokens = 0
+        var fullText = ""
+        var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
+        var cancelled = false
+
+        for await generation in stream {
+            if let cancellation, cancellation.isCancelled {
+                cancelled = true
+                break
+            }
+
+            switch generation {
+            case .chunk(let text):
+                completionTokens += 1
+                fullText += text
+                LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
+                if let onChunk {
+                    await onChunk(text)
+                }
+            case .info(let info):
+                promptTokens = info.promptTokenCount
+                completionTokens = info.generationTokenCount
+                LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
+                if info.tokensPerSecond > 0 {
+                    LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
+                }
+            case .toolCall(let call):
+                frameworkToolCalls.append(call)
+            }
+        }
+
+        return GenerationOutcome(
+            promptTokens: promptTokens,
+            completionTokens: completionTokens,
+            fullText: fullText,
+            frameworkToolCalls: frameworkToolCalls,
+            succeeded: !cancelled,
+            cancelled: cancelled
+        )
+    }
+
+    private static func storePromptCache(
+        _ cache: [KVCache],
+        promptTokenCount: Int,
+        entryId: UUID,
+        cacheKey: [Int],
+        modelId: String
+    ) {
+        guard trimGeneratedTokens(cache, promptTokenCount: promptTokenCount) else {
+            return
+        }
+        TokenPrefixCache.shared.store(
+            entryId: entryId,
+            kvCache: cache,
+            cacheKey: cacheKey,
+            modelId: modelId
+        )
+    }
+
+    private static func trimGeneratedTokens(_ cache: [KVCache], promptTokenCount: Int) -> Bool {
+        for layer in cache {
+            let excess = layer.offset - promptTokenCount
+            guard excess <= 0 || layer.isTrimmable else {
+                return false
+            }
+            if excess > 0 {
+                let trimmed = layer.trim(excess)
+                guard trimmed == excess else {
+                    return false
+                }
+            }
+        }
+        return true
+    }
+
    // MARK: - HTTP helpers

    private func sendResponse(
@@ -787,6 +745,15 @@ final class APIServer {
    }
 }

+private struct GenerationOutcome {
+    var promptTokens: Int
+    var completionTokens: Int
+    var fullText: String
+    var frameworkToolCalls: [MLXLMCommon.ToolCall]
+    var succeeded: Bool
+    var cancelled: Bool
+}
+
 // MARK: - HTTP request parser

 private struct HTTPRequest {
--- a/MLXServer/Server/ConversationSessionCache.swift
+++ b/MLXServer/Server/ConversationSessionCache.swift
@@ -1,358 +0,0 @@
-import Foundation
-import MLXLMCommon
-import os
-
-enum APISessionPhase: String, Sendable {
-    case idle = "Idle"
-    case sessionBuild = "Session Build"
-    case prefilling = "Prefilling"
-    case generating = "Generating"
-}
-
-/// Bounded cache of API chat sessions keyed by normalized conversation history.
-/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
-final class ConversationSessionCache: @unchecked Sendable {
-    static let shared = ConversationSessionCache()
-
-    private let lock = OSAllocatedUnfairLock()
-
-    private let maxEntries = 8
-    private let maxCachedTokens = 256_000
-    private let idleTTL: TimeInterval = 10 * 60
-
-    private var entries: [UUID: Entry] = [:]
-    private var totals = Totals()
-
-    private init() {}
-
-    struct Lease {
-        let entryId: UUID
-        let session: ChatSession?
-        let reusedPromptTokens: Int
-        let cacheHit: Bool
-    }
-
-    struct SessionSummary: Identifiable, Sendable {
-        let id: UUID
-        let modelId: String
-        let phase: APISessionPhase
-        let messageCount: Int
-        let cachedTokenEstimate: Int
-        let estimatedBytes: Int
-        let inFlightRequests: Int
-        let hitCount: Int
-        let lastPromptTokens: Int
-        let lastCompletionTokens: Int
-        let lastReuseTokens: Int
-        let createdAt: Date
-        let lastAccessAt: Date
-    }
-
-    struct Snapshot: Sendable {
-        let totalEntries: Int
-        let warmEntries: Int
-        let activeEntries: Int
-        let generatingEntries: Int
-        let estimatedBytes: Int
-        let cachedTokenEstimate: Int
-        let totalHits: Int
-        let totalMisses: Int
-        let totalEvictions: Int
-        let totalReusePromptTokens: Int
-        let totalRebuildPromptTokens: Int
-        let sessions: [SessionSummary]
-    }
-
-    func checkoutSession(
-        modelId: String,
-        instructions: String,
-        historySignatures: [UInt64],
-        requestMessageCount: Int,
-        estimatedPromptTokens: Int,
-        estimatedBytes: Int
-    ) -> Lease {
-        lock.lock()
-        let now = Date()
-        pruneExpiredLocked(now: now)
-
-        let instructionsHash = Self.stableHash(instructions)
-        let match = entries
-            .values
-            .filter {
-                $0.modelId == modelId
-                    && $0.instructionsHash == instructionsHash
-                    && $0.session != nil
-                    && $0.inFlightRequests == 0
-                    && Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
-            }
-            .max { lhs, rhs in
-                lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
-            }
-
-        if let match {
-            var entry = match
-            entry.inFlightRequests += 1
-            entry.lastAccessAt = now
-            entry.phase = .prefilling
-            entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
-            entry.hitCount += 1
-            entries[entry.id] = entry
-            totals.totalHits += 1
-            totals.totalReusePromptTokens += entry.lastReuseTokens
-            let lease = Lease(
-                entryId: entry.id,
-                session: entry.session,
-                reusedPromptTokens: entry.lastReuseTokens,
-                cacheHit: true
-            )
-            lock.unlock()
-            return lease
-        }
-
-        let entryId = UUID()
-        entries[entryId] = Entry(
-            id: entryId,
-            modelId: modelId,
-            instructionsHash: instructionsHash,
-            requestMessageSignatures: historySignatures,
-            messageCount: requestMessageCount,
-            cachedTokenEstimate: estimatedPromptTokens,
-            estimatedBytes: estimatedBytes,
-            createdAt: now,
-            lastAccessAt: now,
-            inFlightRequests: 1,
-            hitCount: 0,
-            phase: .sessionBuild,
-            lastPromptTokens: 0,
-            lastCompletionTokens: 0,
-            lastReuseTokens: 0,
-            session: nil
-        )
-        totals.totalMisses += 1
-        totals.totalRebuildPromptTokens += estimatedPromptTokens
-        lock.unlock()
-        return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
-    }
-
-    func markSessionBuild(entryId: UUID) {
-        updatePhase(entryId: entryId, phase: .sessionBuild)
-    }
-
-    func markPrefilling(entryId: UUID) {
-        updatePhase(entryId: entryId, phase: .prefilling)
-    }
-
-    func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
-        lock.lock()
-        if var entry = entries[entryId] {
-            entry.phase = .generating
-            entry.lastPromptTokens = promptTokens
-            entry.lastCompletionTokens = completionTokens
-            entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
-            entry.lastAccessAt = Date()
-            entries[entryId] = entry
-        }
-        lock.unlock()
-    }
-
-    func completeRequest(
-        entryId: UUID,
-        session: ChatSession,
-        requestMessageSignatures: [UInt64],
-        requestMessageCount: Int,
-        estimatedPromptTokens: Int,
-        estimatedBytes: Int,
-        promptTokens: Int,
-        completionTokens: Int
-    ) {
-        lock.lock()
-        let now = Date()
-        if var entry = entries[entryId] {
-            entry.session = session
-            entry.requestMessageSignatures = requestMessageSignatures
-            entry.messageCount = requestMessageCount
-            entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
-            entry.estimatedBytes = estimatedBytes
-            entry.lastPromptTokens = promptTokens
-            entry.lastCompletionTokens = completionTokens
-            entry.lastAccessAt = now
-            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
-            entry.phase = .idle
-            entries[entryId] = entry
-            enforceBudgetLocked(now: now)
-        }
-        lock.unlock()
-    }
-
-    func abandonRequest(entryId: UUID) {
-        lock.lock()
-        if var entry = entries[entryId] {
-            entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
-            if entry.session == nil && entry.inFlightRequests == 0 {
-                entries.removeValue(forKey: entryId)
-            } else {
-                entry.phase = .idle
-                entry.lastAccessAt = Date()
-                entries[entryId] = entry
-            }
-        }
-        lock.unlock()
-    }
-
-    func invalidateAll() {
-        lock.lock()
-        totals.totalEvictions += entries.count
-        entries.removeAll()
-        lock.unlock()
-    }
-
-    func reset() {
-        lock.lock()
-        entries.removeAll()
-        totals = Totals()
-        lock.unlock()
-    }
-
-    func snapshot() -> Snapshot {
-        lock.lock()
-        let now = Date()
-        pruneExpiredLocked(now: now)
-        let allEntries = Array(entries.values)
-        let sessions = allEntries
-            .sorted {
-                if $0.inFlightRequests != $1.inFlightRequests {
-                    return $0.inFlightRequests > $1.inFlightRequests
-                }
-                return $0.lastAccessAt > $1.lastAccessAt
-            }
-            .map {
-                SessionSummary(
-                    id: $0.id,
-                    modelId: $0.modelId,
-                    phase: $0.phase,
-                    messageCount: $0.messageCount,
-                    cachedTokenEstimate: $0.cachedTokenEstimate,
-                    estimatedBytes: $0.estimatedBytes,
-                    inFlightRequests: $0.inFlightRequests,
-                    hitCount: $0.hitCount,
-                    lastPromptTokens: $0.lastPromptTokens,
-                    lastCompletionTokens: $0.lastCompletionTokens,
-                    lastReuseTokens: $0.lastReuseTokens,
-                    createdAt: $0.createdAt,
-                    lastAccessAt: $0.lastAccessAt
-                )
-            }
-        let snapshot = Snapshot(
-            totalEntries: allEntries.count,
-            warmEntries: allEntries.filter { $0.session != nil }.count,
-            activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
-            generatingEntries: allEntries.filter { $0.phase == .generating }.count,
-            estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
-            cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
-            totalHits: totals.totalHits,
-            totalMisses: totals.totalMisses,
-            totalEvictions: totals.totalEvictions,
-            totalReusePromptTokens: totals.totalReusePromptTokens,
-            totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
-            sessions: sessions
-        )
-        lock.unlock()
-        return snapshot
-    }
-
-    private func updatePhase(entryId: UUID, phase: APISessionPhase) {
-        lock.lock()
-        if var entry = entries[entryId] {
-            entry.phase = phase
-            entry.lastAccessAt = Date()
-            entries[entryId] = entry
-        }
-        lock.unlock()
-    }
-
-    private func pruneExpiredLocked(now: Date) {
-        let expired = entries.values.filter {
-            $0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
-        }
-        guard !expired.isEmpty else { return }
-        for entry in expired {
-            entries.removeValue(forKey: entry.id)
-        }
-        totals.totalEvictions += expired.count
-    }
-
-    private func enforceBudgetLocked(now: Date) {
-        pruneExpiredLocked(now: now)
-
-        func totalCachedTokens() -> Int {
-            entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
-        }
-
-        while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
-            guard let victim = entries.values
-                .filter({ $0.inFlightRequests == 0 })
-                .sorted(by: evictionOrder)
-                .first
-            else {
-                break
-            }
-            entries.removeValue(forKey: victim.id)
-            totals.totalEvictions += 1
-        }
-    }
-
-    private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
-        if lhs.lastAccessAt != rhs.lastAccessAt {
-            return lhs.lastAccessAt < rhs.lastAccessAt
-        }
-        if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
-            return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
-        }
-        return lhs.createdAt < rhs.createdAt
-    }
-
-    private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
-        guard cached.count <= incoming.count,
-              incoming.count <= cached.count + 1 else { return false }
-        for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
-            return false
-        }
-        return true
-    }
-
-    static func stableHash(_ text: String) -> UInt64 {
-        var hash: UInt64 = 14_695_981_039_346_656_037
-        for byte in text.utf8 {
-            hash ^= UInt64(byte)
-            hash &*= 1_099_511_628_211
-        }
-        return hash
-    }
-
-    private struct Entry {
-        let id: UUID
-        let modelId: String
-        let instructionsHash: UInt64
-        var requestMessageSignatures: [UInt64]
-        var messageCount: Int
-        var cachedTokenEstimate: Int
-        var estimatedBytes: Int
-        let createdAt: Date
-        var lastAccessAt: Date
-        var inFlightRequests: Int
-        var hitCount: Int
-        var phase: APISessionPhase
-        var lastPromptTokens: Int
-        var lastCompletionTokens: Int
-        var lastReuseTokens: Int
-        var session: ChatSession?
-    }
-
-    private struct Totals {
-        var totalHits: Int = 0
-        var totalMisses: Int = 0
-        var totalEvictions: Int = 0
-        var totalReusePromptTokens: Int = 0
-        var totalRebuildPromptTokens: Int = 0
-    }
-}
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
--- a/MLXServerTests/Server/APIServerRewriteTests.swift
+++ b/MLXServerTests/Server/APIServerRewriteTests.swift
@@ -0,0 +1,736 @@
+import Foundation
+import XCTest
+@testable import MLX_Server
+
+final class APIServerRewriteTests: XCTestCase {
+    func testNonStreamingChatCompletionUsesStatelessServerPathAndCachesPrompt() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let request = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 1,
+            stream: false,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let firstResponse = try await sendChatCompletion(request, port: harness.port)
+        XCTAssertEqual(firstResponse.choices.count, 1)
+        XCTAssertEqual(firstResponse.choices[0].message.role, "assistant")
+        XCTAssertGreaterThan(firstResponse.usage.prompt_tokens, 0)
+        XCTAssertGreaterThanOrEqual(firstResponse.usage.completion_tokens, 0)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalEntries > 0
+        }
+        let firstSnapshot = TokenPrefixCache.shared.snapshot()
+        let firstLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
+
+        _ = try await sendChatCompletion(request, port: harness.port)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
+        }
+        let secondSnapshot = TokenPrefixCache.shared.snapshot()
+        let secondLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
+        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
+    }
+
+    func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let firstRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 3,
+            stream: true,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let firstStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
+        XCTAssertEqual(firstStream.roleDeltaCount, 1)
+        XCTAssertTrue(firstStream.sawDone)
+        XCTAssertEqual(firstStream.finalFinishReason, "stop")
+        XCTAssertGreaterThan(firstStream.usage?.prompt_tokens ?? 0, 0)
+        XCTAssertFalse(firstStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalEntries > 0
+        }
+        let firstSnapshot = TokenPrefixCache.shared.snapshot()
+        let firstLiveSnapshot = LiveCounters.shared.snapshot()
+
+        let secondRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 3,
+            stream: true,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let secondStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
+        XCTAssertEqual(secondStream.roleDeltaCount, 1)
+        XCTAssertTrue(secondStream.sawDone)
+        XCTAssertEqual(secondStream.finalFinishReason, "stop")
+        XCTAssertFalse(secondStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
+        }
+        let secondSnapshot = TokenPrefixCache.shared.snapshot()
+        let secondLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
+        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
+
+        let thirdRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: .text(secondStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Answer in one word: what color is snow?"), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 3,
+            stream: true,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
+        XCTAssertEqual(thirdStream.roleDeltaCount, 1)
+        XCTAssertTrue(thirdStream.sawDone)
+        XCTAssertEqual(thirdStream.finalFinishReason, "stop")
+        XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
+        }
+        let thirdSnapshot = TokenPrefixCache.shared.snapshot()
+        let thirdLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
+        XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
+    }
+
+    func testStreamingChatCompletionReusesCacheAcrossToolBoundary() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let tools = [mockWeatherTool]
+        let firstRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 48,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let toolCallStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
+        XCTAssertEqual(toolCallStream.roleDeltaCount, 1)
+        XCTAssertTrue(toolCallStream.sawDone)
+        XCTAssertEqual(toolCallStream.finalFinishReason, "tool_calls")
+        let toolCall = try XCTUnwrap(toolCallStream.toolCalls.first)
+        XCTAssertEqual(toolCall.function.name, "weather")
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalEntries > 0
+        }
+        let afterToolCallSnapshot = TokenPrefixCache.shared.snapshot()
+        let afterToolCallLiveSnapshot = LiveCounters.shared.snapshot()
+
+        let secondRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
+                APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 16,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let directAnswerStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
+        XCTAssertEqual(directAnswerStream.roleDeltaCount, 1)
+        XCTAssertTrue(directAnswerStream.sawDone)
+        XCTAssertEqual(directAnswerStream.finalFinishReason, "stop")
+        XCTAssertTrue(directAnswerStream.toolCalls.isEmpty)
+        XCTAssertFalse(directAnswerStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > afterToolCallSnapshot.totalHits
+        }
+        let afterDirectAnswerSnapshot = TokenPrefixCache.shared.snapshot()
+        let afterDirectAnswerLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(afterDirectAnswerSnapshot.totalHits, afterToolCallSnapshot.totalHits)
+        XCTAssertGreaterThan(afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens, afterToolCallLiveSnapshot.totalCacheReusePromptTokens)
+
+        let thirdRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
+                APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id),
+                APIChatMessage(role: "assistant", content: .text(directAnswerStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Now compress that answer to two words."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 8,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
+        XCTAssertEqual(thirdStream.roleDeltaCount, 1)
+        XCTAssertTrue(thirdStream.sawDone)
+        XCTAssertEqual(thirdStream.finalFinishReason, "stop")
+        XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > afterDirectAnswerSnapshot.totalHits
+        }
+        let finalSnapshot = TokenPrefixCache.shared.snapshot()
+        let finalLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(finalSnapshot.totalHits, afterDirectAnswerSnapshot.totalHits)
+        XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens)
+    }
+
+    func testStreamingChatCompletionReusesCacheAcrossMultipleToolTurns() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let tools = [mockWeatherTool]
+        let berlinRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 48,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let firstToolTurn = try await sendStreamingChatCompletion(berlinRequest, port: harness.port)
+        XCTAssertEqual(firstToolTurn.finalFinishReason, "tool_calls")
+        let berlinToolCall = try XCTUnwrap(firstToolTurn.toolCalls.first)
+        XCTAssertEqual(berlinToolCall.function.name, "weather")
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalEntries > 0
+        }
+        let firstSnapshot = TokenPrefixCache.shared.snapshot()
+        let firstLiveSnapshot = LiveCounters.shared.snapshot()
+
+        let berlinToolResult = APIChatMessage(
+            role: "tool",
+            content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"),
+            name: nil,
+            tool_calls: nil,
+            tool_call_id: berlinToolCall.id
+        )
+
+        let berlinAnswerRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
+                berlinToolResult
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 16,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let berlinAnswer = try await sendStreamingChatCompletion(berlinAnswerRequest, port: harness.port)
+        XCTAssertEqual(berlinAnswer.finalFinishReason, "stop")
+        XCTAssertFalse(berlinAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
+        }
+        let secondSnapshot = TokenPrefixCache.shared.snapshot()
+        let secondLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
+        XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
+
+        let parisToolTurnRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
+                berlinToolResult,
+                APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 48,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let secondToolTurn = try await sendStreamingChatCompletion(parisToolTurnRequest, port: harness.port)
+        XCTAssertEqual(secondToolTurn.finalFinishReason, "tool_calls")
+        let parisToolCall = try XCTUnwrap(secondToolTurn.toolCalls.first)
+        XCTAssertEqual(parisToolCall.function.name, "weather")
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
+        }
+        let thirdSnapshot = TokenPrefixCache.shared.snapshot()
+        let thirdLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
+        XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
+
+        let parisAnswerRequest = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
+                berlinToolResult,
+                APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
+                APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [parisToolCall], tool_call_id: nil),
+                APIChatMessage(role: "tool", content: .text("{\"city\":\"Paris\",\"temperature_c\":21,\"condition\":\"clear\"}"), name: nil, tool_calls: nil, tool_call_id: parisToolCall.id)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 16,
+            stream: true,
+            stop: nil,
+            tools: tools,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let parisAnswer = try await sendStreamingChatCompletion(parisAnswerRequest, port: harness.port)
+        XCTAssertEqual(parisAnswer.finalFinishReason, "stop")
+        XCTAssertTrue(parisAnswer.toolCalls.isEmpty)
+        XCTAssertFalse(parisAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > thirdSnapshot.totalHits
+        }
+        let fourthSnapshot = TokenPrefixCache.shared.snapshot()
+        let fourthLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(fourthSnapshot.totalHits, thirdSnapshot.totalHits)
+        XCTAssertGreaterThan(fourthLiveSnapshot.totalCacheReusePromptTokens, thirdLiveSnapshot.totalCacheReusePromptTokens)
+    }
+
+    func testStreamingDisconnectStoresPromptCacheForReuse() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let request = APIChatCompletionRequest(
+            model: "gemma",
+            messages: [
+                APIChatMessage(role: "user", content: .text("Count from one to twenty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
+            ],
+            temperature: 0,
+            top_p: 1,
+            max_tokens: 64,
+            stream: true,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            n: nil
+        )
+
+        let initialSnapshot = TokenPrefixCache.shared.snapshot()
+        try await cancelStreamingChatCompletionAfterFirstContent(request, port: harness.port)
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalEntries > initialSnapshot.totalEntries
+        }
+        let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
+        let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
+
+        _ = try await sendChatCompletion(
+            APIChatCompletionRequest(
+                model: request.model,
+                messages: request.messages,
+                temperature: request.temperature,
+                top_p: request.top_p,
+                max_tokens: 8,
+                stream: false,
+                stop: request.stop,
+                tools: request.tools,
+                tool_choice: request.tool_choice,
+                frequency_penalty: request.frequency_penalty,
+                presence_penalty: request.presence_penalty,
+                n: request.n
+            ),
+            port: harness.port
+        )
+
+        try await waitUntil(timeoutSeconds: 5) {
+            TokenPrefixCache.shared.snapshot().totalHits > afterDisconnectSnapshot.totalHits
+        }
+        let finalSnapshot = TokenPrefixCache.shared.snapshot()
+        let finalLiveSnapshot = LiveCounters.shared.snapshot()
+        XCTAssertGreaterThan(finalSnapshot.totalHits, afterDisconnectSnapshot.totalHits)
+        XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
+    }
+
+    func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
+        let harness = try await makeHarness()
+        defer { harness.stop() }
+
+        let detailed = try await sendStreamingChatCompletionDetailed(
+            APIChatCompletionRequest(
+                model: "gemma",
+                messages: [
+                    APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
+                ],
+                temperature: 0,
+                top_p: 1,
+                max_tokens: 48,
+                stream: true,
+                stop: nil,
+                tools: [mockWeatherTool],
+                tool_choice: nil,
+                frequency_penalty: nil,
+                presence_penalty: nil,
+                n: nil
+            ),
+            port: harness.port
+        )
+
+        XCTAssertTrue(detailed.sawDone)
+        XCTAssertFalse(detailed.events.isEmpty)
+
+        let firstEvent = try XCTUnwrap(detailed.events.first)
+        XCTAssertEqual(firstEvent.kind, .role)
+        XCTAssertEqual(firstEvent.role, "assistant")
+
+        let toolEventIndices = detailed.events.enumerated().compactMap { index, event in
+            event.kind == .toolCall ? index : nil
+        }
+        XCTAssertFalse(toolEventIndices.isEmpty)
+
+        let finalIndex = try XCTUnwrap(detailed.events.lastIndex(where: { $0.kind == .final }))
+        XCTAssertEqual(finalIndex, detailed.events.count - 1)
+
+        for toolIndex in toolEventIndices {
+            XCTAssertLessThan(toolIndex, finalIndex)
+        }
+
+        let finalEvent = detailed.events[finalIndex]
+        XCTAssertEqual(finalEvent.finishReason, "tool_calls")
+        XCTAssertNotNil(finalEvent.usage)
+
+        let roleEventCount = detailed.events.filter { $0.kind == .role }.count
+        XCTAssertEqual(roleEventCount, 1)
+    }
+
+    private var mockWeatherTool: APIToolDefinition {
+        APIToolDefinition(
+            type: "function",
+            function: APIFunctionDefinition(
+                name: "weather",
+                description: "Look up weather for a city.",
+                parameters: [
+                    "type": AnyCodable("object"),
+                    "properties": AnyCodable([
+                        "city": [
+                            "type": "string",
+                            "description": "City name"
+                        ]
+                    ]),
+                    "required": AnyCodable(["city"])
+                ]
+            )
+        )
+    }
+
+    private func makeHarness() async throws -> TestHarness {
+        let modelManager = await MainActor.run { ModelManager() }
+        let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
+
+        LiveCounters.shared.reset()
+        TokenPrefixCache.shared.reset()
+        await modelManager.loadModel(config)
+        let isReady = await MainActor.run { modelManager.isReady }
+        XCTAssertTrue(isReady)
+
+        let server = await MainActor.run { APIServer() }
+        let port = UInt16.random(in: 20_000...40_000)
+        await MainActor.run {
+            server.start(modelManager: modelManager, port: Int(port))
+        }
+
+        try await waitUntil(timeoutSeconds: 5) {
+            await MainActor.run { server.isRunning }
+        }
+
+        return TestHarness(server: server, modelManager: modelManager, port: port)
+    }
+
+    private func sendChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> APIChatCompletionResponse {
+        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
+        var urlRequest = URLRequest(url: url)
+        urlRequest.httpMethod = "POST"
+        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        urlRequest.httpBody = try JSONEncoder().encode(request)
+
+        let (data, response) = try await URLSession.shared.data(for: urlRequest)
+        let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
+        XCTAssertEqual(httpResponse.statusCode, 200, String(data: data, encoding: .utf8) ?? "")
+        return try JSONDecoder().decode(APIChatCompletionResponse.self, from: data)
+    }
+
+    private func sendStreamingChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> StreamingResult {
+        let detailed = try await sendStreamingChatCompletionDetailed(request, port: port)
+        return StreamingResult(
+            roleDeltaCount: detailed.events.filter { $0.kind == .role }.count,
+            content: detailed.events.compactMap(\ .content).joined(),
+            toolCalls: detailed.events.flatMap(\ .toolCalls),
+            finalFinishReason: detailed.events.last(where: { $0.kind == .final })?.finishReason,
+            usage: detailed.events.last(where: { $0.kind == .final })?.usage,
+            sawDone: detailed.sawDone
+        )
+    }
+
+    private func sendStreamingChatCompletionDetailed(_ request: APIChatCompletionRequest, port: UInt16) async throws -> DetailedStreamingResult {
+        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
+        var urlRequest = URLRequest(url: url)
+        urlRequest.httpMethod = "POST"
+        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        urlRequest.httpBody = try JSONEncoder().encode(request)
+
+        let (bytes, response) = try await URLSession.shared.bytes(for: urlRequest)
+        let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
+        guard httpResponse.statusCode == 200 else {
+            var body = ""
+            for try await line in bytes.lines {
+                body += line
+            }
+            XCTFail("Expected 200 response, got \(httpResponse.statusCode): \(body)")
+            return DetailedStreamingResult(events: [], sawDone: false)
+        }
+
+        var events: [StreamingEvent] = []
+        var sawDone = false
+
+        for try await line in bytes.lines {
+            guard line.hasPrefix("data: ") else { continue }
+            let payload = String(line.dropFirst(6))
+            if payload == "[DONE]" {
+                sawDone = true
+                break
+            }
+
+            guard let data = payload.data(using: .utf8) else { continue }
+            let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
+            let choice = chunk.choices.first
+            if let delta = chunk.choices.first?.delta.role, delta == "assistant" {
+                events.append(StreamingEvent(kind: .role, role: delta, content: nil, toolCalls: [], finishReason: nil, usage: nil))
+            }
+            if let deltaContent = chunk.choices.first?.delta.content {
+                events.append(StreamingEvent(kind: .content, role: nil, content: deltaContent, toolCalls: [], finishReason: nil, usage: nil))
+            }
+            if let deltaToolCalls = chunk.choices.first?.delta.tool_calls {
+                events.append(StreamingEvent(kind: .toolCall, role: nil, content: nil, toolCalls: deltaToolCalls, finishReason: nil, usage: nil))
+            }
+            if let finishReason = choice?.finish_reason {
+                events.append(StreamingEvent(kind: .final, role: nil, content: nil, toolCalls: [], finishReason: finishReason, usage: chunk.usage))
+            }
+        }
+
+        return DetailedStreamingResult(events: events, sawDone: sawDone)
+    }
+
+    private func cancelStreamingChatCompletionAfterFirstContent(_ request: APIChatCompletionRequest, port: UInt16) async throws {
+        let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
+        var urlRequest = URLRequest(url: url)
+        urlRequest.httpMethod = "POST"
+        urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        urlRequest.httpBody = try JSONEncoder().encode(request)
+
+        let observer = StreamCancellationObserver()
+        let session = URLSession(configuration: .ephemeral)
+        let task = Task {
+            let (bytes, response) = try await session.bytes(for: urlRequest)
+            let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
+            XCTAssertEqual(httpResponse.statusCode, 200)
+
+            for try await line in bytes.lines {
+                guard line.hasPrefix("data: ") else { continue }
+                let payload = String(line.dropFirst(6))
+                if payload == "[DONE]" {
+                    break
+                }
+                guard let data = payload.data(using: .utf8) else { continue }
+                let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
+                if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
+                    await observer.markFirstContentSeen()
+                    try await Task.sleep(nanoseconds: 30_000_000_000)
+                }
+            }
+        }
+
+        try await waitUntil(timeoutSeconds: 10) {
+            await observer.hasSeenFirstContent
+        }
+
+        session.invalidateAndCancel()
+        task.cancel()
+        _ = try? await task.value
+    }
+
+    private func waitUntil(
+        timeoutSeconds: TimeInterval,
+        intervalNanoseconds: UInt64 = 100_000_000,
+        condition: @escaping () async -> Bool
+    ) async throws {
+        let deadline = Date().addingTimeInterval(timeoutSeconds)
+        while Date() < deadline {
+            if await condition() {
+                return
+            }
+            try await Task.sleep(nanoseconds: intervalNanoseconds)
+        }
+        XCTFail("Condition not met before timeout")
+    }
+}
+
+private actor StreamCancellationObserver {
+    private var sawFirstContent = false
+
+    func markFirstContentSeen() {
+        sawFirstContent = true
+    }
+
+    var hasSeenFirstContent: Bool {
+        sawFirstContent
+    }
+}
+
+private struct DetailedStreamingResult {
+    let events: [StreamingEvent]
+    let sawDone: Bool
+}
+
+private struct StreamingEvent {
+    enum Kind {
+        case role
+        case content
+        case toolCall
+        case final
+    }
+
+    let kind: Kind
+    let role: String?
+    let content: String?
+    let toolCalls: [APIToolCall]
+    let finishReason: String?
+    let usage: APIUsageInfo?
+}
+
+private struct StreamingResult {
+    let roleDeltaCount: Int
+    let content: String
+    let toolCalls: [APIToolCall]
+    let finalFinishReason: String?
+    let usage: APIUsageInfo?
+    let sawDone: Bool
+}
+
+private struct TestHarness {
+    let server: APIServer
+    let modelManager: ModelManager
+    let port: UInt16
+
+    func stop() {
+        Task { @MainActor in
+            server.stop()
+            modelManager.unloadModel()
+        }
+        TokenPrefixCache.shared.reset()
+    }
+}
--- a/docs/session-cache-upgrade.md
+++ b/docs/session-cache-upgrade.md
@@ -2572,14 +2572,18 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni

 ### Phase 3: Integration

-7. **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
+7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
 8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.

+Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
+
 ### Phase 4: Statistics & Monitoring

 9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
-10. **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
-11. **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
+10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
+11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
+
+Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.

 ### Phase 5: Advanced Cache Matching