feat: better hardening with unit tests and end-to-end tests

This commit is contained in:
2026-03-20 10:27:39 +01:00
parent e40a2f3c45
commit aadcc308a5
7 changed files with 1395 additions and 1326 deletions

View File

@@ -46,6 +46,7 @@
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; }; C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; }; C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; }; CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */; };
CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; }; CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; }; D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; }; D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
@@ -53,7 +54,6 @@
E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; }; E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; }; E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; }; EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; }; F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; }; FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; }; FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -114,6 +114,7 @@
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; }; DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; }; E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; }; E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerRewriteTests.swift; sourceTree = "<group>"; };
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; }; E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; }; E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; }; E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
@@ -121,7 +122,6 @@
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; }; F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; }; FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
@@ -172,6 +172,7 @@
154AF0C071A7DC02EB5F6F49 /* Server */ = { 154AF0C071A7DC02EB5F6F49 /* Server */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */, FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */, E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */, D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
@@ -263,7 +264,6 @@
F1A52E2C9964ADA9D841A89B /* APIModels.swift */, F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
3D08828E16B17EF02C14243E /* APIServer.swift */, 3D08828E16B17EF02C14243E /* APIServer.swift */,
3489501F2F8E1BA382347CFA /* CancellationToken.swift */, 3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
7C1A89C076E717F87A60397D /* ImageDecoder.swift */, 7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */, 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
E1E62624B6F285479CB33041 /* PromptBuilder.swift */, E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
@@ -379,6 +379,7 @@
isa = PBXSourcesBuildPhase; isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */, 962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */, E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */, 8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
@@ -406,7 +407,6 @@
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */, 85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */, B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */, 5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */, C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */, 4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */, A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,

View File

@@ -24,11 +24,15 @@ final class LiveCounters: @unchecked Sendable {
private var _isGenerating: Bool = false private var _isGenerating: Bool = false
private var _contextMax: Int = 0 private var _contextMax: Int = 0
private var _currentPhaseElapsed: TimeInterval = 0 private var _currentPhaseElapsed: TimeInterval = 0
private var _currentCacheMatchedPromptTokens: Int = 0
private var _currentCacheRebuiltPromptTokens: Int = 0
// Cumulative // Cumulative
private var _totalRequests: Int = 0 private var _totalRequests: Int = 0
private var _totalPromptTokens: Int = 0 private var _totalPromptTokens: Int = 0
private var _totalGenerationTokens: Int = 0 private var _totalGenerationTokens: Int = 0
private var _totalCacheReusePromptTokens: Int = 0
private var _totalCacheRebuildPromptTokens: Int = 0
private var _totalPreparingDuration: TimeInterval = 0 private var _totalPreparingDuration: TimeInterval = 0
private var _totalSessionBuildDuration: TimeInterval = 0 private var _totalSessionBuildDuration: TimeInterval = 0
private var _totalPrefillDuration: TimeInterval = 0 private var _totalPrefillDuration: TimeInterval = 0
@@ -90,6 +94,26 @@ final class LiveCounters: @unchecked Sendable {
lock.unlock() lock.unlock()
} }
func recordPrefillReuse(requestId: String, matchedPromptTokens: Int, promptTokenCount: Int) {
lock.lock()
guard var state = requestPhases[requestId] else {
lock.unlock()
return
}
let matched = max(0, matchedPromptTokens)
let rebuilt = max(0, promptTokenCount - matched)
_totalCacheReusePromptTokens += matched
_totalCacheRebuildPromptTokens += rebuilt
state.matchedPromptTokens = matched
state.rebuiltPromptTokens = rebuilt
requestPhases[requestId] = state
refreshCurrentCachePromptStatsLocked()
lock.unlock()
}
func requestCompleted(requestId: String, generationTokens: Int) { func requestCompleted(requestId: String, generationTokens: Int) {
let now = Date() let now = Date()
lock.lock() lock.lock()
@@ -108,6 +132,7 @@ final class LiveCounters: @unchecked Sendable {
_isGenerating = _generatingRequests > 0 _isGenerating = _generatingRequests > 0
} }
refreshCurrentPhaseElapsed(now: now) refreshCurrentPhaseElapsed(now: now)
refreshCurrentCachePromptStatsLocked()
lock.unlock() lock.unlock()
} }
@@ -126,9 +151,13 @@ final class LiveCounters: @unchecked Sendable {
_isGenerating = false _isGenerating = false
_contextMax = 0 _contextMax = 0
_currentPhaseElapsed = 0 _currentPhaseElapsed = 0
_currentCacheMatchedPromptTokens = 0
_currentCacheRebuiltPromptTokens = 0
_totalRequests = 0 _totalRequests = 0
_totalPromptTokens = 0 _totalPromptTokens = 0
_totalGenerationTokens = 0 _totalGenerationTokens = 0
_totalCacheReusePromptTokens = 0
_totalCacheRebuildPromptTokens = 0
_totalPreparingDuration = 0 _totalPreparingDuration = 0
_totalSessionBuildDuration = 0 _totalSessionBuildDuration = 0
_totalPrefillDuration = 0 _totalPrefillDuration = 0
@@ -154,9 +183,13 @@ final class LiveCounters: @unchecked Sendable {
isGenerating: _isGenerating, isGenerating: _isGenerating,
contextMax: _contextMax, contextMax: _contextMax,
currentPhaseElapsed: _currentPhaseElapsed, currentPhaseElapsed: _currentPhaseElapsed,
currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
totalRequests: _totalRequests, totalRequests: _totalRequests,
totalPromptTokens: _totalPromptTokens, totalPromptTokens: _totalPromptTokens,
totalGenerationTokens: _totalGenerationTokens, totalGenerationTokens: _totalGenerationTokens,
totalCacheReusePromptTokens: _totalCacheReusePromptTokens,
totalCacheRebuildPromptTokens: _totalCacheRebuildPromptTokens,
totalPreparingDuration: _totalPreparingDuration, totalPreparingDuration: _totalPreparingDuration,
totalSessionBuildDuration: _totalSessionBuildDuration, totalSessionBuildDuration: _totalSessionBuildDuration,
totalPrefillDuration: _totalPrefillDuration, totalPrefillDuration: _totalPrefillDuration,
@@ -179,9 +212,13 @@ final class LiveCounters: @unchecked Sendable {
let isGenerating: Bool let isGenerating: Bool
let contextMax: Int let contextMax: Int
let currentPhaseElapsed: TimeInterval let currentPhaseElapsed: TimeInterval
let currentCacheMatchedPromptTokens: Int
let currentCacheRebuiltPromptTokens: Int
let totalRequests: Int let totalRequests: Int
let totalPromptTokens: Int let totalPromptTokens: Int
let totalGenerationTokens: Int let totalGenerationTokens: Int
let totalCacheReusePromptTokens: Int
let totalCacheRebuildPromptTokens: Int
let totalPreparingDuration: TimeInterval let totalPreparingDuration: TimeInterval
let totalSessionBuildDuration: TimeInterval let totalSessionBuildDuration: TimeInterval
let totalPrefillDuration: TimeInterval let totalPrefillDuration: TimeInterval
@@ -231,9 +268,16 @@ final class LiveCounters: @unchecked Sendable {
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0 _currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
} }
private func refreshCurrentCachePromptStatsLocked() {
_currentCacheMatchedPromptTokens = requestPhases.values.reduce(0) { $0 + $1.matchedPromptTokens }
_currentCacheRebuiltPromptTokens = requestPhases.values.reduce(0) { $0 + $1.rebuiltPromptTokens }
}
private struct RequestState { private struct RequestState {
var phase: RequestPhase var phase: RequestPhase
var phaseStartedAt: Date var phaseStartedAt: Date
var matchedPromptTokens: Int = 0
var rebuiltPromptTokens: Int = 0
} }
enum RequestPhase { enum RequestPhase {
@@ -264,17 +308,20 @@ final class InferenceStats {
var contextUsed: Int = 0 var contextUsed: Int = 0
var contextMax: Int = 0 var contextMax: Int = 0
var currentPhaseElapsed: TimeInterval = 0 var currentPhaseElapsed: TimeInterval = 0
var currentCacheMatchedPromptTokens: Int = 0
var currentCacheRebuiltPromptTokens: Int = 0
// MARK: - Cumulative counters // MARK: - Cumulative counters
var totalRequests: Int = 0 var totalRequests: Int = 0
var totalPromptTokens: Int = 0 var totalPromptTokens: Int = 0
var totalGenerationTokens: Int = 0 var totalGenerationTokens: Int = 0
var totalCacheReusePromptTokens: Int = 0
var totalCacheRebuildPromptTokens: Int = 0
var totalCacheHits: Int = 0 var totalCacheHits: Int = 0
var totalCacheMisses: Int = 0 var totalCacheMisses: Int = 0
var totalCacheEvictions: Int = 0 var totalCacheEvictions: Int = 0
var totalCacheReusePromptTokens: Int = 0 var cacheHitRatePercent: Double = 0
var totalCacheRebuildPromptTokens: Int = 0
var totalPreparingDuration: TimeInterval = 0 var totalPreparingDuration: TimeInterval = 0
var totalSessionBuildDuration: TimeInterval = 0 var totalSessionBuildDuration: TimeInterval = 0
var totalPrefillDuration: TimeInterval = 0 var totalPrefillDuration: TimeInterval = 0
@@ -283,12 +330,11 @@ final class InferenceStats {
// MARK: - Cache state // MARK: - Cache state
var cacheEntryCount: Int = 0 var cacheEntryCount: Int = 0
var warmCacheEntryCount: Int = 0
var activeCacheEntryCount: Int = 0
var generatingCacheEntryCount: Int = 0
var cacheEstimatedBytes: Int = 0 var cacheEstimatedBytes: Int = 0
var cacheEstimatedTokens: Int = 0 var cacheEstimatedTokens: Int = 0
var cachedSessions: [ConversationSessionCache.SessionSummary] = [] var cacheMemoryBudgetBytes: Int = 0
var cacheMemoryUsagePercent: Double = 0
var cachedEntries: [TokenPrefixCache.EntrySummary] = []
// MARK: - Time series data (ring buffers for charts) // MARK: - Time series data (ring buffers for charts)
@@ -302,13 +348,14 @@ final class InferenceStats {
private(set) var promptTokenHistory: [DataPoint] = [] private(set) var promptTokenHistory: [DataPoint] = []
private(set) var generationTokenHistory: [DataPoint] = [] private(set) var generationTokenHistory: [DataPoint] = []
private(set) var cacheEntryHistory: [DataPoint] = [] private(set) var cacheEntryHistory: [DataPoint] = []
private(set) var activeSessionHistory: [DataPoint] = []
private(set) var cacheFootprintHistory: [DataPoint] = [] private(set) var cacheFootprintHistory: [DataPoint] = []
private(set) var cacheReuseHistory: [DataPoint] = [] private(set) var cacheHitRateHistory: [DataPoint] = []
private(set) var cacheRebuildHistory: [DataPoint] = [] private(set) var cacheMemoryPressureHistory: [DataPoint] = []
private(set) var currentPhaseElapsedHistory: [DataPoint] = [] private(set) var currentPhaseElapsedHistory: [DataPoint] = []
private(set) var prefillDurationHistory: [DataPoint] = [] private(set) var prefillDurationHistory: [DataPoint] = []
private(set) var sessionBuildDurationHistory: [DataPoint] = [] private(set) var cacheReusePromptHistory: [DataPoint] = []
private(set) var cacheRebuildPromptHistory: [DataPoint] = []
private(set) var cacheMatchQualityHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -316,10 +363,9 @@ final class InferenceStats {
private var sampleTimer: Timer? private var sampleTimer: Timer?
private var lastGenerationTokenCount: Int = 0 private var lastGenerationTokenCount: Int = 0
private var lastPromptTokenCount: Int = 0 private var lastPromptTokenCount: Int = 0
private var lastCacheReuseTokenCount: Int = 0
private var lastCacheRebuildTokenCount: Int = 0
private var lastPrefillDuration: TimeInterval = 0 private var lastPrefillDuration: TimeInterval = 0
private var lastSessionBuildDuration: TimeInterval = 0 private var lastCacheReusePromptTokenCount: Int = 0
private var lastCacheRebuildPromptTokenCount: Int = 0
func startSampling() { func startSampling() {
guard sampleTimer == nil else { return } guard sampleTimer == nil else { return }
@@ -338,7 +384,7 @@ final class InferenceStats {
private func recordSample() { private func recordSample() {
// Pull live values from the thread-safe counters // Pull live values from the thread-safe counters
let snap = LiveCounters.shared.snapshot() let snap = LiveCounters.shared.snapshot()
let cache = ConversationSessionCache.shared.snapshot() let cache = TokenPrefixCache.shared.snapshot()
activeRequests = snap.activeRequests activeRequests = snap.activeRequests
preparingRequests = snap.preparingRequests preparingRequests = snap.preparingRequests
@@ -353,9 +399,13 @@ final class InferenceStats {
contextMax = snap.contextMax contextMax = snap.contextMax
contextUsed = snap.promptTokens + snap.generationTokens contextUsed = snap.promptTokens + snap.generationTokens
currentPhaseElapsed = snap.currentPhaseElapsed currentPhaseElapsed = snap.currentPhaseElapsed
currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
totalRequests = snap.totalRequests totalRequests = snap.totalRequests
totalPromptTokens = snap.totalPromptTokens totalPromptTokens = snap.totalPromptTokens
totalGenerationTokens = snap.totalGenerationTokens totalGenerationTokens = snap.totalGenerationTokens
totalCacheReusePromptTokens = snap.totalCacheReusePromptTokens
totalCacheRebuildPromptTokens = snap.totalCacheRebuildPromptTokens
totalPreparingDuration = snap.totalPreparingDuration totalPreparingDuration = snap.totalPreparingDuration
totalSessionBuildDuration = snap.totalSessionBuildDuration totalSessionBuildDuration = snap.totalSessionBuildDuration
totalPrefillDuration = snap.totalPrefillDuration totalPrefillDuration = snap.totalPrefillDuration
@@ -363,41 +413,41 @@ final class InferenceStats {
totalCacheHits = cache.totalHits totalCacheHits = cache.totalHits
totalCacheMisses = cache.totalMisses totalCacheMisses = cache.totalMisses
totalCacheEvictions = cache.totalEvictions totalCacheEvictions = cache.totalEvictions
totalCacheReusePromptTokens = cache.totalReusePromptTokens cacheHitRatePercent = cache.hitRate
totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
cacheEntryCount = cache.totalEntries cacheEntryCount = cache.totalEntries
warmCacheEntryCount = cache.warmEntries
activeCacheEntryCount = cache.activeEntries
generatingCacheEntryCount = cache.generatingEntries
cacheEstimatedBytes = cache.estimatedBytes cacheEstimatedBytes = cache.estimatedBytes
cacheEstimatedTokens = cache.cachedTokenEstimate cacheEstimatedTokens = cache.totalCachedTokens
cachedSessions = cache.sessions cacheMemoryBudgetBytes = cache.memoryBudgetBytes
cacheMemoryUsagePercent = cache.memoryUsagePercent
cachedEntries = cache.entries
let now = Date.now let now = Date.now
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration let cacheReusePromptDelta = snap.totalCacheReusePromptTokens - lastCacheReusePromptTokenCount
let cacheRebuildPromptDelta = snap.totalCacheRebuildPromptTokens - lastCacheRebuildPromptTokenCount
let cacheMatchQualityDelta = cacheReusePromptDelta + cacheRebuildPromptDelta > 0
? (Double(cacheReusePromptDelta) / Double(cacheReusePromptDelta + cacheRebuildPromptDelta)) * 100
: 0
lastGenerationTokenCount = snap.totalGenerationTokens lastGenerationTokenCount = snap.totalGenerationTokens
lastPromptTokenCount = snap.totalPromptTokens lastPromptTokenCount = snap.totalPromptTokens
lastCacheReuseTokenCount = cache.totalReusePromptTokens
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
lastPrefillDuration = snap.totalPrefillDuration lastPrefillDuration = snap.totalPrefillDuration
lastSessionBuildDuration = snap.totalSessionBuildDuration lastCacheReusePromptTokenCount = snap.totalCacheReusePromptTokens
lastCacheRebuildPromptTokenCount = snap.totalCacheRebuildPromptTokens
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond)) tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta))) generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta))) promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries))) cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes))) cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta))) cacheHitRateHistory.append(DataPoint(timestamp: now, value: cache.hitRate))
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta))) cacheMemoryPressureHistory.append(DataPoint(timestamp: now, value: cache.memoryUsagePercent))
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed)) currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta)) prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta)) cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
if tokenRateHistory.count > Self.maxHistoryPoints { if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints) tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -411,17 +461,14 @@ final class InferenceStats {
if cacheEntryHistory.count > Self.maxHistoryPoints { if cacheEntryHistory.count > Self.maxHistoryPoints {
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints) cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
} }
if activeSessionHistory.count > Self.maxHistoryPoints {
activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
}
if cacheFootprintHistory.count > Self.maxHistoryPoints { if cacheFootprintHistory.count > Self.maxHistoryPoints {
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints) cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
} }
if cacheReuseHistory.count > Self.maxHistoryPoints { if cacheHitRateHistory.count > Self.maxHistoryPoints {
cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints) cacheHitRateHistory.removeFirst(cacheHitRateHistory.count - Self.maxHistoryPoints)
} }
if cacheRebuildHistory.count > Self.maxHistoryPoints { if cacheMemoryPressureHistory.count > Self.maxHistoryPoints {
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints) cacheMemoryPressureHistory.removeFirst(cacheMemoryPressureHistory.count - Self.maxHistoryPoints)
} }
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints { if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints) currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
@@ -429,14 +476,20 @@ final class InferenceStats {
if prefillDurationHistory.count > Self.maxHistoryPoints { if prefillDurationHistory.count > Self.maxHistoryPoints {
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints) prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
} }
if sessionBuildDurationHistory.count > Self.maxHistoryPoints { if cacheReusePromptHistory.count > Self.maxHistoryPoints {
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints) cacheReusePromptHistory.removeFirst(cacheReusePromptHistory.count - Self.maxHistoryPoints)
}
if cacheRebuildPromptHistory.count > Self.maxHistoryPoints {
cacheRebuildPromptHistory.removeFirst(cacheRebuildPromptHistory.count - Self.maxHistoryPoints)
}
if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
} }
} }
func reset() { func reset() {
LiveCounters.shared.reset() LiveCounters.shared.reset()
ConversationSessionCache.shared.reset() TokenPrefixCache.shared.reset()
activeRequests = 0 activeRequests = 0
preparingRequests = 0 preparingRequests = 0
sessionBuildRequests = 0 sessionBuildRequests = 0
@@ -450,9 +503,13 @@ final class InferenceStats {
contextUsed = 0 contextUsed = 0
contextMax = 0 contextMax = 0
currentPhaseElapsed = 0 currentPhaseElapsed = 0
currentCacheMatchedPromptTokens = 0
currentCacheRebuiltPromptTokens = 0
totalRequests = 0 totalRequests = 0
totalPromptTokens = 0 totalPromptTokens = 0
totalGenerationTokens = 0 totalGenerationTokens = 0
totalCacheReusePromptTokens = 0
totalCacheRebuildPromptTokens = 0
totalPreparingDuration = 0 totalPreparingDuration = 0
totalSessionBuildDuration = 0 totalSessionBuildDuration = 0
totalPrefillDuration = 0 totalPrefillDuration = 0
@@ -460,31 +517,41 @@ final class InferenceStats {
totalCacheHits = 0 totalCacheHits = 0
totalCacheMisses = 0 totalCacheMisses = 0
totalCacheEvictions = 0 totalCacheEvictions = 0
totalCacheReusePromptTokens = 0 cacheHitRatePercent = 0
totalCacheRebuildPromptTokens = 0
cacheEntryCount = 0 cacheEntryCount = 0
warmCacheEntryCount = 0
activeCacheEntryCount = 0
generatingCacheEntryCount = 0
cacheEstimatedBytes = 0 cacheEstimatedBytes = 0
cacheEstimatedTokens = 0 cacheEstimatedTokens = 0
cachedSessions.removeAll() cacheMemoryBudgetBytes = 0
cacheMemoryUsagePercent = 0
cachedEntries.removeAll()
tokenRateHistory.removeAll() tokenRateHistory.removeAll()
promptTokenHistory.removeAll() promptTokenHistory.removeAll()
generationTokenHistory.removeAll() generationTokenHistory.removeAll()
cacheEntryHistory.removeAll() cacheEntryHistory.removeAll()
activeSessionHistory.removeAll()
cacheFootprintHistory.removeAll() cacheFootprintHistory.removeAll()
cacheReuseHistory.removeAll() cacheHitRateHistory.removeAll()
cacheRebuildHistory.removeAll() cacheMemoryPressureHistory.removeAll()
currentPhaseElapsedHistory.removeAll() currentPhaseElapsedHistory.removeAll()
prefillDurationHistory.removeAll() prefillDurationHistory.removeAll()
sessionBuildDurationHistory.removeAll() cacheReusePromptHistory.removeAll()
cacheRebuildPromptHistory.removeAll()
cacheMatchQualityHistory.removeAll()
lastGenerationTokenCount = 0 lastGenerationTokenCount = 0
lastPromptTokenCount = 0 lastPromptTokenCount = 0
lastCacheReuseTokenCount = 0
lastCacheRebuildTokenCount = 0
lastPrefillDuration = 0 lastPrefillDuration = 0
lastSessionBuildDuration = 0 lastCacheReusePromptTokenCount = 0
lastCacheRebuildPromptTokenCount = 0
}
var currentCacheMatchQualityPercent: Double {
let total = currentCacheMatchedPromptTokens + currentCacheRebuiltPromptTokens
guard total > 0 else { return 0 }
return (Double(currentCacheMatchedPromptTokens) / Double(total)) * 100
}
var totalCacheMatchQualityPercent: Double {
let total = totalCacheReusePromptTokens + totalCacheRebuildPromptTokens
guard total > 0 else { return 0 }
return (Double(totalCacheReusePromptTokens) / Double(total)) * 100
} }
} }

View File

@@ -63,7 +63,7 @@ final class APIServer {
listener?.cancel() listener?.cancel()
listener = nil listener = nil
isRunning = false isRunning = false
ConversationSessionCache.shared.invalidateAll() TokenPrefixCache.shared.invalidateAll()
inferenceStats.stopSampling() inferenceStats.stopSampling()
} }
@@ -176,7 +176,7 @@ final class APIServer {
if let targetConfig = ModelConfig.resolve(requestedModel) { if let targetConfig = ModelConfig.resolve(requestedModel) {
if modelManager.currentModel?.id != targetConfig.id { if modelManager.currentModel?.id != targetConfig.id {
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)") print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
ConversationSessionCache.shared.invalidateAll() TokenPrefixCache.shared.invalidateAll()
await modelManager.loadModel(targetConfig) await modelManager.loadModel(targetConfig)
} }
} }
@@ -187,7 +187,7 @@ final class APIServer {
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId, if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
let config = ModelConfig.resolve(lastModelId) { let config = ModelConfig.resolve(lastModelId) {
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)") print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
ConversationSessionCache.shared.invalidateAll() TokenPrefixCache.shared.invalidateAll()
await modelManager.loadModel(config) await modelManager.loadModel(config)
} }
@@ -260,110 +260,80 @@ final class APIServer {
temperature: Float(temperature), temperature: Float(temperature),
topP: Float(topP) topP: Float(topP)
) )
// Feed all messages except the last as history, then send the last as the prompt
let chatMessages = preparedPrompt.chatMessages
let allButLast = Array(chatMessages.dropLast())
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
let historySignatures = Array(preparedPrompt.messageSignatures.dropLast())
let currentModelId = modelManager.currentModel?.id ?? modelName let currentModelId = modelManager.currentModel?.id ?? modelName
let lease = ConversationSessionCache.shared.checkoutSession( let engine = InferenceEngine(container: container)
modelId: currentModelId, let preparedInference: InferenceEngine.PreparedInference
instructions: preparedPrompt.instructions, do {
historySignatures: historySignatures, preparedInference = try await engine.prepare(preparedPrompt.userInput)
requestMessageCount: chatMessages.count, } catch {
estimatedPromptTokens: estimatedPromptTokens, LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
estimatedBytes: preparedPrompt.estimatedBytes sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
) return
let session: ChatSession
if let reusableSession = lease.session {
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
session = reusableSession
session.generateParameters = generateParams
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
} else {
print("[APIServer] Creating fresh session")
ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
// Use `instructions:` for system/tool prompt (matches internal chat pattern).
// Only conversation turns go in `history:` this avoids replaying the
// large tool prompt as history on every new session.
let instr = preparedPrompt.instructions.isEmpty ? nil : preparedPrompt.instructions
if !allButLast.isEmpty {
session = ChatSession(
container,
instructions: instr,
history: allButLast,
generateParameters: generateParams,
additionalContext: preparedPrompt.additionalContext
)
} else {
session = ChatSession(
container,
instructions: instr,
generateParameters: generateParams,
additionalContext: preparedPrompt.additionalContext
)
}
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
} }
// Extract images from the last message only (ChatSession.streamDetails takes images separately) // Vision requests stay uncached until image fingerprinting lands.
let lastImages = lastMessage.images let cacheKey = preparedInference.hasImages ? nil : preparedInference.tokens
let lease = cacheKey.map { TokenPrefixCache.shared.lookup(cacheKey: $0, modelId: currentModelId) }
?? TokenPrefixCache.CacheLease(entryId: UUID(), kvCache: nil, matchedTokenCount: 0, isHit: false)
let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) LiveCounters.shared.recordPrefillReuse(
requestId: requestId,
matchedPromptTokens: lease.matchedTokenCount,
promptTokenCount: preparedInference.tokens.count
)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
let cancellation = CancellationToken()
let streamHandle: InferenceEngine.StreamHandle
do {
streamHandle = try await engine.stream(
InferenceEngine.InferenceRequest(
input: preparedInference.lmInput,
tokens: preparedInference.tokens,
parameters: generateParams,
cachedKV: lease.kvCache,
cachedTokenCount: lease.matchedTokenCount
),
cancellation: cancellation
)
} catch {
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return
}
let result: GenerationOutcome
if isStream { if isStream {
result = await handleStreamingResponse( result = await handleStreamingResponse(
connection: connection, connection: connection,
requestId: requestId, requestId: requestId,
cacheEntryId: lease.entryId, cancellation: cancellation,
session: session, stream: streamHandle.stream,
prompt: lastMessage.content,
images: lastImages,
tools: request.tools, tools: request.tools,
created: created, created: created,
modelName: modelName, modelName: modelName
isQwen: isQwen
) )
} else { } else {
result = await handleNonStreamingResponse( result = await handleNonStreamingResponse(
connection: connection, connection: connection,
requestId: requestId, requestId: requestId,
cacheEntryId: lease.entryId, stream: streamHandle.stream,
session: session,
prompt: lastMessage.content,
images: lastImages,
tools: request.tools, tools: request.tools,
created: created, created: created,
modelName: modelName, modelName: modelName
isQwen: isQwen
) )
} }
if result.succeeded { if let cacheKey,
var cachedSignatures = preparedPrompt.messageSignatures result.succeeded || result.cancelled {
if let assistantHistoryText = result.assistantHistoryText { Self.storePromptCache(
cachedSignatures.append( streamHandle.workingCache,
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: []) promptTokenCount: preparedInference.tokens.count,
)
}
ConversationSessionCache.shared.completeRequest(
entryId: lease.entryId, entryId: lease.entryId,
session: session, cacheKey: cacheKey,
requestMessageSignatures: cachedSignatures, modelId: currentModelId
requestMessageCount: cachedSignatures.count,
estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: preparedPrompt.estimatedBytes,
promptTokens: result.promptTokens,
completionTokens: result.completionTokens
) )
} else {
ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
} }
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens) LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
@@ -375,53 +345,20 @@ final class APIServer {
private func handleNonStreamingResponse( private func handleNonStreamingResponse(
connection: NWConnection, connection: NWConnection,
requestId: String, requestId: String,
cacheEntryId: UUID, stream: AsyncStream<Generation>,
session: ChatSession,
prompt: String,
images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
created: Int, created: Int,
modelName: String, modelName: String
isQwen: Bool ) async -> GenerationOutcome {
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
do { do {
var fullText = "" let outcome = await Self.collectGenerationOutcome(
var promptTokens = 0 stream: stream,
var completionTokens = 0 requestId: requestId,
var frameworkToolCalls: [MLXLMCommon.ToolCall] = [] cancellation: nil
let stream = session.streamDetails(
to: prompt,
images: images,
videos: []
) )
for try await generation in stream {
switch generation {
case .chunk(let text):
fullText += text
completionTokens += 1
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
let resolved = Self.resolveAssistantResponse( let resolved = Self.resolveAssistantResponse(
fullText: fullText, fullText: outcome.fullText,
frameworkToolCalls: frameworkToolCalls, frameworkToolCalls: outcome.frameworkToolCalls,
tools: tools tools: tools
) )
@@ -442,24 +379,26 @@ final class APIServer {
) )
], ],
usage: APIUsageInfo( usage: APIUsageInfo(
prompt_tokens: promptTokens, prompt_tokens: outcome.promptTokens,
completion_tokens: completionTokens, completion_tokens: outcome.completionTokens,
total_tokens: promptTokens + completionTokens total_tokens: outcome.promptTokens + outcome.completionTokens
) )
) )
if let json = try? JSONEncoder().encode(response) { if let json = try? JSONEncoder().encode(response) {
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}") sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
} }
let assistantHistoryText = Self.normalizedAssistantHistoryContent( return GenerationOutcome(
content: resolved.content, promptTokens: outcome.promptTokens,
toolCalls: resolved.toolCalls, completionTokens: outcome.completionTokens,
isQwen: isQwen fullText: outcome.fullText,
frameworkToolCalls: outcome.frameworkToolCalls,
succeeded: true,
cancelled: false
) )
return (promptTokens, completionTokens, assistantHistoryText, true)
} catch { } catch {
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return (0, 0, nil, false) return GenerationOutcome(promptTokens: 0, completionTokens: 0, fullText: "", frameworkToolCalls: [], succeeded: false, cancelled: false)
} }
} }
@@ -468,15 +407,12 @@ final class APIServer {
private func handleStreamingResponse( private func handleStreamingResponse(
connection: NWConnection, connection: NWConnection,
requestId: String, requestId: String,
cacheEntryId: UUID, cancellation: CancellationToken,
session: ChatSession, stream: AsyncStream<Generation>,
prompt: String,
images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
created: Int, created: Int,
modelName: String, modelName: String
isQwen: Bool ) async -> GenerationOutcome {
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
// Send SSE headers // Send SSE headers
let header = [ let header = [
"HTTP/1.1 200 OK", "HTTP/1.1 200 OK",
@@ -489,55 +425,34 @@ final class APIServer {
].joined(separator: "\r\n") ].joined(separator: "\r\n")
await Self.sendData(connection: connection, data: header.data(using: .utf8)!) await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
connection.stateUpdateHandler = { state in
switch state {
case .cancelled, .failed:
cancellation.cancel()
default:
break
}
}
// Send initial role chunk let encoder = StreamingSSEEncoder(requestId: requestId, created: created, modelName: modelName)
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk( await Self.sendData(connection: connection, data: encoder.encodeRoleDelta("assistant"))
id: requestId,
object: "chat.completion.chunk",
created: created,
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: "assistant", content: nil, tool_calls: nil), finish_reason: nil)],
usage: nil
))
let hasTools = tools != nil && !(tools?.isEmpty ?? true) let result = await Self.runStreamingLoop(
connection: connection,
// Run the generation loop OFF MainActor. stream: stream,
// ChatSession and NWConnection don't need MainActor. cancellation: cancellation,
// Running on MainActor caused every token to compete with SwiftUI requestId: requestId,
// rendering, creating back-pressure that coalesced all output. encoder: encoder
let stream = session.streamDetails(
to: prompt,
images: images,
videos: []
) )
// Transfer non-Sendable values to the nonisolated loop.
// Safe because we don't touch session/images again until after the loop.
let result = await {
nonisolated(unsafe) let stream = stream
return await Self.runStreamingLoop(
connection: connection,
stream: stream,
requestId: requestId,
created: created,
modelName: modelName
)
}()
let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result if result.cancelled {
connection.cancel()
if promptTokens > 0 { return result
ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
} }
let resolved = Self.resolveAssistantResponse( let resolved = Self.resolveAssistantResponse(
fullText: fullText, fullText: result.fullText,
frameworkToolCalls: frameworkToolCalls, frameworkToolCalls: result.frameworkToolCalls,
tools: tools tools: tools
) )
@@ -562,21 +477,16 @@ final class APIServer {
model: modelName, model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)], choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
usage: APIUsageInfo( usage: APIUsageInfo(
prompt_tokens: promptTokens, prompt_tokens: result.promptTokens,
completion_tokens: completionTokens, completion_tokens: result.completionTokens,
total_tokens: promptTokens + completionTokens total_tokens: result.promptTokens + result.completionTokens
) )
)) ))
// Send [DONE] and close // Send [DONE] and close
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!) await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
connection.cancel() connection.cancel()
let assistantHistoryText = Self.normalizedAssistantHistoryContent( return result
content: resolved.content,
toolCalls: resolved.toolCalls,
isQwen: isQwen
)
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
} }
/// Run the token generation + SSE send loop entirely off MainActor. /// Run the token generation + SSE send loop entirely off MainActor.
@@ -584,54 +494,20 @@ final class APIServer {
/// multiple actor hops competing with SwiftUI, causing all output to batch. /// multiple actor hops competing with SwiftUI, causing all output to batch.
nonisolated private static func runStreamingLoop( nonisolated private static func runStreamingLoop(
connection: NWConnection, connection: NWConnection,
stream: AsyncThrowingStream<Generation, any Error>, stream: AsyncStream<Generation>,
cancellation: CancellationToken,
requestId: String, requestId: String,
created: Int, encoder: StreamingSSEEncoder
modelName: String ) async -> GenerationOutcome {
) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) { var outcome = await collectGenerationOutcome(
var promptTokens = 0 stream: stream,
var completionTokens = 0 requestId: requestId,
var fullText = "" cancellation: cancellation
var frameworkToolCalls: [MLXLMCommon.ToolCall] = [] ) { text in
await sendData(connection: connection, data: encoder.encodeContentDelta(text))
do {
for try await generation in stream {
switch generation {
case .chunk(let text):
completionTokens += 1
fullText += text
// Update live counters directly no MainActor hop needed
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
// Send directly no MainActor hop.
await sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
id: requestId,
object: "chat.completion.chunk",
created: created,
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: text, tool_calls: nil), finish_reason: nil)],
usage: nil
))
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
} catch {
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
} }
outcome.succeeded = !outcome.cancelled
return (promptTokens, completionTokens, fullText, frameworkToolCalls, true) return outcome
} }
/// Send an SSE event and wait for the protocol stack to process it. /// Send an SSE event and wait for the protocol stack to process it.
@@ -651,6 +527,88 @@ final class APIServer {
} }
} }
nonisolated private static func collectGenerationOutcome(
stream: AsyncStream<Generation>,
requestId: String,
cancellation: CancellationToken?,
onChunk: ((String) async -> Void)? = nil
) async -> GenerationOutcome {
var promptTokens = 0
var completionTokens = 0
var fullText = ""
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
var cancelled = false
for await generation in stream {
if let cancellation, cancellation.isCancelled {
cancelled = true
break
}
switch generation {
case .chunk(let text):
completionTokens += 1
fullText += text
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
if let onChunk {
await onChunk(text)
}
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
return GenerationOutcome(
promptTokens: promptTokens,
completionTokens: completionTokens,
fullText: fullText,
frameworkToolCalls: frameworkToolCalls,
succeeded: !cancelled,
cancelled: cancelled
)
}
private static func storePromptCache(
_ cache: [KVCache],
promptTokenCount: Int,
entryId: UUID,
cacheKey: [Int],
modelId: String
) {
guard trimGeneratedTokens(cache, promptTokenCount: promptTokenCount) else {
return
}
TokenPrefixCache.shared.store(
entryId: entryId,
kvCache: cache,
cacheKey: cacheKey,
modelId: modelId
)
}
private static func trimGeneratedTokens(_ cache: [KVCache], promptTokenCount: Int) -> Bool {
for layer in cache {
let excess = layer.offset - promptTokenCount
guard excess <= 0 || layer.isTrimmable else {
return false
}
if excess > 0 {
let trimmed = layer.trim(excess)
guard trimmed == excess else {
return false
}
}
}
return true
}
// MARK: - HTTP helpers // MARK: - HTTP helpers
private func sendResponse( private func sendResponse(
@@ -787,6 +745,15 @@ final class APIServer {
} }
} }
private struct GenerationOutcome {
var promptTokens: Int
var completionTokens: Int
var fullText: String
var frameworkToolCalls: [MLXLMCommon.ToolCall]
var succeeded: Bool
var cancelled: Bool
}
// MARK: - HTTP request parser // MARK: - HTTP request parser
private struct HTTPRequest { private struct HTTPRequest {

View File

@@ -1,358 +0,0 @@
import Foundation
import MLXLMCommon
import os
enum APISessionPhase: String, Sendable {
case idle = "Idle"
case sessionBuild = "Session Build"
case prefilling = "Prefilling"
case generating = "Generating"
}
/// Bounded cache of API chat sessions keyed by normalized conversation history.
/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
final class ConversationSessionCache: @unchecked Sendable {
static let shared = ConversationSessionCache()
private let lock = OSAllocatedUnfairLock()
private let maxEntries = 8
private let maxCachedTokens = 256_000
private let idleTTL: TimeInterval = 10 * 60
private var entries: [UUID: Entry] = [:]
private var totals = Totals()
private init() {}
struct Lease {
let entryId: UUID
let session: ChatSession?
let reusedPromptTokens: Int
let cacheHit: Bool
}
struct SessionSummary: Identifiable, Sendable {
let id: UUID
let modelId: String
let phase: APISessionPhase
let messageCount: Int
let cachedTokenEstimate: Int
let estimatedBytes: Int
let inFlightRequests: Int
let hitCount: Int
let lastPromptTokens: Int
let lastCompletionTokens: Int
let lastReuseTokens: Int
let createdAt: Date
let lastAccessAt: Date
}
struct Snapshot: Sendable {
let totalEntries: Int
let warmEntries: Int
let activeEntries: Int
let generatingEntries: Int
let estimatedBytes: Int
let cachedTokenEstimate: Int
let totalHits: Int
let totalMisses: Int
let totalEvictions: Int
let totalReusePromptTokens: Int
let totalRebuildPromptTokens: Int
let sessions: [SessionSummary]
}
func checkoutSession(
modelId: String,
instructions: String,
historySignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int
) -> Lease {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let instructionsHash = Self.stableHash(instructions)
let match = entries
.values
.filter {
$0.modelId == modelId
&& $0.instructionsHash == instructionsHash
&& $0.session != nil
&& $0.inFlightRequests == 0
&& Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
}
.max { lhs, rhs in
lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
}
if let match {
var entry = match
entry.inFlightRequests += 1
entry.lastAccessAt = now
entry.phase = .prefilling
entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
entry.hitCount += 1
entries[entry.id] = entry
totals.totalHits += 1
totals.totalReusePromptTokens += entry.lastReuseTokens
let lease = Lease(
entryId: entry.id,
session: entry.session,
reusedPromptTokens: entry.lastReuseTokens,
cacheHit: true
)
lock.unlock()
return lease
}
let entryId = UUID()
entries[entryId] = Entry(
id: entryId,
modelId: modelId,
instructionsHash: instructionsHash,
requestMessageSignatures: historySignatures,
messageCount: requestMessageCount,
cachedTokenEstimate: estimatedPromptTokens,
estimatedBytes: estimatedBytes,
createdAt: now,
lastAccessAt: now,
inFlightRequests: 1,
hitCount: 0,
phase: .sessionBuild,
lastPromptTokens: 0,
lastCompletionTokens: 0,
lastReuseTokens: 0,
session: nil
)
totals.totalMisses += 1
totals.totalRebuildPromptTokens += estimatedPromptTokens
lock.unlock()
return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
}
func markSessionBuild(entryId: UUID) {
updatePhase(entryId: entryId, phase: .sessionBuild)
}
func markPrefilling(entryId: UUID) {
updatePhase(entryId: entryId, phase: .prefilling)
}
func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = .generating
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
func completeRequest(
entryId: UUID,
session: ChatSession,
requestMessageSignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int,
promptTokens: Int,
completionTokens: Int
) {
lock.lock()
let now = Date()
if var entry = entries[entryId] {
entry.session = session
entry.requestMessageSignatures = requestMessageSignatures
entry.messageCount = requestMessageCount
entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
entry.estimatedBytes = estimatedBytes
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.lastAccessAt = now
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
entry.phase = .idle
entries[entryId] = entry
enforceBudgetLocked(now: now)
}
lock.unlock()
}
func abandonRequest(entryId: UUID) {
lock.lock()
if var entry = entries[entryId] {
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
if entry.session == nil && entry.inFlightRequests == 0 {
entries.removeValue(forKey: entryId)
} else {
entry.phase = .idle
entry.lastAccessAt = Date()
entries[entryId] = entry
}
}
lock.unlock()
}
func invalidateAll() {
lock.lock()
totals.totalEvictions += entries.count
entries.removeAll()
lock.unlock()
}
func reset() {
lock.lock()
entries.removeAll()
totals = Totals()
lock.unlock()
}
func snapshot() -> Snapshot {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let allEntries = Array(entries.values)
let sessions = allEntries
.sorted {
if $0.inFlightRequests != $1.inFlightRequests {
return $0.inFlightRequests > $1.inFlightRequests
}
return $0.lastAccessAt > $1.lastAccessAt
}
.map {
SessionSummary(
id: $0.id,
modelId: $0.modelId,
phase: $0.phase,
messageCount: $0.messageCount,
cachedTokenEstimate: $0.cachedTokenEstimate,
estimatedBytes: $0.estimatedBytes,
inFlightRequests: $0.inFlightRequests,
hitCount: $0.hitCount,
lastPromptTokens: $0.lastPromptTokens,
lastCompletionTokens: $0.lastCompletionTokens,
lastReuseTokens: $0.lastReuseTokens,
createdAt: $0.createdAt,
lastAccessAt: $0.lastAccessAt
)
}
let snapshot = Snapshot(
totalEntries: allEntries.count,
warmEntries: allEntries.filter { $0.session != nil }.count,
activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
generatingEntries: allEntries.filter { $0.phase == .generating }.count,
estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
totalHits: totals.totalHits,
totalMisses: totals.totalMisses,
totalEvictions: totals.totalEvictions,
totalReusePromptTokens: totals.totalReusePromptTokens,
totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
sessions: sessions
)
lock.unlock()
return snapshot
}
private func updatePhase(entryId: UUID, phase: APISessionPhase) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = phase
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
private func pruneExpiredLocked(now: Date) {
let expired = entries.values.filter {
$0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
}
guard !expired.isEmpty else { return }
for entry in expired {
entries.removeValue(forKey: entry.id)
}
totals.totalEvictions += expired.count
}
private func enforceBudgetLocked(now: Date) {
pruneExpiredLocked(now: now)
func totalCachedTokens() -> Int {
entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
}
while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
guard let victim = entries.values
.filter({ $0.inFlightRequests == 0 })
.sorted(by: evictionOrder)
.first
else {
break
}
entries.removeValue(forKey: victim.id)
totals.totalEvictions += 1
}
}
private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
if lhs.lastAccessAt != rhs.lastAccessAt {
return lhs.lastAccessAt < rhs.lastAccessAt
}
if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
}
return lhs.createdAt < rhs.createdAt
}
private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
guard cached.count <= incoming.count,
incoming.count <= cached.count + 1 else { return false }
for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
return false
}
return true
}
static func stableHash(_ text: String) -> UInt64 {
var hash: UInt64 = 14_695_981_039_346_656_037
for byte in text.utf8 {
hash ^= UInt64(byte)
hash &*= 1_099_511_628_211
}
return hash
}
private struct Entry {
let id: UUID
let modelId: String
let instructionsHash: UInt64
var requestMessageSignatures: [UInt64]
var messageCount: Int
var cachedTokenEstimate: Int
var estimatedBytes: Int
let createdAt: Date
var lastAccessAt: Date
var inFlightRequests: Int
var hitCount: Int
var phase: APISessionPhase
var lastPromptTokens: Int
var lastCompletionTokens: Int
var lastReuseTokens: Int
var session: ChatSession?
}
private struct Totals {
var totalHits: Int = 0
var totalMisses: Int = 0
var totalEvictions: Int = 0
var totalReusePromptTokens: Int = 0
var totalRebuildPromptTokens: Int = 0
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,736 @@
import Foundation
import XCTest
@testable import MLX_Server
final class APIServerRewriteTests: XCTestCase {
func testNonStreamingChatCompletionUsesStatelessServerPathAndCachesPrompt() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let request = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 1,
stream: false,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstResponse = try await sendChatCompletion(request, port: harness.port)
XCTAssertEqual(firstResponse.choices.count, 1)
XCTAssertEqual(firstResponse.choices[0].message.role, "assistant")
XCTAssertGreaterThan(firstResponse.usage.prompt_tokens, 0)
XCTAssertGreaterThanOrEqual(firstResponse.usage.completion_tokens, 0)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
_ = try await sendChatCompletion(request, port: harness.port)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let firstRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
XCTAssertEqual(firstStream.roleDeltaCount, 1)
XCTAssertTrue(firstStream.sawDone)
XCTAssertEqual(firstStream.finalFinishReason, "stop")
XCTAssertGreaterThan(firstStream.usage?.prompt_tokens ?? 0, 0)
XCTAssertFalse(firstStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
let secondRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let secondStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
XCTAssertEqual(secondStream.roleDeltaCount, 1)
XCTAssertTrue(secondStream.sawDone)
XCTAssertEqual(secondStream.finalFinishReason, "stop")
XCTAssertFalse(secondStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
let thirdRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(secondStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is snow?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
XCTAssertTrue(thirdStream.sawDone)
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
}
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossToolBoundary() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let tools = [mockWeatherTool]
let firstRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let toolCallStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
XCTAssertEqual(toolCallStream.roleDeltaCount, 1)
XCTAssertTrue(toolCallStream.sawDone)
XCTAssertEqual(toolCallStream.finalFinishReason, "tool_calls")
let toolCall = try XCTUnwrap(toolCallStream.toolCalls.first)
XCTAssertEqual(toolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let afterToolCallSnapshot = TokenPrefixCache.shared.snapshot()
let afterToolCallLiveSnapshot = LiveCounters.shared.snapshot()
let secondRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id)
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let directAnswerStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
XCTAssertEqual(directAnswerStream.roleDeltaCount, 1)
XCTAssertTrue(directAnswerStream.sawDone)
XCTAssertEqual(directAnswerStream.finalFinishReason, "stop")
XCTAssertTrue(directAnswerStream.toolCalls.isEmpty)
XCTAssertFalse(directAnswerStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterToolCallSnapshot.totalHits
}
let afterDirectAnswerSnapshot = TokenPrefixCache.shared.snapshot()
let afterDirectAnswerLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(afterDirectAnswerSnapshot.totalHits, afterToolCallSnapshot.totalHits)
XCTAssertGreaterThan(afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens, afterToolCallLiveSnapshot.totalCacheReusePromptTokens)
let thirdRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id),
APIChatMessage(role: "assistant", content: .text(directAnswerStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now compress that answer to two words."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 8,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
XCTAssertTrue(thirdStream.sawDone)
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterDirectAnswerSnapshot.totalHits
}
let finalSnapshot = TokenPrefixCache.shared.snapshot()
let finalLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDirectAnswerSnapshot.totalHits)
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossMultipleToolTurns() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let tools = [mockWeatherTool]
let berlinRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstToolTurn = try await sendStreamingChatCompletion(berlinRequest, port: harness.port)
XCTAssertEqual(firstToolTurn.finalFinishReason, "tool_calls")
let berlinToolCall = try XCTUnwrap(firstToolTurn.toolCalls.first)
XCTAssertEqual(berlinToolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
let berlinToolResult = APIChatMessage(
role: "tool",
content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"),
name: nil,
tool_calls: nil,
tool_call_id: berlinToolCall.id
)
let berlinAnswerRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let berlinAnswer = try await sendStreamingChatCompletion(berlinAnswerRequest, port: harness.port)
XCTAssertEqual(berlinAnswer.finalFinishReason, "stop")
XCTAssertFalse(berlinAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
let parisToolTurnRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult,
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let secondToolTurn = try await sendStreamingChatCompletion(parisToolTurnRequest, port: harness.port)
XCTAssertEqual(secondToolTurn.finalFinishReason, "tool_calls")
let parisToolCall = try XCTUnwrap(secondToolTurn.toolCalls.first)
XCTAssertEqual(parisToolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
}
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
let parisAnswerRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult,
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [parisToolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Paris\",\"temperature_c\":21,\"condition\":\"clear\"}"), name: nil, tool_calls: nil, tool_call_id: parisToolCall.id)
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let parisAnswer = try await sendStreamingChatCompletion(parisAnswerRequest, port: harness.port)
XCTAssertEqual(parisAnswer.finalFinishReason, "stop")
XCTAssertTrue(parisAnswer.toolCalls.isEmpty)
XCTAssertFalse(parisAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > thirdSnapshot.totalHits
}
let fourthSnapshot = TokenPrefixCache.shared.snapshot()
let fourthLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(fourthSnapshot.totalHits, thirdSnapshot.totalHits)
XCTAssertGreaterThan(fourthLiveSnapshot.totalCacheReusePromptTokens, thirdLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingDisconnectStoresPromptCacheForReuse() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let request = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Count from one to twenty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 64,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let initialSnapshot = TokenPrefixCache.shared.snapshot()
try await cancelStreamingChatCompletionAfterFirstContent(request, port: harness.port)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > initialSnapshot.totalEntries
}
let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
_ = try await sendChatCompletion(
APIChatCompletionRequest(
model: request.model,
messages: request.messages,
temperature: request.temperature,
top_p: request.top_p,
max_tokens: 8,
stream: false,
stop: request.stop,
tools: request.tools,
tool_choice: request.tool_choice,
frequency_penalty: request.frequency_penalty,
presence_penalty: request.presence_penalty,
n: request.n
),
port: harness.port
)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterDisconnectSnapshot.totalHits
}
let finalSnapshot = TokenPrefixCache.shared.snapshot()
let finalLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDisconnectSnapshot.totalHits)
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let detailed = try await sendStreamingChatCompletionDetailed(
APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: [mockWeatherTool],
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
),
port: harness.port
)
XCTAssertTrue(detailed.sawDone)
XCTAssertFalse(detailed.events.isEmpty)
let firstEvent = try XCTUnwrap(detailed.events.first)
XCTAssertEqual(firstEvent.kind, .role)
XCTAssertEqual(firstEvent.role, "assistant")
let toolEventIndices = detailed.events.enumerated().compactMap { index, event in
event.kind == .toolCall ? index : nil
}
XCTAssertFalse(toolEventIndices.isEmpty)
let finalIndex = try XCTUnwrap(detailed.events.lastIndex(where: { $0.kind == .final }))
XCTAssertEqual(finalIndex, detailed.events.count - 1)
for toolIndex in toolEventIndices {
XCTAssertLessThan(toolIndex, finalIndex)
}
let finalEvent = detailed.events[finalIndex]
XCTAssertEqual(finalEvent.finishReason, "tool_calls")
XCTAssertNotNil(finalEvent.usage)
let roleEventCount = detailed.events.filter { $0.kind == .role }.count
XCTAssertEqual(roleEventCount, 1)
}
private var mockWeatherTool: APIToolDefinition {
APIToolDefinition(
type: "function",
function: APIFunctionDefinition(
name: "weather",
description: "Look up weather for a city.",
parameters: [
"type": AnyCodable("object"),
"properties": AnyCodable([
"city": [
"type": "string",
"description": "City name"
]
]),
"required": AnyCodable(["city"])
]
)
)
}
private func makeHarness() async throws -> TestHarness {
let modelManager = await MainActor.run { ModelManager() }
let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
LiveCounters.shared.reset()
TokenPrefixCache.shared.reset()
await modelManager.loadModel(config)
let isReady = await MainActor.run { modelManager.isReady }
XCTAssertTrue(isReady)
let server = await MainActor.run { APIServer() }
let port = UInt16.random(in: 20_000...40_000)
await MainActor.run {
server.start(modelManager: modelManager, port: Int(port))
}
try await waitUntil(timeoutSeconds: 5) {
await MainActor.run { server.isRunning }
}
return TestHarness(server: server, modelManager: modelManager, port: port)
}
private func sendChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> APIChatCompletionResponse {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let (data, response) = try await URLSession.shared.data(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
XCTAssertEqual(httpResponse.statusCode, 200, String(data: data, encoding: .utf8) ?? "")
return try JSONDecoder().decode(APIChatCompletionResponse.self, from: data)
}
private func sendStreamingChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> StreamingResult {
let detailed = try await sendStreamingChatCompletionDetailed(request, port: port)
return StreamingResult(
roleDeltaCount: detailed.events.filter { $0.kind == .role }.count,
content: detailed.events.compactMap(\ .content).joined(),
toolCalls: detailed.events.flatMap(\ .toolCalls),
finalFinishReason: detailed.events.last(where: { $0.kind == .final })?.finishReason,
usage: detailed.events.last(where: { $0.kind == .final })?.usage,
sawDone: detailed.sawDone
)
}
private func sendStreamingChatCompletionDetailed(_ request: APIChatCompletionRequest, port: UInt16) async throws -> DetailedStreamingResult {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let (bytes, response) = try await URLSession.shared.bytes(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
guard httpResponse.statusCode == 200 else {
var body = ""
for try await line in bytes.lines {
body += line
}
XCTFail("Expected 200 response, got \(httpResponse.statusCode): \(body)")
return DetailedStreamingResult(events: [], sawDone: false)
}
var events: [StreamingEvent] = []
var sawDone = false
for try await line in bytes.lines {
guard line.hasPrefix("data: ") else { continue }
let payload = String(line.dropFirst(6))
if payload == "[DONE]" {
sawDone = true
break
}
guard let data = payload.data(using: .utf8) else { continue }
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
let choice = chunk.choices.first
if let delta = chunk.choices.first?.delta.role, delta == "assistant" {
events.append(StreamingEvent(kind: .role, role: delta, content: nil, toolCalls: [], finishReason: nil, usage: nil))
}
if let deltaContent = chunk.choices.first?.delta.content {
events.append(StreamingEvent(kind: .content, role: nil, content: deltaContent, toolCalls: [], finishReason: nil, usage: nil))
}
if let deltaToolCalls = chunk.choices.first?.delta.tool_calls {
events.append(StreamingEvent(kind: .toolCall, role: nil, content: nil, toolCalls: deltaToolCalls, finishReason: nil, usage: nil))
}
if let finishReason = choice?.finish_reason {
events.append(StreamingEvent(kind: .final, role: nil, content: nil, toolCalls: [], finishReason: finishReason, usage: chunk.usage))
}
}
return DetailedStreamingResult(events: events, sawDone: sawDone)
}
private func cancelStreamingChatCompletionAfterFirstContent(_ request: APIChatCompletionRequest, port: UInt16) async throws {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let observer = StreamCancellationObserver()
let session = URLSession(configuration: .ephemeral)
let task = Task {
let (bytes, response) = try await session.bytes(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
XCTAssertEqual(httpResponse.statusCode, 200)
for try await line in bytes.lines {
guard line.hasPrefix("data: ") else { continue }
let payload = String(line.dropFirst(6))
if payload == "[DONE]" {
break
}
guard let data = payload.data(using: .utf8) else { continue }
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
await observer.markFirstContentSeen()
try await Task.sleep(nanoseconds: 30_000_000_000)
}
}
}
try await waitUntil(timeoutSeconds: 10) {
await observer.hasSeenFirstContent
}
session.invalidateAndCancel()
task.cancel()
_ = try? await task.value
}
private func waitUntil(
timeoutSeconds: TimeInterval,
intervalNanoseconds: UInt64 = 100_000_000,
condition: @escaping () async -> Bool
) async throws {
let deadline = Date().addingTimeInterval(timeoutSeconds)
while Date() < deadline {
if await condition() {
return
}
try await Task.sleep(nanoseconds: intervalNanoseconds)
}
XCTFail("Condition not met before timeout")
}
}
private actor StreamCancellationObserver {
private var sawFirstContent = false
func markFirstContentSeen() {
sawFirstContent = true
}
var hasSeenFirstContent: Bool {
sawFirstContent
}
}
private struct DetailedStreamingResult {
let events: [StreamingEvent]
let sawDone: Bool
}
private struct StreamingEvent {
enum Kind {
case role
case content
case toolCall
case final
}
let kind: Kind
let role: String?
let content: String?
let toolCalls: [APIToolCall]
let finishReason: String?
let usage: APIUsageInfo?
}
private struct StreamingResult {
let roleDeltaCount: Int
let content: String
let toolCalls: [APIToolCall]
let finalFinishReason: String?
let usage: APIUsageInfo?
let sawDone: Bool
}
private struct TestHarness {
let server: APIServer
let modelManager: ModelManager
let port: UInt16
func stop() {
Task { @MainActor in
server.stop()
modelManager.unloadModel()
}
TokenPrefixCache.shared.reset()
}
}

View File

@@ -2572,14 +2572,18 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
### Phase 3: Integration ### Phase 3: Integration
7. **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder. 7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested. 8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
### Phase 4: Statistics & Monitoring ### Phase 4: Statistics & Monitoring
9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer. 9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
10. **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot(). 10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
11. **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list. 11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.
### Phase 5: Advanced Cache Matching ### Phase 5: Advanced Cache Matching