feat: better hardening with unit tests and end-to-end tests

This commit is contained in:
2026-03-20 10:27:39 +01:00
parent e40a2f3c45
commit aadcc308a5
7 changed files with 1395 additions and 1326 deletions

View File

@@ -46,6 +46,7 @@
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */; };
CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
@@ -53,7 +54,6 @@
E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
@@ -114,6 +114,7 @@
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerRewriteTests.swift; sourceTree = "<group>"; };
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
@@ -121,7 +122,6 @@
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -172,6 +172,7 @@
154AF0C071A7DC02EB5F6F49 /* Server */ = {
isa = PBXGroup;
children = (
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
@@ -263,7 +264,6 @@
F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
3D08828E16B17EF02C14243E /* APIServer.swift */,
3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
@@ -379,6 +379,7 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
@@ -406,7 +407,6 @@
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,

View File

@@ -24,11 +24,15 @@ final class LiveCounters: @unchecked Sendable {
private var _isGenerating: Bool = false
private var _contextMax: Int = 0
private var _currentPhaseElapsed: TimeInterval = 0
private var _currentCacheMatchedPromptTokens: Int = 0
private var _currentCacheRebuiltPromptTokens: Int = 0
// Cumulative
private var _totalRequests: Int = 0
private var _totalPromptTokens: Int = 0
private var _totalGenerationTokens: Int = 0
private var _totalCacheReusePromptTokens: Int = 0
private var _totalCacheRebuildPromptTokens: Int = 0
private var _totalPreparingDuration: TimeInterval = 0
private var _totalSessionBuildDuration: TimeInterval = 0
private var _totalPrefillDuration: TimeInterval = 0
@@ -90,6 +94,26 @@ final class LiveCounters: @unchecked Sendable {
lock.unlock()
}
func recordPrefillReuse(requestId: String, matchedPromptTokens: Int, promptTokenCount: Int) {
lock.lock()
guard var state = requestPhases[requestId] else {
lock.unlock()
return
}
let matched = max(0, matchedPromptTokens)
let rebuilt = max(0, promptTokenCount - matched)
_totalCacheReusePromptTokens += matched
_totalCacheRebuildPromptTokens += rebuilt
state.matchedPromptTokens = matched
state.rebuiltPromptTokens = rebuilt
requestPhases[requestId] = state
refreshCurrentCachePromptStatsLocked()
lock.unlock()
}
func requestCompleted(requestId: String, generationTokens: Int) {
let now = Date()
lock.lock()
@@ -108,6 +132,7 @@ final class LiveCounters: @unchecked Sendable {
_isGenerating = _generatingRequests > 0
}
refreshCurrentPhaseElapsed(now: now)
refreshCurrentCachePromptStatsLocked()
lock.unlock()
}
@@ -126,9 +151,13 @@ final class LiveCounters: @unchecked Sendable {
_isGenerating = false
_contextMax = 0
_currentPhaseElapsed = 0
_currentCacheMatchedPromptTokens = 0
_currentCacheRebuiltPromptTokens = 0
_totalRequests = 0
_totalPromptTokens = 0
_totalGenerationTokens = 0
_totalCacheReusePromptTokens = 0
_totalCacheRebuildPromptTokens = 0
_totalPreparingDuration = 0
_totalSessionBuildDuration = 0
_totalPrefillDuration = 0
@@ -154,9 +183,13 @@ final class LiveCounters: @unchecked Sendable {
isGenerating: _isGenerating,
contextMax: _contextMax,
currentPhaseElapsed: _currentPhaseElapsed,
currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
totalRequests: _totalRequests,
totalPromptTokens: _totalPromptTokens,
totalGenerationTokens: _totalGenerationTokens,
totalCacheReusePromptTokens: _totalCacheReusePromptTokens,
totalCacheRebuildPromptTokens: _totalCacheRebuildPromptTokens,
totalPreparingDuration: _totalPreparingDuration,
totalSessionBuildDuration: _totalSessionBuildDuration,
totalPrefillDuration: _totalPrefillDuration,
@@ -179,9 +212,13 @@ final class LiveCounters: @unchecked Sendable {
let isGenerating: Bool
let contextMax: Int
let currentPhaseElapsed: TimeInterval
let currentCacheMatchedPromptTokens: Int
let currentCacheRebuiltPromptTokens: Int
let totalRequests: Int
let totalPromptTokens: Int
let totalGenerationTokens: Int
let totalCacheReusePromptTokens: Int
let totalCacheRebuildPromptTokens: Int
let totalPreparingDuration: TimeInterval
let totalSessionBuildDuration: TimeInterval
let totalPrefillDuration: TimeInterval
@@ -231,9 +268,16 @@ final class LiveCounters: @unchecked Sendable {
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
}
private func refreshCurrentCachePromptStatsLocked() {
_currentCacheMatchedPromptTokens = requestPhases.values.reduce(0) { $0 + $1.matchedPromptTokens }
_currentCacheRebuiltPromptTokens = requestPhases.values.reduce(0) { $0 + $1.rebuiltPromptTokens }
}
private struct RequestState {
var phase: RequestPhase
var phaseStartedAt: Date
var matchedPromptTokens: Int = 0
var rebuiltPromptTokens: Int = 0
}
enum RequestPhase {
@@ -264,17 +308,20 @@ final class InferenceStats {
var contextUsed: Int = 0
var contextMax: Int = 0
var currentPhaseElapsed: TimeInterval = 0
var currentCacheMatchedPromptTokens: Int = 0
var currentCacheRebuiltPromptTokens: Int = 0
// MARK: - Cumulative counters
var totalRequests: Int = 0
var totalPromptTokens: Int = 0
var totalGenerationTokens: Int = 0
var totalCacheReusePromptTokens: Int = 0
var totalCacheRebuildPromptTokens: Int = 0
var totalCacheHits: Int = 0
var totalCacheMisses: Int = 0
var totalCacheEvictions: Int = 0
var totalCacheReusePromptTokens: Int = 0
var totalCacheRebuildPromptTokens: Int = 0
var cacheHitRatePercent: Double = 0
var totalPreparingDuration: TimeInterval = 0
var totalSessionBuildDuration: TimeInterval = 0
var totalPrefillDuration: TimeInterval = 0
@@ -283,12 +330,11 @@ final class InferenceStats {
// MARK: - Cache state
var cacheEntryCount: Int = 0
var warmCacheEntryCount: Int = 0
var activeCacheEntryCount: Int = 0
var generatingCacheEntryCount: Int = 0
var cacheEstimatedBytes: Int = 0
var cacheEstimatedTokens: Int = 0
var cachedSessions: [ConversationSessionCache.SessionSummary] = []
var cacheMemoryBudgetBytes: Int = 0
var cacheMemoryUsagePercent: Double = 0
var cachedEntries: [TokenPrefixCache.EntrySummary] = []
// MARK: - Time series data (ring buffers for charts)
@@ -302,13 +348,14 @@ final class InferenceStats {
private(set) var promptTokenHistory: [DataPoint] = []
private(set) var generationTokenHistory: [DataPoint] = []
private(set) var cacheEntryHistory: [DataPoint] = []
private(set) var activeSessionHistory: [DataPoint] = []
private(set) var cacheFootprintHistory: [DataPoint] = []
private(set) var cacheReuseHistory: [DataPoint] = []
private(set) var cacheRebuildHistory: [DataPoint] = []
private(set) var cacheHitRateHistory: [DataPoint] = []
private(set) var cacheMemoryPressureHistory: [DataPoint] = []
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
private(set) var prefillDurationHistory: [DataPoint] = []
private(set) var sessionBuildDurationHistory: [DataPoint] = []
private(set) var cacheReusePromptHistory: [DataPoint] = []
private(set) var cacheRebuildPromptHistory: [DataPoint] = []
private(set) var cacheMatchQualityHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -316,10 +363,9 @@ final class InferenceStats {
private var sampleTimer: Timer?
private var lastGenerationTokenCount: Int = 0
private var lastPromptTokenCount: Int = 0
private var lastCacheReuseTokenCount: Int = 0
private var lastCacheRebuildTokenCount: Int = 0
private var lastPrefillDuration: TimeInterval = 0
private var lastSessionBuildDuration: TimeInterval = 0
private var lastCacheReusePromptTokenCount: Int = 0
private var lastCacheRebuildPromptTokenCount: Int = 0
func startSampling() {
guard sampleTimer == nil else { return }
@@ -338,7 +384,7 @@ final class InferenceStats {
private func recordSample() {
// Pull live values from the thread-safe counters
let snap = LiveCounters.shared.snapshot()
let cache = ConversationSessionCache.shared.snapshot()
let cache = TokenPrefixCache.shared.snapshot()
activeRequests = snap.activeRequests
preparingRequests = snap.preparingRequests
@@ -353,9 +399,13 @@ final class InferenceStats {
contextMax = snap.contextMax
contextUsed = snap.promptTokens + snap.generationTokens
currentPhaseElapsed = snap.currentPhaseElapsed
currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
totalRequests = snap.totalRequests
totalPromptTokens = snap.totalPromptTokens
totalGenerationTokens = snap.totalGenerationTokens
totalCacheReusePromptTokens = snap.totalCacheReusePromptTokens
totalCacheRebuildPromptTokens = snap.totalCacheRebuildPromptTokens
totalPreparingDuration = snap.totalPreparingDuration
totalSessionBuildDuration = snap.totalSessionBuildDuration
totalPrefillDuration = snap.totalPrefillDuration
@@ -363,41 +413,41 @@ final class InferenceStats {
totalCacheHits = cache.totalHits
totalCacheMisses = cache.totalMisses
totalCacheEvictions = cache.totalEvictions
totalCacheReusePromptTokens = cache.totalReusePromptTokens
totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
cacheHitRatePercent = cache.hitRate
cacheEntryCount = cache.totalEntries
warmCacheEntryCount = cache.warmEntries
activeCacheEntryCount = cache.activeEntries
generatingCacheEntryCount = cache.generatingEntries
cacheEstimatedBytes = cache.estimatedBytes
cacheEstimatedTokens = cache.cachedTokenEstimate
cachedSessions = cache.sessions
cacheEstimatedTokens = cache.totalCachedTokens
cacheMemoryBudgetBytes = cache.memoryBudgetBytes
cacheMemoryUsagePercent = cache.memoryUsagePercent
cachedEntries = cache.entries
let now = Date.now
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
let cacheReusePromptDelta = snap.totalCacheReusePromptTokens - lastCacheReusePromptTokenCount
let cacheRebuildPromptDelta = snap.totalCacheRebuildPromptTokens - lastCacheRebuildPromptTokenCount
let cacheMatchQualityDelta = cacheReusePromptDelta + cacheRebuildPromptDelta > 0
? (Double(cacheReusePromptDelta) / Double(cacheReusePromptDelta + cacheRebuildPromptDelta)) * 100
: 0
lastGenerationTokenCount = snap.totalGenerationTokens
lastPromptTokenCount = snap.totalPromptTokens
lastCacheReuseTokenCount = cache.totalReusePromptTokens
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
lastPrefillDuration = snap.totalPrefillDuration
lastSessionBuildDuration = snap.totalSessionBuildDuration
lastCacheReusePromptTokenCount = snap.totalCacheReusePromptTokens
lastCacheRebuildPromptTokenCount = snap.totalCacheRebuildPromptTokens
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
cacheHitRateHistory.append(DataPoint(timestamp: now, value: cache.hitRate))
cacheMemoryPressureHistory.append(DataPoint(timestamp: now, value: cache.memoryUsagePercent))
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -411,17 +461,14 @@ final class InferenceStats {
if cacheEntryHistory.count > Self.maxHistoryPoints {
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
}
if activeSessionHistory.count > Self.maxHistoryPoints {
activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
}
if cacheFootprintHistory.count > Self.maxHistoryPoints {
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
}
if cacheReuseHistory.count > Self.maxHistoryPoints {
cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
if cacheHitRateHistory.count > Self.maxHistoryPoints {
cacheHitRateHistory.removeFirst(cacheHitRateHistory.count - Self.maxHistoryPoints)
}
if cacheRebuildHistory.count > Self.maxHistoryPoints {
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
if cacheMemoryPressureHistory.count > Self.maxHistoryPoints {
cacheMemoryPressureHistory.removeFirst(cacheMemoryPressureHistory.count - Self.maxHistoryPoints)
}
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
@@ -429,14 +476,20 @@ final class InferenceStats {
if prefillDurationHistory.count > Self.maxHistoryPoints {
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
}
if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
if cacheReusePromptHistory.count > Self.maxHistoryPoints {
cacheReusePromptHistory.removeFirst(cacheReusePromptHistory.count - Self.maxHistoryPoints)
}
if cacheRebuildPromptHistory.count > Self.maxHistoryPoints {
cacheRebuildPromptHistory.removeFirst(cacheRebuildPromptHistory.count - Self.maxHistoryPoints)
}
if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
}
}
func reset() {
LiveCounters.shared.reset()
ConversationSessionCache.shared.reset()
TokenPrefixCache.shared.reset()
activeRequests = 0
preparingRequests = 0
sessionBuildRequests = 0
@@ -450,9 +503,13 @@ final class InferenceStats {
contextUsed = 0
contextMax = 0
currentPhaseElapsed = 0
currentCacheMatchedPromptTokens = 0
currentCacheRebuiltPromptTokens = 0
totalRequests = 0
totalPromptTokens = 0
totalGenerationTokens = 0
totalCacheReusePromptTokens = 0
totalCacheRebuildPromptTokens = 0
totalPreparingDuration = 0
totalSessionBuildDuration = 0
totalPrefillDuration = 0
@@ -460,31 +517,41 @@ final class InferenceStats {
totalCacheHits = 0
totalCacheMisses = 0
totalCacheEvictions = 0
totalCacheReusePromptTokens = 0
totalCacheRebuildPromptTokens = 0
cacheHitRatePercent = 0
cacheEntryCount = 0
warmCacheEntryCount = 0
activeCacheEntryCount = 0
generatingCacheEntryCount = 0
cacheEstimatedBytes = 0
cacheEstimatedTokens = 0
cachedSessions.removeAll()
cacheMemoryBudgetBytes = 0
cacheMemoryUsagePercent = 0
cachedEntries.removeAll()
tokenRateHistory.removeAll()
promptTokenHistory.removeAll()
generationTokenHistory.removeAll()
cacheEntryHistory.removeAll()
activeSessionHistory.removeAll()
cacheFootprintHistory.removeAll()
cacheReuseHistory.removeAll()
cacheRebuildHistory.removeAll()
cacheHitRateHistory.removeAll()
cacheMemoryPressureHistory.removeAll()
currentPhaseElapsedHistory.removeAll()
prefillDurationHistory.removeAll()
sessionBuildDurationHistory.removeAll()
cacheReusePromptHistory.removeAll()
cacheRebuildPromptHistory.removeAll()
cacheMatchQualityHistory.removeAll()
lastGenerationTokenCount = 0
lastPromptTokenCount = 0
lastCacheReuseTokenCount = 0
lastCacheRebuildTokenCount = 0
lastPrefillDuration = 0
lastSessionBuildDuration = 0
lastCacheReusePromptTokenCount = 0
lastCacheRebuildPromptTokenCount = 0
}
var currentCacheMatchQualityPercent: Double {
let total = currentCacheMatchedPromptTokens + currentCacheRebuiltPromptTokens
guard total > 0 else { return 0 }
return (Double(currentCacheMatchedPromptTokens) / Double(total)) * 100
}
var totalCacheMatchQualityPercent: Double {
let total = totalCacheReusePromptTokens + totalCacheRebuildPromptTokens
guard total > 0 else { return 0 }
return (Double(totalCacheReusePromptTokens) / Double(total)) * 100
}
}

View File

@@ -63,7 +63,7 @@ final class APIServer {
listener?.cancel()
listener = nil
isRunning = false
ConversationSessionCache.shared.invalidateAll()
TokenPrefixCache.shared.invalidateAll()
inferenceStats.stopSampling()
}
@@ -176,7 +176,7 @@ final class APIServer {
if let targetConfig = ModelConfig.resolve(requestedModel) {
if modelManager.currentModel?.id != targetConfig.id {
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
ConversationSessionCache.shared.invalidateAll()
TokenPrefixCache.shared.invalidateAll()
await modelManager.loadModel(targetConfig)
}
}
@@ -187,7 +187,7 @@ final class APIServer {
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
let config = ModelConfig.resolve(lastModelId) {
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
ConversationSessionCache.shared.invalidateAll()
TokenPrefixCache.shared.invalidateAll()
await modelManager.loadModel(config)
}
@@ -260,110 +260,80 @@ final class APIServer {
temperature: Float(temperature),
topP: Float(topP)
)
// Feed all messages except the last as history, then send the last as the prompt
let chatMessages = preparedPrompt.chatMessages
let allButLast = Array(chatMessages.dropLast())
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
let historySignatures = Array(preparedPrompt.messageSignatures.dropLast())
let currentModelId = modelManager.currentModel?.id ?? modelName
let lease = ConversationSessionCache.shared.checkoutSession(
modelId: currentModelId,
instructions: preparedPrompt.instructions,
historySignatures: historySignatures,
requestMessageCount: chatMessages.count,
estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: preparedPrompt.estimatedBytes
)
let session: ChatSession
if let reusableSession = lease.session {
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
session = reusableSession
session.generateParameters = generateParams
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
} else {
print("[APIServer] Creating fresh session")
ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
// Use `instructions:` for system/tool prompt (matches internal chat pattern).
// Only conversation turns go in `history:` this avoids replaying the
// large tool prompt as history on every new session.
let instr = preparedPrompt.instructions.isEmpty ? nil : preparedPrompt.instructions
if !allButLast.isEmpty {
session = ChatSession(
container,
instructions: instr,
history: allButLast,
generateParameters: generateParams,
additionalContext: preparedPrompt.additionalContext
)
} else {
session = ChatSession(
container,
instructions: instr,
generateParameters: generateParams,
additionalContext: preparedPrompt.additionalContext
)
}
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
let engine = InferenceEngine(container: container)
let preparedInference: InferenceEngine.PreparedInference
do {
preparedInference = try await engine.prepare(preparedPrompt.userInput)
} catch {
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return
}
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
let lastImages = lastMessage.images
// Vision requests stay uncached until image fingerprinting lands.
let cacheKey = preparedInference.hasImages ? nil : preparedInference.tokens
let lease = cacheKey.map { TokenPrefixCache.shared.lookup(cacheKey: $0, modelId: currentModelId) }
?? TokenPrefixCache.CacheLease(entryId: UUID(), kvCache: nil, matchedTokenCount: 0, isHit: false)
let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
LiveCounters.shared.recordPrefillReuse(
requestId: requestId,
matchedPromptTokens: lease.matchedTokenCount,
promptTokenCount: preparedInference.tokens.count
)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
let cancellation = CancellationToken()
let streamHandle: InferenceEngine.StreamHandle
do {
streamHandle = try await engine.stream(
InferenceEngine.InferenceRequest(
input: preparedInference.lmInput,
tokens: preparedInference.tokens,
parameters: generateParams,
cachedKV: lease.kvCache,
cachedTokenCount: lease.matchedTokenCount
),
cancellation: cancellation
)
} catch {
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return
}
let result: GenerationOutcome
if isStream {
result = await handleStreamingResponse(
connection: connection,
requestId: requestId,
cacheEntryId: lease.entryId,
session: session,
prompt: lastMessage.content,
images: lastImages,
cancellation: cancellation,
stream: streamHandle.stream,
tools: request.tools,
created: created,
modelName: modelName,
isQwen: isQwen
modelName: modelName
)
} else {
result = await handleNonStreamingResponse(
connection: connection,
requestId: requestId,
cacheEntryId: lease.entryId,
session: session,
prompt: lastMessage.content,
images: lastImages,
stream: streamHandle.stream,
tools: request.tools,
created: created,
modelName: modelName,
isQwen: isQwen
modelName: modelName
)
}
if result.succeeded {
var cachedSignatures = preparedPrompt.messageSignatures
if let assistantHistoryText = result.assistantHistoryText {
cachedSignatures.append(
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
)
}
ConversationSessionCache.shared.completeRequest(
if let cacheKey,
result.succeeded || result.cancelled {
Self.storePromptCache(
streamHandle.workingCache,
promptTokenCount: preparedInference.tokens.count,
entryId: lease.entryId,
session: session,
requestMessageSignatures: cachedSignatures,
requestMessageCount: cachedSignatures.count,
estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: preparedPrompt.estimatedBytes,
promptTokens: result.promptTokens,
completionTokens: result.completionTokens
cacheKey: cacheKey,
modelId: currentModelId
)
} else {
ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
}
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
@@ -375,53 +345,20 @@ final class APIServer {
private func handleNonStreamingResponse(
connection: NWConnection,
requestId: String,
cacheEntryId: UUID,
session: ChatSession,
prompt: String,
images: [UserInput.Image],
stream: AsyncStream<Generation>,
tools: [APIToolDefinition]?,
created: Int,
modelName: String,
isQwen: Bool
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
modelName: String
) async -> GenerationOutcome {
do {
var fullText = ""
var promptTokens = 0
var completionTokens = 0
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
let stream = session.streamDetails(
to: prompt,
images: images,
videos: []
let outcome = await Self.collectGenerationOutcome(
stream: stream,
requestId: requestId,
cancellation: nil
)
for try await generation in stream {
switch generation {
case .chunk(let text):
fullText += text
completionTokens += 1
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
let resolved = Self.resolveAssistantResponse(
fullText: fullText,
frameworkToolCalls: frameworkToolCalls,
fullText: outcome.fullText,
frameworkToolCalls: outcome.frameworkToolCalls,
tools: tools
)
@@ -442,24 +379,26 @@ final class APIServer {
)
],
usage: APIUsageInfo(
prompt_tokens: promptTokens,
completion_tokens: completionTokens,
total_tokens: promptTokens + completionTokens
prompt_tokens: outcome.promptTokens,
completion_tokens: outcome.completionTokens,
total_tokens: outcome.promptTokens + outcome.completionTokens
)
)
if let json = try? JSONEncoder().encode(response) {
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
}
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
content: resolved.content,
toolCalls: resolved.toolCalls,
isQwen: isQwen
return GenerationOutcome(
promptTokens: outcome.promptTokens,
completionTokens: outcome.completionTokens,
fullText: outcome.fullText,
frameworkToolCalls: outcome.frameworkToolCalls,
succeeded: true,
cancelled: false
)
return (promptTokens, completionTokens, assistantHistoryText, true)
} catch {
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return (0, 0, nil, false)
return GenerationOutcome(promptTokens: 0, completionTokens: 0, fullText: "", frameworkToolCalls: [], succeeded: false, cancelled: false)
}
}
@@ -468,15 +407,12 @@ final class APIServer {
private func handleStreamingResponse(
connection: NWConnection,
requestId: String,
cacheEntryId: UUID,
session: ChatSession,
prompt: String,
images: [UserInput.Image],
cancellation: CancellationToken,
stream: AsyncStream<Generation>,
tools: [APIToolDefinition]?,
created: Int,
modelName: String,
isQwen: Bool
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
modelName: String
) async -> GenerationOutcome {
// Send SSE headers
let header = [
"HTTP/1.1 200 OK",
@@ -489,55 +425,34 @@ final class APIServer {
].joined(separator: "\r\n")
await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
connection.stateUpdateHandler = { state in
switch state {
case .cancelled, .failed:
cancellation.cancel()
default:
break
}
}
// Send initial role chunk
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
id: requestId,
object: "chat.completion.chunk",
created: created,
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: "assistant", content: nil, tool_calls: nil), finish_reason: nil)],
usage: nil
))
let encoder = StreamingSSEEncoder(requestId: requestId, created: created, modelName: modelName)
await Self.sendData(connection: connection, data: encoder.encodeRoleDelta("assistant"))
let hasTools = tools != nil && !(tools?.isEmpty ?? true)
// Run the generation loop OFF MainActor.
// ChatSession and NWConnection don't need MainActor.
// Running on MainActor caused every token to compete with SwiftUI
// rendering, creating back-pressure that coalesced all output.
let stream = session.streamDetails(
to: prompt,
images: images,
videos: []
let result = await Self.runStreamingLoop(
connection: connection,
stream: stream,
cancellation: cancellation,
requestId: requestId,
encoder: encoder
)
// Transfer non-Sendable values to the nonisolated loop.
// Safe because we don't touch session/images again until after the loop.
let result = await {
nonisolated(unsafe) let stream = stream
return await Self.runStreamingLoop(
connection: connection,
stream: stream,
requestId: requestId,
created: created,
modelName: modelName
)
}()
let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
if promptTokens > 0 {
ConversationSessionCache.shared.markGenerating(
entryId: cacheEntryId,
promptTokens: promptTokens,
completionTokens: completionTokens
)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if result.cancelled {
connection.cancel()
return result
}
let resolved = Self.resolveAssistantResponse(
fullText: fullText,
frameworkToolCalls: frameworkToolCalls,
fullText: result.fullText,
frameworkToolCalls: result.frameworkToolCalls,
tools: tools
)
@@ -562,21 +477,16 @@ final class APIServer {
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
usage: APIUsageInfo(
prompt_tokens: promptTokens,
completion_tokens: completionTokens,
total_tokens: promptTokens + completionTokens
prompt_tokens: result.promptTokens,
completion_tokens: result.completionTokens,
total_tokens: result.promptTokens + result.completionTokens
)
))
// Send [DONE] and close
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
connection.cancel()
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
content: resolved.content,
toolCalls: resolved.toolCalls,
isQwen: isQwen
)
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
return result
}
/// Run the token generation + SSE send loop entirely off MainActor.
@@ -584,54 +494,20 @@ final class APIServer {
/// multiple actor hops competing with SwiftUI, causing all output to batch.
nonisolated private static func runStreamingLoop(
connection: NWConnection,
stream: AsyncThrowingStream<Generation, any Error>,
stream: AsyncStream<Generation>,
cancellation: CancellationToken,
requestId: String,
created: Int,
modelName: String
) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
var promptTokens = 0
var completionTokens = 0
var fullText = ""
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
do {
for try await generation in stream {
switch generation {
case .chunk(let text):
completionTokens += 1
fullText += text
// Update live counters directly no MainActor hop needed
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
// Send directly no MainActor hop.
await sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
id: requestId,
object: "chat.completion.chunk",
created: created,
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: text, tool_calls: nil), finish_reason: nil)],
usage: nil
))
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
} catch {
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
encoder: StreamingSSEEncoder
) async -> GenerationOutcome {
var outcome = await collectGenerationOutcome(
stream: stream,
requestId: requestId,
cancellation: cancellation
) { text in
await sendData(connection: connection, data: encoder.encodeContentDelta(text))
}
return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
outcome.succeeded = !outcome.cancelled
return outcome
}
/// Send an SSE event and wait for the protocol stack to process it.
@@ -651,6 +527,88 @@ final class APIServer {
}
}
nonisolated private static func collectGenerationOutcome(
stream: AsyncStream<Generation>,
requestId: String,
cancellation: CancellationToken?,
onChunk: ((String) async -> Void)? = nil
) async -> GenerationOutcome {
var promptTokens = 0
var completionTokens = 0
var fullText = ""
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
var cancelled = false
for await generation in stream {
if let cancellation, cancellation.isCancelled {
cancelled = true
break
}
switch generation {
case .chunk(let text):
completionTokens += 1
fullText += text
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
if let onChunk {
await onChunk(text)
}
case .info(let info):
promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call):
frameworkToolCalls.append(call)
}
}
return GenerationOutcome(
promptTokens: promptTokens,
completionTokens: completionTokens,
fullText: fullText,
frameworkToolCalls: frameworkToolCalls,
succeeded: !cancelled,
cancelled: cancelled
)
}
private static func storePromptCache(
_ cache: [KVCache],
promptTokenCount: Int,
entryId: UUID,
cacheKey: [Int],
modelId: String
) {
guard trimGeneratedTokens(cache, promptTokenCount: promptTokenCount) else {
return
}
TokenPrefixCache.shared.store(
entryId: entryId,
kvCache: cache,
cacheKey: cacheKey,
modelId: modelId
)
}
private static func trimGeneratedTokens(_ cache: [KVCache], promptTokenCount: Int) -> Bool {
for layer in cache {
let excess = layer.offset - promptTokenCount
guard excess <= 0 || layer.isTrimmable else {
return false
}
if excess > 0 {
let trimmed = layer.trim(excess)
guard trimmed == excess else {
return false
}
}
}
return true
}
// MARK: - HTTP helpers
private func sendResponse(
@@ -787,6 +745,15 @@ final class APIServer {
}
}
private struct GenerationOutcome {
var promptTokens: Int
var completionTokens: Int
var fullText: String
var frameworkToolCalls: [MLXLMCommon.ToolCall]
var succeeded: Bool
var cancelled: Bool
}
// MARK: - HTTP request parser
private struct HTTPRequest {

View File

@@ -1,358 +0,0 @@
import Foundation
import MLXLMCommon
import os
enum APISessionPhase: String, Sendable {
case idle = "Idle"
case sessionBuild = "Session Build"
case prefilling = "Prefilling"
case generating = "Generating"
}
/// Bounded cache of API chat sessions keyed by normalized conversation history.
/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
final class ConversationSessionCache: @unchecked Sendable {
static let shared = ConversationSessionCache()
private let lock = OSAllocatedUnfairLock()
private let maxEntries = 8
private let maxCachedTokens = 256_000
private let idleTTL: TimeInterval = 10 * 60
private var entries: [UUID: Entry] = [:]
private var totals = Totals()
private init() {}
struct Lease {
let entryId: UUID
let session: ChatSession?
let reusedPromptTokens: Int
let cacheHit: Bool
}
struct SessionSummary: Identifiable, Sendable {
let id: UUID
let modelId: String
let phase: APISessionPhase
let messageCount: Int
let cachedTokenEstimate: Int
let estimatedBytes: Int
let inFlightRequests: Int
let hitCount: Int
let lastPromptTokens: Int
let lastCompletionTokens: Int
let lastReuseTokens: Int
let createdAt: Date
let lastAccessAt: Date
}
struct Snapshot: Sendable {
let totalEntries: Int
let warmEntries: Int
let activeEntries: Int
let generatingEntries: Int
let estimatedBytes: Int
let cachedTokenEstimate: Int
let totalHits: Int
let totalMisses: Int
let totalEvictions: Int
let totalReusePromptTokens: Int
let totalRebuildPromptTokens: Int
let sessions: [SessionSummary]
}
func checkoutSession(
modelId: String,
instructions: String,
historySignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int
) -> Lease {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let instructionsHash = Self.stableHash(instructions)
let match = entries
.values
.filter {
$0.modelId == modelId
&& $0.instructionsHash == instructionsHash
&& $0.session != nil
&& $0.inFlightRequests == 0
&& Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
}
.max { lhs, rhs in
lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
}
if let match {
var entry = match
entry.inFlightRequests += 1
entry.lastAccessAt = now
entry.phase = .prefilling
entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
entry.hitCount += 1
entries[entry.id] = entry
totals.totalHits += 1
totals.totalReusePromptTokens += entry.lastReuseTokens
let lease = Lease(
entryId: entry.id,
session: entry.session,
reusedPromptTokens: entry.lastReuseTokens,
cacheHit: true
)
lock.unlock()
return lease
}
let entryId = UUID()
entries[entryId] = Entry(
id: entryId,
modelId: modelId,
instructionsHash: instructionsHash,
requestMessageSignatures: historySignatures,
messageCount: requestMessageCount,
cachedTokenEstimate: estimatedPromptTokens,
estimatedBytes: estimatedBytes,
createdAt: now,
lastAccessAt: now,
inFlightRequests: 1,
hitCount: 0,
phase: .sessionBuild,
lastPromptTokens: 0,
lastCompletionTokens: 0,
lastReuseTokens: 0,
session: nil
)
totals.totalMisses += 1
totals.totalRebuildPromptTokens += estimatedPromptTokens
lock.unlock()
return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
}
func markSessionBuild(entryId: UUID) {
updatePhase(entryId: entryId, phase: .sessionBuild)
}
func markPrefilling(entryId: UUID) {
updatePhase(entryId: entryId, phase: .prefilling)
}
func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = .generating
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
func completeRequest(
entryId: UUID,
session: ChatSession,
requestMessageSignatures: [UInt64],
requestMessageCount: Int,
estimatedPromptTokens: Int,
estimatedBytes: Int,
promptTokens: Int,
completionTokens: Int
) {
lock.lock()
let now = Date()
if var entry = entries[entryId] {
entry.session = session
entry.requestMessageSignatures = requestMessageSignatures
entry.messageCount = requestMessageCount
entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
entry.estimatedBytes = estimatedBytes
entry.lastPromptTokens = promptTokens
entry.lastCompletionTokens = completionTokens
entry.lastAccessAt = now
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
entry.phase = .idle
entries[entryId] = entry
enforceBudgetLocked(now: now)
}
lock.unlock()
}
func abandonRequest(entryId: UUID) {
lock.lock()
if var entry = entries[entryId] {
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
if entry.session == nil && entry.inFlightRequests == 0 {
entries.removeValue(forKey: entryId)
} else {
entry.phase = .idle
entry.lastAccessAt = Date()
entries[entryId] = entry
}
}
lock.unlock()
}
func invalidateAll() {
lock.lock()
totals.totalEvictions += entries.count
entries.removeAll()
lock.unlock()
}
func reset() {
lock.lock()
entries.removeAll()
totals = Totals()
lock.unlock()
}
func snapshot() -> Snapshot {
lock.lock()
let now = Date()
pruneExpiredLocked(now: now)
let allEntries = Array(entries.values)
let sessions = allEntries
.sorted {
if $0.inFlightRequests != $1.inFlightRequests {
return $0.inFlightRequests > $1.inFlightRequests
}
return $0.lastAccessAt > $1.lastAccessAt
}
.map {
SessionSummary(
id: $0.id,
modelId: $0.modelId,
phase: $0.phase,
messageCount: $0.messageCount,
cachedTokenEstimate: $0.cachedTokenEstimate,
estimatedBytes: $0.estimatedBytes,
inFlightRequests: $0.inFlightRequests,
hitCount: $0.hitCount,
lastPromptTokens: $0.lastPromptTokens,
lastCompletionTokens: $0.lastCompletionTokens,
lastReuseTokens: $0.lastReuseTokens,
createdAt: $0.createdAt,
lastAccessAt: $0.lastAccessAt
)
}
let snapshot = Snapshot(
totalEntries: allEntries.count,
warmEntries: allEntries.filter { $0.session != nil }.count,
activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
generatingEntries: allEntries.filter { $0.phase == .generating }.count,
estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
totalHits: totals.totalHits,
totalMisses: totals.totalMisses,
totalEvictions: totals.totalEvictions,
totalReusePromptTokens: totals.totalReusePromptTokens,
totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
sessions: sessions
)
lock.unlock()
return snapshot
}
private func updatePhase(entryId: UUID, phase: APISessionPhase) {
lock.lock()
if var entry = entries[entryId] {
entry.phase = phase
entry.lastAccessAt = Date()
entries[entryId] = entry
}
lock.unlock()
}
private func pruneExpiredLocked(now: Date) {
let expired = entries.values.filter {
$0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
}
guard !expired.isEmpty else { return }
for entry in expired {
entries.removeValue(forKey: entry.id)
}
totals.totalEvictions += expired.count
}
private func enforceBudgetLocked(now: Date) {
pruneExpiredLocked(now: now)
func totalCachedTokens() -> Int {
entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
}
while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
guard let victim = entries.values
.filter({ $0.inFlightRequests == 0 })
.sorted(by: evictionOrder)
.first
else {
break
}
entries.removeValue(forKey: victim.id)
totals.totalEvictions += 1
}
}
private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
if lhs.lastAccessAt != rhs.lastAccessAt {
return lhs.lastAccessAt < rhs.lastAccessAt
}
if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
}
return lhs.createdAt < rhs.createdAt
}
private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
guard cached.count <= incoming.count,
incoming.count <= cached.count + 1 else { return false }
for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
return false
}
return true
}
static func stableHash(_ text: String) -> UInt64 {
var hash: UInt64 = 14_695_981_039_346_656_037
for byte in text.utf8 {
hash ^= UInt64(byte)
hash &*= 1_099_511_628_211
}
return hash
}
private struct Entry {
let id: UUID
let modelId: String
let instructionsHash: UInt64
var requestMessageSignatures: [UInt64]
var messageCount: Int
var cachedTokenEstimate: Int
var estimatedBytes: Int
let createdAt: Date
var lastAccessAt: Date
var inFlightRequests: Int
var hitCount: Int
var phase: APISessionPhase
var lastPromptTokens: Int
var lastCompletionTokens: Int
var lastReuseTokens: Int
var session: ChatSession?
}
private struct Totals {
var totalHits: Int = 0
var totalMisses: Int = 0
var totalEvictions: Int = 0
var totalReusePromptTokens: Int = 0
var totalRebuildPromptTokens: Int = 0
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,736 @@
import Foundation
import XCTest
@testable import MLX_Server
final class APIServerRewriteTests: XCTestCase {
func testNonStreamingChatCompletionUsesStatelessServerPathAndCachesPrompt() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let request = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 1,
stream: false,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstResponse = try await sendChatCompletion(request, port: harness.port)
XCTAssertEqual(firstResponse.choices.count, 1)
XCTAssertEqual(firstResponse.choices[0].message.role, "assistant")
XCTAssertGreaterThan(firstResponse.usage.prompt_tokens, 0)
XCTAssertGreaterThanOrEqual(firstResponse.usage.completion_tokens, 0)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
_ = try await sendChatCompletion(request, port: harness.port)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let firstRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
XCTAssertEqual(firstStream.roleDeltaCount, 1)
XCTAssertTrue(firstStream.sawDone)
XCTAssertEqual(firstStream.finalFinishReason, "stop")
XCTAssertGreaterThan(firstStream.usage?.prompt_tokens ?? 0, 0)
XCTAssertFalse(firstStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
let secondRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let secondStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
XCTAssertEqual(secondStream.roleDeltaCount, 1)
XCTAssertTrue(secondStream.sawDone)
XCTAssertEqual(secondStream.finalFinishReason, "stop")
XCTAssertFalse(secondStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
let thirdRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: .text(secondStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Answer in one word: what color is snow?"), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 3,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
XCTAssertTrue(thirdStream.sawDone)
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
}
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossToolBoundary() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let tools = [mockWeatherTool]
let firstRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let toolCallStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
XCTAssertEqual(toolCallStream.roleDeltaCount, 1)
XCTAssertTrue(toolCallStream.sawDone)
XCTAssertEqual(toolCallStream.finalFinishReason, "tool_calls")
let toolCall = try XCTUnwrap(toolCallStream.toolCalls.first)
XCTAssertEqual(toolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let afterToolCallSnapshot = TokenPrefixCache.shared.snapshot()
let afterToolCallLiveSnapshot = LiveCounters.shared.snapshot()
let secondRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id)
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let directAnswerStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
XCTAssertEqual(directAnswerStream.roleDeltaCount, 1)
XCTAssertTrue(directAnswerStream.sawDone)
XCTAssertEqual(directAnswerStream.finalFinishReason, "stop")
XCTAssertTrue(directAnswerStream.toolCalls.isEmpty)
XCTAssertFalse(directAnswerStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterToolCallSnapshot.totalHits
}
let afterDirectAnswerSnapshot = TokenPrefixCache.shared.snapshot()
let afterDirectAnswerLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(afterDirectAnswerSnapshot.totalHits, afterToolCallSnapshot.totalHits)
XCTAssertGreaterThan(afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens, afterToolCallLiveSnapshot.totalCacheReusePromptTokens)
let thirdRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id),
APIChatMessage(role: "assistant", content: .text(directAnswerStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now compress that answer to two words."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 8,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
XCTAssertTrue(thirdStream.sawDone)
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterDirectAnswerSnapshot.totalHits
}
let finalSnapshot = TokenPrefixCache.shared.snapshot()
let finalLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDirectAnswerSnapshot.totalHits)
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingChatCompletionReusesCacheAcrossMultipleToolTurns() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let tools = [mockWeatherTool]
let berlinRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let firstToolTurn = try await sendStreamingChatCompletion(berlinRequest, port: harness.port)
XCTAssertEqual(firstToolTurn.finalFinishReason, "tool_calls")
let berlinToolCall = try XCTUnwrap(firstToolTurn.toolCalls.first)
XCTAssertEqual(berlinToolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > 0
}
let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot()
let berlinToolResult = APIChatMessage(
role: "tool",
content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"),
name: nil,
tool_calls: nil,
tool_call_id: berlinToolCall.id
)
let berlinAnswerRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let berlinAnswer = try await sendStreamingChatCompletion(berlinAnswerRequest, port: harness.port)
XCTAssertEqual(berlinAnswer.finalFinishReason, "stop")
XCTAssertFalse(berlinAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
}
let secondSnapshot = TokenPrefixCache.shared.snapshot()
let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
let parisToolTurnRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult,
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let secondToolTurn = try await sendStreamingChatCompletion(parisToolTurnRequest, port: harness.port)
XCTAssertEqual(secondToolTurn.finalFinishReason, "tool_calls")
let parisToolCall = try XCTUnwrap(secondToolTurn.toolCalls.first)
XCTAssertEqual(parisToolCall.function.name, "weather")
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
}
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
let parisAnswerRequest = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
berlinToolResult,
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [parisToolCall], tool_call_id: nil),
APIChatMessage(role: "tool", content: .text("{\"city\":\"Paris\",\"temperature_c\":21,\"condition\":\"clear\"}"), name: nil, tool_calls: nil, tool_call_id: parisToolCall.id)
],
temperature: 0,
top_p: 1,
max_tokens: 16,
stream: true,
stop: nil,
tools: tools,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let parisAnswer = try await sendStreamingChatCompletion(parisAnswerRequest, port: harness.port)
XCTAssertEqual(parisAnswer.finalFinishReason, "stop")
XCTAssertTrue(parisAnswer.toolCalls.isEmpty)
XCTAssertFalse(parisAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > thirdSnapshot.totalHits
}
let fourthSnapshot = TokenPrefixCache.shared.snapshot()
let fourthLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(fourthSnapshot.totalHits, thirdSnapshot.totalHits)
XCTAssertGreaterThan(fourthLiveSnapshot.totalCacheReusePromptTokens, thirdLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingDisconnectStoresPromptCacheForReuse() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let request = APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Count from one to twenty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 64,
stream: true,
stop: nil,
tools: nil,
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
)
let initialSnapshot = TokenPrefixCache.shared.snapshot()
try await cancelStreamingChatCompletionAfterFirstContent(request, port: harness.port)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalEntries > initialSnapshot.totalEntries
}
let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
_ = try await sendChatCompletion(
APIChatCompletionRequest(
model: request.model,
messages: request.messages,
temperature: request.temperature,
top_p: request.top_p,
max_tokens: 8,
stream: false,
stop: request.stop,
tools: request.tools,
tool_choice: request.tool_choice,
frequency_penalty: request.frequency_penalty,
presence_penalty: request.presence_penalty,
n: request.n
),
port: harness.port
)
try await waitUntil(timeoutSeconds: 5) {
TokenPrefixCache.shared.snapshot().totalHits > afterDisconnectSnapshot.totalHits
}
let finalSnapshot = TokenPrefixCache.shared.snapshot()
let finalLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDisconnectSnapshot.totalHits)
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
}
func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
let harness = try await makeHarness()
defer { harness.stop() }
let detailed = try await sendStreamingChatCompletionDetailed(
APIChatCompletionRequest(
model: "gemma",
messages: [
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
],
temperature: 0,
top_p: 1,
max_tokens: 48,
stream: true,
stop: nil,
tools: [mockWeatherTool],
tool_choice: nil,
frequency_penalty: nil,
presence_penalty: nil,
n: nil
),
port: harness.port
)
XCTAssertTrue(detailed.sawDone)
XCTAssertFalse(detailed.events.isEmpty)
let firstEvent = try XCTUnwrap(detailed.events.first)
XCTAssertEqual(firstEvent.kind, .role)
XCTAssertEqual(firstEvent.role, "assistant")
let toolEventIndices = detailed.events.enumerated().compactMap { index, event in
event.kind == .toolCall ? index : nil
}
XCTAssertFalse(toolEventIndices.isEmpty)
let finalIndex = try XCTUnwrap(detailed.events.lastIndex(where: { $0.kind == .final }))
XCTAssertEqual(finalIndex, detailed.events.count - 1)
for toolIndex in toolEventIndices {
XCTAssertLessThan(toolIndex, finalIndex)
}
let finalEvent = detailed.events[finalIndex]
XCTAssertEqual(finalEvent.finishReason, "tool_calls")
XCTAssertNotNil(finalEvent.usage)
let roleEventCount = detailed.events.filter { $0.kind == .role }.count
XCTAssertEqual(roleEventCount, 1)
}
private var mockWeatherTool: APIToolDefinition {
APIToolDefinition(
type: "function",
function: APIFunctionDefinition(
name: "weather",
description: "Look up weather for a city.",
parameters: [
"type": AnyCodable("object"),
"properties": AnyCodable([
"city": [
"type": "string",
"description": "City name"
]
]),
"required": AnyCodable(["city"])
]
)
)
}
private func makeHarness() async throws -> TestHarness {
let modelManager = await MainActor.run { ModelManager() }
let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
LiveCounters.shared.reset()
TokenPrefixCache.shared.reset()
await modelManager.loadModel(config)
let isReady = await MainActor.run { modelManager.isReady }
XCTAssertTrue(isReady)
let server = await MainActor.run { APIServer() }
let port = UInt16.random(in: 20_000...40_000)
await MainActor.run {
server.start(modelManager: modelManager, port: Int(port))
}
try await waitUntil(timeoutSeconds: 5) {
await MainActor.run { server.isRunning }
}
return TestHarness(server: server, modelManager: modelManager, port: port)
}
private func sendChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> APIChatCompletionResponse {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let (data, response) = try await URLSession.shared.data(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
XCTAssertEqual(httpResponse.statusCode, 200, String(data: data, encoding: .utf8) ?? "")
return try JSONDecoder().decode(APIChatCompletionResponse.self, from: data)
}
private func sendStreamingChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> StreamingResult {
let detailed = try await sendStreamingChatCompletionDetailed(request, port: port)
return StreamingResult(
roleDeltaCount: detailed.events.filter { $0.kind == .role }.count,
content: detailed.events.compactMap(\ .content).joined(),
toolCalls: detailed.events.flatMap(\ .toolCalls),
finalFinishReason: detailed.events.last(where: { $0.kind == .final })?.finishReason,
usage: detailed.events.last(where: { $0.kind == .final })?.usage,
sawDone: detailed.sawDone
)
}
private func sendStreamingChatCompletionDetailed(_ request: APIChatCompletionRequest, port: UInt16) async throws -> DetailedStreamingResult {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let (bytes, response) = try await URLSession.shared.bytes(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
guard httpResponse.statusCode == 200 else {
var body = ""
for try await line in bytes.lines {
body += line
}
XCTFail("Expected 200 response, got \(httpResponse.statusCode): \(body)")
return DetailedStreamingResult(events: [], sawDone: false)
}
var events: [StreamingEvent] = []
var sawDone = false
for try await line in bytes.lines {
guard line.hasPrefix("data: ") else { continue }
let payload = String(line.dropFirst(6))
if payload == "[DONE]" {
sawDone = true
break
}
guard let data = payload.data(using: .utf8) else { continue }
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
let choice = chunk.choices.first
if let delta = chunk.choices.first?.delta.role, delta == "assistant" {
events.append(StreamingEvent(kind: .role, role: delta, content: nil, toolCalls: [], finishReason: nil, usage: nil))
}
if let deltaContent = chunk.choices.first?.delta.content {
events.append(StreamingEvent(kind: .content, role: nil, content: deltaContent, toolCalls: [], finishReason: nil, usage: nil))
}
if let deltaToolCalls = chunk.choices.first?.delta.tool_calls {
events.append(StreamingEvent(kind: .toolCall, role: nil, content: nil, toolCalls: deltaToolCalls, finishReason: nil, usage: nil))
}
if let finishReason = choice?.finish_reason {
events.append(StreamingEvent(kind: .final, role: nil, content: nil, toolCalls: [], finishReason: finishReason, usage: chunk.usage))
}
}
return DetailedStreamingResult(events: events, sawDone: sawDone)
}
private func cancelStreamingChatCompletionAfterFirstContent(_ request: APIChatCompletionRequest, port: UInt16) async throws {
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST"
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
urlRequest.httpBody = try JSONEncoder().encode(request)
let observer = StreamCancellationObserver()
let session = URLSession(configuration: .ephemeral)
let task = Task {
let (bytes, response) = try await session.bytes(for: urlRequest)
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
XCTAssertEqual(httpResponse.statusCode, 200)
for try await line in bytes.lines {
guard line.hasPrefix("data: ") else { continue }
let payload = String(line.dropFirst(6))
if payload == "[DONE]" {
break
}
guard let data = payload.data(using: .utf8) else { continue }
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
await observer.markFirstContentSeen()
try await Task.sleep(nanoseconds: 30_000_000_000)
}
}
}
try await waitUntil(timeoutSeconds: 10) {
await observer.hasSeenFirstContent
}
session.invalidateAndCancel()
task.cancel()
_ = try? await task.value
}
private func waitUntil(
timeoutSeconds: TimeInterval,
intervalNanoseconds: UInt64 = 100_000_000,
condition: @escaping () async -> Bool
) async throws {
let deadline = Date().addingTimeInterval(timeoutSeconds)
while Date() < deadline {
if await condition() {
return
}
try await Task.sleep(nanoseconds: intervalNanoseconds)
}
XCTFail("Condition not met before timeout")
}
}
private actor StreamCancellationObserver {
private var sawFirstContent = false
func markFirstContentSeen() {
sawFirstContent = true
}
var hasSeenFirstContent: Bool {
sawFirstContent
}
}
private struct DetailedStreamingResult {
let events: [StreamingEvent]
let sawDone: Bool
}
private struct StreamingEvent {
enum Kind {
case role
case content
case toolCall
case final
}
let kind: Kind
let role: String?
let content: String?
let toolCalls: [APIToolCall]
let finishReason: String?
let usage: APIUsageInfo?
}
private struct StreamingResult {
let roleDeltaCount: Int
let content: String
let toolCalls: [APIToolCall]
let finalFinishReason: String?
let usage: APIUsageInfo?
let sawDone: Bool
}
private struct TestHarness {
let server: APIServer
let modelManager: ModelManager
let port: UInt16
func stop() {
Task { @MainActor in
server.stop()
modelManager.unloadModel()
}
TokenPrefixCache.shared.reset()
}
}

View File

@@ -2572,14 +2572,18 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
### Phase 3: Integration
7. **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
### Phase 4: Statistics & Monitoring
9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
10. **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
11. **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.
### Phase 5: Advanced Cache Matching