feat: better hardening with unit tests and end-to-end tests
This commit is contained in:
@@ -46,6 +46,7 @@
|
|||||||
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
|
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2DC8C86D397B1FCA08E07CBD /* DownloadModalView.swift */; };
|
||||||
C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
|
C34F02550C584BB2547F0F6C /* ChatDocumentPackage.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */; };
|
||||||
CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
|
CBA88529F8BE7BD0518994AD /* SceneSelectionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */; };
|
||||||
|
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */; };
|
||||||
CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
|
CFEE79815DFB80E51FE3745A /* SceneStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = C234359924C542F07ED926A2 /* SceneStore.swift */; };
|
||||||
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
|
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
|
||||||
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
|
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
|
||||||
@@ -53,7 +54,6 @@
|
|||||||
E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
|
E199D0BB09B61AC128AB093A /* CancellationToken.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3489501F2F8E1BA382347CFA /* CancellationToken.swift */; };
|
||||||
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
|
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */; };
|
||||||
EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
|
EC4FC68608DDFA6A3DF133CC /* InferenceEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */; };
|
||||||
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
|
|
||||||
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
|
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
|
||||||
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
|
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
|
||||||
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
|
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
|
||||||
@@ -114,6 +114,7 @@
|
|||||||
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
|
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
|
||||||
E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
|
E1E62624B6F285479CB33041 /* PromptBuilder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PromptBuilder.swift; sourceTree = "<group>"; };
|
||||||
E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
|
E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
|
||||||
|
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerRewriteTests.swift; sourceTree = "<group>"; };
|
||||||
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
|
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoderTests.swift; sourceTree = "<group>"; };
|
||||||
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
|
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
|
||||||
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
|
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
|
||||||
@@ -121,7 +122,6 @@
|
|||||||
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
|
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
|
||||||
F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
F4CE2D594F7433C76169151A /* MLXServerTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = MLXServerTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
|
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CancellationTokenTests.swift; sourceTree = "<group>"; };
|
||||||
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
|
|
||||||
/* End PBXFileReference section */
|
/* End PBXFileReference section */
|
||||||
|
|
||||||
/* Begin PBXFrameworksBuildPhase section */
|
/* Begin PBXFrameworksBuildPhase section */
|
||||||
@@ -172,6 +172,7 @@
|
|||||||
154AF0C071A7DC02EB5F6F49 /* Server */ = {
|
154AF0C071A7DC02EB5F6F49 /* Server */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
|
||||||
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
|
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
|
||||||
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
|
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
|
||||||
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
|
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
|
||||||
@@ -263,7 +264,6 @@
|
|||||||
F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
|
F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
|
||||||
3D08828E16B17EF02C14243E /* APIServer.swift */,
|
3D08828E16B17EF02C14243E /* APIServer.swift */,
|
||||||
3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
|
3489501F2F8E1BA382347CFA /* CancellationToken.swift */,
|
||||||
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
|
|
||||||
7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
|
7C1A89C076E717F87A60397D /* ImageDecoder.swift */,
|
||||||
02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
|
02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */,
|
||||||
E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
|
E1E62624B6F285479CB33041 /* PromptBuilder.swift */,
|
||||||
@@ -379,6 +379,7 @@
|
|||||||
isa = PBXSourcesBuildPhase;
|
isa = PBXSourcesBuildPhase;
|
||||||
buildActionMask = 2147483647;
|
buildActionMask = 2147483647;
|
||||||
files = (
|
files = (
|
||||||
|
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
|
||||||
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
|
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
|
||||||
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
|
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
|
||||||
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
|
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
|
||||||
@@ -406,7 +407,6 @@
|
|||||||
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
|
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
|
||||||
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
|
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
|
||||||
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
|
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
|
||||||
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
|
|
||||||
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
|
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
|
||||||
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
|
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
|
||||||
A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,
|
A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */,
|
||||||
|
|||||||
@@ -24,11 +24,15 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
private var _isGenerating: Bool = false
|
private var _isGenerating: Bool = false
|
||||||
private var _contextMax: Int = 0
|
private var _contextMax: Int = 0
|
||||||
private var _currentPhaseElapsed: TimeInterval = 0
|
private var _currentPhaseElapsed: TimeInterval = 0
|
||||||
|
private var _currentCacheMatchedPromptTokens: Int = 0
|
||||||
|
private var _currentCacheRebuiltPromptTokens: Int = 0
|
||||||
|
|
||||||
// Cumulative
|
// Cumulative
|
||||||
private var _totalRequests: Int = 0
|
private var _totalRequests: Int = 0
|
||||||
private var _totalPromptTokens: Int = 0
|
private var _totalPromptTokens: Int = 0
|
||||||
private var _totalGenerationTokens: Int = 0
|
private var _totalGenerationTokens: Int = 0
|
||||||
|
private var _totalCacheReusePromptTokens: Int = 0
|
||||||
|
private var _totalCacheRebuildPromptTokens: Int = 0
|
||||||
private var _totalPreparingDuration: TimeInterval = 0
|
private var _totalPreparingDuration: TimeInterval = 0
|
||||||
private var _totalSessionBuildDuration: TimeInterval = 0
|
private var _totalSessionBuildDuration: TimeInterval = 0
|
||||||
private var _totalPrefillDuration: TimeInterval = 0
|
private var _totalPrefillDuration: TimeInterval = 0
|
||||||
@@ -90,6 +94,26 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func recordPrefillReuse(requestId: String, matchedPromptTokens: Int, promptTokenCount: Int) {
|
||||||
|
lock.lock()
|
||||||
|
guard var state = requestPhases[requestId] else {
|
||||||
|
lock.unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let matched = max(0, matchedPromptTokens)
|
||||||
|
let rebuilt = max(0, promptTokenCount - matched)
|
||||||
|
|
||||||
|
_totalCacheReusePromptTokens += matched
|
||||||
|
_totalCacheRebuildPromptTokens += rebuilt
|
||||||
|
|
||||||
|
state.matchedPromptTokens = matched
|
||||||
|
state.rebuiltPromptTokens = rebuilt
|
||||||
|
requestPhases[requestId] = state
|
||||||
|
refreshCurrentCachePromptStatsLocked()
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
func requestCompleted(requestId: String, generationTokens: Int) {
|
func requestCompleted(requestId: String, generationTokens: Int) {
|
||||||
let now = Date()
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
@@ -108,6 +132,7 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_isGenerating = _generatingRequests > 0
|
_isGenerating = _generatingRequests > 0
|
||||||
}
|
}
|
||||||
refreshCurrentPhaseElapsed(now: now)
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
|
refreshCurrentCachePromptStatsLocked()
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,9 +151,13 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_isGenerating = false
|
_isGenerating = false
|
||||||
_contextMax = 0
|
_contextMax = 0
|
||||||
_currentPhaseElapsed = 0
|
_currentPhaseElapsed = 0
|
||||||
|
_currentCacheMatchedPromptTokens = 0
|
||||||
|
_currentCacheRebuiltPromptTokens = 0
|
||||||
_totalRequests = 0
|
_totalRequests = 0
|
||||||
_totalPromptTokens = 0
|
_totalPromptTokens = 0
|
||||||
_totalGenerationTokens = 0
|
_totalGenerationTokens = 0
|
||||||
|
_totalCacheReusePromptTokens = 0
|
||||||
|
_totalCacheRebuildPromptTokens = 0
|
||||||
_totalPreparingDuration = 0
|
_totalPreparingDuration = 0
|
||||||
_totalSessionBuildDuration = 0
|
_totalSessionBuildDuration = 0
|
||||||
_totalPrefillDuration = 0
|
_totalPrefillDuration = 0
|
||||||
@@ -154,9 +183,13 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
isGenerating: _isGenerating,
|
isGenerating: _isGenerating,
|
||||||
contextMax: _contextMax,
|
contextMax: _contextMax,
|
||||||
currentPhaseElapsed: _currentPhaseElapsed,
|
currentPhaseElapsed: _currentPhaseElapsed,
|
||||||
|
currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
|
||||||
|
currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
|
||||||
totalRequests: _totalRequests,
|
totalRequests: _totalRequests,
|
||||||
totalPromptTokens: _totalPromptTokens,
|
totalPromptTokens: _totalPromptTokens,
|
||||||
totalGenerationTokens: _totalGenerationTokens,
|
totalGenerationTokens: _totalGenerationTokens,
|
||||||
|
totalCacheReusePromptTokens: _totalCacheReusePromptTokens,
|
||||||
|
totalCacheRebuildPromptTokens: _totalCacheRebuildPromptTokens,
|
||||||
totalPreparingDuration: _totalPreparingDuration,
|
totalPreparingDuration: _totalPreparingDuration,
|
||||||
totalSessionBuildDuration: _totalSessionBuildDuration,
|
totalSessionBuildDuration: _totalSessionBuildDuration,
|
||||||
totalPrefillDuration: _totalPrefillDuration,
|
totalPrefillDuration: _totalPrefillDuration,
|
||||||
@@ -179,9 +212,13 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
let isGenerating: Bool
|
let isGenerating: Bool
|
||||||
let contextMax: Int
|
let contextMax: Int
|
||||||
let currentPhaseElapsed: TimeInterval
|
let currentPhaseElapsed: TimeInterval
|
||||||
|
let currentCacheMatchedPromptTokens: Int
|
||||||
|
let currentCacheRebuiltPromptTokens: Int
|
||||||
let totalRequests: Int
|
let totalRequests: Int
|
||||||
let totalPromptTokens: Int
|
let totalPromptTokens: Int
|
||||||
let totalGenerationTokens: Int
|
let totalGenerationTokens: Int
|
||||||
|
let totalCacheReusePromptTokens: Int
|
||||||
|
let totalCacheRebuildPromptTokens: Int
|
||||||
let totalPreparingDuration: TimeInterval
|
let totalPreparingDuration: TimeInterval
|
||||||
let totalSessionBuildDuration: TimeInterval
|
let totalSessionBuildDuration: TimeInterval
|
||||||
let totalPrefillDuration: TimeInterval
|
let totalPrefillDuration: TimeInterval
|
||||||
@@ -231,9 +268,16 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
|
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func refreshCurrentCachePromptStatsLocked() {
|
||||||
|
_currentCacheMatchedPromptTokens = requestPhases.values.reduce(0) { $0 + $1.matchedPromptTokens }
|
||||||
|
_currentCacheRebuiltPromptTokens = requestPhases.values.reduce(0) { $0 + $1.rebuiltPromptTokens }
|
||||||
|
}
|
||||||
|
|
||||||
private struct RequestState {
|
private struct RequestState {
|
||||||
var phase: RequestPhase
|
var phase: RequestPhase
|
||||||
var phaseStartedAt: Date
|
var phaseStartedAt: Date
|
||||||
|
var matchedPromptTokens: Int = 0
|
||||||
|
var rebuiltPromptTokens: Int = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
enum RequestPhase {
|
enum RequestPhase {
|
||||||
@@ -264,17 +308,20 @@ final class InferenceStats {
|
|||||||
var contextUsed: Int = 0
|
var contextUsed: Int = 0
|
||||||
var contextMax: Int = 0
|
var contextMax: Int = 0
|
||||||
var currentPhaseElapsed: TimeInterval = 0
|
var currentPhaseElapsed: TimeInterval = 0
|
||||||
|
var currentCacheMatchedPromptTokens: Int = 0
|
||||||
|
var currentCacheRebuiltPromptTokens: Int = 0
|
||||||
|
|
||||||
// MARK: - Cumulative counters
|
// MARK: - Cumulative counters
|
||||||
|
|
||||||
var totalRequests: Int = 0
|
var totalRequests: Int = 0
|
||||||
var totalPromptTokens: Int = 0
|
var totalPromptTokens: Int = 0
|
||||||
var totalGenerationTokens: Int = 0
|
var totalGenerationTokens: Int = 0
|
||||||
|
var totalCacheReusePromptTokens: Int = 0
|
||||||
|
var totalCacheRebuildPromptTokens: Int = 0
|
||||||
var totalCacheHits: Int = 0
|
var totalCacheHits: Int = 0
|
||||||
var totalCacheMisses: Int = 0
|
var totalCacheMisses: Int = 0
|
||||||
var totalCacheEvictions: Int = 0
|
var totalCacheEvictions: Int = 0
|
||||||
var totalCacheReusePromptTokens: Int = 0
|
var cacheHitRatePercent: Double = 0
|
||||||
var totalCacheRebuildPromptTokens: Int = 0
|
|
||||||
var totalPreparingDuration: TimeInterval = 0
|
var totalPreparingDuration: TimeInterval = 0
|
||||||
var totalSessionBuildDuration: TimeInterval = 0
|
var totalSessionBuildDuration: TimeInterval = 0
|
||||||
var totalPrefillDuration: TimeInterval = 0
|
var totalPrefillDuration: TimeInterval = 0
|
||||||
@@ -283,12 +330,11 @@ final class InferenceStats {
|
|||||||
// MARK: - Cache state
|
// MARK: - Cache state
|
||||||
|
|
||||||
var cacheEntryCount: Int = 0
|
var cacheEntryCount: Int = 0
|
||||||
var warmCacheEntryCount: Int = 0
|
|
||||||
var activeCacheEntryCount: Int = 0
|
|
||||||
var generatingCacheEntryCount: Int = 0
|
|
||||||
var cacheEstimatedBytes: Int = 0
|
var cacheEstimatedBytes: Int = 0
|
||||||
var cacheEstimatedTokens: Int = 0
|
var cacheEstimatedTokens: Int = 0
|
||||||
var cachedSessions: [ConversationSessionCache.SessionSummary] = []
|
var cacheMemoryBudgetBytes: Int = 0
|
||||||
|
var cacheMemoryUsagePercent: Double = 0
|
||||||
|
var cachedEntries: [TokenPrefixCache.EntrySummary] = []
|
||||||
|
|
||||||
// MARK: - Time series data (ring buffers for charts)
|
// MARK: - Time series data (ring buffers for charts)
|
||||||
|
|
||||||
@@ -302,13 +348,14 @@ final class InferenceStats {
|
|||||||
private(set) var promptTokenHistory: [DataPoint] = []
|
private(set) var promptTokenHistory: [DataPoint] = []
|
||||||
private(set) var generationTokenHistory: [DataPoint] = []
|
private(set) var generationTokenHistory: [DataPoint] = []
|
||||||
private(set) var cacheEntryHistory: [DataPoint] = []
|
private(set) var cacheEntryHistory: [DataPoint] = []
|
||||||
private(set) var activeSessionHistory: [DataPoint] = []
|
|
||||||
private(set) var cacheFootprintHistory: [DataPoint] = []
|
private(set) var cacheFootprintHistory: [DataPoint] = []
|
||||||
private(set) var cacheReuseHistory: [DataPoint] = []
|
private(set) var cacheHitRateHistory: [DataPoint] = []
|
||||||
private(set) var cacheRebuildHistory: [DataPoint] = []
|
private(set) var cacheMemoryPressureHistory: [DataPoint] = []
|
||||||
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
|
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
|
||||||
private(set) var prefillDurationHistory: [DataPoint] = []
|
private(set) var prefillDurationHistory: [DataPoint] = []
|
||||||
private(set) var sessionBuildDurationHistory: [DataPoint] = []
|
private(set) var cacheReusePromptHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheRebuildPromptHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheMatchQualityHistory: [DataPoint] = []
|
||||||
|
|
||||||
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
||||||
|
|
||||||
@@ -316,10 +363,9 @@ final class InferenceStats {
|
|||||||
private var sampleTimer: Timer?
|
private var sampleTimer: Timer?
|
||||||
private var lastGenerationTokenCount: Int = 0
|
private var lastGenerationTokenCount: Int = 0
|
||||||
private var lastPromptTokenCount: Int = 0
|
private var lastPromptTokenCount: Int = 0
|
||||||
private var lastCacheReuseTokenCount: Int = 0
|
|
||||||
private var lastCacheRebuildTokenCount: Int = 0
|
|
||||||
private var lastPrefillDuration: TimeInterval = 0
|
private var lastPrefillDuration: TimeInterval = 0
|
||||||
private var lastSessionBuildDuration: TimeInterval = 0
|
private var lastCacheReusePromptTokenCount: Int = 0
|
||||||
|
private var lastCacheRebuildPromptTokenCount: Int = 0
|
||||||
|
|
||||||
func startSampling() {
|
func startSampling() {
|
||||||
guard sampleTimer == nil else { return }
|
guard sampleTimer == nil else { return }
|
||||||
@@ -338,7 +384,7 @@ final class InferenceStats {
|
|||||||
private func recordSample() {
|
private func recordSample() {
|
||||||
// Pull live values from the thread-safe counters
|
// Pull live values from the thread-safe counters
|
||||||
let snap = LiveCounters.shared.snapshot()
|
let snap = LiveCounters.shared.snapshot()
|
||||||
let cache = ConversationSessionCache.shared.snapshot()
|
let cache = TokenPrefixCache.shared.snapshot()
|
||||||
|
|
||||||
activeRequests = snap.activeRequests
|
activeRequests = snap.activeRequests
|
||||||
preparingRequests = snap.preparingRequests
|
preparingRequests = snap.preparingRequests
|
||||||
@@ -353,9 +399,13 @@ final class InferenceStats {
|
|||||||
contextMax = snap.contextMax
|
contextMax = snap.contextMax
|
||||||
contextUsed = snap.promptTokens + snap.generationTokens
|
contextUsed = snap.promptTokens + snap.generationTokens
|
||||||
currentPhaseElapsed = snap.currentPhaseElapsed
|
currentPhaseElapsed = snap.currentPhaseElapsed
|
||||||
|
currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
|
||||||
|
currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
|
||||||
totalRequests = snap.totalRequests
|
totalRequests = snap.totalRequests
|
||||||
totalPromptTokens = snap.totalPromptTokens
|
totalPromptTokens = snap.totalPromptTokens
|
||||||
totalGenerationTokens = snap.totalGenerationTokens
|
totalGenerationTokens = snap.totalGenerationTokens
|
||||||
|
totalCacheReusePromptTokens = snap.totalCacheReusePromptTokens
|
||||||
|
totalCacheRebuildPromptTokens = snap.totalCacheRebuildPromptTokens
|
||||||
totalPreparingDuration = snap.totalPreparingDuration
|
totalPreparingDuration = snap.totalPreparingDuration
|
||||||
totalSessionBuildDuration = snap.totalSessionBuildDuration
|
totalSessionBuildDuration = snap.totalSessionBuildDuration
|
||||||
totalPrefillDuration = snap.totalPrefillDuration
|
totalPrefillDuration = snap.totalPrefillDuration
|
||||||
@@ -363,41 +413,41 @@ final class InferenceStats {
|
|||||||
totalCacheHits = cache.totalHits
|
totalCacheHits = cache.totalHits
|
||||||
totalCacheMisses = cache.totalMisses
|
totalCacheMisses = cache.totalMisses
|
||||||
totalCacheEvictions = cache.totalEvictions
|
totalCacheEvictions = cache.totalEvictions
|
||||||
totalCacheReusePromptTokens = cache.totalReusePromptTokens
|
cacheHitRatePercent = cache.hitRate
|
||||||
totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
|
|
||||||
cacheEntryCount = cache.totalEntries
|
cacheEntryCount = cache.totalEntries
|
||||||
warmCacheEntryCount = cache.warmEntries
|
|
||||||
activeCacheEntryCount = cache.activeEntries
|
|
||||||
generatingCacheEntryCount = cache.generatingEntries
|
|
||||||
cacheEstimatedBytes = cache.estimatedBytes
|
cacheEstimatedBytes = cache.estimatedBytes
|
||||||
cacheEstimatedTokens = cache.cachedTokenEstimate
|
cacheEstimatedTokens = cache.totalCachedTokens
|
||||||
cachedSessions = cache.sessions
|
cacheMemoryBudgetBytes = cache.memoryBudgetBytes
|
||||||
|
cacheMemoryUsagePercent = cache.memoryUsagePercent
|
||||||
|
cachedEntries = cache.entries
|
||||||
|
|
||||||
let now = Date.now
|
let now = Date.now
|
||||||
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
|
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
|
||||||
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
||||||
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
|
|
||||||
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
|
|
||||||
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
|
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
|
||||||
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
|
let cacheReusePromptDelta = snap.totalCacheReusePromptTokens - lastCacheReusePromptTokenCount
|
||||||
|
let cacheRebuildPromptDelta = snap.totalCacheRebuildPromptTokens - lastCacheRebuildPromptTokenCount
|
||||||
|
let cacheMatchQualityDelta = cacheReusePromptDelta + cacheRebuildPromptDelta > 0
|
||||||
|
? (Double(cacheReusePromptDelta) / Double(cacheReusePromptDelta + cacheRebuildPromptDelta)) * 100
|
||||||
|
: 0
|
||||||
lastGenerationTokenCount = snap.totalGenerationTokens
|
lastGenerationTokenCount = snap.totalGenerationTokens
|
||||||
lastPromptTokenCount = snap.totalPromptTokens
|
lastPromptTokenCount = snap.totalPromptTokens
|
||||||
lastCacheReuseTokenCount = cache.totalReusePromptTokens
|
|
||||||
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
|
|
||||||
lastPrefillDuration = snap.totalPrefillDuration
|
lastPrefillDuration = snap.totalPrefillDuration
|
||||||
lastSessionBuildDuration = snap.totalSessionBuildDuration
|
lastCacheReusePromptTokenCount = snap.totalCacheReusePromptTokens
|
||||||
|
lastCacheRebuildPromptTokenCount = snap.totalCacheRebuildPromptTokens
|
||||||
|
|
||||||
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
||||||
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
||||||
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
|
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
|
||||||
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
|
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
|
||||||
activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
|
|
||||||
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
||||||
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
|
cacheHitRateHistory.append(DataPoint(timestamp: now, value: cache.hitRate))
|
||||||
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
|
cacheMemoryPressureHistory.append(DataPoint(timestamp: now, value: cache.memoryUsagePercent))
|
||||||
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
|
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
|
||||||
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
|
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
|
||||||
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
|
cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
|
||||||
|
cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
|
||||||
|
cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
|
||||||
|
|
||||||
if tokenRateHistory.count > Self.maxHistoryPoints {
|
if tokenRateHistory.count > Self.maxHistoryPoints {
|
||||||
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
||||||
@@ -411,17 +461,14 @@ final class InferenceStats {
|
|||||||
if cacheEntryHistory.count > Self.maxHistoryPoints {
|
if cacheEntryHistory.count > Self.maxHistoryPoints {
|
||||||
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
|
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
if activeSessionHistory.count > Self.maxHistoryPoints {
|
|
||||||
activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
|
|
||||||
}
|
|
||||||
if cacheFootprintHistory.count > Self.maxHistoryPoints {
|
if cacheFootprintHistory.count > Self.maxHistoryPoints {
|
||||||
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
|
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
if cacheReuseHistory.count > Self.maxHistoryPoints {
|
if cacheHitRateHistory.count > Self.maxHistoryPoints {
|
||||||
cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
|
cacheHitRateHistory.removeFirst(cacheHitRateHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
if cacheRebuildHistory.count > Self.maxHistoryPoints {
|
if cacheMemoryPressureHistory.count > Self.maxHistoryPoints {
|
||||||
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
|
cacheMemoryPressureHistory.removeFirst(cacheMemoryPressureHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
|
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
|
||||||
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
|
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
|
||||||
@@ -429,14 +476,20 @@ final class InferenceStats {
|
|||||||
if prefillDurationHistory.count > Self.maxHistoryPoints {
|
if prefillDurationHistory.count > Self.maxHistoryPoints {
|
||||||
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
|
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
|
if cacheReusePromptHistory.count > Self.maxHistoryPoints {
|
||||||
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
|
cacheReusePromptHistory.removeFirst(cacheReusePromptHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if cacheRebuildPromptHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheRebuildPromptHistory.removeFirst(cacheRebuildPromptHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func reset() {
|
func reset() {
|
||||||
LiveCounters.shared.reset()
|
LiveCounters.shared.reset()
|
||||||
ConversationSessionCache.shared.reset()
|
TokenPrefixCache.shared.reset()
|
||||||
activeRequests = 0
|
activeRequests = 0
|
||||||
preparingRequests = 0
|
preparingRequests = 0
|
||||||
sessionBuildRequests = 0
|
sessionBuildRequests = 0
|
||||||
@@ -450,9 +503,13 @@ final class InferenceStats {
|
|||||||
contextUsed = 0
|
contextUsed = 0
|
||||||
contextMax = 0
|
contextMax = 0
|
||||||
currentPhaseElapsed = 0
|
currentPhaseElapsed = 0
|
||||||
|
currentCacheMatchedPromptTokens = 0
|
||||||
|
currentCacheRebuiltPromptTokens = 0
|
||||||
totalRequests = 0
|
totalRequests = 0
|
||||||
totalPromptTokens = 0
|
totalPromptTokens = 0
|
||||||
totalGenerationTokens = 0
|
totalGenerationTokens = 0
|
||||||
|
totalCacheReusePromptTokens = 0
|
||||||
|
totalCacheRebuildPromptTokens = 0
|
||||||
totalPreparingDuration = 0
|
totalPreparingDuration = 0
|
||||||
totalSessionBuildDuration = 0
|
totalSessionBuildDuration = 0
|
||||||
totalPrefillDuration = 0
|
totalPrefillDuration = 0
|
||||||
@@ -460,31 +517,41 @@ final class InferenceStats {
|
|||||||
totalCacheHits = 0
|
totalCacheHits = 0
|
||||||
totalCacheMisses = 0
|
totalCacheMisses = 0
|
||||||
totalCacheEvictions = 0
|
totalCacheEvictions = 0
|
||||||
totalCacheReusePromptTokens = 0
|
cacheHitRatePercent = 0
|
||||||
totalCacheRebuildPromptTokens = 0
|
|
||||||
cacheEntryCount = 0
|
cacheEntryCount = 0
|
||||||
warmCacheEntryCount = 0
|
|
||||||
activeCacheEntryCount = 0
|
|
||||||
generatingCacheEntryCount = 0
|
|
||||||
cacheEstimatedBytes = 0
|
cacheEstimatedBytes = 0
|
||||||
cacheEstimatedTokens = 0
|
cacheEstimatedTokens = 0
|
||||||
cachedSessions.removeAll()
|
cacheMemoryBudgetBytes = 0
|
||||||
|
cacheMemoryUsagePercent = 0
|
||||||
|
cachedEntries.removeAll()
|
||||||
tokenRateHistory.removeAll()
|
tokenRateHistory.removeAll()
|
||||||
promptTokenHistory.removeAll()
|
promptTokenHistory.removeAll()
|
||||||
generationTokenHistory.removeAll()
|
generationTokenHistory.removeAll()
|
||||||
cacheEntryHistory.removeAll()
|
cacheEntryHistory.removeAll()
|
||||||
activeSessionHistory.removeAll()
|
|
||||||
cacheFootprintHistory.removeAll()
|
cacheFootprintHistory.removeAll()
|
||||||
cacheReuseHistory.removeAll()
|
cacheHitRateHistory.removeAll()
|
||||||
cacheRebuildHistory.removeAll()
|
cacheMemoryPressureHistory.removeAll()
|
||||||
currentPhaseElapsedHistory.removeAll()
|
currentPhaseElapsedHistory.removeAll()
|
||||||
prefillDurationHistory.removeAll()
|
prefillDurationHistory.removeAll()
|
||||||
sessionBuildDurationHistory.removeAll()
|
cacheReusePromptHistory.removeAll()
|
||||||
|
cacheRebuildPromptHistory.removeAll()
|
||||||
|
cacheMatchQualityHistory.removeAll()
|
||||||
lastGenerationTokenCount = 0
|
lastGenerationTokenCount = 0
|
||||||
lastPromptTokenCount = 0
|
lastPromptTokenCount = 0
|
||||||
lastCacheReuseTokenCount = 0
|
|
||||||
lastCacheRebuildTokenCount = 0
|
|
||||||
lastPrefillDuration = 0
|
lastPrefillDuration = 0
|
||||||
lastSessionBuildDuration = 0
|
lastCacheReusePromptTokenCount = 0
|
||||||
|
lastCacheRebuildPromptTokenCount = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
var currentCacheMatchQualityPercent: Double {
|
||||||
|
let total = currentCacheMatchedPromptTokens + currentCacheRebuiltPromptTokens
|
||||||
|
guard total > 0 else { return 0 }
|
||||||
|
return (Double(currentCacheMatchedPromptTokens) / Double(total)) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
var totalCacheMatchQualityPercent: Double {
|
||||||
|
let total = totalCacheReusePromptTokens + totalCacheRebuildPromptTokens
|
||||||
|
guard total > 0 else { return 0 }
|
||||||
|
return (Double(totalCacheReusePromptTokens) / Double(total)) * 100
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ final class APIServer {
|
|||||||
listener?.cancel()
|
listener?.cancel()
|
||||||
listener = nil
|
listener = nil
|
||||||
isRunning = false
|
isRunning = false
|
||||||
ConversationSessionCache.shared.invalidateAll()
|
TokenPrefixCache.shared.invalidateAll()
|
||||||
inferenceStats.stopSampling()
|
inferenceStats.stopSampling()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -176,7 +176,7 @@ final class APIServer {
|
|||||||
if let targetConfig = ModelConfig.resolve(requestedModel) {
|
if let targetConfig = ModelConfig.resolve(requestedModel) {
|
||||||
if modelManager.currentModel?.id != targetConfig.id {
|
if modelManager.currentModel?.id != targetConfig.id {
|
||||||
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
|
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
|
||||||
ConversationSessionCache.shared.invalidateAll()
|
TokenPrefixCache.shared.invalidateAll()
|
||||||
await modelManager.loadModel(targetConfig)
|
await modelManager.loadModel(targetConfig)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -187,7 +187,7 @@ final class APIServer {
|
|||||||
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
||||||
let config = ModelConfig.resolve(lastModelId) {
|
let config = ModelConfig.resolve(lastModelId) {
|
||||||
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
||||||
ConversationSessionCache.shared.invalidateAll()
|
TokenPrefixCache.shared.invalidateAll()
|
||||||
await modelManager.loadModel(config)
|
await modelManager.loadModel(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,110 +260,80 @@ final class APIServer {
|
|||||||
temperature: Float(temperature),
|
temperature: Float(temperature),
|
||||||
topP: Float(topP)
|
topP: Float(topP)
|
||||||
)
|
)
|
||||||
|
|
||||||
// Feed all messages except the last as history, then send the last as the prompt
|
|
||||||
let chatMessages = preparedPrompt.chatMessages
|
|
||||||
let allButLast = Array(chatMessages.dropLast())
|
|
||||||
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
|
|
||||||
|
|
||||||
let historySignatures = Array(preparedPrompt.messageSignatures.dropLast())
|
|
||||||
let currentModelId = modelManager.currentModel?.id ?? modelName
|
let currentModelId = modelManager.currentModel?.id ?? modelName
|
||||||
let lease = ConversationSessionCache.shared.checkoutSession(
|
let engine = InferenceEngine(container: container)
|
||||||
modelId: currentModelId,
|
let preparedInference: InferenceEngine.PreparedInference
|
||||||
instructions: preparedPrompt.instructions,
|
do {
|
||||||
historySignatures: historySignatures,
|
preparedInference = try await engine.prepare(preparedPrompt.userInput)
|
||||||
requestMessageCount: chatMessages.count,
|
} catch {
|
||||||
estimatedPromptTokens: estimatedPromptTokens,
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
|
||||||
estimatedBytes: preparedPrompt.estimatedBytes
|
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||||
)
|
return
|
||||||
|
|
||||||
let session: ChatSession
|
|
||||||
if let reusableSession = lease.session {
|
|
||||||
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
|
|
||||||
session = reusableSession
|
|
||||||
session.generateParameters = generateParams
|
|
||||||
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
|
|
||||||
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
|
|
||||||
} else {
|
|
||||||
print("[APIServer] Creating fresh session")
|
|
||||||
ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
|
|
||||||
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
|
|
||||||
// Use `instructions:` for system/tool prompt (matches internal chat pattern).
|
|
||||||
// Only conversation turns go in `history:` — this avoids replaying the
|
|
||||||
// large tool prompt as history on every new session.
|
|
||||||
let instr = preparedPrompt.instructions.isEmpty ? nil : preparedPrompt.instructions
|
|
||||||
if !allButLast.isEmpty {
|
|
||||||
session = ChatSession(
|
|
||||||
container,
|
|
||||||
instructions: instr,
|
|
||||||
history: allButLast,
|
|
||||||
generateParameters: generateParams,
|
|
||||||
additionalContext: preparedPrompt.additionalContext
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
session = ChatSession(
|
|
||||||
container,
|
|
||||||
instructions: instr,
|
|
||||||
generateParameters: generateParams,
|
|
||||||
additionalContext: preparedPrompt.additionalContext
|
|
||||||
)
|
|
||||||
}
|
|
||||||
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
|
|
||||||
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
// Vision requests stay uncached until image fingerprinting lands.
|
||||||
let lastImages = lastMessage.images
|
let cacheKey = preparedInference.hasImages ? nil : preparedInference.tokens
|
||||||
|
let lease = cacheKey.map { TokenPrefixCache.shared.lookup(cacheKey: $0, modelId: currentModelId) }
|
||||||
|
?? TokenPrefixCache.CacheLease(entryId: UUID(), kvCache: nil, matchedTokenCount: 0, isHit: false)
|
||||||
|
|
||||||
let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
|
LiveCounters.shared.recordPrefillReuse(
|
||||||
|
requestId: requestId,
|
||||||
|
matchedPromptTokens: lease.matchedTokenCount,
|
||||||
|
promptTokenCount: preparedInference.tokens.count
|
||||||
|
)
|
||||||
|
|
||||||
|
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
|
||||||
|
|
||||||
|
let cancellation = CancellationToken()
|
||||||
|
let streamHandle: InferenceEngine.StreamHandle
|
||||||
|
do {
|
||||||
|
streamHandle = try await engine.stream(
|
||||||
|
InferenceEngine.InferenceRequest(
|
||||||
|
input: preparedInference.lmInput,
|
||||||
|
tokens: preparedInference.tokens,
|
||||||
|
parameters: generateParams,
|
||||||
|
cachedKV: lease.kvCache,
|
||||||
|
cachedTokenCount: lease.matchedTokenCount
|
||||||
|
),
|
||||||
|
cancellation: cancellation
|
||||||
|
)
|
||||||
|
} catch {
|
||||||
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
|
||||||
|
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
let result: GenerationOutcome
|
||||||
if isStream {
|
if isStream {
|
||||||
result = await handleStreamingResponse(
|
result = await handleStreamingResponse(
|
||||||
connection: connection,
|
connection: connection,
|
||||||
requestId: requestId,
|
requestId: requestId,
|
||||||
cacheEntryId: lease.entryId,
|
cancellation: cancellation,
|
||||||
session: session,
|
stream: streamHandle.stream,
|
||||||
prompt: lastMessage.content,
|
|
||||||
images: lastImages,
|
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName,
|
modelName: modelName
|
||||||
isQwen: isQwen
|
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
result = await handleNonStreamingResponse(
|
result = await handleNonStreamingResponse(
|
||||||
connection: connection,
|
connection: connection,
|
||||||
requestId: requestId,
|
requestId: requestId,
|
||||||
cacheEntryId: lease.entryId,
|
stream: streamHandle.stream,
|
||||||
session: session,
|
|
||||||
prompt: lastMessage.content,
|
|
||||||
images: lastImages,
|
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName,
|
modelName: modelName
|
||||||
isQwen: isQwen
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.succeeded {
|
if let cacheKey,
|
||||||
var cachedSignatures = preparedPrompt.messageSignatures
|
result.succeeded || result.cancelled {
|
||||||
if let assistantHistoryText = result.assistantHistoryText {
|
Self.storePromptCache(
|
||||||
cachedSignatures.append(
|
streamHandle.workingCache,
|
||||||
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
|
promptTokenCount: preparedInference.tokens.count,
|
||||||
)
|
|
||||||
}
|
|
||||||
ConversationSessionCache.shared.completeRequest(
|
|
||||||
entryId: lease.entryId,
|
entryId: lease.entryId,
|
||||||
session: session,
|
cacheKey: cacheKey,
|
||||||
requestMessageSignatures: cachedSignatures,
|
modelId: currentModelId
|
||||||
requestMessageCount: cachedSignatures.count,
|
|
||||||
estimatedPromptTokens: estimatedPromptTokens,
|
|
||||||
estimatedBytes: preparedPrompt.estimatedBytes,
|
|
||||||
promptTokens: result.promptTokens,
|
|
||||||
completionTokens: result.completionTokens
|
|
||||||
)
|
)
|
||||||
} else {
|
|
||||||
ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
|
||||||
@@ -375,53 +345,20 @@ final class APIServer {
|
|||||||
private func handleNonStreamingResponse(
|
private func handleNonStreamingResponse(
|
||||||
connection: NWConnection,
|
connection: NWConnection,
|
||||||
requestId: String,
|
requestId: String,
|
||||||
cacheEntryId: UUID,
|
stream: AsyncStream<Generation>,
|
||||||
session: ChatSession,
|
|
||||||
prompt: String,
|
|
||||||
images: [UserInput.Image],
|
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String,
|
modelName: String
|
||||||
isQwen: Bool
|
) async -> GenerationOutcome {
|
||||||
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
|
||||||
do {
|
do {
|
||||||
var fullText = ""
|
let outcome = await Self.collectGenerationOutcome(
|
||||||
var promptTokens = 0
|
stream: stream,
|
||||||
var completionTokens = 0
|
requestId: requestId,
|
||||||
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
|
cancellation: nil
|
||||||
|
|
||||||
let stream = session.streamDetails(
|
|
||||||
to: prompt,
|
|
||||||
images: images,
|
|
||||||
videos: []
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for try await generation in stream {
|
|
||||||
switch generation {
|
|
||||||
case .chunk(let text):
|
|
||||||
fullText += text
|
|
||||||
completionTokens += 1
|
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
|
|
||||||
case .info(let info):
|
|
||||||
promptTokens = info.promptTokenCount
|
|
||||||
completionTokens = info.generationTokenCount
|
|
||||||
ConversationSessionCache.shared.markGenerating(
|
|
||||||
entryId: cacheEntryId,
|
|
||||||
promptTokens: promptTokens,
|
|
||||||
completionTokens: completionTokens
|
|
||||||
)
|
|
||||||
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
|
||||||
if info.tokensPerSecond > 0 {
|
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
|
||||||
}
|
|
||||||
case .toolCall(let call):
|
|
||||||
frameworkToolCalls.append(call)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let resolved = Self.resolveAssistantResponse(
|
let resolved = Self.resolveAssistantResponse(
|
||||||
fullText: fullText,
|
fullText: outcome.fullText,
|
||||||
frameworkToolCalls: frameworkToolCalls,
|
frameworkToolCalls: outcome.frameworkToolCalls,
|
||||||
tools: tools
|
tools: tools
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -442,24 +379,26 @@ final class APIServer {
|
|||||||
)
|
)
|
||||||
],
|
],
|
||||||
usage: APIUsageInfo(
|
usage: APIUsageInfo(
|
||||||
prompt_tokens: promptTokens,
|
prompt_tokens: outcome.promptTokens,
|
||||||
completion_tokens: completionTokens,
|
completion_tokens: outcome.completionTokens,
|
||||||
total_tokens: promptTokens + completionTokens
|
total_tokens: outcome.promptTokens + outcome.completionTokens
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if let json = try? JSONEncoder().encode(response) {
|
if let json = try? JSONEncoder().encode(response) {
|
||||||
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
||||||
}
|
}
|
||||||
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
return GenerationOutcome(
|
||||||
content: resolved.content,
|
promptTokens: outcome.promptTokens,
|
||||||
toolCalls: resolved.toolCalls,
|
completionTokens: outcome.completionTokens,
|
||||||
isQwen: isQwen
|
fullText: outcome.fullText,
|
||||||
|
frameworkToolCalls: outcome.frameworkToolCalls,
|
||||||
|
succeeded: true,
|
||||||
|
cancelled: false
|
||||||
)
|
)
|
||||||
return (promptTokens, completionTokens, assistantHistoryText, true)
|
|
||||||
} catch {
|
} catch {
|
||||||
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||||
return (0, 0, nil, false)
|
return GenerationOutcome(promptTokens: 0, completionTokens: 0, fullText: "", frameworkToolCalls: [], succeeded: false, cancelled: false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -468,15 +407,12 @@ final class APIServer {
|
|||||||
private func handleStreamingResponse(
|
private func handleStreamingResponse(
|
||||||
connection: NWConnection,
|
connection: NWConnection,
|
||||||
requestId: String,
|
requestId: String,
|
||||||
cacheEntryId: UUID,
|
cancellation: CancellationToken,
|
||||||
session: ChatSession,
|
stream: AsyncStream<Generation>,
|
||||||
prompt: String,
|
|
||||||
images: [UserInput.Image],
|
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String,
|
modelName: String
|
||||||
isQwen: Bool
|
) async -> GenerationOutcome {
|
||||||
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
|
||||||
// Send SSE headers
|
// Send SSE headers
|
||||||
let header = [
|
let header = [
|
||||||
"HTTP/1.1 200 OK",
|
"HTTP/1.1 200 OK",
|
||||||
@@ -489,55 +425,34 @@ final class APIServer {
|
|||||||
].joined(separator: "\r\n")
|
].joined(separator: "\r\n")
|
||||||
|
|
||||||
await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
|
await Self.sendData(connection: connection, data: header.data(using: .utf8)!)
|
||||||
|
connection.stateUpdateHandler = { state in
|
||||||
|
switch state {
|
||||||
|
case .cancelled, .failed:
|
||||||
|
cancellation.cancel()
|
||||||
|
default:
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Send initial role chunk
|
let encoder = StreamingSSEEncoder(requestId: requestId, created: created, modelName: modelName)
|
||||||
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
await Self.sendData(connection: connection, data: encoder.encodeRoleDelta("assistant"))
|
||||||
id: requestId,
|
|
||||||
object: "chat.completion.chunk",
|
|
||||||
created: created,
|
|
||||||
model: modelName,
|
|
||||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: "assistant", content: nil, tool_calls: nil), finish_reason: nil)],
|
|
||||||
usage: nil
|
|
||||||
))
|
|
||||||
|
|
||||||
let hasTools = tools != nil && !(tools?.isEmpty ?? true)
|
let result = await Self.runStreamingLoop(
|
||||||
|
connection: connection,
|
||||||
// Run the generation loop OFF MainActor.
|
stream: stream,
|
||||||
// ChatSession and NWConnection don't need MainActor.
|
cancellation: cancellation,
|
||||||
// Running on MainActor caused every token to compete with SwiftUI
|
requestId: requestId,
|
||||||
// rendering, creating back-pressure that coalesced all output.
|
encoder: encoder
|
||||||
let stream = session.streamDetails(
|
|
||||||
to: prompt,
|
|
||||||
images: images,
|
|
||||||
videos: []
|
|
||||||
)
|
)
|
||||||
// Transfer non-Sendable values to the nonisolated loop.
|
|
||||||
// Safe because we don't touch session/images again until after the loop.
|
|
||||||
let result = await {
|
|
||||||
nonisolated(unsafe) let stream = stream
|
|
||||||
return await Self.runStreamingLoop(
|
|
||||||
connection: connection,
|
|
||||||
stream: stream,
|
|
||||||
requestId: requestId,
|
|
||||||
created: created,
|
|
||||||
modelName: modelName
|
|
||||||
)
|
|
||||||
}()
|
|
||||||
|
|
||||||
let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
|
if result.cancelled {
|
||||||
|
connection.cancel()
|
||||||
if promptTokens > 0 {
|
return result
|
||||||
ConversationSessionCache.shared.markGenerating(
|
|
||||||
entryId: cacheEntryId,
|
|
||||||
promptTokens: promptTokens,
|
|
||||||
completionTokens: completionTokens
|
|
||||||
)
|
|
||||||
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let resolved = Self.resolveAssistantResponse(
|
let resolved = Self.resolveAssistantResponse(
|
||||||
fullText: fullText,
|
fullText: result.fullText,
|
||||||
frameworkToolCalls: frameworkToolCalls,
|
frameworkToolCalls: result.frameworkToolCalls,
|
||||||
tools: tools
|
tools: tools
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -562,21 +477,16 @@ final class APIServer {
|
|||||||
model: modelName,
|
model: modelName,
|
||||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
|
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
|
||||||
usage: APIUsageInfo(
|
usage: APIUsageInfo(
|
||||||
prompt_tokens: promptTokens,
|
prompt_tokens: result.promptTokens,
|
||||||
completion_tokens: completionTokens,
|
completion_tokens: result.completionTokens,
|
||||||
total_tokens: promptTokens + completionTokens
|
total_tokens: result.promptTokens + result.completionTokens
|
||||||
)
|
)
|
||||||
))
|
))
|
||||||
|
|
||||||
// Send [DONE] and close
|
// Send [DONE] and close
|
||||||
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
||||||
connection.cancel()
|
connection.cancel()
|
||||||
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
return result
|
||||||
content: resolved.content,
|
|
||||||
toolCalls: resolved.toolCalls,
|
|
||||||
isQwen: isQwen
|
|
||||||
)
|
|
||||||
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the token generation + SSE send loop entirely off MainActor.
|
/// Run the token generation + SSE send loop entirely off MainActor.
|
||||||
@@ -584,54 +494,20 @@ final class APIServer {
|
|||||||
/// multiple actor hops competing with SwiftUI, causing all output to batch.
|
/// multiple actor hops competing with SwiftUI, causing all output to batch.
|
||||||
nonisolated private static func runStreamingLoop(
|
nonisolated private static func runStreamingLoop(
|
||||||
connection: NWConnection,
|
connection: NWConnection,
|
||||||
stream: AsyncThrowingStream<Generation, any Error>,
|
stream: AsyncStream<Generation>,
|
||||||
|
cancellation: CancellationToken,
|
||||||
requestId: String,
|
requestId: String,
|
||||||
created: Int,
|
encoder: StreamingSSEEncoder
|
||||||
modelName: String
|
) async -> GenerationOutcome {
|
||||||
) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
|
var outcome = await collectGenerationOutcome(
|
||||||
var promptTokens = 0
|
stream: stream,
|
||||||
var completionTokens = 0
|
requestId: requestId,
|
||||||
var fullText = ""
|
cancellation: cancellation
|
||||||
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
|
) { text in
|
||||||
|
await sendData(connection: connection, data: encoder.encodeContentDelta(text))
|
||||||
do {
|
|
||||||
for try await generation in stream {
|
|
||||||
switch generation {
|
|
||||||
case .chunk(let text):
|
|
||||||
completionTokens += 1
|
|
||||||
fullText += text
|
|
||||||
|
|
||||||
// Update live counters directly — no MainActor hop needed
|
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
|
|
||||||
|
|
||||||
// Send directly — no MainActor hop.
|
|
||||||
await sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
|
||||||
id: requestId,
|
|
||||||
object: "chat.completion.chunk",
|
|
||||||
created: created,
|
|
||||||
model: modelName,
|
|
||||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: text, tool_calls: nil), finish_reason: nil)],
|
|
||||||
usage: nil
|
|
||||||
))
|
|
||||||
|
|
||||||
case .info(let info):
|
|
||||||
promptTokens = info.promptTokenCount
|
|
||||||
completionTokens = info.generationTokenCount
|
|
||||||
if info.tokensPerSecond > 0 {
|
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
|
||||||
}
|
|
||||||
|
|
||||||
case .toolCall(let call):
|
|
||||||
frameworkToolCalls.append(call)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
|
|
||||||
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
|
|
||||||
return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
|
|
||||||
}
|
}
|
||||||
|
outcome.succeeded = !outcome.cancelled
|
||||||
return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
|
return outcome
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send an SSE event and wait for the protocol stack to process it.
|
/// Send an SSE event and wait for the protocol stack to process it.
|
||||||
@@ -651,6 +527,88 @@ final class APIServer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nonisolated private static func collectGenerationOutcome(
|
||||||
|
stream: AsyncStream<Generation>,
|
||||||
|
requestId: String,
|
||||||
|
cancellation: CancellationToken?,
|
||||||
|
onChunk: ((String) async -> Void)? = nil
|
||||||
|
) async -> GenerationOutcome {
|
||||||
|
var promptTokens = 0
|
||||||
|
var completionTokens = 0
|
||||||
|
var fullText = ""
|
||||||
|
var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
|
||||||
|
var cancelled = false
|
||||||
|
|
||||||
|
for await generation in stream {
|
||||||
|
if let cancellation, cancellation.isCancelled {
|
||||||
|
cancelled = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
switch generation {
|
||||||
|
case .chunk(let text):
|
||||||
|
completionTokens += 1
|
||||||
|
fullText += text
|
||||||
|
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
|
||||||
|
if let onChunk {
|
||||||
|
await onChunk(text)
|
||||||
|
}
|
||||||
|
case .info(let info):
|
||||||
|
promptTokens = info.promptTokenCount
|
||||||
|
completionTokens = info.generationTokenCount
|
||||||
|
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
||||||
|
if info.tokensPerSecond > 0 {
|
||||||
|
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
||||||
|
}
|
||||||
|
case .toolCall(let call):
|
||||||
|
frameworkToolCalls.append(call)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return GenerationOutcome(
|
||||||
|
promptTokens: promptTokens,
|
||||||
|
completionTokens: completionTokens,
|
||||||
|
fullText: fullText,
|
||||||
|
frameworkToolCalls: frameworkToolCalls,
|
||||||
|
succeeded: !cancelled,
|
||||||
|
cancelled: cancelled
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func storePromptCache(
|
||||||
|
_ cache: [KVCache],
|
||||||
|
promptTokenCount: Int,
|
||||||
|
entryId: UUID,
|
||||||
|
cacheKey: [Int],
|
||||||
|
modelId: String
|
||||||
|
) {
|
||||||
|
guard trimGeneratedTokens(cache, promptTokenCount: promptTokenCount) else {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
TokenPrefixCache.shared.store(
|
||||||
|
entryId: entryId,
|
||||||
|
kvCache: cache,
|
||||||
|
cacheKey: cacheKey,
|
||||||
|
modelId: modelId
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func trimGeneratedTokens(_ cache: [KVCache], promptTokenCount: Int) -> Bool {
|
||||||
|
for layer in cache {
|
||||||
|
let excess = layer.offset - promptTokenCount
|
||||||
|
guard excess <= 0 || layer.isTrimmable else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if excess > 0 {
|
||||||
|
let trimmed = layer.trim(excess)
|
||||||
|
guard trimmed == excess else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - HTTP helpers
|
// MARK: - HTTP helpers
|
||||||
|
|
||||||
private func sendResponse(
|
private func sendResponse(
|
||||||
@@ -787,6 +745,15 @@ final class APIServer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private struct GenerationOutcome {
|
||||||
|
var promptTokens: Int
|
||||||
|
var completionTokens: Int
|
||||||
|
var fullText: String
|
||||||
|
var frameworkToolCalls: [MLXLMCommon.ToolCall]
|
||||||
|
var succeeded: Bool
|
||||||
|
var cancelled: Bool
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - HTTP request parser
|
// MARK: - HTTP request parser
|
||||||
|
|
||||||
private struct HTTPRequest {
|
private struct HTTPRequest {
|
||||||
|
|||||||
@@ -1,358 +0,0 @@
|
|||||||
import Foundation
|
|
||||||
import MLXLMCommon
|
|
||||||
import os
|
|
||||||
|
|
||||||
enum APISessionPhase: String, Sendable {
|
|
||||||
case idle = "Idle"
|
|
||||||
case sessionBuild = "Session Build"
|
|
||||||
case prefilling = "Prefilling"
|
|
||||||
case generating = "Generating"
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Bounded cache of API chat sessions keyed by normalized conversation history.
|
|
||||||
/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
|
|
||||||
final class ConversationSessionCache: @unchecked Sendable {
|
|
||||||
static let shared = ConversationSessionCache()
|
|
||||||
|
|
||||||
private let lock = OSAllocatedUnfairLock()
|
|
||||||
|
|
||||||
private let maxEntries = 8
|
|
||||||
private let maxCachedTokens = 256_000
|
|
||||||
private let idleTTL: TimeInterval = 10 * 60
|
|
||||||
|
|
||||||
private var entries: [UUID: Entry] = [:]
|
|
||||||
private var totals = Totals()
|
|
||||||
|
|
||||||
private init() {}
|
|
||||||
|
|
||||||
struct Lease {
|
|
||||||
let entryId: UUID
|
|
||||||
let session: ChatSession?
|
|
||||||
let reusedPromptTokens: Int
|
|
||||||
let cacheHit: Bool
|
|
||||||
}
|
|
||||||
|
|
||||||
struct SessionSummary: Identifiable, Sendable {
|
|
||||||
let id: UUID
|
|
||||||
let modelId: String
|
|
||||||
let phase: APISessionPhase
|
|
||||||
let messageCount: Int
|
|
||||||
let cachedTokenEstimate: Int
|
|
||||||
let estimatedBytes: Int
|
|
||||||
let inFlightRequests: Int
|
|
||||||
let hitCount: Int
|
|
||||||
let lastPromptTokens: Int
|
|
||||||
let lastCompletionTokens: Int
|
|
||||||
let lastReuseTokens: Int
|
|
||||||
let createdAt: Date
|
|
||||||
let lastAccessAt: Date
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Snapshot: Sendable {
|
|
||||||
let totalEntries: Int
|
|
||||||
let warmEntries: Int
|
|
||||||
let activeEntries: Int
|
|
||||||
let generatingEntries: Int
|
|
||||||
let estimatedBytes: Int
|
|
||||||
let cachedTokenEstimate: Int
|
|
||||||
let totalHits: Int
|
|
||||||
let totalMisses: Int
|
|
||||||
let totalEvictions: Int
|
|
||||||
let totalReusePromptTokens: Int
|
|
||||||
let totalRebuildPromptTokens: Int
|
|
||||||
let sessions: [SessionSummary]
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkoutSession(
|
|
||||||
modelId: String,
|
|
||||||
instructions: String,
|
|
||||||
historySignatures: [UInt64],
|
|
||||||
requestMessageCount: Int,
|
|
||||||
estimatedPromptTokens: Int,
|
|
||||||
estimatedBytes: Int
|
|
||||||
) -> Lease {
|
|
||||||
lock.lock()
|
|
||||||
let now = Date()
|
|
||||||
pruneExpiredLocked(now: now)
|
|
||||||
|
|
||||||
let instructionsHash = Self.stableHash(instructions)
|
|
||||||
let match = entries
|
|
||||||
.values
|
|
||||||
.filter {
|
|
||||||
$0.modelId == modelId
|
|
||||||
&& $0.instructionsHash == instructionsHash
|
|
||||||
&& $0.session != nil
|
|
||||||
&& $0.inFlightRequests == 0
|
|
||||||
&& Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
|
|
||||||
}
|
|
||||||
.max { lhs, rhs in
|
|
||||||
lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
|
|
||||||
}
|
|
||||||
|
|
||||||
if let match {
|
|
||||||
var entry = match
|
|
||||||
entry.inFlightRequests += 1
|
|
||||||
entry.lastAccessAt = now
|
|
||||||
entry.phase = .prefilling
|
|
||||||
entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
|
|
||||||
entry.hitCount += 1
|
|
||||||
entries[entry.id] = entry
|
|
||||||
totals.totalHits += 1
|
|
||||||
totals.totalReusePromptTokens += entry.lastReuseTokens
|
|
||||||
let lease = Lease(
|
|
||||||
entryId: entry.id,
|
|
||||||
session: entry.session,
|
|
||||||
reusedPromptTokens: entry.lastReuseTokens,
|
|
||||||
cacheHit: true
|
|
||||||
)
|
|
||||||
lock.unlock()
|
|
||||||
return lease
|
|
||||||
}
|
|
||||||
|
|
||||||
let entryId = UUID()
|
|
||||||
entries[entryId] = Entry(
|
|
||||||
id: entryId,
|
|
||||||
modelId: modelId,
|
|
||||||
instructionsHash: instructionsHash,
|
|
||||||
requestMessageSignatures: historySignatures,
|
|
||||||
messageCount: requestMessageCount,
|
|
||||||
cachedTokenEstimate: estimatedPromptTokens,
|
|
||||||
estimatedBytes: estimatedBytes,
|
|
||||||
createdAt: now,
|
|
||||||
lastAccessAt: now,
|
|
||||||
inFlightRequests: 1,
|
|
||||||
hitCount: 0,
|
|
||||||
phase: .sessionBuild,
|
|
||||||
lastPromptTokens: 0,
|
|
||||||
lastCompletionTokens: 0,
|
|
||||||
lastReuseTokens: 0,
|
|
||||||
session: nil
|
|
||||||
)
|
|
||||||
totals.totalMisses += 1
|
|
||||||
totals.totalRebuildPromptTokens += estimatedPromptTokens
|
|
||||||
lock.unlock()
|
|
||||||
return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
|
|
||||||
}
|
|
||||||
|
|
||||||
func markSessionBuild(entryId: UUID) {
|
|
||||||
updatePhase(entryId: entryId, phase: .sessionBuild)
|
|
||||||
}
|
|
||||||
|
|
||||||
func markPrefilling(entryId: UUID) {
|
|
||||||
updatePhase(entryId: entryId, phase: .prefilling)
|
|
||||||
}
|
|
||||||
|
|
||||||
func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
|
|
||||||
lock.lock()
|
|
||||||
if var entry = entries[entryId] {
|
|
||||||
entry.phase = .generating
|
|
||||||
entry.lastPromptTokens = promptTokens
|
|
||||||
entry.lastCompletionTokens = completionTokens
|
|
||||||
entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
|
|
||||||
entry.lastAccessAt = Date()
|
|
||||||
entries[entryId] = entry
|
|
||||||
}
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func completeRequest(
|
|
||||||
entryId: UUID,
|
|
||||||
session: ChatSession,
|
|
||||||
requestMessageSignatures: [UInt64],
|
|
||||||
requestMessageCount: Int,
|
|
||||||
estimatedPromptTokens: Int,
|
|
||||||
estimatedBytes: Int,
|
|
||||||
promptTokens: Int,
|
|
||||||
completionTokens: Int
|
|
||||||
) {
|
|
||||||
lock.lock()
|
|
||||||
let now = Date()
|
|
||||||
if var entry = entries[entryId] {
|
|
||||||
entry.session = session
|
|
||||||
entry.requestMessageSignatures = requestMessageSignatures
|
|
||||||
entry.messageCount = requestMessageCount
|
|
||||||
entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
|
|
||||||
entry.estimatedBytes = estimatedBytes
|
|
||||||
entry.lastPromptTokens = promptTokens
|
|
||||||
entry.lastCompletionTokens = completionTokens
|
|
||||||
entry.lastAccessAt = now
|
|
||||||
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
|
|
||||||
entry.phase = .idle
|
|
||||||
entries[entryId] = entry
|
|
||||||
enforceBudgetLocked(now: now)
|
|
||||||
}
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func abandonRequest(entryId: UUID) {
|
|
||||||
lock.lock()
|
|
||||||
if var entry = entries[entryId] {
|
|
||||||
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
|
|
||||||
if entry.session == nil && entry.inFlightRequests == 0 {
|
|
||||||
entries.removeValue(forKey: entryId)
|
|
||||||
} else {
|
|
||||||
entry.phase = .idle
|
|
||||||
entry.lastAccessAt = Date()
|
|
||||||
entries[entryId] = entry
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func invalidateAll() {
|
|
||||||
lock.lock()
|
|
||||||
totals.totalEvictions += entries.count
|
|
||||||
entries.removeAll()
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func reset() {
|
|
||||||
lock.lock()
|
|
||||||
entries.removeAll()
|
|
||||||
totals = Totals()
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func snapshot() -> Snapshot {
|
|
||||||
lock.lock()
|
|
||||||
let now = Date()
|
|
||||||
pruneExpiredLocked(now: now)
|
|
||||||
let allEntries = Array(entries.values)
|
|
||||||
let sessions = allEntries
|
|
||||||
.sorted {
|
|
||||||
if $0.inFlightRequests != $1.inFlightRequests {
|
|
||||||
return $0.inFlightRequests > $1.inFlightRequests
|
|
||||||
}
|
|
||||||
return $0.lastAccessAt > $1.lastAccessAt
|
|
||||||
}
|
|
||||||
.map {
|
|
||||||
SessionSummary(
|
|
||||||
id: $0.id,
|
|
||||||
modelId: $0.modelId,
|
|
||||||
phase: $0.phase,
|
|
||||||
messageCount: $0.messageCount,
|
|
||||||
cachedTokenEstimate: $0.cachedTokenEstimate,
|
|
||||||
estimatedBytes: $0.estimatedBytes,
|
|
||||||
inFlightRequests: $0.inFlightRequests,
|
|
||||||
hitCount: $0.hitCount,
|
|
||||||
lastPromptTokens: $0.lastPromptTokens,
|
|
||||||
lastCompletionTokens: $0.lastCompletionTokens,
|
|
||||||
lastReuseTokens: $0.lastReuseTokens,
|
|
||||||
createdAt: $0.createdAt,
|
|
||||||
lastAccessAt: $0.lastAccessAt
|
|
||||||
)
|
|
||||||
}
|
|
||||||
let snapshot = Snapshot(
|
|
||||||
totalEntries: allEntries.count,
|
|
||||||
warmEntries: allEntries.filter { $0.session != nil }.count,
|
|
||||||
activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
|
|
||||||
generatingEntries: allEntries.filter { $0.phase == .generating }.count,
|
|
||||||
estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
|
|
||||||
cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
|
|
||||||
totalHits: totals.totalHits,
|
|
||||||
totalMisses: totals.totalMisses,
|
|
||||||
totalEvictions: totals.totalEvictions,
|
|
||||||
totalReusePromptTokens: totals.totalReusePromptTokens,
|
|
||||||
totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
|
|
||||||
sessions: sessions
|
|
||||||
)
|
|
||||||
lock.unlock()
|
|
||||||
return snapshot
|
|
||||||
}
|
|
||||||
|
|
||||||
private func updatePhase(entryId: UUID, phase: APISessionPhase) {
|
|
||||||
lock.lock()
|
|
||||||
if var entry = entries[entryId] {
|
|
||||||
entry.phase = phase
|
|
||||||
entry.lastAccessAt = Date()
|
|
||||||
entries[entryId] = entry
|
|
||||||
}
|
|
||||||
lock.unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
private func pruneExpiredLocked(now: Date) {
|
|
||||||
let expired = entries.values.filter {
|
|
||||||
$0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
|
|
||||||
}
|
|
||||||
guard !expired.isEmpty else { return }
|
|
||||||
for entry in expired {
|
|
||||||
entries.removeValue(forKey: entry.id)
|
|
||||||
}
|
|
||||||
totals.totalEvictions += expired.count
|
|
||||||
}
|
|
||||||
|
|
||||||
private func enforceBudgetLocked(now: Date) {
|
|
||||||
pruneExpiredLocked(now: now)
|
|
||||||
|
|
||||||
func totalCachedTokens() -> Int {
|
|
||||||
entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
|
|
||||||
}
|
|
||||||
|
|
||||||
while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
|
|
||||||
guard let victim = entries.values
|
|
||||||
.filter({ $0.inFlightRequests == 0 })
|
|
||||||
.sorted(by: evictionOrder)
|
|
||||||
.first
|
|
||||||
else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
entries.removeValue(forKey: victim.id)
|
|
||||||
totals.totalEvictions += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
|
|
||||||
if lhs.lastAccessAt != rhs.lastAccessAt {
|
|
||||||
return lhs.lastAccessAt < rhs.lastAccessAt
|
|
||||||
}
|
|
||||||
if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
|
|
||||||
return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
|
|
||||||
}
|
|
||||||
return lhs.createdAt < rhs.createdAt
|
|
||||||
}
|
|
||||||
|
|
||||||
private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
|
|
||||||
guard cached.count <= incoming.count,
|
|
||||||
incoming.count <= cached.count + 1 else { return false }
|
|
||||||
for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
static func stableHash(_ text: String) -> UInt64 {
|
|
||||||
var hash: UInt64 = 14_695_981_039_346_656_037
|
|
||||||
for byte in text.utf8 {
|
|
||||||
hash ^= UInt64(byte)
|
|
||||||
hash &*= 1_099_511_628_211
|
|
||||||
}
|
|
||||||
return hash
|
|
||||||
}
|
|
||||||
|
|
||||||
private struct Entry {
|
|
||||||
let id: UUID
|
|
||||||
let modelId: String
|
|
||||||
let instructionsHash: UInt64
|
|
||||||
var requestMessageSignatures: [UInt64]
|
|
||||||
var messageCount: Int
|
|
||||||
var cachedTokenEstimate: Int
|
|
||||||
var estimatedBytes: Int
|
|
||||||
let createdAt: Date
|
|
||||||
var lastAccessAt: Date
|
|
||||||
var inFlightRequests: Int
|
|
||||||
var hitCount: Int
|
|
||||||
var phase: APISessionPhase
|
|
||||||
var lastPromptTokens: Int
|
|
||||||
var lastCompletionTokens: Int
|
|
||||||
var lastReuseTokens: Int
|
|
||||||
var session: ChatSession?
|
|
||||||
}
|
|
||||||
|
|
||||||
private struct Totals {
|
|
||||||
var totalHits: Int = 0
|
|
||||||
var totalMisses: Int = 0
|
|
||||||
var totalEvictions: Int = 0
|
|
||||||
var totalReusePromptTokens: Int = 0
|
|
||||||
var totalRebuildPromptTokens: Int = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
736
MLXServerTests/Server/APIServerRewriteTests.swift
Normal file
736
MLXServerTests/Server/APIServerRewriteTests.swift
Normal file
@@ -0,0 +1,736 @@
|
|||||||
|
import Foundation
|
||||||
|
import XCTest
|
||||||
|
@testable import MLX_Server
|
||||||
|
|
||||||
|
final class APIServerRewriteTests: XCTestCase {
|
||||||
|
func testNonStreamingChatCompletionUsesStatelessServerPathAndCachesPrompt() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let request = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 1,
|
||||||
|
stream: false,
|
||||||
|
stop: nil,
|
||||||
|
tools: nil,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let firstResponse = try await sendChatCompletion(request, port: harness.port)
|
||||||
|
XCTAssertEqual(firstResponse.choices.count, 1)
|
||||||
|
XCTAssertEqual(firstResponse.choices[0].message.role, "assistant")
|
||||||
|
XCTAssertGreaterThan(firstResponse.usage.prompt_tokens, 0)
|
||||||
|
XCTAssertGreaterThanOrEqual(firstResponse.usage.completion_tokens, 0)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalEntries > 0
|
||||||
|
}
|
||||||
|
let firstSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let firstLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
|
||||||
|
|
||||||
|
_ = try await sendChatCompletion(request, port: harness.port)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let secondSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let secondLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let firstRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 3,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: nil,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let firstStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(firstStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(firstStream.sawDone)
|
||||||
|
XCTAssertEqual(firstStream.finalFinishReason, "stop")
|
||||||
|
XCTAssertGreaterThan(firstStream.usage?.prompt_tokens ?? 0, 0)
|
||||||
|
XCTAssertFalse(firstStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalEntries > 0
|
||||||
|
}
|
||||||
|
let firstSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let firstLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
|
||||||
|
let secondRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 3,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: nil,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let secondStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(secondStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(secondStream.sawDone)
|
||||||
|
XCTAssertEqual(secondStream.finalFinishReason, "stop")
|
||||||
|
XCTAssertFalse(secondStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let secondSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let secondLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
|
||||||
|
let thirdRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is the sky on a clear day?"), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: .text(firstStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is grass?"), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: .text(secondStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Answer in one word: what color is snow?"), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 3,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: nil,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(thirdStream.sawDone)
|
||||||
|
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
|
||||||
|
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStreamingChatCompletionReusesCacheAcrossToolBoundary() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let tools = [mockWeatherTool]
|
||||||
|
let firstRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 48,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let toolCallStream = try await sendStreamingChatCompletion(firstRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(toolCallStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(toolCallStream.sawDone)
|
||||||
|
XCTAssertEqual(toolCallStream.finalFinishReason, "tool_calls")
|
||||||
|
let toolCall = try XCTUnwrap(toolCallStream.toolCalls.first)
|
||||||
|
XCTAssertEqual(toolCall.function.name, "weather")
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalEntries > 0
|
||||||
|
}
|
||||||
|
let afterToolCallSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let afterToolCallLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
|
||||||
|
let secondRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 16,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let directAnswerStream = try await sendStreamingChatCompletion(secondRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(directAnswerStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(directAnswerStream.sawDone)
|
||||||
|
XCTAssertEqual(directAnswerStream.finalFinishReason, "stop")
|
||||||
|
XCTAssertTrue(directAnswerStream.toolCalls.isEmpty)
|
||||||
|
XCTAssertFalse(directAnswerStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > afterToolCallSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let afterDirectAnswerSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let afterDirectAnswerLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(afterDirectAnswerSnapshot.totalHits, afterToolCallSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens, afterToolCallLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
|
||||||
|
let thirdRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("You must call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [toolCall], tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "tool", content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"), name: nil, tool_calls: nil, tool_call_id: toolCall.id),
|
||||||
|
APIChatMessage(role: "assistant", content: .text(directAnswerStream.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Now compress that answer to two words."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 8,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let thirdStream = try await sendStreamingChatCompletion(thirdRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(thirdStream.roleDeltaCount, 1)
|
||||||
|
XCTAssertTrue(thirdStream.sawDone)
|
||||||
|
XCTAssertEqual(thirdStream.finalFinishReason, "stop")
|
||||||
|
XCTAssertFalse(thirdStream.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > afterDirectAnswerSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let finalSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let finalLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDirectAnswerSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDirectAnswerLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStreamingChatCompletionReusesCacheAcrossMultipleToolTurns() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let tools = [mockWeatherTool]
|
||||||
|
let berlinRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 48,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let firstToolTurn = try await sendStreamingChatCompletion(berlinRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(firstToolTurn.finalFinishReason, "tool_calls")
|
||||||
|
let berlinToolCall = try XCTUnwrap(firstToolTurn.toolCalls.first)
|
||||||
|
XCTAssertEqual(berlinToolCall.function.name, "weather")
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalEntries > 0
|
||||||
|
}
|
||||||
|
let firstSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let firstLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
|
||||||
|
let berlinToolResult = APIChatMessage(
|
||||||
|
role: "tool",
|
||||||
|
content: .text("{\"city\":\"Berlin\",\"temperature_c\":19,\"condition\":\"sunny\"}"),
|
||||||
|
name: nil,
|
||||||
|
tool_calls: nil,
|
||||||
|
tool_call_id: berlinToolCall.id
|
||||||
|
)
|
||||||
|
|
||||||
|
let berlinAnswerRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
|
||||||
|
berlinToolResult
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 16,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let berlinAnswer = try await sendStreamingChatCompletion(berlinAnswerRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(berlinAnswer.finalFinishReason, "stop")
|
||||||
|
XCTAssertFalse(berlinAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let secondSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let secondLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
|
||||||
|
let parisToolTurnRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
|
||||||
|
berlinToolResult,
|
||||||
|
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 48,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let secondToolTurn = try await sendStreamingChatCompletion(parisToolTurnRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(secondToolTurn.finalFinishReason, "tool_calls")
|
||||||
|
let parisToolCall = try XCTUnwrap(secondToolTurn.toolCalls.first)
|
||||||
|
XCTAssertEqual(parisToolCall.function.name, "weather")
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > secondSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let thirdSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let thirdLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(thirdSnapshot.totalHits, secondSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(thirdLiveSnapshot.totalCacheReusePromptTokens, secondLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
|
||||||
|
let parisAnswerRequest = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [berlinToolCall], tool_call_id: nil),
|
||||||
|
berlinToolResult,
|
||||||
|
APIChatMessage(role: "assistant", content: .text(berlinAnswer.content), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "user", content: .text("Now call the weather tool for Paris. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "assistant", content: nil, name: nil, tool_calls: [parisToolCall], tool_call_id: nil),
|
||||||
|
APIChatMessage(role: "tool", content: .text("{\"city\":\"Paris\",\"temperature_c\":21,\"condition\":\"clear\"}"), name: nil, tool_calls: nil, tool_call_id: parisToolCall.id)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 16,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: tools,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let parisAnswer = try await sendStreamingChatCompletion(parisAnswerRequest, port: harness.port)
|
||||||
|
XCTAssertEqual(parisAnswer.finalFinishReason, "stop")
|
||||||
|
XCTAssertTrue(parisAnswer.toolCalls.isEmpty)
|
||||||
|
XCTAssertFalse(parisAnswer.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > thirdSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let fourthSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let fourthLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(fourthSnapshot.totalHits, thirdSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(fourthLiveSnapshot.totalCacheReusePromptTokens, thirdLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStreamingDisconnectStoresPromptCacheForReuse() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let request = APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Count from one to twenty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 64,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: nil,
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
)
|
||||||
|
|
||||||
|
let initialSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
try await cancelStreamingChatCompletionAfterFirstContent(request, port: harness.port)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalEntries > initialSnapshot.totalEntries
|
||||||
|
}
|
||||||
|
let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
|
||||||
|
|
||||||
|
_ = try await sendChatCompletion(
|
||||||
|
APIChatCompletionRequest(
|
||||||
|
model: request.model,
|
||||||
|
messages: request.messages,
|
||||||
|
temperature: request.temperature,
|
||||||
|
top_p: request.top_p,
|
||||||
|
max_tokens: 8,
|
||||||
|
stream: false,
|
||||||
|
stop: request.stop,
|
||||||
|
tools: request.tools,
|
||||||
|
tool_choice: request.tool_choice,
|
||||||
|
frequency_penalty: request.frequency_penalty,
|
||||||
|
presence_penalty: request.presence_penalty,
|
||||||
|
n: request.n
|
||||||
|
),
|
||||||
|
port: harness.port
|
||||||
|
)
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
TokenPrefixCache.shared.snapshot().totalHits > afterDisconnectSnapshot.totalHits
|
||||||
|
}
|
||||||
|
let finalSnapshot = TokenPrefixCache.shared.snapshot()
|
||||||
|
let finalLiveSnapshot = LiveCounters.shared.snapshot()
|
||||||
|
XCTAssertGreaterThan(finalSnapshot.totalHits, afterDisconnectSnapshot.totalHits)
|
||||||
|
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
|
||||||
|
let harness = try await makeHarness()
|
||||||
|
defer { harness.stop() }
|
||||||
|
|
||||||
|
let detailed = try await sendStreamingChatCompletionDetailed(
|
||||||
|
APIChatCompletionRequest(
|
||||||
|
model: "gemma",
|
||||||
|
messages: [
|
||||||
|
APIChatMessage(role: "user", content: .text("Call the weather tool for Berlin. Do not answer directly."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||||
|
],
|
||||||
|
temperature: 0,
|
||||||
|
top_p: 1,
|
||||||
|
max_tokens: 48,
|
||||||
|
stream: true,
|
||||||
|
stop: nil,
|
||||||
|
tools: [mockWeatherTool],
|
||||||
|
tool_choice: nil,
|
||||||
|
frequency_penalty: nil,
|
||||||
|
presence_penalty: nil,
|
||||||
|
n: nil
|
||||||
|
),
|
||||||
|
port: harness.port
|
||||||
|
)
|
||||||
|
|
||||||
|
XCTAssertTrue(detailed.sawDone)
|
||||||
|
XCTAssertFalse(detailed.events.isEmpty)
|
||||||
|
|
||||||
|
let firstEvent = try XCTUnwrap(detailed.events.first)
|
||||||
|
XCTAssertEqual(firstEvent.kind, .role)
|
||||||
|
XCTAssertEqual(firstEvent.role, "assistant")
|
||||||
|
|
||||||
|
let toolEventIndices = detailed.events.enumerated().compactMap { index, event in
|
||||||
|
event.kind == .toolCall ? index : nil
|
||||||
|
}
|
||||||
|
XCTAssertFalse(toolEventIndices.isEmpty)
|
||||||
|
|
||||||
|
let finalIndex = try XCTUnwrap(detailed.events.lastIndex(where: { $0.kind == .final }))
|
||||||
|
XCTAssertEqual(finalIndex, detailed.events.count - 1)
|
||||||
|
|
||||||
|
for toolIndex in toolEventIndices {
|
||||||
|
XCTAssertLessThan(toolIndex, finalIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
let finalEvent = detailed.events[finalIndex]
|
||||||
|
XCTAssertEqual(finalEvent.finishReason, "tool_calls")
|
||||||
|
XCTAssertNotNil(finalEvent.usage)
|
||||||
|
|
||||||
|
let roleEventCount = detailed.events.filter { $0.kind == .role }.count
|
||||||
|
XCTAssertEqual(roleEventCount, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
private var mockWeatherTool: APIToolDefinition {
|
||||||
|
APIToolDefinition(
|
||||||
|
type: "function",
|
||||||
|
function: APIFunctionDefinition(
|
||||||
|
name: "weather",
|
||||||
|
description: "Look up weather for a city.",
|
||||||
|
parameters: [
|
||||||
|
"type": AnyCodable("object"),
|
||||||
|
"properties": AnyCodable([
|
||||||
|
"city": [
|
||||||
|
"type": "string",
|
||||||
|
"description": "City name"
|
||||||
|
]
|
||||||
|
]),
|
||||||
|
"required": AnyCodable(["city"])
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func makeHarness() async throws -> TestHarness {
|
||||||
|
let modelManager = await MainActor.run { ModelManager() }
|
||||||
|
let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
|
||||||
|
|
||||||
|
LiveCounters.shared.reset()
|
||||||
|
TokenPrefixCache.shared.reset()
|
||||||
|
await modelManager.loadModel(config)
|
||||||
|
let isReady = await MainActor.run { modelManager.isReady }
|
||||||
|
XCTAssertTrue(isReady)
|
||||||
|
|
||||||
|
let server = await MainActor.run { APIServer() }
|
||||||
|
let port = UInt16.random(in: 20_000...40_000)
|
||||||
|
await MainActor.run {
|
||||||
|
server.start(modelManager: modelManager, port: Int(port))
|
||||||
|
}
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 5) {
|
||||||
|
await MainActor.run { server.isRunning }
|
||||||
|
}
|
||||||
|
|
||||||
|
return TestHarness(server: server, modelManager: modelManager, port: port)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func sendChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> APIChatCompletionResponse {
|
||||||
|
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
|
||||||
|
var urlRequest = URLRequest(url: url)
|
||||||
|
urlRequest.httpMethod = "POST"
|
||||||
|
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||||
|
urlRequest.httpBody = try JSONEncoder().encode(request)
|
||||||
|
|
||||||
|
let (data, response) = try await URLSession.shared.data(for: urlRequest)
|
||||||
|
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
|
||||||
|
XCTAssertEqual(httpResponse.statusCode, 200, String(data: data, encoding: .utf8) ?? "")
|
||||||
|
return try JSONDecoder().decode(APIChatCompletionResponse.self, from: data)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func sendStreamingChatCompletion(_ request: APIChatCompletionRequest, port: UInt16) async throws -> StreamingResult {
|
||||||
|
let detailed = try await sendStreamingChatCompletionDetailed(request, port: port)
|
||||||
|
return StreamingResult(
|
||||||
|
roleDeltaCount: detailed.events.filter { $0.kind == .role }.count,
|
||||||
|
content: detailed.events.compactMap(\ .content).joined(),
|
||||||
|
toolCalls: detailed.events.flatMap(\ .toolCalls),
|
||||||
|
finalFinishReason: detailed.events.last(where: { $0.kind == .final })?.finishReason,
|
||||||
|
usage: detailed.events.last(where: { $0.kind == .final })?.usage,
|
||||||
|
sawDone: detailed.sawDone
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func sendStreamingChatCompletionDetailed(_ request: APIChatCompletionRequest, port: UInt16) async throws -> DetailedStreamingResult {
|
||||||
|
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
|
||||||
|
var urlRequest = URLRequest(url: url)
|
||||||
|
urlRequest.httpMethod = "POST"
|
||||||
|
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||||
|
urlRequest.httpBody = try JSONEncoder().encode(request)
|
||||||
|
|
||||||
|
let (bytes, response) = try await URLSession.shared.bytes(for: urlRequest)
|
||||||
|
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
|
||||||
|
guard httpResponse.statusCode == 200 else {
|
||||||
|
var body = ""
|
||||||
|
for try await line in bytes.lines {
|
||||||
|
body += line
|
||||||
|
}
|
||||||
|
XCTFail("Expected 200 response, got \(httpResponse.statusCode): \(body)")
|
||||||
|
return DetailedStreamingResult(events: [], sawDone: false)
|
||||||
|
}
|
||||||
|
|
||||||
|
var events: [StreamingEvent] = []
|
||||||
|
var sawDone = false
|
||||||
|
|
||||||
|
for try await line in bytes.lines {
|
||||||
|
guard line.hasPrefix("data: ") else { continue }
|
||||||
|
let payload = String(line.dropFirst(6))
|
||||||
|
if payload == "[DONE]" {
|
||||||
|
sawDone = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let data = payload.data(using: .utf8) else { continue }
|
||||||
|
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
|
||||||
|
let choice = chunk.choices.first
|
||||||
|
if let delta = chunk.choices.first?.delta.role, delta == "assistant" {
|
||||||
|
events.append(StreamingEvent(kind: .role, role: delta, content: nil, toolCalls: [], finishReason: nil, usage: nil))
|
||||||
|
}
|
||||||
|
if let deltaContent = chunk.choices.first?.delta.content {
|
||||||
|
events.append(StreamingEvent(kind: .content, role: nil, content: deltaContent, toolCalls: [], finishReason: nil, usage: nil))
|
||||||
|
}
|
||||||
|
if let deltaToolCalls = chunk.choices.first?.delta.tool_calls {
|
||||||
|
events.append(StreamingEvent(kind: .toolCall, role: nil, content: nil, toolCalls: deltaToolCalls, finishReason: nil, usage: nil))
|
||||||
|
}
|
||||||
|
if let finishReason = choice?.finish_reason {
|
||||||
|
events.append(StreamingEvent(kind: .final, role: nil, content: nil, toolCalls: [], finishReason: finishReason, usage: chunk.usage))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return DetailedStreamingResult(events: events, sawDone: sawDone)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func cancelStreamingChatCompletionAfterFirstContent(_ request: APIChatCompletionRequest, port: UInt16) async throws {
|
||||||
|
let url = URL(string: "http://127.0.0.1:\(port)/v1/chat/completions")!
|
||||||
|
var urlRequest = URLRequest(url: url)
|
||||||
|
urlRequest.httpMethod = "POST"
|
||||||
|
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||||
|
urlRequest.httpBody = try JSONEncoder().encode(request)
|
||||||
|
|
||||||
|
let observer = StreamCancellationObserver()
|
||||||
|
let session = URLSession(configuration: .ephemeral)
|
||||||
|
let task = Task {
|
||||||
|
let (bytes, response) = try await session.bytes(for: urlRequest)
|
||||||
|
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
|
||||||
|
XCTAssertEqual(httpResponse.statusCode, 200)
|
||||||
|
|
||||||
|
for try await line in bytes.lines {
|
||||||
|
guard line.hasPrefix("data: ") else { continue }
|
||||||
|
let payload = String(line.dropFirst(6))
|
||||||
|
if payload == "[DONE]" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
guard let data = payload.data(using: .utf8) else { continue }
|
||||||
|
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
|
||||||
|
if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
|
||||||
|
await observer.markFirstContentSeen()
|
||||||
|
try await Task.sleep(nanoseconds: 30_000_000_000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try await waitUntil(timeoutSeconds: 10) {
|
||||||
|
await observer.hasSeenFirstContent
|
||||||
|
}
|
||||||
|
|
||||||
|
session.invalidateAndCancel()
|
||||||
|
task.cancel()
|
||||||
|
_ = try? await task.value
|
||||||
|
}
|
||||||
|
|
||||||
|
private func waitUntil(
|
||||||
|
timeoutSeconds: TimeInterval,
|
||||||
|
intervalNanoseconds: UInt64 = 100_000_000,
|
||||||
|
condition: @escaping () async -> Bool
|
||||||
|
) async throws {
|
||||||
|
let deadline = Date().addingTimeInterval(timeoutSeconds)
|
||||||
|
while Date() < deadline {
|
||||||
|
if await condition() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
try await Task.sleep(nanoseconds: intervalNanoseconds)
|
||||||
|
}
|
||||||
|
XCTFail("Condition not met before timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private actor StreamCancellationObserver {
|
||||||
|
private var sawFirstContent = false
|
||||||
|
|
||||||
|
func markFirstContentSeen() {
|
||||||
|
sawFirstContent = true
|
||||||
|
}
|
||||||
|
|
||||||
|
var hasSeenFirstContent: Bool {
|
||||||
|
sawFirstContent
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct DetailedStreamingResult {
|
||||||
|
let events: [StreamingEvent]
|
||||||
|
let sawDone: Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct StreamingEvent {
|
||||||
|
enum Kind {
|
||||||
|
case role
|
||||||
|
case content
|
||||||
|
case toolCall
|
||||||
|
case final
|
||||||
|
}
|
||||||
|
|
||||||
|
let kind: Kind
|
||||||
|
let role: String?
|
||||||
|
let content: String?
|
||||||
|
let toolCalls: [APIToolCall]
|
||||||
|
let finishReason: String?
|
||||||
|
let usage: APIUsageInfo?
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct StreamingResult {
|
||||||
|
let roleDeltaCount: Int
|
||||||
|
let content: String
|
||||||
|
let toolCalls: [APIToolCall]
|
||||||
|
let finalFinishReason: String?
|
||||||
|
let usage: APIUsageInfo?
|
||||||
|
let sawDone: Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct TestHarness {
|
||||||
|
let server: APIServer
|
||||||
|
let modelManager: ModelManager
|
||||||
|
let port: UInt16
|
||||||
|
|
||||||
|
func stop() {
|
||||||
|
Task { @MainActor in
|
||||||
|
server.stop()
|
||||||
|
modelManager.unloadModel()
|
||||||
|
}
|
||||||
|
TokenPrefixCache.shared.reset()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2572,14 +2572,18 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
|
|||||||
|
|
||||||
### Phase 3: Integration
|
### Phase 3: Integration
|
||||||
|
|
||||||
7. **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
|
7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
|
||||||
8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
|
8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
|
||||||
|
|
||||||
|
Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
|
||||||
|
|
||||||
### Phase 4: Statistics & Monitoring
|
### Phase 4: Statistics & Monitoring
|
||||||
|
|
||||||
9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
|
9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
|
||||||
10. **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
|
10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
|
||||||
11. **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
|
11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
|
||||||
|
|
||||||
|
Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.
|
||||||
|
|
||||||
### Phase 5: Advanced Cache Matching
|
### Phase 5: Advanced Cache Matching
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user