feat: implemented more of phase 4

This commit is contained in:
2026-03-20 10:40:51 +01:00
parent aadcc308a5
commit ee34fd5e84
7 changed files with 341 additions and 54 deletions

View File

@@ -28,6 +28,7 @@
5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */; }; 5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */; };
621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */; }; 621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */; };
67262C5E24739F1FE0011439 /* StreamingSSEEncoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 615F8A7C9ABCADEB215D31BD /* StreamingSSEEncoder.swift */; }; 67262C5E24739F1FE0011439 /* StreamingSSEEncoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 615F8A7C9ABCADEB215D31BD /* StreamingSSEEncoder.swift */; };
67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */; };
6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */; }; 6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */; };
741692862DB1F13EA0B2D14D /* TokenPrefixCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1962D530BEABCC7F1E8E0ED1 /* TokenPrefixCache.swift */; }; 741692862DB1F13EA0B2D14D /* TokenPrefixCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1962D530BEABCC7F1E8E0ED1 /* TokenPrefixCache.swift */; };
7CD765C1E2F9F4D7504C8D09 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B629DA084A9A40E54F8EA5FA /* Assets.xcassets */; }; 7CD765C1E2F9F4D7504C8D09 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B629DA084A9A40E54F8EA5FA /* Assets.xcassets */; };
@@ -95,6 +96,7 @@
6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatDocumentPackage.swift; sourceTree = "<group>"; }; 6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatDocumentPackage.swift; sourceTree = "<group>"; };
6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; }; 6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; };
7C1A89C076E717F87A60397D /* ImageDecoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoder.swift; sourceTree = "<group>"; }; 7C1A89C076E717F87A60397D /* ImageDecoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoder.swift; sourceTree = "<group>"; };
7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveCountersTests.swift; sourceTree = "<group>"; };
922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; }; 922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; };
944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; }; 944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
A4B359324B5FD8D106C74338 /* ChatMessage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessage.swift; sourceTree = "<group>"; }; A4B359324B5FD8D106C74338 /* ChatMessage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessage.swift; sourceTree = "<group>"; };
@@ -175,6 +177,7 @@
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */, E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */, FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */, E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */,
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */, D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
5F9426FA5A4AC55F8D9C080E /* PromptBuilderTests.swift */, 5F9426FA5A4AC55F8D9C080E /* PromptBuilderTests.swift */,
49C383DD5224F3420EB98DB2 /* StreamingSSEEncoderTests.swift */, 49C383DD5224F3420EB98DB2 /* StreamingSSEEncoderTests.swift */,
@@ -382,6 +385,7 @@
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */, CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */, 962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */, E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */,
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */, 8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
1FE8C624898960ECCE39C0D4 /* PromptBuilderTests.swift in Sources */, 1FE8C624898960ECCE39C0D4 /* PromptBuilderTests.swift in Sources */,
FE4405F66873C75CD6FA19A5 /* StreamingSSEEncoderTests.swift in Sources */, FE4405F66873C75CD6FA19A5 /* StreamingSSEEncoderTests.swift in Sources */,

View File

@@ -20,12 +20,16 @@ final class LiveCounters: @unchecked Sendable {
private var _promptTokens: Int = 0 private var _promptTokens: Int = 0
private var _generationTokens: Int = 0 private var _generationTokens: Int = 0
private var _tokensPerSecond: Double = 0 private var _tokensPerSecond: Double = 0
private var _prefillTokensPerSecond: Double = 0
private var _timeToFirstToken: TimeInterval = 0
private var _isPrefilling: Bool = false private var _isPrefilling: Bool = false
private var _isGenerating: Bool = false private var _isGenerating: Bool = false
private var _contextMax: Int = 0 private var _contextMax: Int = 0
private var _currentPhaseElapsed: TimeInterval = 0 private var _currentPhaseElapsed: TimeInterval = 0
private var _currentCacheMatchedPromptTokens: Int = 0 private var _currentCacheMatchedPromptTokens: Int = 0
private var _currentCacheRebuiltPromptTokens: Int = 0 private var _currentCacheRebuiltPromptTokens: Int = 0
private var _cacheMatchDepth: Int = 0
private var _visionEncoderTime: TimeInterval = 0
// Cumulative // Cumulative
private var _totalRequests: Int = 0 private var _totalRequests: Int = 0
@@ -37,6 +41,8 @@ final class LiveCounters: @unchecked Sendable {
private var _totalSessionBuildDuration: TimeInterval = 0 private var _totalSessionBuildDuration: TimeInterval = 0
private var _totalPrefillDuration: TimeInterval = 0 private var _totalPrefillDuration: TimeInterval = 0
private var _totalGenerationDuration: TimeInterval = 0 private var _totalGenerationDuration: TimeInterval = 0
private var _totalVisionEncoderDuration: TimeInterval = 0
private var _totalDisconnects: Int = 0
func requestStarted(requestId: String, contextLength: Int) { func requestStarted(requestId: String, contextLength: Int) {
let now = Date() let now = Date()
@@ -49,8 +55,16 @@ final class LiveCounters: @unchecked Sendable {
_promptTokens = 0 _promptTokens = 0
_generationTokens = 0 _generationTokens = 0
_tokensPerSecond = 0 _tokensPerSecond = 0
_prefillTokensPerSecond = 0
_timeToFirstToken = 0
_contextMax = contextLength _contextMax = contextLength
requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now) _cacheMatchDepth = 0
_visionEncoderTime = 0
requestPhases[requestId] = RequestState(
phase: .preparing,
phaseStartedAt: now,
requestStartedAt: now
)
refreshCurrentPhaseElapsed(now: now) refreshCurrentPhaseElapsed(now: now)
lock.unlock() lock.unlock()
} }
@@ -61,9 +75,24 @@ final class LiveCounters: @unchecked Sendable {
if let current = requestPhases[requestId] { if let current = requestPhases[requestId] {
decrementCount(for: current.phase) decrementCount(for: current.phase)
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt)) accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
requestPhases[requestId] = RequestState(
phase: phase,
phaseStartedAt: now,
requestStartedAt: current.requestStartedAt,
matchedPromptTokens: current.matchedPromptTokens,
rebuiltPromptTokens: current.rebuiltPromptTokens,
hasRecordedFirstToken: current.hasRecordedFirstToken,
disconnectRecorded: current.disconnectRecorded,
visionEncoderTime: current.visionEncoderTime
)
} else {
requestPhases[requestId] = RequestState(
phase: phase,
phaseStartedAt: now,
requestStartedAt: now
)
} }
incrementCount(for: phase) incrementCount(for: phase)
requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0 _isGenerating = _generatingRequests > 0
refreshCurrentPhaseElapsed(now: now) refreshCurrentPhaseElapsed(now: now)
@@ -74,11 +103,19 @@ final class LiveCounters: @unchecked Sendable {
let now = Date() let now = Date()
lock.lock() lock.lock()
if let current = requestPhases[requestId] { if let current = requestPhases[requestId] {
let prefillElapsed = max(now.timeIntervalSince(current.phaseStartedAt), 0)
_prefillTokensPerSecond = prefillElapsed > 0
? Double(promptTokens) / prefillElapsed
: 0
decrementCount(for: current.phase) decrementCount(for: current.phase)
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt)) accumulateDuration(for: current.phase, elapsed: prefillElapsed)
} }
incrementCount(for: .generating) incrementCount(for: .generating)
requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now) if var state = requestPhases[requestId] {
state.phase = .generating
state.phaseStartedAt = now
requestPhases[requestId] = state
}
_promptTokens = promptTokens _promptTokens = promptTokens
_totalPromptTokens += promptTokens _totalPromptTokens += promptTokens
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
@@ -87,6 +124,20 @@ final class LiveCounters: @unchecked Sendable {
lock.unlock() lock.unlock()
} }
func firstTokenGenerated(requestId: String) {
let now = Date()
lock.lock()
guard var state = requestPhases[requestId], !state.hasRecordedFirstToken else {
lock.unlock()
return
}
state.hasRecordedFirstToken = true
requestPhases[requestId] = state
_timeToFirstToken = max(now.timeIntervalSince(state.requestStartedAt), 0)
lock.unlock()
}
func tokenGenerated(tokensPerSecond: Double, totalGenerated: Int) { func tokenGenerated(tokensPerSecond: Double, totalGenerated: Int) {
lock.lock() lock.lock()
_generationTokens = totalGenerated _generationTokens = totalGenerated
@@ -106,6 +157,7 @@ final class LiveCounters: @unchecked Sendable {
_totalCacheReusePromptTokens += matched _totalCacheReusePromptTokens += matched
_totalCacheRebuildPromptTokens += rebuilt _totalCacheRebuildPromptTokens += rebuilt
_cacheMatchDepth = matched
state.matchedPromptTokens = matched state.matchedPromptTokens = matched
state.rebuiltPromptTokens = rebuilt state.rebuiltPromptTokens = rebuilt
@@ -114,6 +166,34 @@ final class LiveCounters: @unchecked Sendable {
lock.unlock() lock.unlock()
} }
func visionProcessingCompleted(requestId: String, duration: TimeInterval) {
let clampedDuration = max(duration, 0)
lock.lock()
guard var state = requestPhases[requestId] else {
lock.unlock()
return
}
_visionEncoderTime = clampedDuration
_totalVisionEncoderDuration += clampedDuration
state.visionEncoderTime = clampedDuration
requestPhases[requestId] = state
lock.unlock()
}
func disconnectDetected(requestId: String) {
lock.lock()
guard var state = requestPhases[requestId], !state.disconnectRecorded else {
lock.unlock()
return
}
state.disconnectRecorded = true
requestPhases[requestId] = state
_totalDisconnects += 1
lock.unlock()
}
func requestCompleted(requestId: String, generationTokens: Int) { func requestCompleted(requestId: String, generationTokens: Int) {
let now = Date() let now = Date()
lock.lock() lock.lock()
@@ -147,12 +227,16 @@ final class LiveCounters: @unchecked Sendable {
_promptTokens = 0 _promptTokens = 0
_generationTokens = 0 _generationTokens = 0
_tokensPerSecond = 0 _tokensPerSecond = 0
_prefillTokensPerSecond = 0
_timeToFirstToken = 0
_isPrefilling = false _isPrefilling = false
_isGenerating = false _isGenerating = false
_contextMax = 0 _contextMax = 0
_currentPhaseElapsed = 0 _currentPhaseElapsed = 0
_currentCacheMatchedPromptTokens = 0 _currentCacheMatchedPromptTokens = 0
_currentCacheRebuiltPromptTokens = 0 _currentCacheRebuiltPromptTokens = 0
_cacheMatchDepth = 0
_visionEncoderTime = 0
_totalRequests = 0 _totalRequests = 0
_totalPromptTokens = 0 _totalPromptTokens = 0
_totalGenerationTokens = 0 _totalGenerationTokens = 0
@@ -162,6 +246,8 @@ final class LiveCounters: @unchecked Sendable {
_totalSessionBuildDuration = 0 _totalSessionBuildDuration = 0
_totalPrefillDuration = 0 _totalPrefillDuration = 0
_totalGenerationDuration = 0 _totalGenerationDuration = 0
_totalVisionEncoderDuration = 0
_totalDisconnects = 0
lock.unlock() lock.unlock()
} }
@@ -179,12 +265,16 @@ final class LiveCounters: @unchecked Sendable {
promptTokens: _promptTokens, promptTokens: _promptTokens,
generationTokens: _generationTokens, generationTokens: _generationTokens,
tokensPerSecond: _tokensPerSecond, tokensPerSecond: _tokensPerSecond,
prefillTokensPerSecond: _prefillTokensPerSecond,
timeToFirstToken: _timeToFirstToken,
isPrefilling: _isPrefilling, isPrefilling: _isPrefilling,
isGenerating: _isGenerating, isGenerating: _isGenerating,
contextMax: _contextMax, contextMax: _contextMax,
currentPhaseElapsed: _currentPhaseElapsed, currentPhaseElapsed: _currentPhaseElapsed,
currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens, currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens, currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
cacheMatchDepth: _cacheMatchDepth,
visionEncoderTime: _visionEncoderTime,
totalRequests: _totalRequests, totalRequests: _totalRequests,
totalPromptTokens: _totalPromptTokens, totalPromptTokens: _totalPromptTokens,
totalGenerationTokens: _totalGenerationTokens, totalGenerationTokens: _totalGenerationTokens,
@@ -193,7 +283,9 @@ final class LiveCounters: @unchecked Sendable {
totalPreparingDuration: _totalPreparingDuration, totalPreparingDuration: _totalPreparingDuration,
totalSessionBuildDuration: _totalSessionBuildDuration, totalSessionBuildDuration: _totalSessionBuildDuration,
totalPrefillDuration: _totalPrefillDuration, totalPrefillDuration: _totalPrefillDuration,
totalGenerationDuration: _totalGenerationDuration totalGenerationDuration: _totalGenerationDuration,
totalVisionEncoderDuration: _totalVisionEncoderDuration,
totalDisconnects: _totalDisconnects
) )
lock.unlock() lock.unlock()
return s return s
@@ -208,12 +300,16 @@ final class LiveCounters: @unchecked Sendable {
let promptTokens: Int let promptTokens: Int
let generationTokens: Int let generationTokens: Int
let tokensPerSecond: Double let tokensPerSecond: Double
let prefillTokensPerSecond: Double
let timeToFirstToken: TimeInterval
let isPrefilling: Bool let isPrefilling: Bool
let isGenerating: Bool let isGenerating: Bool
let contextMax: Int let contextMax: Int
let currentPhaseElapsed: TimeInterval let currentPhaseElapsed: TimeInterval
let currentCacheMatchedPromptTokens: Int let currentCacheMatchedPromptTokens: Int
let currentCacheRebuiltPromptTokens: Int let currentCacheRebuiltPromptTokens: Int
let cacheMatchDepth: Int
let visionEncoderTime: TimeInterval
let totalRequests: Int let totalRequests: Int
let totalPromptTokens: Int let totalPromptTokens: Int
let totalGenerationTokens: Int let totalGenerationTokens: Int
@@ -223,6 +319,8 @@ final class LiveCounters: @unchecked Sendable {
let totalSessionBuildDuration: TimeInterval let totalSessionBuildDuration: TimeInterval
let totalPrefillDuration: TimeInterval let totalPrefillDuration: TimeInterval
let totalGenerationDuration: TimeInterval let totalGenerationDuration: TimeInterval
let totalVisionEncoderDuration: TimeInterval
let totalDisconnects: Int
} }
private func incrementCount(for phase: RequestPhase) { private func incrementCount(for phase: RequestPhase) {
@@ -276,8 +374,12 @@ final class LiveCounters: @unchecked Sendable {
private struct RequestState { private struct RequestState {
var phase: RequestPhase var phase: RequestPhase
var phaseStartedAt: Date var phaseStartedAt: Date
var requestStartedAt: Date
var matchedPromptTokens: Int = 0 var matchedPromptTokens: Int = 0
var rebuiltPromptTokens: Int = 0 var rebuiltPromptTokens: Int = 0
var hasRecordedFirstToken: Bool = false
var disconnectRecorded: Bool = false
var visionEncoderTime: TimeInterval = 0
} }
enum RequestPhase { enum RequestPhase {
@@ -305,11 +407,15 @@ final class InferenceStats {
var isGenerating: Bool = false var isGenerating: Bool = false
var isPrefilling: Bool = false var isPrefilling: Bool = false
var currentTokensPerSecond: Double = 0 var currentTokensPerSecond: Double = 0
var prefillTokensPerSecond: Double = 0
var timeToFirstToken: TimeInterval = 0
var contextUsed: Int = 0 var contextUsed: Int = 0
var contextMax: Int = 0 var contextMax: Int = 0
var currentPhaseElapsed: TimeInterval = 0 var currentPhaseElapsed: TimeInterval = 0
var currentCacheMatchedPromptTokens: Int = 0 var currentCacheMatchedPromptTokens: Int = 0
var currentCacheRebuiltPromptTokens: Int = 0 var currentCacheRebuiltPromptTokens: Int = 0
var cacheMatchDepth: Int = 0
var visionEncoderTime: TimeInterval = 0
// MARK: - Cumulative counters // MARK: - Cumulative counters
@@ -326,6 +432,8 @@ final class InferenceStats {
var totalSessionBuildDuration: TimeInterval = 0 var totalSessionBuildDuration: TimeInterval = 0
var totalPrefillDuration: TimeInterval = 0 var totalPrefillDuration: TimeInterval = 0
var totalGenerationDuration: TimeInterval = 0 var totalGenerationDuration: TimeInterval = 0
var totalVisionEncoderDuration: TimeInterval = 0
var totalDisconnects: Int = 0
// MARK: - Cache state // MARK: - Cache state
@@ -356,6 +464,10 @@ final class InferenceStats {
private(set) var cacheReusePromptHistory: [DataPoint] = [] private(set) var cacheReusePromptHistory: [DataPoint] = []
private(set) var cacheRebuildPromptHistory: [DataPoint] = [] private(set) var cacheRebuildPromptHistory: [DataPoint] = []
private(set) var cacheMatchQualityHistory: [DataPoint] = [] private(set) var cacheMatchQualityHistory: [DataPoint] = []
private(set) var ttftHistory: [DataPoint] = []
private(set) var prefillSpeedHistory: [DataPoint] = []
private(set) var cacheMatchDepthHistory: [DataPoint] = []
private(set) var visionTimeHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -394,6 +506,8 @@ final class InferenceStats {
currentPromptTokens = snap.promptTokens currentPromptTokens = snap.promptTokens
currentGenerationTokens = snap.generationTokens currentGenerationTokens = snap.generationTokens
currentTokensPerSecond = snap.tokensPerSecond currentTokensPerSecond = snap.tokensPerSecond
prefillTokensPerSecond = snap.prefillTokensPerSecond
timeToFirstToken = snap.timeToFirstToken
isPrefilling = snap.isPrefilling isPrefilling = snap.isPrefilling
isGenerating = snap.isGenerating isGenerating = snap.isGenerating
contextMax = snap.contextMax contextMax = snap.contextMax
@@ -401,6 +515,8 @@ final class InferenceStats {
currentPhaseElapsed = snap.currentPhaseElapsed currentPhaseElapsed = snap.currentPhaseElapsed
currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
cacheMatchDepth = snap.cacheMatchDepth
visionEncoderTime = snap.visionEncoderTime
totalRequests = snap.totalRequests totalRequests = snap.totalRequests
totalPromptTokens = snap.totalPromptTokens totalPromptTokens = snap.totalPromptTokens
totalGenerationTokens = snap.totalGenerationTokens totalGenerationTokens = snap.totalGenerationTokens
@@ -410,6 +526,8 @@ final class InferenceStats {
totalSessionBuildDuration = snap.totalSessionBuildDuration totalSessionBuildDuration = snap.totalSessionBuildDuration
totalPrefillDuration = snap.totalPrefillDuration totalPrefillDuration = snap.totalPrefillDuration
totalGenerationDuration = snap.totalGenerationDuration totalGenerationDuration = snap.totalGenerationDuration
totalVisionEncoderDuration = snap.totalVisionEncoderDuration
totalDisconnects = snap.totalDisconnects
totalCacheHits = cache.totalHits totalCacheHits = cache.totalHits
totalCacheMisses = cache.totalMisses totalCacheMisses = cache.totalMisses
totalCacheEvictions = cache.totalEvictions totalCacheEvictions = cache.totalEvictions
@@ -448,6 +566,10 @@ final class InferenceStats {
cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta))) cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta))) cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta)) cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
ttftHistory.append(DataPoint(timestamp: now, value: snap.timeToFirstToken * 1_000))
prefillSpeedHistory.append(DataPoint(timestamp: now, value: snap.prefillTokensPerSecond))
cacheMatchDepthHistory.append(DataPoint(timestamp: now, value: Double(snap.cacheMatchDepth)))
visionTimeHistory.append(DataPoint(timestamp: now, value: snap.visionEncoderTime * 1_000))
if tokenRateHistory.count > Self.maxHistoryPoints { if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints) tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -485,6 +607,18 @@ final class InferenceStats {
if cacheMatchQualityHistory.count > Self.maxHistoryPoints { if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints) cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
} }
if ttftHistory.count > Self.maxHistoryPoints {
ttftHistory.removeFirst(ttftHistory.count - Self.maxHistoryPoints)
}
if prefillSpeedHistory.count > Self.maxHistoryPoints {
prefillSpeedHistory.removeFirst(prefillSpeedHistory.count - Self.maxHistoryPoints)
}
if cacheMatchDepthHistory.count > Self.maxHistoryPoints {
cacheMatchDepthHistory.removeFirst(cacheMatchDepthHistory.count - Self.maxHistoryPoints)
}
if visionTimeHistory.count > Self.maxHistoryPoints {
visionTimeHistory.removeFirst(visionTimeHistory.count - Self.maxHistoryPoints)
}
} }
func reset() { func reset() {
@@ -500,11 +634,15 @@ final class InferenceStats {
isGenerating = false isGenerating = false
isPrefilling = false isPrefilling = false
currentTokensPerSecond = 0 currentTokensPerSecond = 0
prefillTokensPerSecond = 0
timeToFirstToken = 0
contextUsed = 0 contextUsed = 0
contextMax = 0 contextMax = 0
currentPhaseElapsed = 0 currentPhaseElapsed = 0
currentCacheMatchedPromptTokens = 0 currentCacheMatchedPromptTokens = 0
currentCacheRebuiltPromptTokens = 0 currentCacheRebuiltPromptTokens = 0
cacheMatchDepth = 0
visionEncoderTime = 0
totalRequests = 0 totalRequests = 0
totalPromptTokens = 0 totalPromptTokens = 0
totalGenerationTokens = 0 totalGenerationTokens = 0
@@ -514,6 +652,8 @@ final class InferenceStats {
totalSessionBuildDuration = 0 totalSessionBuildDuration = 0
totalPrefillDuration = 0 totalPrefillDuration = 0
totalGenerationDuration = 0 totalGenerationDuration = 0
totalVisionEncoderDuration = 0
totalDisconnects = 0
totalCacheHits = 0 totalCacheHits = 0
totalCacheMisses = 0 totalCacheMisses = 0
totalCacheEvictions = 0 totalCacheEvictions = 0
@@ -536,6 +676,10 @@ final class InferenceStats {
cacheReusePromptHistory.removeAll() cacheReusePromptHistory.removeAll()
cacheRebuildPromptHistory.removeAll() cacheRebuildPromptHistory.removeAll()
cacheMatchQualityHistory.removeAll() cacheMatchQualityHistory.removeAll()
ttftHistory.removeAll()
prefillSpeedHistory.removeAll()
cacheMatchDepthHistory.removeAll()
visionTimeHistory.removeAll()
lastGenerationTokenCount = 0 lastGenerationTokenCount = 0
lastPromptTokenCount = 0 lastPromptTokenCount = 0
lastPrefillDuration = 0 lastPrefillDuration = 0

View File

@@ -264,7 +264,14 @@ final class APIServer {
let engine = InferenceEngine(container: container) let engine = InferenceEngine(container: container)
let preparedInference: InferenceEngine.PreparedInference let preparedInference: InferenceEngine.PreparedInference
do { do {
let prepareStartedAt = Date()
preparedInference = try await engine.prepare(preparedPrompt.userInput) preparedInference = try await engine.prepare(preparedPrompt.userInput)
if preparedPrompt.containsImages {
LiveCounters.shared.visionProcessingCompleted(
requestId: requestId,
duration: Date().timeIntervalSince(prepareStartedAt)
)
}
} catch { } catch {
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0) LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
@@ -428,6 +435,7 @@ final class APIServer {
connection.stateUpdateHandler = { state in connection.stateUpdateHandler = { state in
switch state { switch state {
case .cancelled, .failed: case .cancelled, .failed:
LiveCounters.shared.disconnectDetected(requestId: requestId)
cancellation.cancel() cancellation.cancel()
default: default:
break break
@@ -538,6 +546,7 @@ final class APIServer {
var fullText = "" var fullText = ""
var frameworkToolCalls: [MLXLMCommon.ToolCall] = [] var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
var cancelled = false var cancelled = false
var sawFirstChunk = false
for await generation in stream { for await generation in stream {
if let cancellation, cancellation.isCancelled { if let cancellation, cancellation.isCancelled {
@@ -547,6 +556,10 @@ final class APIServer {
switch generation { switch generation {
case .chunk(let text): case .chunk(let text):
if !sawFirstChunk {
sawFirstChunk = true
LiveCounters.shared.firstTokenGenerated(requestId: requestId)
}
completionTokens += 1 completionTokens += 1
fullText += text fullText += text
LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens) LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)

View File

@@ -43,13 +43,27 @@ struct MonitorView: View {
color: .blue color: .blue
) )
metricCard( metricCard(
title: "Prefill Reuse", title: "Cache Match",
value: formatTokenCount(stats.totalCacheReusePromptTokens), value: formatTokenCount(stats.cacheMatchDepth),
detail: stats.currentCacheMatchedPromptTokens > 0 detail: stats.currentCacheMatchedPromptTokens > 0
? String(format: "%.0f%% match now", stats.currentCacheMatchQualityPercent) ? String(format: "%.0f%% match now", stats.currentCacheMatchQualityPercent)
: String(format: "%.0f%% total quality", stats.totalCacheMatchQualityPercent), : String(format: "%.0f%% total quality", stats.totalCacheMatchQualityPercent),
color: .teal color: .teal
) )
metricCard(
title: "TTFT",
value: formatMilliseconds(stats.timeToFirstToken * 1_000),
detail: stats.isGenerating ? "time to first token" : "last completed request",
color: .cyan
)
metricCard(
title: "Prefill Speed",
value: stats.prefillTokensPerSecond > 0
? String(format: "%.1f tok/s", stats.prefillTokensPerSecond)
: "0 tok/s",
detail: formatTokenCount(stats.currentPromptTokens) + " prompt tokens",
color: .blue
)
metricCard( metricCard(
title: "Context", title: "Context",
value: formatTokenCount(stats.contextUsed), value: formatTokenCount(stats.contextUsed),
@@ -68,13 +82,23 @@ struct MonitorView: View {
detail: "\(stats.currentGenerationTokens) output tokens", detail: "\(stats.currentGenerationTokens) output tokens",
color: .green color: .green
) )
metricCard(
title: "Disconnects",
value: "\(stats.totalDisconnects)",
detail: stats.totalDisconnects == 0 ? "none detected" : "streams cancelled by clients",
color: .red
)
} }
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) { LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
latencyChart
prefillSpeedChart
throughputChart throughputChart
phaseChart cacheMatchChart
cacheChart
memoryChart memoryChart
if hasVisionSamples {
visionChart
}
} }
cumulativeSection cumulativeSection
@@ -140,48 +164,33 @@ struct MonitorView: View {
} }
} }
private var phaseChart: some View { private var latencyChart: some View {
chartCard(title: "Phase Timing") { chartCard(title: "Time To First Token") {
Chart { Chart {
ForEach(stats.currentPhaseElapsedHistory) { point in ForEach(stats.ttftHistory) { point in
LineMark( LineMark(
x: .value("Time", point.timestamp), x: .value("Time", point.timestamp),
y: .value("Age", point.value) y: .value("TTFT", point.value)
) )
.foregroundStyle(systemStateColor) .foregroundStyle(.cyan)
.interpolationMethod(.monotone) .interpolationMethod(.monotone)
} }
ForEach(stats.prefillDurationHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Prefill", point.value)
)
.foregroundStyle(.blue.opacity(0.45))
}
} }
.chartXAxis { timeAxis } .chartXAxis { timeAxis }
.chartYAxis { leadingValueAxis } .chartYAxis { leadingValueAxis }
.frame(height: 180) .frame(height: 180)
} footer: { } footer: {
legendRow(items: [("Active phase age", systemStateColor), ("Prefill completions", .blue)]) legendRow(items: [("TTFT ms", .cyan)])
} }
} }
private var cacheChart: some View { private var prefillSpeedChart: some View {
chartCard(title: "Cache Match Quality") { chartCard(title: "Prefill Speed") {
Chart { Chart {
ForEach(stats.cacheMatchQualityHistory) { point in ForEach(stats.prefillSpeedHistory) { point in
LineMark( LineMark(
x: .value("Time", point.timestamp), x: .value("Time", point.timestamp),
y: .value("Match Quality", point.value) y: .value("Prefill Speed", point.value)
)
.foregroundStyle(.teal)
.interpolationMethod(.monotone)
}
ForEach(stats.cacheHitRateHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("Hit Rate", point.value)
) )
.foregroundStyle(.blue) .foregroundStyle(.blue)
.interpolationMethod(.monotone) .interpolationMethod(.monotone)
@@ -191,7 +200,41 @@ struct MonitorView: View {
.chartYAxis { leadingValueAxis } .chartYAxis { leadingValueAxis }
.frame(height: 180) .frame(height: 180)
} footer: { } footer: {
legendRow(items: [("Match quality %", .teal), ("Hit rate %", .blue)]) legendRow(items: [("Prompt tok/s", .blue)])
}
}
private var cacheMatchChart: some View {
chartCard(title: "Cache Match Depth") {
Chart {
ForEach(stats.cacheReusePromptHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Cached", point.value)
)
.foregroundStyle(.teal.opacity(0.7))
}
ForEach(stats.cacheRebuildPromptHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Prefilled", point.value)
)
.foregroundStyle(.orange.opacity(0.65))
}
ForEach(stats.cacheMatchDepthHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("Depth", point.value)
)
.foregroundStyle(.blue)
.interpolationMethod(.monotone)
}
}
.chartXAxis { timeAxis }
.chartYAxis { leadingValueAxis }
.frame(height: 180)
} footer: {
legendRow(items: [("Cached", .teal), ("Prefilled", .orange), ("Matched depth", .blue)])
} }
} }
@@ -230,6 +273,25 @@ struct MonitorView: View {
} }
} }
private var visionChart: some View {
chartCard(title: "Vision Prepare Time") {
Chart {
ForEach(stats.visionTimeHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Vision", point.value)
)
.foregroundStyle(.purple.opacity(0.8))
}
}
.chartXAxis { timeAxis }
.chartYAxis { leadingValueAxis }
.frame(height: 180)
} footer: {
legendRow(items: [("Prepare ms", .purple)])
}
}
private var cumulativeSection: some View { private var cumulativeSection: some View {
VStack(alignment: .leading, spacing: 10) { VStack(alignment: .leading, spacing: 10) {
Text("Totals") Text("Totals")
@@ -240,11 +302,13 @@ struct MonitorView: View {
compactTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue) compactTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue)
compactTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .green) compactTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .green)
compactTile(title: "Cache Evictions", value: "\(stats.totalCacheEvictions)", color: .red) compactTile(title: "Cache Evictions", value: "\(stats.totalCacheEvictions)", color: .red)
compactTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .teal) compactTile(title: "Tokens From Cache", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .teal)
compactTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .orange) compactTile(title: "Tokens Prefilled", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .orange)
compactTile(title: "Match Quality", value: String(format: "%.0f%%", stats.totalCacheMatchQualityPercent), color: .teal) compactTile(title: "Match Quality", value: String(format: "%.0f%%", stats.totalCacheMatchQualityPercent), color: .teal)
compactTile(title: "Prefill Time", value: String(format: "%.1fs", stats.totalPrefillDuration), color: .blue) compactTile(title: "Prefill Time", value: String(format: "%.1fs", stats.totalPrefillDuration), color: .blue)
compactTile(title: "Generation Time", value: String(format: "%.1fs", stats.totalGenerationDuration), color: .green) compactTile(title: "Generation Time", value: String(format: "%.1fs", stats.totalGenerationDuration), color: .green)
compactTile(title: "Vision Time", value: String(format: "%.1fs", stats.totalVisionEncoderDuration), color: .purple)
compactTile(title: "Disconnects", value: "\(stats.totalDisconnects)", color: .red)
compactTile(title: "Cache Budget", value: formatByteCount(stats.cacheMemoryBudgetBytes), color: .orange) compactTile(title: "Cache Budget", value: formatByteCount(stats.cacheMemoryBudgetBytes), color: .orange)
} }
} }
@@ -451,4 +515,16 @@ struct MonitorView: View {
if minutes < 60 { return "\(minutes)m ago" } if minutes < 60 { return "\(minutes)m ago" }
return "\(minutes / 60)h ago" return "\(minutes / 60)h ago"
} }
private var hasVisionSamples: Bool {
stats.visionTimeHistory.contains { $0.value > 0 }
}
private func formatMilliseconds(_ value: Double) -> String {
guard value > 0 else { return "0 ms" }
if value >= 1_000 {
return String(format: "%.2fs", value / 1_000)
}
return String(format: "%.0f ms", value)
}
} }

View File

@@ -36,6 +36,8 @@ final class APIServerRewriteTests: XCTestCase {
let firstSnapshot = TokenPrefixCache.shared.snapshot() let firstSnapshot = TokenPrefixCache.shared.snapshot()
let firstLiveSnapshot = LiveCounters.shared.snapshot() let firstLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(firstSnapshot.totalEntries, 0) XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
XCTAssertGreaterThan(firstLiveSnapshot.prefillTokensPerSecond, 0)
XCTAssertGreaterThan(firstLiveSnapshot.timeToFirstToken, 0)
_ = try await sendChatCompletion(request, port: harness.port) _ = try await sendChatCompletion(request, port: harness.port)
@@ -46,6 +48,7 @@ final class APIServerRewriteTests: XCTestCase {
let secondLiveSnapshot = LiveCounters.shared.snapshot() let secondLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits) XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens) XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
XCTAssertGreaterThan(secondLiveSnapshot.cacheMatchDepth, 0)
} }
func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws { func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
@@ -429,6 +432,7 @@ final class APIServerRewriteTests: XCTestCase {
let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot() let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot() let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries) XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
XCTAssertGreaterThan(afterDisconnectLiveSnapshot.totalDisconnects, 0)
_ = try await sendChatCompletion( _ = try await sendChatCompletion(
APIChatCompletionRequest( APIChatCompletionRequest(

View File

@@ -0,0 +1,46 @@
import Foundation
import XCTest
@testable import MLX_Server
final class LiveCountersTests: XCTestCase {
override func tearDown() {
LiveCounters.shared.reset()
super.tearDown()
}
func testTracksRequestMetricsAndDeduplicatesDisconnects() {
let requestId = "req-1"
LiveCounters.shared.reset()
LiveCounters.shared.requestStarted(requestId: requestId, contextLength: 8_192)
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
LiveCounters.shared.recordPrefillReuse(requestId: requestId, matchedPromptTokens: 40, promptTokenCount: 64)
LiveCounters.shared.visionProcessingCompleted(requestId: requestId, duration: 0.25)
Thread.sleep(forTimeInterval: 0.01)
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: 64)
Thread.sleep(forTimeInterval: 0.01)
LiveCounters.shared.firstTokenGenerated(requestId: requestId)
LiveCounters.shared.tokenGenerated(tokensPerSecond: 12.5, totalGenerated: 3)
LiveCounters.shared.disconnectDetected(requestId: requestId)
LiveCounters.shared.disconnectDetected(requestId: requestId)
let inFlight = LiveCounters.shared.snapshot()
XCTAssertEqual(inFlight.cacheMatchDepth, 40)
XCTAssertEqual(inFlight.currentCacheMatchedPromptTokens, 40)
XCTAssertEqual(inFlight.currentCacheRebuiltPromptTokens, 24)
XCTAssertEqual(inFlight.visionEncoderTime, 0.25, accuracy: 0.0001)
XCTAssertGreaterThan(inFlight.prefillTokensPerSecond, 0)
XCTAssertGreaterThan(inFlight.timeToFirstToken, 0)
XCTAssertEqual(inFlight.totalDisconnects, 1)
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 3)
let completed = LiveCounters.shared.snapshot()
XCTAssertEqual(completed.totalPromptTokens, 64)
XCTAssertEqual(completed.totalGenerationTokens, 3)
XCTAssertEqual(completed.totalVisionEncoderDuration, 0.25, accuracy: 0.0001)
XCTAssertEqual(completed.totalDisconnects, 1)
}
}

View File

@@ -2573,17 +2573,17 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
### Phase 3: Integration ### Phase 3: Integration
7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder. 7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested. 8. [x] **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented. Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
### Phase 4: Statistics & Monitoring ### Phase 4: Statistics & Monitoring
9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer. 9. [x] **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot(). 10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list. 11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet. Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` now surfaces TTFT, prefill speed, cache match depth, cache memory pressure, disconnect totals, and vision prepare time from `LiveCounters`. Match-type hit breakdown is still open because it depends on the advanced cache matching work in Phase 5.
### Phase 5: Advanced Cache Matching ### Phase 5: Advanced Cache Matching
@@ -2608,10 +2608,10 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
### Cache Correctness ### Cache Correctness
- [ ] Cold start: no cache entries → fresh generation works - [x] Cold start: no cache entries → fresh generation works
- [ ] Second identical request → full cache hit, zero prefill tokens - [ ] Second identical request → full cache hit, zero prefill tokens
- [ ] Conversation continuation (add 1 message) → partial cache hit - [ ] Conversation continuation (add 1 message) → partial cache hit
- [ ] Conversation continuation (add 2+ messages, e.g. tool-use flow) → partial cache hit (not a miss!) - [x] Conversation continuation (add 2+ messages, e.g. tool-use flow) → partial cache hit (not a miss!)
- [ ] Same system prompt, different user message → system prompt prefix cached and reused - [ ] Same system prompt, different user message → system prompt prefix cached and reused
- [ ] Different system prompt → no false cache hit - [ ] Different system prompt → no false cache hit
- [ ] Model swap → cache invalidated, fresh generation works - [ ] Model swap → cache invalidated, fresh generation works
@@ -2622,31 +2622,31 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
- [ ] Memory budget computed correctly from Metal device - [ ] Memory budget computed correctly from Metal device
- [x] Entries evicted under memory pressure (oldest first) - [x] Entries evicted under memory pressure (oldest first)
- [x] Expired entries pruned after 30 min idle - [x] Expired entries pruned after 30 min idle
- [ ] Trie nodes cleaned up when entries are evicted (no memory leak) - [x] Trie nodes cleaned up when entries are evicted (no memory leak)
- [ ] `snapshot()` reports accurate memory usage and hit rates - [ ] `snapshot()` reports accurate memory usage and hit rates
### Disconnect Handling ### Disconnect Handling
- [ ] Client disconnects mid-stream → generation stops within ~200ms - [ ] Client disconnects mid-stream → generation stops within ~200ms
- [ ] Partial KV cache from disconnected request is still stored for reuse - [x] Partial KV cache from disconnected request is still stored for reuse
- [ ] No Metal assertion failures on disconnect - [ ] No Metal assertion failures on disconnect
### Streaming ### Streaming
- [ ] SSE JSON is valid and parseable by standard clients - [x] SSE JSON is valid and parseable by standard clients
- [x] `StreamingSSEEncoder` output matches `JSONEncoder` output byte-for-byte (for content deltas) - [x] `StreamingSSEEncoder` output matches `JSONEncoder` output byte-for-byte (for content deltas)
- [ ] Role delta sent once at stream start - [x] Role delta sent once at stream start
- [ ] Tool call chunks sent correctly - [x] Tool call chunks sent correctly
- [ ] Final chunk has finish_reason and usage stats - [x] Final chunk has finish_reason and usage stats
- [ ] `data: [DONE]` sent at end - [x] `data: [DONE]` sent at end
### Tool Use ### Tool Use
- [ ] Gemma tool_code blocks parsed correctly - [ ] Gemma tool_code blocks parsed correctly
- [ ] Qwen `<tool_call>` tags parsed correctly - [ ] Qwen `<tool_call>` tags parsed correctly
- [ ] Framework `ToolCall` events handled correctly - [ ] Framework `ToolCall` events handled correctly
- [ ] Tool results round-trip correctly (user sends tool result → model sees it in context) - [x] Tool results round-trip correctly (user sends tool result → model sees it in context)
- [ ] finish_reason is "tool_calls" when tools are invoked - [x] finish_reason is "tool_calls" when tools are invoked
### Vision-Language Models ### Vision-Language Models
@@ -2694,7 +2694,7 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
### Thinking Mode ### Thinking Mode
- [ ] `enable_thinking: false` passed through to template correctly - [x] `enable_thinking: false` passed through to template correctly
- [ ] Thinking mode on: `<think>` blocks appear in output - [ ] Thinking mode on: `<think>` blocks appear in output
- [ ] Thinking mode off: no `<think>` blocks - [ ] Thinking mode off: no `<think>` blocks
@@ -2702,7 +2702,7 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
- [ ] `GET /health``{"status":"ok"}` - [ ] `GET /health``{"status":"ok"}`
- [ ] `GET /v1/models` → model list with context windows - [ ] `GET /v1/models` → model list with context windows
- [ ] Non-streaming `POST /v1/chat/completions` → full response - [x] Non-streaming `POST /v1/chat/completions` → full response
- [ ] Streaming `POST /v1/chat/completions` → SSE stream - [x] Streaming `POST /v1/chat/completions` → SSE stream
- [ ] Model field in request triggers model swap - [ ] Model field in request triggers model swap
- [ ] UI chat (ChatViewModel) completely unaffected - [ ] UI chat (ChatViewModel) completely unaffected