feat: more visibility of prefilling

This commit is contained in:
2026-03-19 11:36:46 +01:00
parent 49bd165ce7
commit 577fdf8950
3 changed files with 290 additions and 109 deletions

View File

@@ -9,7 +9,7 @@ final class LiveCounters: @unchecked Sendable {
static let shared = LiveCounters() static let shared = LiveCounters()
private let lock = OSAllocatedUnfairLock() private let lock = OSAllocatedUnfairLock()
private var requestPhases: [String: RequestPhase] = [:] private var requestPhases: [String: RequestState] = [:]
// Current request // Current request
private var _activeRequests: Int = 0 private var _activeRequests: Int = 0
@@ -23,13 +23,19 @@ final class LiveCounters: @unchecked Sendable {
private var _isPrefilling: Bool = false private var _isPrefilling: Bool = false
private var _isGenerating: Bool = false private var _isGenerating: Bool = false
private var _contextMax: Int = 0 private var _contextMax: Int = 0
private var _currentPhaseElapsed: TimeInterval = 0
// Cumulative // Cumulative
private var _totalRequests: Int = 0 private var _totalRequests: Int = 0
private var _totalPromptTokens: Int = 0 private var _totalPromptTokens: Int = 0
private var _totalGenerationTokens: Int = 0 private var _totalGenerationTokens: Int = 0
private var _totalPreparingDuration: TimeInterval = 0
private var _totalSessionBuildDuration: TimeInterval = 0
private var _totalPrefillDuration: TimeInterval = 0
private var _totalGenerationDuration: TimeInterval = 0
func requestStarted(requestId: String, contextLength: Int) { func requestStarted(requestId: String, contextLength: Int) {
let now = Date()
lock.lock() lock.lock()
_activeRequests += 1 _activeRequests += 1
_preparingRequests += 1 _preparingRequests += 1
@@ -40,33 +46,40 @@ final class LiveCounters: @unchecked Sendable {
_generationTokens = 0 _generationTokens = 0
_tokensPerSecond = 0 _tokensPerSecond = 0
_contextMax = contextLength _contextMax = contextLength
requestPhases[requestId] = .preparing requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
refreshCurrentPhaseElapsed(now: now)
lock.unlock() lock.unlock()
} }
func requestPhaseChanged(requestId: String, phase: RequestPhase) { func requestPhaseChanged(requestId: String, phase: RequestPhase) {
let now = Date()
lock.lock() lock.lock()
if let current = requestPhases[requestId] { if let current = requestPhases[requestId] {
decrementCount(for: current) decrementCount(for: current.phase)
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
} }
incrementCount(for: phase) incrementCount(for: phase)
requestPhases[requestId] = phase requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0 _isGenerating = _generatingRequests > 0
refreshCurrentPhaseElapsed(now: now)
lock.unlock() lock.unlock()
} }
func prefillCompleted(requestId: String, promptTokens: Int) { func prefillCompleted(requestId: String, promptTokens: Int) {
let now = Date()
lock.lock() lock.lock()
if let current = requestPhases[requestId] { if let current = requestPhases[requestId] {
decrementCount(for: current) decrementCount(for: current.phase)
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
} }
incrementCount(for: .generating) incrementCount(for: .generating)
requestPhases[requestId] = .generating requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
_promptTokens = promptTokens _promptTokens = promptTokens
_totalPromptTokens += promptTokens _totalPromptTokens += promptTokens
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0 _isGenerating = _generatingRequests > 0
refreshCurrentPhaseElapsed(now: now)
lock.unlock() lock.unlock()
} }
@@ -78,9 +91,11 @@ final class LiveCounters: @unchecked Sendable {
} }
func requestCompleted(requestId: String, generationTokens: Int) { func requestCompleted(requestId: String, generationTokens: Int) {
let now = Date()
lock.lock() lock.lock()
if let current = requestPhases.removeValue(forKey: requestId) { if let current = requestPhases.removeValue(forKey: requestId) {
decrementCount(for: current) decrementCount(for: current.phase)
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
} }
_activeRequests = max(0, _activeRequests - 1) _activeRequests = max(0, _activeRequests - 1)
_totalGenerationTokens += generationTokens _totalGenerationTokens += generationTokens
@@ -92,6 +107,7 @@ final class LiveCounters: @unchecked Sendable {
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0 _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
_isGenerating = _generatingRequests > 0 _isGenerating = _generatingRequests > 0
} }
refreshCurrentPhaseElapsed(now: now)
lock.unlock() lock.unlock()
} }
@@ -109,15 +125,22 @@ final class LiveCounters: @unchecked Sendable {
_isPrefilling = false _isPrefilling = false
_isGenerating = false _isGenerating = false
_contextMax = 0 _contextMax = 0
_currentPhaseElapsed = 0
_totalRequests = 0 _totalRequests = 0
_totalPromptTokens = 0 _totalPromptTokens = 0
_totalGenerationTokens = 0 _totalGenerationTokens = 0
_totalPreparingDuration = 0
_totalSessionBuildDuration = 0
_totalPrefillDuration = 0
_totalGenerationDuration = 0
lock.unlock() lock.unlock()
} }
/// Atomic snapshot for the UI timer. /// Atomic snapshot for the UI timer.
func snapshot() -> Snapshot { func snapshot() -> Snapshot {
let now = Date()
lock.lock() lock.lock()
refreshCurrentPhaseElapsed(now: now)
let s = Snapshot( let s = Snapshot(
activeRequests: _activeRequests, activeRequests: _activeRequests,
preparingRequests: _preparingRequests, preparingRequests: _preparingRequests,
@@ -130,9 +153,14 @@ final class LiveCounters: @unchecked Sendable {
isPrefilling: _isPrefilling, isPrefilling: _isPrefilling,
isGenerating: _isGenerating, isGenerating: _isGenerating,
contextMax: _contextMax, contextMax: _contextMax,
currentPhaseElapsed: _currentPhaseElapsed,
totalRequests: _totalRequests, totalRequests: _totalRequests,
totalPromptTokens: _totalPromptTokens, totalPromptTokens: _totalPromptTokens,
totalGenerationTokens: _totalGenerationTokens totalGenerationTokens: _totalGenerationTokens,
totalPreparingDuration: _totalPreparingDuration,
totalSessionBuildDuration: _totalSessionBuildDuration,
totalPrefillDuration: _totalPrefillDuration,
totalGenerationDuration: _totalGenerationDuration
) )
lock.unlock() lock.unlock()
return s return s
@@ -150,9 +178,14 @@ final class LiveCounters: @unchecked Sendable {
let isPrefilling: Bool let isPrefilling: Bool
let isGenerating: Bool let isGenerating: Bool
let contextMax: Int let contextMax: Int
let currentPhaseElapsed: TimeInterval
let totalRequests: Int let totalRequests: Int
let totalPromptTokens: Int let totalPromptTokens: Int
let totalGenerationTokens: Int let totalGenerationTokens: Int
let totalPreparingDuration: TimeInterval
let totalSessionBuildDuration: TimeInterval
let totalPrefillDuration: TimeInterval
let totalGenerationDuration: TimeInterval
} }
private func incrementCount(for phase: RequestPhase) { private func incrementCount(for phase: RequestPhase) {
@@ -181,6 +214,28 @@ final class LiveCounters: @unchecked Sendable {
} }
} }
private func accumulateDuration(for phase: RequestPhase, elapsed: TimeInterval) {
switch phase {
case .preparing:
_totalPreparingDuration += elapsed
case .sessionBuild:
_totalSessionBuildDuration += elapsed
case .prefilling:
_totalPrefillDuration += elapsed
case .generating:
_totalGenerationDuration += elapsed
}
}
private func refreshCurrentPhaseElapsed(now: Date) {
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
}
private struct RequestState {
var phase: RequestPhase
var phaseStartedAt: Date
}
enum RequestPhase { enum RequestPhase {
case preparing case preparing
case sessionBuild case sessionBuild
@@ -208,6 +263,7 @@ final class InferenceStats {
var currentTokensPerSecond: Double = 0 var currentTokensPerSecond: Double = 0
var contextUsed: Int = 0 var contextUsed: Int = 0
var contextMax: Int = 0 var contextMax: Int = 0
var currentPhaseElapsed: TimeInterval = 0
// MARK: - Cumulative counters // MARK: - Cumulative counters
@@ -219,6 +275,10 @@ final class InferenceStats {
var totalCacheEvictions: Int = 0 var totalCacheEvictions: Int = 0
var totalCacheReusePromptTokens: Int = 0 var totalCacheReusePromptTokens: Int = 0
var totalCacheRebuildPromptTokens: Int = 0 var totalCacheRebuildPromptTokens: Int = 0
var totalPreparingDuration: TimeInterval = 0
var totalSessionBuildDuration: TimeInterval = 0
var totalPrefillDuration: TimeInterval = 0
var totalGenerationDuration: TimeInterval = 0
// MARK: - Cache state // MARK: - Cache state
@@ -246,6 +306,9 @@ final class InferenceStats {
private(set) var cacheFootprintHistory: [DataPoint] = [] private(set) var cacheFootprintHistory: [DataPoint] = []
private(set) var cacheReuseHistory: [DataPoint] = [] private(set) var cacheReuseHistory: [DataPoint] = []
private(set) var cacheRebuildHistory: [DataPoint] = [] private(set) var cacheRebuildHistory: [DataPoint] = []
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
private(set) var prefillDurationHistory: [DataPoint] = []
private(set) var sessionBuildDurationHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -255,6 +318,8 @@ final class InferenceStats {
private var lastPromptTokenCount: Int = 0 private var lastPromptTokenCount: Int = 0
private var lastCacheReuseTokenCount: Int = 0 private var lastCacheReuseTokenCount: Int = 0
private var lastCacheRebuildTokenCount: Int = 0 private var lastCacheRebuildTokenCount: Int = 0
private var lastPrefillDuration: TimeInterval = 0
private var lastSessionBuildDuration: TimeInterval = 0
func startSampling() { func startSampling() {
guard sampleTimer == nil else { return } guard sampleTimer == nil else { return }
@@ -287,9 +352,14 @@ final class InferenceStats {
isGenerating = snap.isGenerating isGenerating = snap.isGenerating
contextMax = snap.contextMax contextMax = snap.contextMax
contextUsed = snap.promptTokens + snap.generationTokens contextUsed = snap.promptTokens + snap.generationTokens
currentPhaseElapsed = snap.currentPhaseElapsed
totalRequests = snap.totalRequests totalRequests = snap.totalRequests
totalPromptTokens = snap.totalPromptTokens totalPromptTokens = snap.totalPromptTokens
totalGenerationTokens = snap.totalGenerationTokens totalGenerationTokens = snap.totalGenerationTokens
totalPreparingDuration = snap.totalPreparingDuration
totalSessionBuildDuration = snap.totalSessionBuildDuration
totalPrefillDuration = snap.totalPrefillDuration
totalGenerationDuration = snap.totalGenerationDuration
totalCacheHits = cache.totalHits totalCacheHits = cache.totalHits
totalCacheMisses = cache.totalMisses totalCacheMisses = cache.totalMisses
totalCacheEvictions = cache.totalEvictions totalCacheEvictions = cache.totalEvictions
@@ -308,10 +378,14 @@ final class InferenceStats {
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
lastGenerationTokenCount = snap.totalGenerationTokens lastGenerationTokenCount = snap.totalGenerationTokens
lastPromptTokenCount = snap.totalPromptTokens lastPromptTokenCount = snap.totalPromptTokens
lastCacheReuseTokenCount = cache.totalReusePromptTokens lastCacheReuseTokenCount = cache.totalReusePromptTokens
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
lastPrefillDuration = snap.totalPrefillDuration
lastSessionBuildDuration = snap.totalSessionBuildDuration
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond)) tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta))) generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
@@ -321,6 +395,9 @@ final class InferenceStats {
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes))) cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta))) cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta))) cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
if tokenRateHistory.count > Self.maxHistoryPoints { if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints) tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -346,6 +423,15 @@ final class InferenceStats {
if cacheRebuildHistory.count > Self.maxHistoryPoints { if cacheRebuildHistory.count > Self.maxHistoryPoints {
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints) cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
} }
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
}
if prefillDurationHistory.count > Self.maxHistoryPoints {
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
}
if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
}
} }
func reset() { func reset() {
@@ -363,9 +449,14 @@ final class InferenceStats {
currentTokensPerSecond = 0 currentTokensPerSecond = 0
contextUsed = 0 contextUsed = 0
contextMax = 0 contextMax = 0
currentPhaseElapsed = 0
totalRequests = 0 totalRequests = 0
totalPromptTokens = 0 totalPromptTokens = 0
totalGenerationTokens = 0 totalGenerationTokens = 0
totalPreparingDuration = 0
totalSessionBuildDuration = 0
totalPrefillDuration = 0
totalGenerationDuration = 0
totalCacheHits = 0 totalCacheHits = 0
totalCacheMisses = 0 totalCacheMisses = 0
totalCacheEvictions = 0 totalCacheEvictions = 0
@@ -386,9 +477,14 @@ final class InferenceStats {
cacheFootprintHistory.removeAll() cacheFootprintHistory.removeAll()
cacheReuseHistory.removeAll() cacheReuseHistory.removeAll()
cacheRebuildHistory.removeAll() cacheRebuildHistory.removeAll()
currentPhaseElapsedHistory.removeAll()
prefillDurationHistory.removeAll()
sessionBuildDurationHistory.removeAll()
lastGenerationTokenCount = 0 lastGenerationTokenCount = 0
lastPromptTokenCount = 0 lastPromptTokenCount = 0
lastCacheReuseTokenCount = 0 lastCacheReuseTokenCount = 0
lastCacheRebuildTokenCount = 0 lastCacheRebuildTokenCount = 0
lastPrefillDuration = 0
lastSessionBuildDuration = 0
} }
} }

View File

@@ -393,7 +393,7 @@ final class APIServer {
// Extract images from the last message only (ChatSession.streamDetails takes images separately) // Extract images from the last message only (ChatSession.streamDetails takes images separately)
let lastImages = lastMessage.images let lastImages = lastMessage.images
let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool) let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
if isStream { if isStream {
result = await handleStreamingResponse( result = await handleStreamingResponse(
@@ -405,7 +405,8 @@ final class APIServer {
images: lastImages, images: lastImages,
tools: request.tools, tools: request.tools,
created: created, created: created,
modelName: modelName modelName: modelName,
isQwen: isQwen
) )
} else { } else {
result = await handleNonStreamingResponse( result = await handleNonStreamingResponse(
@@ -417,16 +418,23 @@ final class APIServer {
images: lastImages, images: lastImages,
tools: request.tools, tools: request.tools,
created: created, created: created,
modelName: modelName modelName: modelName,
isQwen: isQwen
) )
} }
if result.succeeded { if result.succeeded {
var cachedSignatures = messageSignatures
if let assistantHistoryText = result.assistantHistoryText {
cachedSignatures.append(
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
)
}
ConversationSessionCache.shared.completeRequest( ConversationSessionCache.shared.completeRequest(
entryId: lease.entryId, entryId: lease.entryId,
session: session, session: session,
requestMessageSignatures: messageSignatures, requestMessageSignatures: cachedSignatures,
requestMessageCount: chatMessages.count, requestMessageCount: cachedSignatures.count,
estimatedPromptTokens: estimatedPromptTokens, estimatedPromptTokens: estimatedPromptTokens,
estimatedBytes: estimatedBytes, estimatedBytes: estimatedBytes,
promptTokens: result.promptTokens, promptTokens: result.promptTokens,
@@ -473,8 +481,9 @@ final class APIServer {
images: [UserInput.Image], images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
created: Int, created: Int,
modelName: String modelName: String,
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) { isQwen: Bool
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
do { do {
var fullText = "" var fullText = ""
var promptTokens = 0 var promptTokens = 0
@@ -510,48 +519,11 @@ final class APIServer {
} }
} }
// Parse tool calls: first check framework-detected ones, then our own text parser let resolved = Self.resolveAssistantResponse(
var finishReason = "stop" fullText: fullText,
var responseContent: String? = fullText frameworkToolCalls: frameworkToolCalls,
var apiToolCalls: [APIToolCall]? = nil tools: tools
)
if !frameworkToolCalls.isEmpty {
// Framework natively detected tool calls (e.g. Qwen)
finishReason = "tool_calls"
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
let argsJSON: String
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
let str = String(data: data, encoding: .utf8) {
argsJSON = str
} else {
argsJSON = "{}"
}
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
return APIToolCall(
index: i,
id: callId,
type: "function",
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
)
}
responseContent = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
} else if let tools, !tools.isEmpty {
// Try our own text parser (e.g. Gemma tool_code blocks)
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
if !parsedCalls.isEmpty {
finishReason = "tool_calls"
apiToolCalls = parsedCalls.enumerated().map { i, tc in
APIToolCall(
index: i,
id: tc.id,
type: "function",
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
)
}
responseContent = cleanText.isEmpty ? nil : cleanText
}
}
let response = APIChatCompletionResponse( let response = APIChatCompletionResponse(
id: requestId, id: requestId,
@@ -563,10 +535,10 @@ final class APIServer {
index: 0, index: 0,
message: APIChoiceMessage( message: APIChoiceMessage(
role: "assistant", role: "assistant",
content: responseContent, content: resolved.content,
tool_calls: apiToolCalls tool_calls: resolved.toolCalls
), ),
finish_reason: finishReason finish_reason: resolved.finishReason
) )
], ],
usage: APIUsageInfo( usage: APIUsageInfo(
@@ -579,10 +551,15 @@ final class APIServer {
if let json = try? JSONEncoder().encode(response) { if let json = try? JSONEncoder().encode(response) {
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}") sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
} }
return (promptTokens, completionTokens, true) let assistantHistoryText = Self.normalizedAssistantHistoryContent(
content: resolved.content,
toolCalls: resolved.toolCalls,
isQwen: isQwen
)
return (promptTokens, completionTokens, assistantHistoryText, true)
} catch { } catch {
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
return (0, 0, false) return (0, 0, nil, false)
} }
} }
@@ -597,8 +574,9 @@ final class APIServer {
images: [UserInput.Image], images: [UserInput.Image],
tools: [APIToolDefinition]?, tools: [APIToolDefinition]?,
created: Int, created: Int,
modelName: String modelName: String,
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) { isQwen: Bool
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
// Send SSE headers // Send SSE headers
let header = [ let header = [
"HTTP/1.1 200 OK", "HTTP/1.1 200 OK",
@@ -657,50 +635,14 @@ final class APIServer {
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens) LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
} }
// Stats were already updated by LiveCounters inside the loop let resolved = Self.resolveAssistantResponse(
fullText: fullText,
frameworkToolCalls: frameworkToolCalls,
tools: tools
)
// Post-generation: handle tool calls (framework-detected or text-parsed) if let toolCalls = resolved.toolCalls {
var finishReason = "stop" for apiToolCall in toolCalls {
if !frameworkToolCalls.isEmpty {
finishReason = "tool_calls"
for (i, tc) in frameworkToolCalls.enumerated() {
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
let argsJSON: String
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
let str = String(data: data, encoding: .utf8) {
argsJSON = str
} else {
argsJSON = "{}"
}
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
let apiToolCall = APIToolCall(
index: i,
id: callId,
type: "function",
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
)
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
id: requestId,
object: "chat.completion.chunk",
created: created,
model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: [apiToolCall]), finish_reason: nil)],
usage: nil
))
}
} else if hasTools {
let (_, parsed) = ToolCallParser.parse(text: fullText, tools: tools)
if !parsed.isEmpty {
finishReason = "tool_calls"
}
for (i, tc) in parsed.enumerated() {
let apiToolCall = APIToolCall(
index: i,
id: tc.id,
type: "function",
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
)
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk( await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
id: requestId, id: requestId,
object: "chat.completion.chunk", object: "chat.completion.chunk",
@@ -718,7 +660,7 @@ final class APIServer {
object: "chat.completion.chunk", object: "chat.completion.chunk",
created: created, created: created,
model: modelName, model: modelName,
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: finishReason)], choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
usage: APIUsageInfo( usage: APIUsageInfo(
prompt_tokens: promptTokens, prompt_tokens: promptTokens,
completion_tokens: completionTokens, completion_tokens: completionTokens,
@@ -729,7 +671,12 @@ final class APIServer {
// Send [DONE] and close // Send [DONE] and close
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!) await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
connection.cancel() connection.cancel()
return (promptTokens, completionTokens, succeeded) let assistantHistoryText = Self.normalizedAssistantHistoryContent(
content: resolved.content,
toolCalls: resolved.toolCalls,
isQwen: isQwen
)
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
} }
/// Run the token generation + SSE send loop entirely off MainActor. /// Run the token generation + SSE send loop entirely off MainActor.
@@ -876,6 +823,68 @@ final class APIServer {
return hash return hash
} }
private static func normalizedAssistantHistoryContent(
content: String?,
toolCalls: [APIToolCall]?,
isQwen: Bool
) -> String? {
var text = content?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if let toolCalls, !toolCalls.isEmpty {
let formattedCalls = isQwen
? ToolPromptBuilder.formatQwenToolCalls(toolCalls)
: ToolPromptBuilder.formatGemmaToolCalls(toolCalls)
text = text.isEmpty ? formattedCalls : text + "\n" + formattedCalls
}
return text.isEmpty ? nil : text
}
private static func resolveAssistantResponse(
fullText: String,
frameworkToolCalls: [MLXLMCommon.ToolCall],
tools: [APIToolDefinition]?
) -> (content: String?, toolCalls: [APIToolCall]?, finishReason: String) {
var finishReason = "stop"
var responseContent: String? = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
var apiToolCalls: [APIToolCall]? = nil
if !frameworkToolCalls.isEmpty {
finishReason = "tool_calls"
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
let argsJSON: String
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
let str = String(data: data, encoding: .utf8) {
argsJSON = str
} else {
argsJSON = "{}"
}
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
return APIToolCall(
index: i,
id: callId,
type: "function",
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
)
}
} else if let tools, !tools.isEmpty {
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
if !parsedCalls.isEmpty {
finishReason = "tool_calls"
apiToolCalls = parsedCalls.enumerated().map { i, tc in
APIToolCall(
index: i,
id: tc.id,
type: "function",
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
)
}
responseContent = cleanText.isEmpty ? nil : cleanText
}
}
return (responseContent, apiToolCalls, finishReason)
}
} }
private struct DecodedImage { private struct DecodedImage {

View File

@@ -17,6 +17,7 @@ struct MonitorView: View {
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) { LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
tokenRateChart tokenRateChart
tokenThroughputChart tokenThroughputChart
phaseActivityChart
cacheReuseChart cacheReuseChart
cacheFootprintChart cacheFootprintChart
cacheSessionChart cacheSessionChart
@@ -90,6 +91,9 @@ struct MonitorView: View {
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue) phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green) phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange) phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
if stats.activeRequests > 0 {
phaseChip(title: phaseAgeLabel, count: Int(stats.currentPhaseElapsed.rounded()), color: activityColor)
}
} }
} }
.padding(12) .padding(12)
@@ -161,6 +165,71 @@ struct MonitorView: View {
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10)) .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
} }
@ViewBuilder
private var phaseActivityChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Phase Activity")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart {
ForEach(stats.currentPhaseElapsedHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("Active s", point.value)
)
.foregroundStyle(activityColor)
.interpolationMethod(.monotone)
}
ForEach(stats.prefillDurationHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Prefill done", point.value)
)
.foregroundStyle(.blue.opacity(0.45))
}
ForEach(stats.sessionBuildDurationHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Build done", point.value)
)
.foregroundStyle(.purple.opacity(0.45))
}
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.0f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.frame(height: 150)
HStack(spacing: 12) {
Label("Active phase age", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(activityColor)
Label("Prefill completed", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.blue)
Label("Session build completed", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.purple)
}
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
@ViewBuilder @ViewBuilder
private var cacheReuseChart: some View { private var cacheReuseChart: some View {
VStack(alignment: .leading, spacing: 6) { VStack(alignment: .leading, spacing: 6) {
@@ -717,6 +786,13 @@ struct MonitorView: View {
return Double(stats.totalCacheHits) / Double(total) return Double(stats.totalCacheHits) / Double(total)
} }
private var phaseAgeLabel: String {
if stats.generatingRequests > 0 { return "Generating s" }
if stats.prefillingRequests > 0 { return "Prefill s" }
if stats.sessionBuildRequests > 0 { return "Build s" }
return "Preparing s"
}
private func maxContextRatio(for tokens: Int) -> Double { private func maxContextRatio(for tokens: Int) -> Double {
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0) let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
guard maxContext > 0 else { return 0 } guard maxContext > 0 else { return 0 }