feat: more visibility of prefilling
This commit is contained in:
@@ -9,7 +9,7 @@ final class LiveCounters: @unchecked Sendable {
|
||||
static let shared = LiveCounters()
|
||||
|
||||
private let lock = OSAllocatedUnfairLock()
|
||||
private var requestPhases: [String: RequestPhase] = [:]
|
||||
private var requestPhases: [String: RequestState] = [:]
|
||||
|
||||
// Current request
|
||||
private var _activeRequests: Int = 0
|
||||
@@ -23,13 +23,19 @@ final class LiveCounters: @unchecked Sendable {
|
||||
private var _isPrefilling: Bool = false
|
||||
private var _isGenerating: Bool = false
|
||||
private var _contextMax: Int = 0
|
||||
private var _currentPhaseElapsed: TimeInterval = 0
|
||||
|
||||
// Cumulative
|
||||
private var _totalRequests: Int = 0
|
||||
private var _totalPromptTokens: Int = 0
|
||||
private var _totalGenerationTokens: Int = 0
|
||||
private var _totalPreparingDuration: TimeInterval = 0
|
||||
private var _totalSessionBuildDuration: TimeInterval = 0
|
||||
private var _totalPrefillDuration: TimeInterval = 0
|
||||
private var _totalGenerationDuration: TimeInterval = 0
|
||||
|
||||
func requestStarted(requestId: String, contextLength: Int) {
|
||||
let now = Date()
|
||||
lock.lock()
|
||||
_activeRequests += 1
|
||||
_preparingRequests += 1
|
||||
@@ -40,33 +46,40 @@ final class LiveCounters: @unchecked Sendable {
|
||||
_generationTokens = 0
|
||||
_tokensPerSecond = 0
|
||||
_contextMax = contextLength
|
||||
requestPhases[requestId] = .preparing
|
||||
requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
|
||||
refreshCurrentPhaseElapsed(now: now)
|
||||
lock.unlock()
|
||||
}
|
||||
|
||||
func requestPhaseChanged(requestId: String, phase: RequestPhase) {
|
||||
let now = Date()
|
||||
lock.lock()
|
||||
if let current = requestPhases[requestId] {
|
||||
decrementCount(for: current)
|
||||
decrementCount(for: current.phase)
|
||||
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||
}
|
||||
incrementCount(for: phase)
|
||||
requestPhases[requestId] = phase
|
||||
requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
|
||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||
_isGenerating = _generatingRequests > 0
|
||||
refreshCurrentPhaseElapsed(now: now)
|
||||
lock.unlock()
|
||||
}
|
||||
|
||||
func prefillCompleted(requestId: String, promptTokens: Int) {
|
||||
let now = Date()
|
||||
lock.lock()
|
||||
if let current = requestPhases[requestId] {
|
||||
decrementCount(for: current)
|
||||
decrementCount(for: current.phase)
|
||||
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||
}
|
||||
incrementCount(for: .generating)
|
||||
requestPhases[requestId] = .generating
|
||||
requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
|
||||
_promptTokens = promptTokens
|
||||
_totalPromptTokens += promptTokens
|
||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||
_isGenerating = _generatingRequests > 0
|
||||
refreshCurrentPhaseElapsed(now: now)
|
||||
lock.unlock()
|
||||
}
|
||||
|
||||
@@ -78,9 +91,11 @@ final class LiveCounters: @unchecked Sendable {
|
||||
}
|
||||
|
||||
func requestCompleted(requestId: String, generationTokens: Int) {
|
||||
let now = Date()
|
||||
lock.lock()
|
||||
if let current = requestPhases.removeValue(forKey: requestId) {
|
||||
decrementCount(for: current)
|
||||
decrementCount(for: current.phase)
|
||||
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||
}
|
||||
_activeRequests = max(0, _activeRequests - 1)
|
||||
_totalGenerationTokens += generationTokens
|
||||
@@ -92,6 +107,7 @@ final class LiveCounters: @unchecked Sendable {
|
||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||
_isGenerating = _generatingRequests > 0
|
||||
}
|
||||
refreshCurrentPhaseElapsed(now: now)
|
||||
lock.unlock()
|
||||
}
|
||||
|
||||
@@ -109,15 +125,22 @@ final class LiveCounters: @unchecked Sendable {
|
||||
_isPrefilling = false
|
||||
_isGenerating = false
|
||||
_contextMax = 0
|
||||
_currentPhaseElapsed = 0
|
||||
_totalRequests = 0
|
||||
_totalPromptTokens = 0
|
||||
_totalGenerationTokens = 0
|
||||
_totalPreparingDuration = 0
|
||||
_totalSessionBuildDuration = 0
|
||||
_totalPrefillDuration = 0
|
||||
_totalGenerationDuration = 0
|
||||
lock.unlock()
|
||||
}
|
||||
|
||||
/// Atomic snapshot for the UI timer.
|
||||
func snapshot() -> Snapshot {
|
||||
let now = Date()
|
||||
lock.lock()
|
||||
refreshCurrentPhaseElapsed(now: now)
|
||||
let s = Snapshot(
|
||||
activeRequests: _activeRequests,
|
||||
preparingRequests: _preparingRequests,
|
||||
@@ -130,9 +153,14 @@ final class LiveCounters: @unchecked Sendable {
|
||||
isPrefilling: _isPrefilling,
|
||||
isGenerating: _isGenerating,
|
||||
contextMax: _contextMax,
|
||||
currentPhaseElapsed: _currentPhaseElapsed,
|
||||
totalRequests: _totalRequests,
|
||||
totalPromptTokens: _totalPromptTokens,
|
||||
totalGenerationTokens: _totalGenerationTokens
|
||||
totalGenerationTokens: _totalGenerationTokens,
|
||||
totalPreparingDuration: _totalPreparingDuration,
|
||||
totalSessionBuildDuration: _totalSessionBuildDuration,
|
||||
totalPrefillDuration: _totalPrefillDuration,
|
||||
totalGenerationDuration: _totalGenerationDuration
|
||||
)
|
||||
lock.unlock()
|
||||
return s
|
||||
@@ -150,9 +178,14 @@ final class LiveCounters: @unchecked Sendable {
|
||||
let isPrefilling: Bool
|
||||
let isGenerating: Bool
|
||||
let contextMax: Int
|
||||
let currentPhaseElapsed: TimeInterval
|
||||
let totalRequests: Int
|
||||
let totalPromptTokens: Int
|
||||
let totalGenerationTokens: Int
|
||||
let totalPreparingDuration: TimeInterval
|
||||
let totalSessionBuildDuration: TimeInterval
|
||||
let totalPrefillDuration: TimeInterval
|
||||
let totalGenerationDuration: TimeInterval
|
||||
}
|
||||
|
||||
private func incrementCount(for phase: RequestPhase) {
|
||||
@@ -181,6 +214,28 @@ final class LiveCounters: @unchecked Sendable {
|
||||
}
|
||||
}
|
||||
|
||||
private func accumulateDuration(for phase: RequestPhase, elapsed: TimeInterval) {
|
||||
switch phase {
|
||||
case .preparing:
|
||||
_totalPreparingDuration += elapsed
|
||||
case .sessionBuild:
|
||||
_totalSessionBuildDuration += elapsed
|
||||
case .prefilling:
|
||||
_totalPrefillDuration += elapsed
|
||||
case .generating:
|
||||
_totalGenerationDuration += elapsed
|
||||
}
|
||||
}
|
||||
|
||||
private func refreshCurrentPhaseElapsed(now: Date) {
|
||||
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
|
||||
}
|
||||
|
||||
private struct RequestState {
|
||||
var phase: RequestPhase
|
||||
var phaseStartedAt: Date
|
||||
}
|
||||
|
||||
enum RequestPhase {
|
||||
case preparing
|
||||
case sessionBuild
|
||||
@@ -208,6 +263,7 @@ final class InferenceStats {
|
||||
var currentTokensPerSecond: Double = 0
|
||||
var contextUsed: Int = 0
|
||||
var contextMax: Int = 0
|
||||
var currentPhaseElapsed: TimeInterval = 0
|
||||
|
||||
// MARK: - Cumulative counters
|
||||
|
||||
@@ -219,6 +275,10 @@ final class InferenceStats {
|
||||
var totalCacheEvictions: Int = 0
|
||||
var totalCacheReusePromptTokens: Int = 0
|
||||
var totalCacheRebuildPromptTokens: Int = 0
|
||||
var totalPreparingDuration: TimeInterval = 0
|
||||
var totalSessionBuildDuration: TimeInterval = 0
|
||||
var totalPrefillDuration: TimeInterval = 0
|
||||
var totalGenerationDuration: TimeInterval = 0
|
||||
|
||||
// MARK: - Cache state
|
||||
|
||||
@@ -246,6 +306,9 @@ final class InferenceStats {
|
||||
private(set) var cacheFootprintHistory: [DataPoint] = []
|
||||
private(set) var cacheReuseHistory: [DataPoint] = []
|
||||
private(set) var cacheRebuildHistory: [DataPoint] = []
|
||||
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
|
||||
private(set) var prefillDurationHistory: [DataPoint] = []
|
||||
private(set) var sessionBuildDurationHistory: [DataPoint] = []
|
||||
|
||||
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
||||
|
||||
@@ -255,6 +318,8 @@ final class InferenceStats {
|
||||
private var lastPromptTokenCount: Int = 0
|
||||
private var lastCacheReuseTokenCount: Int = 0
|
||||
private var lastCacheRebuildTokenCount: Int = 0
|
||||
private var lastPrefillDuration: TimeInterval = 0
|
||||
private var lastSessionBuildDuration: TimeInterval = 0
|
||||
|
||||
func startSampling() {
|
||||
guard sampleTimer == nil else { return }
|
||||
@@ -287,9 +352,14 @@ final class InferenceStats {
|
||||
isGenerating = snap.isGenerating
|
||||
contextMax = snap.contextMax
|
||||
contextUsed = snap.promptTokens + snap.generationTokens
|
||||
currentPhaseElapsed = snap.currentPhaseElapsed
|
||||
totalRequests = snap.totalRequests
|
||||
totalPromptTokens = snap.totalPromptTokens
|
||||
totalGenerationTokens = snap.totalGenerationTokens
|
||||
totalPreparingDuration = snap.totalPreparingDuration
|
||||
totalSessionBuildDuration = snap.totalSessionBuildDuration
|
||||
totalPrefillDuration = snap.totalPrefillDuration
|
||||
totalGenerationDuration = snap.totalGenerationDuration
|
||||
totalCacheHits = cache.totalHits
|
||||
totalCacheMisses = cache.totalMisses
|
||||
totalCacheEvictions = cache.totalEvictions
|
||||
@@ -308,10 +378,14 @@ final class InferenceStats {
|
||||
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
||||
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
|
||||
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
|
||||
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
|
||||
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
|
||||
lastGenerationTokenCount = snap.totalGenerationTokens
|
||||
lastPromptTokenCount = snap.totalPromptTokens
|
||||
lastCacheReuseTokenCount = cache.totalReusePromptTokens
|
||||
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
|
||||
lastPrefillDuration = snap.totalPrefillDuration
|
||||
lastSessionBuildDuration = snap.totalSessionBuildDuration
|
||||
|
||||
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
||||
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
||||
@@ -321,6 +395,9 @@ final class InferenceStats {
|
||||
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
||||
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
|
||||
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
|
||||
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
|
||||
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
|
||||
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
|
||||
|
||||
if tokenRateHistory.count > Self.maxHistoryPoints {
|
||||
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
||||
@@ -346,6 +423,15 @@ final class InferenceStats {
|
||||
if cacheRebuildHistory.count > Self.maxHistoryPoints {
|
||||
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
|
||||
}
|
||||
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
|
||||
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
|
||||
}
|
||||
if prefillDurationHistory.count > Self.maxHistoryPoints {
|
||||
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
|
||||
}
|
||||
if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
|
||||
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
|
||||
}
|
||||
}
|
||||
|
||||
func reset() {
|
||||
@@ -363,9 +449,14 @@ final class InferenceStats {
|
||||
currentTokensPerSecond = 0
|
||||
contextUsed = 0
|
||||
contextMax = 0
|
||||
currentPhaseElapsed = 0
|
||||
totalRequests = 0
|
||||
totalPromptTokens = 0
|
||||
totalGenerationTokens = 0
|
||||
totalPreparingDuration = 0
|
||||
totalSessionBuildDuration = 0
|
||||
totalPrefillDuration = 0
|
||||
totalGenerationDuration = 0
|
||||
totalCacheHits = 0
|
||||
totalCacheMisses = 0
|
||||
totalCacheEvictions = 0
|
||||
@@ -386,9 +477,14 @@ final class InferenceStats {
|
||||
cacheFootprintHistory.removeAll()
|
||||
cacheReuseHistory.removeAll()
|
||||
cacheRebuildHistory.removeAll()
|
||||
currentPhaseElapsedHistory.removeAll()
|
||||
prefillDurationHistory.removeAll()
|
||||
sessionBuildDurationHistory.removeAll()
|
||||
lastGenerationTokenCount = 0
|
||||
lastPromptTokenCount = 0
|
||||
lastCacheReuseTokenCount = 0
|
||||
lastCacheRebuildTokenCount = 0
|
||||
lastPrefillDuration = 0
|
||||
lastSessionBuildDuration = 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -393,7 +393,7 @@ final class APIServer {
|
||||
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
||||
let lastImages = lastMessage.images
|
||||
|
||||
let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
|
||||
let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
|
||||
|
||||
if isStream {
|
||||
result = await handleStreamingResponse(
|
||||
@@ -405,7 +405,8 @@ final class APIServer {
|
||||
images: lastImages,
|
||||
tools: request.tools,
|
||||
created: created,
|
||||
modelName: modelName
|
||||
modelName: modelName,
|
||||
isQwen: isQwen
|
||||
)
|
||||
} else {
|
||||
result = await handleNonStreamingResponse(
|
||||
@@ -417,16 +418,23 @@ final class APIServer {
|
||||
images: lastImages,
|
||||
tools: request.tools,
|
||||
created: created,
|
||||
modelName: modelName
|
||||
modelName: modelName,
|
||||
isQwen: isQwen
|
||||
)
|
||||
}
|
||||
|
||||
if result.succeeded {
|
||||
var cachedSignatures = messageSignatures
|
||||
if let assistantHistoryText = result.assistantHistoryText {
|
||||
cachedSignatures.append(
|
||||
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
|
||||
)
|
||||
}
|
||||
ConversationSessionCache.shared.completeRequest(
|
||||
entryId: lease.entryId,
|
||||
session: session,
|
||||
requestMessageSignatures: messageSignatures,
|
||||
requestMessageCount: chatMessages.count,
|
||||
requestMessageSignatures: cachedSignatures,
|
||||
requestMessageCount: cachedSignatures.count,
|
||||
estimatedPromptTokens: estimatedPromptTokens,
|
||||
estimatedBytes: estimatedBytes,
|
||||
promptTokens: result.promptTokens,
|
||||
@@ -473,8 +481,9 @@ final class APIServer {
|
||||
images: [UserInput.Image],
|
||||
tools: [APIToolDefinition]?,
|
||||
created: Int,
|
||||
modelName: String
|
||||
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
||||
modelName: String,
|
||||
isQwen: Bool
|
||||
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
||||
do {
|
||||
var fullText = ""
|
||||
var promptTokens = 0
|
||||
@@ -510,48 +519,11 @@ final class APIServer {
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tool calls: first check framework-detected ones, then our own text parser
|
||||
var finishReason = "stop"
|
||||
var responseContent: String? = fullText
|
||||
var apiToolCalls: [APIToolCall]? = nil
|
||||
|
||||
if !frameworkToolCalls.isEmpty {
|
||||
// Framework natively detected tool calls (e.g. Qwen)
|
||||
finishReason = "tool_calls"
|
||||
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
|
||||
let argsJSON: String
|
||||
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
||||
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
argsJSON = str
|
||||
} else {
|
||||
argsJSON = "{}"
|
||||
}
|
||||
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
||||
return APIToolCall(
|
||||
index: i,
|
||||
id: callId,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
||||
let resolved = Self.resolveAssistantResponse(
|
||||
fullText: fullText,
|
||||
frameworkToolCalls: frameworkToolCalls,
|
||||
tools: tools
|
||||
)
|
||||
}
|
||||
responseContent = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
|
||||
} else if let tools, !tools.isEmpty {
|
||||
// Try our own text parser (e.g. Gemma tool_code blocks)
|
||||
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
|
||||
if !parsedCalls.isEmpty {
|
||||
finishReason = "tool_calls"
|
||||
apiToolCalls = parsedCalls.enumerated().map { i, tc in
|
||||
APIToolCall(
|
||||
index: i,
|
||||
id: tc.id,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
||||
)
|
||||
}
|
||||
responseContent = cleanText.isEmpty ? nil : cleanText
|
||||
}
|
||||
}
|
||||
|
||||
let response = APIChatCompletionResponse(
|
||||
id: requestId,
|
||||
@@ -563,10 +535,10 @@ final class APIServer {
|
||||
index: 0,
|
||||
message: APIChoiceMessage(
|
||||
role: "assistant",
|
||||
content: responseContent,
|
||||
tool_calls: apiToolCalls
|
||||
content: resolved.content,
|
||||
tool_calls: resolved.toolCalls
|
||||
),
|
||||
finish_reason: finishReason
|
||||
finish_reason: resolved.finishReason
|
||||
)
|
||||
],
|
||||
usage: APIUsageInfo(
|
||||
@@ -579,10 +551,15 @@ final class APIServer {
|
||||
if let json = try? JSONEncoder().encode(response) {
|
||||
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
||||
}
|
||||
return (promptTokens, completionTokens, true)
|
||||
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
||||
content: resolved.content,
|
||||
toolCalls: resolved.toolCalls,
|
||||
isQwen: isQwen
|
||||
)
|
||||
return (promptTokens, completionTokens, assistantHistoryText, true)
|
||||
} catch {
|
||||
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||
return (0, 0, false)
|
||||
return (0, 0, nil, false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -597,8 +574,9 @@ final class APIServer {
|
||||
images: [UserInput.Image],
|
||||
tools: [APIToolDefinition]?,
|
||||
created: Int,
|
||||
modelName: String
|
||||
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
||||
modelName: String,
|
||||
isQwen: Bool
|
||||
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
||||
// Send SSE headers
|
||||
let header = [
|
||||
"HTTP/1.1 200 OK",
|
||||
@@ -657,50 +635,14 @@ final class APIServer {
|
||||
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
||||
}
|
||||
|
||||
// Stats were already updated by LiveCounters inside the loop
|
||||
|
||||
// Post-generation: handle tool calls (framework-detected or text-parsed)
|
||||
var finishReason = "stop"
|
||||
|
||||
if !frameworkToolCalls.isEmpty {
|
||||
finishReason = "tool_calls"
|
||||
for (i, tc) in frameworkToolCalls.enumerated() {
|
||||
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
||||
let argsJSON: String
|
||||
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
argsJSON = str
|
||||
} else {
|
||||
argsJSON = "{}"
|
||||
}
|
||||
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
||||
let apiToolCall = APIToolCall(
|
||||
index: i,
|
||||
id: callId,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
||||
)
|
||||
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
||||
id: requestId,
|
||||
object: "chat.completion.chunk",
|
||||
created: created,
|
||||
model: modelName,
|
||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: [apiToolCall]), finish_reason: nil)],
|
||||
usage: nil
|
||||
))
|
||||
}
|
||||
} else if hasTools {
|
||||
let (_, parsed) = ToolCallParser.parse(text: fullText, tools: tools)
|
||||
if !parsed.isEmpty {
|
||||
finishReason = "tool_calls"
|
||||
}
|
||||
for (i, tc) in parsed.enumerated() {
|
||||
let apiToolCall = APIToolCall(
|
||||
index: i,
|
||||
id: tc.id,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
||||
let resolved = Self.resolveAssistantResponse(
|
||||
fullText: fullText,
|
||||
frameworkToolCalls: frameworkToolCalls,
|
||||
tools: tools
|
||||
)
|
||||
|
||||
if let toolCalls = resolved.toolCalls {
|
||||
for apiToolCall in toolCalls {
|
||||
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
||||
id: requestId,
|
||||
object: "chat.completion.chunk",
|
||||
@@ -718,7 +660,7 @@ final class APIServer {
|
||||
object: "chat.completion.chunk",
|
||||
created: created,
|
||||
model: modelName,
|
||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: finishReason)],
|
||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
|
||||
usage: APIUsageInfo(
|
||||
prompt_tokens: promptTokens,
|
||||
completion_tokens: completionTokens,
|
||||
@@ -729,7 +671,12 @@ final class APIServer {
|
||||
// Send [DONE] and close
|
||||
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
||||
connection.cancel()
|
||||
return (promptTokens, completionTokens, succeeded)
|
||||
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
||||
content: resolved.content,
|
||||
toolCalls: resolved.toolCalls,
|
||||
isQwen: isQwen
|
||||
)
|
||||
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
|
||||
}
|
||||
|
||||
/// Run the token generation + SSE send loop entirely off MainActor.
|
||||
@@ -876,6 +823,68 @@ final class APIServer {
|
||||
|
||||
return hash
|
||||
}
|
||||
|
||||
private static func normalizedAssistantHistoryContent(
|
||||
content: String?,
|
||||
toolCalls: [APIToolCall]?,
|
||||
isQwen: Bool
|
||||
) -> String? {
|
||||
var text = content?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||
if let toolCalls, !toolCalls.isEmpty {
|
||||
let formattedCalls = isQwen
|
||||
? ToolPromptBuilder.formatQwenToolCalls(toolCalls)
|
||||
: ToolPromptBuilder.formatGemmaToolCalls(toolCalls)
|
||||
text = text.isEmpty ? formattedCalls : text + "\n" + formattedCalls
|
||||
}
|
||||
return text.isEmpty ? nil : text
|
||||
}
|
||||
|
||||
private static func resolveAssistantResponse(
|
||||
fullText: String,
|
||||
frameworkToolCalls: [MLXLMCommon.ToolCall],
|
||||
tools: [APIToolDefinition]?
|
||||
) -> (content: String?, toolCalls: [APIToolCall]?, finishReason: String) {
|
||||
var finishReason = "stop"
|
||||
var responseContent: String? = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
|
||||
var apiToolCalls: [APIToolCall]? = nil
|
||||
|
||||
if !frameworkToolCalls.isEmpty {
|
||||
finishReason = "tool_calls"
|
||||
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
|
||||
let argsJSON: String
|
||||
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
||||
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
argsJSON = str
|
||||
} else {
|
||||
argsJSON = "{}"
|
||||
}
|
||||
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
||||
return APIToolCall(
|
||||
index: i,
|
||||
id: callId,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
||||
)
|
||||
}
|
||||
} else if let tools, !tools.isEmpty {
|
||||
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
|
||||
if !parsedCalls.isEmpty {
|
||||
finishReason = "tool_calls"
|
||||
apiToolCalls = parsedCalls.enumerated().map { i, tc in
|
||||
APIToolCall(
|
||||
index: i,
|
||||
id: tc.id,
|
||||
type: "function",
|
||||
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
||||
)
|
||||
}
|
||||
responseContent = cleanText.isEmpty ? nil : cleanText
|
||||
}
|
||||
}
|
||||
|
||||
return (responseContent, apiToolCalls, finishReason)
|
||||
}
|
||||
}
|
||||
|
||||
private struct DecodedImage {
|
||||
|
||||
@@ -17,6 +17,7 @@ struct MonitorView: View {
|
||||
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
|
||||
tokenRateChart
|
||||
tokenThroughputChart
|
||||
phaseActivityChart
|
||||
cacheReuseChart
|
||||
cacheFootprintChart
|
||||
cacheSessionChart
|
||||
@@ -90,6 +91,9 @@ struct MonitorView: View {
|
||||
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
|
||||
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
|
||||
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
|
||||
if stats.activeRequests > 0 {
|
||||
phaseChip(title: phaseAgeLabel, count: Int(stats.currentPhaseElapsed.rounded()), color: activityColor)
|
||||
}
|
||||
}
|
||||
}
|
||||
.padding(12)
|
||||
@@ -161,6 +165,71 @@ struct MonitorView: View {
|
||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private var phaseActivityChart: some View {
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
Text("Phase Activity")
|
||||
.font(.caption.bold())
|
||||
.foregroundStyle(.secondary)
|
||||
|
||||
Chart {
|
||||
ForEach(stats.currentPhaseElapsedHistory) { point in
|
||||
LineMark(
|
||||
x: .value("Time", point.timestamp),
|
||||
y: .value("Active s", point.value)
|
||||
)
|
||||
.foregroundStyle(activityColor)
|
||||
.interpolationMethod(.monotone)
|
||||
}
|
||||
ForEach(stats.prefillDurationHistory) { point in
|
||||
BarMark(
|
||||
x: .value("Time", point.timestamp),
|
||||
y: .value("Prefill done", point.value)
|
||||
)
|
||||
.foregroundStyle(.blue.opacity(0.45))
|
||||
}
|
||||
ForEach(stats.sessionBuildDurationHistory) { point in
|
||||
BarMark(
|
||||
x: .value("Time", point.timestamp),
|
||||
y: .value("Build done", point.value)
|
||||
)
|
||||
.foregroundStyle(.purple.opacity(0.45))
|
||||
}
|
||||
}
|
||||
.chartXAxis {
|
||||
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
|
||||
AxisGridLine()
|
||||
}
|
||||
}
|
||||
.chartYAxis {
|
||||
AxisMarks(position: .leading) { value in
|
||||
AxisGridLine()
|
||||
AxisValueLabel {
|
||||
if let v = value.as(Double.self) {
|
||||
Text(String(format: "%.0f", v))
|
||||
.font(.caption2.monospacedDigit())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.frame(height: 150)
|
||||
|
||||
HStack(spacing: 12) {
|
||||
Label("Active phase age", systemImage: "circle.fill")
|
||||
.font(.caption2)
|
||||
.foregroundStyle(activityColor)
|
||||
Label("Prefill completed", systemImage: "circle.fill")
|
||||
.font(.caption2)
|
||||
.foregroundStyle(.blue)
|
||||
Label("Session build completed", systemImage: "circle.fill")
|
||||
.font(.caption2)
|
||||
.foregroundStyle(.purple)
|
||||
}
|
||||
}
|
||||
.padding(12)
|
||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private var cacheReuseChart: some View {
|
||||
VStack(alignment: .leading, spacing: 6) {
|
||||
@@ -717,6 +786,13 @@ struct MonitorView: View {
|
||||
return Double(stats.totalCacheHits) / Double(total)
|
||||
}
|
||||
|
||||
private var phaseAgeLabel: String {
|
||||
if stats.generatingRequests > 0 { return "Generating s" }
|
||||
if stats.prefillingRequests > 0 { return "Prefill s" }
|
||||
if stats.sessionBuildRequests > 0 { return "Build s" }
|
||||
return "Preparing s"
|
||||
}
|
||||
|
||||
private func maxContextRatio(for tokens: Int) -> Double {
|
||||
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
|
||||
guard maxContext > 0 else { return 0 }
|
||||
|
||||
Reference in New Issue
Block a user