feat: more visibility of prefilling
This commit is contained in:
@@ -9,7 +9,7 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
static let shared = LiveCounters()
|
static let shared = LiveCounters()
|
||||||
|
|
||||||
private let lock = OSAllocatedUnfairLock()
|
private let lock = OSAllocatedUnfairLock()
|
||||||
private var requestPhases: [String: RequestPhase] = [:]
|
private var requestPhases: [String: RequestState] = [:]
|
||||||
|
|
||||||
// Current request
|
// Current request
|
||||||
private var _activeRequests: Int = 0
|
private var _activeRequests: Int = 0
|
||||||
@@ -23,13 +23,19 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
private var _isPrefilling: Bool = false
|
private var _isPrefilling: Bool = false
|
||||||
private var _isGenerating: Bool = false
|
private var _isGenerating: Bool = false
|
||||||
private var _contextMax: Int = 0
|
private var _contextMax: Int = 0
|
||||||
|
private var _currentPhaseElapsed: TimeInterval = 0
|
||||||
|
|
||||||
// Cumulative
|
// Cumulative
|
||||||
private var _totalRequests: Int = 0
|
private var _totalRequests: Int = 0
|
||||||
private var _totalPromptTokens: Int = 0
|
private var _totalPromptTokens: Int = 0
|
||||||
private var _totalGenerationTokens: Int = 0
|
private var _totalGenerationTokens: Int = 0
|
||||||
|
private var _totalPreparingDuration: TimeInterval = 0
|
||||||
|
private var _totalSessionBuildDuration: TimeInterval = 0
|
||||||
|
private var _totalPrefillDuration: TimeInterval = 0
|
||||||
|
private var _totalGenerationDuration: TimeInterval = 0
|
||||||
|
|
||||||
func requestStarted(requestId: String, contextLength: Int) {
|
func requestStarted(requestId: String, contextLength: Int) {
|
||||||
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
_activeRequests += 1
|
_activeRequests += 1
|
||||||
_preparingRequests += 1
|
_preparingRequests += 1
|
||||||
@@ -40,33 +46,40 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_generationTokens = 0
|
_generationTokens = 0
|
||||||
_tokensPerSecond = 0
|
_tokensPerSecond = 0
|
||||||
_contextMax = contextLength
|
_contextMax = contextLength
|
||||||
requestPhases[requestId] = .preparing
|
requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
|
||||||
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func requestPhaseChanged(requestId: String, phase: RequestPhase) {
|
func requestPhaseChanged(requestId: String, phase: RequestPhase) {
|
||||||
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
if let current = requestPhases[requestId] {
|
if let current = requestPhases[requestId] {
|
||||||
decrementCount(for: current)
|
decrementCount(for: current.phase)
|
||||||
|
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||||
}
|
}
|
||||||
incrementCount(for: phase)
|
incrementCount(for: phase)
|
||||||
requestPhases[requestId] = phase
|
requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
|
||||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
_isGenerating = _generatingRequests > 0
|
_isGenerating = _generatingRequests > 0
|
||||||
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func prefillCompleted(requestId: String, promptTokens: Int) {
|
func prefillCompleted(requestId: String, promptTokens: Int) {
|
||||||
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
if let current = requestPhases[requestId] {
|
if let current = requestPhases[requestId] {
|
||||||
decrementCount(for: current)
|
decrementCount(for: current.phase)
|
||||||
|
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||||
}
|
}
|
||||||
incrementCount(for: .generating)
|
incrementCount(for: .generating)
|
||||||
requestPhases[requestId] = .generating
|
requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
|
||||||
_promptTokens = promptTokens
|
_promptTokens = promptTokens
|
||||||
_totalPromptTokens += promptTokens
|
_totalPromptTokens += promptTokens
|
||||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
_isGenerating = _generatingRequests > 0
|
_isGenerating = _generatingRequests > 0
|
||||||
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,9 +91,11 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func requestCompleted(requestId: String, generationTokens: Int) {
|
func requestCompleted(requestId: String, generationTokens: Int) {
|
||||||
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
if let current = requestPhases.removeValue(forKey: requestId) {
|
if let current = requestPhases.removeValue(forKey: requestId) {
|
||||||
decrementCount(for: current)
|
decrementCount(for: current.phase)
|
||||||
|
accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
|
||||||
}
|
}
|
||||||
_activeRequests = max(0, _activeRequests - 1)
|
_activeRequests = max(0, _activeRequests - 1)
|
||||||
_totalGenerationTokens += generationTokens
|
_totalGenerationTokens += generationTokens
|
||||||
@@ -92,6 +107,7 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
_isGenerating = _generatingRequests > 0
|
_isGenerating = _generatingRequests > 0
|
||||||
}
|
}
|
||||||
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,15 +125,22 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_isPrefilling = false
|
_isPrefilling = false
|
||||||
_isGenerating = false
|
_isGenerating = false
|
||||||
_contextMax = 0
|
_contextMax = 0
|
||||||
|
_currentPhaseElapsed = 0
|
||||||
_totalRequests = 0
|
_totalRequests = 0
|
||||||
_totalPromptTokens = 0
|
_totalPromptTokens = 0
|
||||||
_totalGenerationTokens = 0
|
_totalGenerationTokens = 0
|
||||||
|
_totalPreparingDuration = 0
|
||||||
|
_totalSessionBuildDuration = 0
|
||||||
|
_totalPrefillDuration = 0
|
||||||
|
_totalGenerationDuration = 0
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Atomic snapshot for the UI timer.
|
/// Atomic snapshot for the UI timer.
|
||||||
func snapshot() -> Snapshot {
|
func snapshot() -> Snapshot {
|
||||||
|
let now = Date()
|
||||||
lock.lock()
|
lock.lock()
|
||||||
|
refreshCurrentPhaseElapsed(now: now)
|
||||||
let s = Snapshot(
|
let s = Snapshot(
|
||||||
activeRequests: _activeRequests,
|
activeRequests: _activeRequests,
|
||||||
preparingRequests: _preparingRequests,
|
preparingRequests: _preparingRequests,
|
||||||
@@ -130,9 +153,14 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
isPrefilling: _isPrefilling,
|
isPrefilling: _isPrefilling,
|
||||||
isGenerating: _isGenerating,
|
isGenerating: _isGenerating,
|
||||||
contextMax: _contextMax,
|
contextMax: _contextMax,
|
||||||
|
currentPhaseElapsed: _currentPhaseElapsed,
|
||||||
totalRequests: _totalRequests,
|
totalRequests: _totalRequests,
|
||||||
totalPromptTokens: _totalPromptTokens,
|
totalPromptTokens: _totalPromptTokens,
|
||||||
totalGenerationTokens: _totalGenerationTokens
|
totalGenerationTokens: _totalGenerationTokens,
|
||||||
|
totalPreparingDuration: _totalPreparingDuration,
|
||||||
|
totalSessionBuildDuration: _totalSessionBuildDuration,
|
||||||
|
totalPrefillDuration: _totalPrefillDuration,
|
||||||
|
totalGenerationDuration: _totalGenerationDuration
|
||||||
)
|
)
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
return s
|
return s
|
||||||
@@ -150,9 +178,14 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
let isPrefilling: Bool
|
let isPrefilling: Bool
|
||||||
let isGenerating: Bool
|
let isGenerating: Bool
|
||||||
let contextMax: Int
|
let contextMax: Int
|
||||||
|
let currentPhaseElapsed: TimeInterval
|
||||||
let totalRequests: Int
|
let totalRequests: Int
|
||||||
let totalPromptTokens: Int
|
let totalPromptTokens: Int
|
||||||
let totalGenerationTokens: Int
|
let totalGenerationTokens: Int
|
||||||
|
let totalPreparingDuration: TimeInterval
|
||||||
|
let totalSessionBuildDuration: TimeInterval
|
||||||
|
let totalPrefillDuration: TimeInterval
|
||||||
|
let totalGenerationDuration: TimeInterval
|
||||||
}
|
}
|
||||||
|
|
||||||
private func incrementCount(for phase: RequestPhase) {
|
private func incrementCount(for phase: RequestPhase) {
|
||||||
@@ -181,6 +214,28 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func accumulateDuration(for phase: RequestPhase, elapsed: TimeInterval) {
|
||||||
|
switch phase {
|
||||||
|
case .preparing:
|
||||||
|
_totalPreparingDuration += elapsed
|
||||||
|
case .sessionBuild:
|
||||||
|
_totalSessionBuildDuration += elapsed
|
||||||
|
case .prefilling:
|
||||||
|
_totalPrefillDuration += elapsed
|
||||||
|
case .generating:
|
||||||
|
_totalGenerationDuration += elapsed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func refreshCurrentPhaseElapsed(now: Date) {
|
||||||
|
_currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct RequestState {
|
||||||
|
var phase: RequestPhase
|
||||||
|
var phaseStartedAt: Date
|
||||||
|
}
|
||||||
|
|
||||||
enum RequestPhase {
|
enum RequestPhase {
|
||||||
case preparing
|
case preparing
|
||||||
case sessionBuild
|
case sessionBuild
|
||||||
@@ -208,6 +263,7 @@ final class InferenceStats {
|
|||||||
var currentTokensPerSecond: Double = 0
|
var currentTokensPerSecond: Double = 0
|
||||||
var contextUsed: Int = 0
|
var contextUsed: Int = 0
|
||||||
var contextMax: Int = 0
|
var contextMax: Int = 0
|
||||||
|
var currentPhaseElapsed: TimeInterval = 0
|
||||||
|
|
||||||
// MARK: - Cumulative counters
|
// MARK: - Cumulative counters
|
||||||
|
|
||||||
@@ -219,6 +275,10 @@ final class InferenceStats {
|
|||||||
var totalCacheEvictions: Int = 0
|
var totalCacheEvictions: Int = 0
|
||||||
var totalCacheReusePromptTokens: Int = 0
|
var totalCacheReusePromptTokens: Int = 0
|
||||||
var totalCacheRebuildPromptTokens: Int = 0
|
var totalCacheRebuildPromptTokens: Int = 0
|
||||||
|
var totalPreparingDuration: TimeInterval = 0
|
||||||
|
var totalSessionBuildDuration: TimeInterval = 0
|
||||||
|
var totalPrefillDuration: TimeInterval = 0
|
||||||
|
var totalGenerationDuration: TimeInterval = 0
|
||||||
|
|
||||||
// MARK: - Cache state
|
// MARK: - Cache state
|
||||||
|
|
||||||
@@ -246,6 +306,9 @@ final class InferenceStats {
|
|||||||
private(set) var cacheFootprintHistory: [DataPoint] = []
|
private(set) var cacheFootprintHistory: [DataPoint] = []
|
||||||
private(set) var cacheReuseHistory: [DataPoint] = []
|
private(set) var cacheReuseHistory: [DataPoint] = []
|
||||||
private(set) var cacheRebuildHistory: [DataPoint] = []
|
private(set) var cacheRebuildHistory: [DataPoint] = []
|
||||||
|
private(set) var currentPhaseElapsedHistory: [DataPoint] = []
|
||||||
|
private(set) var prefillDurationHistory: [DataPoint] = []
|
||||||
|
private(set) var sessionBuildDurationHistory: [DataPoint] = []
|
||||||
|
|
||||||
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
||||||
|
|
||||||
@@ -255,6 +318,8 @@ final class InferenceStats {
|
|||||||
private var lastPromptTokenCount: Int = 0
|
private var lastPromptTokenCount: Int = 0
|
||||||
private var lastCacheReuseTokenCount: Int = 0
|
private var lastCacheReuseTokenCount: Int = 0
|
||||||
private var lastCacheRebuildTokenCount: Int = 0
|
private var lastCacheRebuildTokenCount: Int = 0
|
||||||
|
private var lastPrefillDuration: TimeInterval = 0
|
||||||
|
private var lastSessionBuildDuration: TimeInterval = 0
|
||||||
|
|
||||||
func startSampling() {
|
func startSampling() {
|
||||||
guard sampleTimer == nil else { return }
|
guard sampleTimer == nil else { return }
|
||||||
@@ -287,9 +352,14 @@ final class InferenceStats {
|
|||||||
isGenerating = snap.isGenerating
|
isGenerating = snap.isGenerating
|
||||||
contextMax = snap.contextMax
|
contextMax = snap.contextMax
|
||||||
contextUsed = snap.promptTokens + snap.generationTokens
|
contextUsed = snap.promptTokens + snap.generationTokens
|
||||||
|
currentPhaseElapsed = snap.currentPhaseElapsed
|
||||||
totalRequests = snap.totalRequests
|
totalRequests = snap.totalRequests
|
||||||
totalPromptTokens = snap.totalPromptTokens
|
totalPromptTokens = snap.totalPromptTokens
|
||||||
totalGenerationTokens = snap.totalGenerationTokens
|
totalGenerationTokens = snap.totalGenerationTokens
|
||||||
|
totalPreparingDuration = snap.totalPreparingDuration
|
||||||
|
totalSessionBuildDuration = snap.totalSessionBuildDuration
|
||||||
|
totalPrefillDuration = snap.totalPrefillDuration
|
||||||
|
totalGenerationDuration = snap.totalGenerationDuration
|
||||||
totalCacheHits = cache.totalHits
|
totalCacheHits = cache.totalHits
|
||||||
totalCacheMisses = cache.totalMisses
|
totalCacheMisses = cache.totalMisses
|
||||||
totalCacheEvictions = cache.totalEvictions
|
totalCacheEvictions = cache.totalEvictions
|
||||||
@@ -308,10 +378,14 @@ final class InferenceStats {
|
|||||||
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
||||||
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
|
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
|
||||||
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
|
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
|
||||||
|
let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
|
||||||
|
let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
|
||||||
lastGenerationTokenCount = snap.totalGenerationTokens
|
lastGenerationTokenCount = snap.totalGenerationTokens
|
||||||
lastPromptTokenCount = snap.totalPromptTokens
|
lastPromptTokenCount = snap.totalPromptTokens
|
||||||
lastCacheReuseTokenCount = cache.totalReusePromptTokens
|
lastCacheReuseTokenCount = cache.totalReusePromptTokens
|
||||||
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
|
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
|
||||||
|
lastPrefillDuration = snap.totalPrefillDuration
|
||||||
|
lastSessionBuildDuration = snap.totalSessionBuildDuration
|
||||||
|
|
||||||
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
||||||
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
||||||
@@ -321,6 +395,9 @@ final class InferenceStats {
|
|||||||
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
||||||
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
|
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
|
||||||
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
|
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
|
||||||
|
currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
|
||||||
|
prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
|
||||||
|
sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
|
||||||
|
|
||||||
if tokenRateHistory.count > Self.maxHistoryPoints {
|
if tokenRateHistory.count > Self.maxHistoryPoints {
|
||||||
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
||||||
@@ -346,6 +423,15 @@ final class InferenceStats {
|
|||||||
if cacheRebuildHistory.count > Self.maxHistoryPoints {
|
if cacheRebuildHistory.count > Self.maxHistoryPoints {
|
||||||
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
|
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
|
if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
|
||||||
|
currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if prefillDurationHistory.count > Self.maxHistoryPoints {
|
||||||
|
prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
|
||||||
|
sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func reset() {
|
func reset() {
|
||||||
@@ -363,9 +449,14 @@ final class InferenceStats {
|
|||||||
currentTokensPerSecond = 0
|
currentTokensPerSecond = 0
|
||||||
contextUsed = 0
|
contextUsed = 0
|
||||||
contextMax = 0
|
contextMax = 0
|
||||||
|
currentPhaseElapsed = 0
|
||||||
totalRequests = 0
|
totalRequests = 0
|
||||||
totalPromptTokens = 0
|
totalPromptTokens = 0
|
||||||
totalGenerationTokens = 0
|
totalGenerationTokens = 0
|
||||||
|
totalPreparingDuration = 0
|
||||||
|
totalSessionBuildDuration = 0
|
||||||
|
totalPrefillDuration = 0
|
||||||
|
totalGenerationDuration = 0
|
||||||
totalCacheHits = 0
|
totalCacheHits = 0
|
||||||
totalCacheMisses = 0
|
totalCacheMisses = 0
|
||||||
totalCacheEvictions = 0
|
totalCacheEvictions = 0
|
||||||
@@ -386,9 +477,14 @@ final class InferenceStats {
|
|||||||
cacheFootprintHistory.removeAll()
|
cacheFootprintHistory.removeAll()
|
||||||
cacheReuseHistory.removeAll()
|
cacheReuseHistory.removeAll()
|
||||||
cacheRebuildHistory.removeAll()
|
cacheRebuildHistory.removeAll()
|
||||||
|
currentPhaseElapsedHistory.removeAll()
|
||||||
|
prefillDurationHistory.removeAll()
|
||||||
|
sessionBuildDurationHistory.removeAll()
|
||||||
lastGenerationTokenCount = 0
|
lastGenerationTokenCount = 0
|
||||||
lastPromptTokenCount = 0
|
lastPromptTokenCount = 0
|
||||||
lastCacheReuseTokenCount = 0
|
lastCacheReuseTokenCount = 0
|
||||||
lastCacheRebuildTokenCount = 0
|
lastCacheRebuildTokenCount = 0
|
||||||
|
lastPrefillDuration = 0
|
||||||
|
lastSessionBuildDuration = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -393,7 +393,7 @@ final class APIServer {
|
|||||||
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
||||||
let lastImages = lastMessage.images
|
let lastImages = lastMessage.images
|
||||||
|
|
||||||
let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
|
let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
|
||||||
|
|
||||||
if isStream {
|
if isStream {
|
||||||
result = await handleStreamingResponse(
|
result = await handleStreamingResponse(
|
||||||
@@ -405,7 +405,8 @@ final class APIServer {
|
|||||||
images: lastImages,
|
images: lastImages,
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName
|
modelName: modelName,
|
||||||
|
isQwen: isQwen
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
result = await handleNonStreamingResponse(
|
result = await handleNonStreamingResponse(
|
||||||
@@ -417,16 +418,23 @@ final class APIServer {
|
|||||||
images: lastImages,
|
images: lastImages,
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName
|
modelName: modelName,
|
||||||
|
isQwen: isQwen
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.succeeded {
|
if result.succeeded {
|
||||||
|
var cachedSignatures = messageSignatures
|
||||||
|
if let assistantHistoryText = result.assistantHistoryText {
|
||||||
|
cachedSignatures.append(
|
||||||
|
Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
|
||||||
|
)
|
||||||
|
}
|
||||||
ConversationSessionCache.shared.completeRequest(
|
ConversationSessionCache.shared.completeRequest(
|
||||||
entryId: lease.entryId,
|
entryId: lease.entryId,
|
||||||
session: session,
|
session: session,
|
||||||
requestMessageSignatures: messageSignatures,
|
requestMessageSignatures: cachedSignatures,
|
||||||
requestMessageCount: chatMessages.count,
|
requestMessageCount: cachedSignatures.count,
|
||||||
estimatedPromptTokens: estimatedPromptTokens,
|
estimatedPromptTokens: estimatedPromptTokens,
|
||||||
estimatedBytes: estimatedBytes,
|
estimatedBytes: estimatedBytes,
|
||||||
promptTokens: result.promptTokens,
|
promptTokens: result.promptTokens,
|
||||||
@@ -473,8 +481,9 @@ final class APIServer {
|
|||||||
images: [UserInput.Image],
|
images: [UserInput.Image],
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String
|
modelName: String,
|
||||||
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
isQwen: Bool
|
||||||
|
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
||||||
do {
|
do {
|
||||||
var fullText = ""
|
var fullText = ""
|
||||||
var promptTokens = 0
|
var promptTokens = 0
|
||||||
@@ -510,48 +519,11 @@ final class APIServer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse tool calls: first check framework-detected ones, then our own text parser
|
let resolved = Self.resolveAssistantResponse(
|
||||||
var finishReason = "stop"
|
fullText: fullText,
|
||||||
var responseContent: String? = fullText
|
frameworkToolCalls: frameworkToolCalls,
|
||||||
var apiToolCalls: [APIToolCall]? = nil
|
tools: tools
|
||||||
|
)
|
||||||
if !frameworkToolCalls.isEmpty {
|
|
||||||
// Framework natively detected tool calls (e.g. Qwen)
|
|
||||||
finishReason = "tool_calls"
|
|
||||||
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
|
|
||||||
let argsJSON: String
|
|
||||||
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
|
||||||
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
|
||||||
let str = String(data: data, encoding: .utf8) {
|
|
||||||
argsJSON = str
|
|
||||||
} else {
|
|
||||||
argsJSON = "{}"
|
|
||||||
}
|
|
||||||
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
|
||||||
return APIToolCall(
|
|
||||||
index: i,
|
|
||||||
id: callId,
|
|
||||||
type: "function",
|
|
||||||
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
responseContent = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
|
|
||||||
} else if let tools, !tools.isEmpty {
|
|
||||||
// Try our own text parser (e.g. Gemma tool_code blocks)
|
|
||||||
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
|
|
||||||
if !parsedCalls.isEmpty {
|
|
||||||
finishReason = "tool_calls"
|
|
||||||
apiToolCalls = parsedCalls.enumerated().map { i, tc in
|
|
||||||
APIToolCall(
|
|
||||||
index: i,
|
|
||||||
id: tc.id,
|
|
||||||
type: "function",
|
|
||||||
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
responseContent = cleanText.isEmpty ? nil : cleanText
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let response = APIChatCompletionResponse(
|
let response = APIChatCompletionResponse(
|
||||||
id: requestId,
|
id: requestId,
|
||||||
@@ -563,10 +535,10 @@ final class APIServer {
|
|||||||
index: 0,
|
index: 0,
|
||||||
message: APIChoiceMessage(
|
message: APIChoiceMessage(
|
||||||
role: "assistant",
|
role: "assistant",
|
||||||
content: responseContent,
|
content: resolved.content,
|
||||||
tool_calls: apiToolCalls
|
tool_calls: resolved.toolCalls
|
||||||
),
|
),
|
||||||
finish_reason: finishReason
|
finish_reason: resolved.finishReason
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
usage: APIUsageInfo(
|
usage: APIUsageInfo(
|
||||||
@@ -579,10 +551,15 @@ final class APIServer {
|
|||||||
if let json = try? JSONEncoder().encode(response) {
|
if let json = try? JSONEncoder().encode(response) {
|
||||||
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
||||||
}
|
}
|
||||||
return (promptTokens, completionTokens, true)
|
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
||||||
|
content: resolved.content,
|
||||||
|
toolCalls: resolved.toolCalls,
|
||||||
|
isQwen: isQwen
|
||||||
|
)
|
||||||
|
return (promptTokens, completionTokens, assistantHistoryText, true)
|
||||||
} catch {
|
} catch {
|
||||||
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||||
return (0, 0, false)
|
return (0, 0, nil, false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -597,8 +574,9 @@ final class APIServer {
|
|||||||
images: [UserInput.Image],
|
images: [UserInput.Image],
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String
|
modelName: String,
|
||||||
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
isQwen: Bool
|
||||||
|
) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
|
||||||
// Send SSE headers
|
// Send SSE headers
|
||||||
let header = [
|
let header = [
|
||||||
"HTTP/1.1 200 OK",
|
"HTTP/1.1 200 OK",
|
||||||
@@ -657,50 +635,14 @@ final class APIServer {
|
|||||||
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stats were already updated by LiveCounters inside the loop
|
let resolved = Self.resolveAssistantResponse(
|
||||||
|
fullText: fullText,
|
||||||
|
frameworkToolCalls: frameworkToolCalls,
|
||||||
|
tools: tools
|
||||||
|
)
|
||||||
|
|
||||||
// Post-generation: handle tool calls (framework-detected or text-parsed)
|
if let toolCalls = resolved.toolCalls {
|
||||||
var finishReason = "stop"
|
for apiToolCall in toolCalls {
|
||||||
|
|
||||||
if !frameworkToolCalls.isEmpty {
|
|
||||||
finishReason = "tool_calls"
|
|
||||||
for (i, tc) in frameworkToolCalls.enumerated() {
|
|
||||||
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
|
||||||
let argsJSON: String
|
|
||||||
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
|
||||||
let str = String(data: data, encoding: .utf8) {
|
|
||||||
argsJSON = str
|
|
||||||
} else {
|
|
||||||
argsJSON = "{}"
|
|
||||||
}
|
|
||||||
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
|
||||||
let apiToolCall = APIToolCall(
|
|
||||||
index: i,
|
|
||||||
id: callId,
|
|
||||||
type: "function",
|
|
||||||
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
|
||||||
)
|
|
||||||
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
|
||||||
id: requestId,
|
|
||||||
object: "chat.completion.chunk",
|
|
||||||
created: created,
|
|
||||||
model: modelName,
|
|
||||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: [apiToolCall]), finish_reason: nil)],
|
|
||||||
usage: nil
|
|
||||||
))
|
|
||||||
}
|
|
||||||
} else if hasTools {
|
|
||||||
let (_, parsed) = ToolCallParser.parse(text: fullText, tools: tools)
|
|
||||||
if !parsed.isEmpty {
|
|
||||||
finishReason = "tool_calls"
|
|
||||||
}
|
|
||||||
for (i, tc) in parsed.enumerated() {
|
|
||||||
let apiToolCall = APIToolCall(
|
|
||||||
index: i,
|
|
||||||
id: tc.id,
|
|
||||||
type: "function",
|
|
||||||
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
|
||||||
)
|
|
||||||
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
|
||||||
id: requestId,
|
id: requestId,
|
||||||
object: "chat.completion.chunk",
|
object: "chat.completion.chunk",
|
||||||
@@ -718,7 +660,7 @@ final class APIServer {
|
|||||||
object: "chat.completion.chunk",
|
object: "chat.completion.chunk",
|
||||||
created: created,
|
created: created,
|
||||||
model: modelName,
|
model: modelName,
|
||||||
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: finishReason)],
|
choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
|
||||||
usage: APIUsageInfo(
|
usage: APIUsageInfo(
|
||||||
prompt_tokens: promptTokens,
|
prompt_tokens: promptTokens,
|
||||||
completion_tokens: completionTokens,
|
completion_tokens: completionTokens,
|
||||||
@@ -729,7 +671,12 @@ final class APIServer {
|
|||||||
// Send [DONE] and close
|
// Send [DONE] and close
|
||||||
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
||||||
connection.cancel()
|
connection.cancel()
|
||||||
return (promptTokens, completionTokens, succeeded)
|
let assistantHistoryText = Self.normalizedAssistantHistoryContent(
|
||||||
|
content: resolved.content,
|
||||||
|
toolCalls: resolved.toolCalls,
|
||||||
|
isQwen: isQwen
|
||||||
|
)
|
||||||
|
return (promptTokens, completionTokens, assistantHistoryText, succeeded)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the token generation + SSE send loop entirely off MainActor.
|
/// Run the token generation + SSE send loop entirely off MainActor.
|
||||||
@@ -876,6 +823,68 @@ final class APIServer {
|
|||||||
|
|
||||||
return hash
|
return hash
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static func normalizedAssistantHistoryContent(
|
||||||
|
content: String?,
|
||||||
|
toolCalls: [APIToolCall]?,
|
||||||
|
isQwen: Bool
|
||||||
|
) -> String? {
|
||||||
|
var text = content?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
||||||
|
if let toolCalls, !toolCalls.isEmpty {
|
||||||
|
let formattedCalls = isQwen
|
||||||
|
? ToolPromptBuilder.formatQwenToolCalls(toolCalls)
|
||||||
|
: ToolPromptBuilder.formatGemmaToolCalls(toolCalls)
|
||||||
|
text = text.isEmpty ? formattedCalls : text + "\n" + formattedCalls
|
||||||
|
}
|
||||||
|
return text.isEmpty ? nil : text
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func resolveAssistantResponse(
|
||||||
|
fullText: String,
|
||||||
|
frameworkToolCalls: [MLXLMCommon.ToolCall],
|
||||||
|
tools: [APIToolDefinition]?
|
||||||
|
) -> (content: String?, toolCalls: [APIToolCall]?, finishReason: String) {
|
||||||
|
var finishReason = "stop"
|
||||||
|
var responseContent: String? = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
|
||||||
|
var apiToolCalls: [APIToolCall]? = nil
|
||||||
|
|
||||||
|
if !frameworkToolCalls.isEmpty {
|
||||||
|
finishReason = "tool_calls"
|
||||||
|
apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
|
||||||
|
let argsJSON: String
|
||||||
|
let argsDict = tc.function.arguments.mapValues { $0.anyValue }
|
||||||
|
if let data = try? JSONSerialization.data(withJSONObject: argsDict),
|
||||||
|
let str = String(data: data, encoding: .utf8) {
|
||||||
|
argsJSON = str
|
||||||
|
} else {
|
||||||
|
argsJSON = "{}"
|
||||||
|
}
|
||||||
|
let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
|
||||||
|
return APIToolCall(
|
||||||
|
index: i,
|
||||||
|
id: callId,
|
||||||
|
type: "function",
|
||||||
|
function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else if let tools, !tools.isEmpty {
|
||||||
|
let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
|
||||||
|
if !parsedCalls.isEmpty {
|
||||||
|
finishReason = "tool_calls"
|
||||||
|
apiToolCalls = parsedCalls.enumerated().map { i, tc in
|
||||||
|
APIToolCall(
|
||||||
|
index: i,
|
||||||
|
id: tc.id,
|
||||||
|
type: "function",
|
||||||
|
function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
responseContent = cleanText.isEmpty ? nil : cleanText
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (responseContent, apiToolCalls, finishReason)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private struct DecodedImage {
|
private struct DecodedImage {
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ struct MonitorView: View {
|
|||||||
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
|
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
|
||||||
tokenRateChart
|
tokenRateChart
|
||||||
tokenThroughputChart
|
tokenThroughputChart
|
||||||
|
phaseActivityChart
|
||||||
cacheReuseChart
|
cacheReuseChart
|
||||||
cacheFootprintChart
|
cacheFootprintChart
|
||||||
cacheSessionChart
|
cacheSessionChart
|
||||||
@@ -90,6 +91,9 @@ struct MonitorView: View {
|
|||||||
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
|
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
|
||||||
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
|
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
|
||||||
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
|
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
|
||||||
|
if stats.activeRequests > 0 {
|
||||||
|
phaseChip(title: phaseAgeLabel, count: Int(stats.currentPhaseElapsed.rounded()), color: activityColor)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.padding(12)
|
.padding(12)
|
||||||
@@ -161,6 +165,71 @@ struct MonitorView: View {
|
|||||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var phaseActivityChart: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 6) {
|
||||||
|
Text("Phase Activity")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
|
Chart {
|
||||||
|
ForEach(stats.currentPhaseElapsedHistory) { point in
|
||||||
|
LineMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Active s", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(activityColor)
|
||||||
|
.interpolationMethod(.monotone)
|
||||||
|
}
|
||||||
|
ForEach(stats.prefillDurationHistory) { point in
|
||||||
|
BarMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Prefill done", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.blue.opacity(0.45))
|
||||||
|
}
|
||||||
|
ForEach(stats.sessionBuildDurationHistory) { point in
|
||||||
|
BarMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Build done", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.purple.opacity(0.45))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartXAxis {
|
||||||
|
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
|
||||||
|
AxisGridLine()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartYAxis {
|
||||||
|
AxisMarks(position: .leading) { value in
|
||||||
|
AxisGridLine()
|
||||||
|
AxisValueLabel {
|
||||||
|
if let v = value.as(Double.self) {
|
||||||
|
Text(String(format: "%.0f", v))
|
||||||
|
.font(.caption2.monospacedDigit())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(height: 150)
|
||||||
|
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
Label("Active phase age", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(activityColor)
|
||||||
|
Label("Prefill completed", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.blue)
|
||||||
|
Label("Session build completed", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.purple)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
@ViewBuilder
|
@ViewBuilder
|
||||||
private var cacheReuseChart: some View {
|
private var cacheReuseChart: some View {
|
||||||
VStack(alignment: .leading, spacing: 6) {
|
VStack(alignment: .leading, spacing: 6) {
|
||||||
@@ -717,6 +786,13 @@ struct MonitorView: View {
|
|||||||
return Double(stats.totalCacheHits) / Double(total)
|
return Double(stats.totalCacheHits) / Double(total)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private var phaseAgeLabel: String {
|
||||||
|
if stats.generatingRequests > 0 { return "Generating s" }
|
||||||
|
if stats.prefillingRequests > 0 { return "Prefill s" }
|
||||||
|
if stats.sessionBuildRequests > 0 { return "Build s" }
|
||||||
|
return "Preparing s"
|
||||||
|
}
|
||||||
|
|
||||||
private func maxContextRatio(for tokens: Int) -> Double {
|
private func maxContextRatio(for tokens: Int) -> Double {
|
||||||
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
|
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
|
||||||
guard maxContext > 0 else { return 0 }
|
guard maxContext > 0 else { return 0 }
|
||||||
|
|||||||
Reference in New Issue
Block a user