feat: more visibility of prefilling

2026-03-19 11:36:46 +01:00
parent 49bd165ce7
commit 577fdf8950
3 changed files with 290 additions and 109 deletions
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -9,7 +9,7 @@ final class LiveCounters: @unchecked Sendable {
    static let shared = LiveCounters()
    private let lock = OSAllocatedUnfairLock()
-    private var requestPhases: [String: RequestPhase] = [:]
+    private var requestPhases: [String: RequestState] = [:]
    // Current request
    private var _activeRequests: Int = 0
@@ -23,13 +23,19 @@ final class LiveCounters: @unchecked Sendable {
    private var _isPrefilling: Bool = false
    private var _isGenerating: Bool = false
    private var _contextMax: Int = 0
    private var _currentPhaseElapsed: TimeInterval = 0
    // Cumulative
    private var _totalRequests: Int = 0
    private var _totalPromptTokens: Int = 0
    private var _totalGenerationTokens: Int = 0
    private var _totalPreparingDuration: TimeInterval = 0
    private var _totalSessionBuildDuration: TimeInterval = 0
    private var _totalPrefillDuration: TimeInterval = 0
    private var _totalGenerationDuration: TimeInterval = 0
    func requestStarted(requestId: String, contextLength: Int) {
        let now = Date()
        lock.lock()
        _activeRequests += 1
        _preparingRequests += 1
@@ -40,33 +46,40 @@ final class LiveCounters: @unchecked Sendable {
        _generationTokens = 0
        _tokensPerSecond = 0
        _contextMax = contextLength
-        requestPhases[requestId] = .preparing
+        requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }
    func requestPhaseChanged(requestId: String, phase: RequestPhase) {
        let now = Date()
        lock.lock()
        if let current = requestPhases[requestId] {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        incrementCount(for: phase)
-        requestPhases[requestId] = phase
+        requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
        _isGenerating = _generatingRequests > 0
        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }
    func prefillCompleted(requestId: String, promptTokens: Int) {
        let now = Date()
        lock.lock()
        if let current = requestPhases[requestId] {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        incrementCount(for: .generating)
-        requestPhases[requestId] = .generating
+        requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
        _promptTokens = promptTokens
        _totalPromptTokens += promptTokens
        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
        _isGenerating = _generatingRequests > 0
        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }
@@ -78,9 +91,11 @@ final class LiveCounters: @unchecked Sendable {
    }
    func requestCompleted(requestId: String, generationTokens: Int) {
        let now = Date()
        lock.lock()
        if let current = requestPhases.removeValue(forKey: requestId) {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        _activeRequests = max(0, _activeRequests - 1)
        _totalGenerationTokens += generationTokens
@@ -92,6 +107,7 @@ final class LiveCounters: @unchecked Sendable {
            _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
            _isGenerating = _generatingRequests > 0
        }
        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }
@@ -109,15 +125,22 @@ final class LiveCounters: @unchecked Sendable {
        _isPrefilling = false
        _isGenerating = false
        _contextMax = 0
        _currentPhaseElapsed = 0
        _totalRequests = 0
        _totalPromptTokens = 0
        _totalGenerationTokens = 0
        _totalPreparingDuration = 0
        _totalSessionBuildDuration = 0
        _totalPrefillDuration = 0
        _totalGenerationDuration = 0
        lock.unlock()
    }
    /// Atomic snapshot for the UI timer.
    func snapshot() -> Snapshot {
        let now = Date()
        lock.lock()
        refreshCurrentPhaseElapsed(now: now)
        let s = Snapshot(
            activeRequests: _activeRequests,
            preparingRequests: _preparingRequests,
@@ -130,9 +153,14 @@ final class LiveCounters: @unchecked Sendable {
            isPrefilling: _isPrefilling,
            isGenerating: _isGenerating,
            contextMax: _contextMax,
            currentPhaseElapsed: _currentPhaseElapsed,
            totalRequests: _totalRequests,
            totalPromptTokens: _totalPromptTokens,
-            totalGenerationTokens: _totalGenerationTokens
+            totalGenerationTokens: _totalGenerationTokens,
            totalPreparingDuration: _totalPreparingDuration,
            totalSessionBuildDuration: _totalSessionBuildDuration,
            totalPrefillDuration: _totalPrefillDuration,
            totalGenerationDuration: _totalGenerationDuration
        )
        lock.unlock()
        return s
@@ -150,9 +178,14 @@ final class LiveCounters: @unchecked Sendable {
        let isPrefilling: Bool
        let isGenerating: Bool
        let contextMax: Int
        let currentPhaseElapsed: TimeInterval
        let totalRequests: Int
        let totalPromptTokens: Int
        let totalGenerationTokens: Int
        let totalPreparingDuration: TimeInterval
        let totalSessionBuildDuration: TimeInterval
        let totalPrefillDuration: TimeInterval
        let totalGenerationDuration: TimeInterval
    }
    private func incrementCount(for phase: RequestPhase) {
@@ -181,6 +214,28 @@ final class LiveCounters: @unchecked Sendable {
        }
    }
    private func accumulateDuration(for phase: RequestPhase, elapsed: TimeInterval) {
        switch phase {
        case .preparing:
            _totalPreparingDuration += elapsed
        case .sessionBuild:
            _totalSessionBuildDuration += elapsed
        case .prefilling:
            _totalPrefillDuration += elapsed
        case .generating:
            _totalGenerationDuration += elapsed
        }
    }
    private func refreshCurrentPhaseElapsed(now: Date) {
        _currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
    }
    private struct RequestState {
        var phase: RequestPhase
        var phaseStartedAt: Date
    }
    enum RequestPhase {
        case preparing
        case sessionBuild
@@ -208,6 +263,7 @@ final class InferenceStats {
    var currentTokensPerSecond: Double = 0
    var contextUsed: Int = 0
    var contextMax: Int = 0
    var currentPhaseElapsed: TimeInterval = 0
    // MARK: - Cumulative counters
@@ -219,6 +275,10 @@ final class InferenceStats {
    var totalCacheEvictions: Int = 0
    var totalCacheReusePromptTokens: Int = 0
    var totalCacheRebuildPromptTokens: Int = 0
    var totalPreparingDuration: TimeInterval = 0
    var totalSessionBuildDuration: TimeInterval = 0
    var totalPrefillDuration: TimeInterval = 0
    var totalGenerationDuration: TimeInterval = 0
    // MARK: - Cache state
@@ -246,6 +306,9 @@ final class InferenceStats {
    private(set) var cacheFootprintHistory: [DataPoint] = []
    private(set) var cacheReuseHistory: [DataPoint] = []
    private(set) var cacheRebuildHistory: [DataPoint] = []
    private(set) var currentPhaseElapsedHistory: [DataPoint] = []
    private(set) var prefillDurationHistory: [DataPoint] = []
    private(set) var sessionBuildDurationHistory: [DataPoint] = []
    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
@@ -255,6 +318,8 @@ final class InferenceStats {
    private var lastPromptTokenCount: Int = 0
    private var lastCacheReuseTokenCount: Int = 0
    private var lastCacheRebuildTokenCount: Int = 0
    private var lastPrefillDuration: TimeInterval = 0
    private var lastSessionBuildDuration: TimeInterval = 0
    func startSampling() {
        guard sampleTimer == nil else { return }
@@ -287,9 +352,14 @@ final class InferenceStats {
        isGenerating = snap.isGenerating
        contextMax = snap.contextMax
        contextUsed = snap.promptTokens + snap.generationTokens
        currentPhaseElapsed = snap.currentPhaseElapsed
        totalRequests = snap.totalRequests
        totalPromptTokens = snap.totalPromptTokens
        totalGenerationTokens = snap.totalGenerationTokens
        totalPreparingDuration = snap.totalPreparingDuration
        totalSessionBuildDuration = snap.totalSessionBuildDuration
        totalPrefillDuration = snap.totalPrefillDuration
        totalGenerationDuration = snap.totalGenerationDuration
        totalCacheHits = cache.totalHits
        totalCacheMisses = cache.totalMisses
        totalCacheEvictions = cache.totalEvictions
@@ -308,10 +378,14 @@ final class InferenceStats {
        let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
        let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
        let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
        let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
        let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
        lastGenerationTokenCount = snap.totalGenerationTokens
        lastPromptTokenCount = snap.totalPromptTokens
        lastCacheReuseTokenCount = cache.totalReusePromptTokens
        lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
        lastPrefillDuration = snap.totalPrefillDuration
        lastSessionBuildDuration = snap.totalSessionBuildDuration
        tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
@@ -321,6 +395,9 @@ final class InferenceStats {
        cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
        cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
        cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
        currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
        prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
        sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))
        if tokenRateHistory.count > Self.maxHistoryPoints {
            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -346,6 +423,15 @@ final class InferenceStats {
        if cacheRebuildHistory.count > Self.maxHistoryPoints {
            cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
        }
        if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
            currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
        }
        if prefillDurationHistory.count > Self.maxHistoryPoints {
            prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
        }
        if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
            sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
        }
    }
    func reset() {
@@ -363,9 +449,14 @@ final class InferenceStats {
        currentTokensPerSecond = 0
        contextUsed = 0
        contextMax = 0
        currentPhaseElapsed = 0
        totalRequests = 0
        totalPromptTokens = 0
        totalGenerationTokens = 0
        totalPreparingDuration = 0
        totalSessionBuildDuration = 0
        totalPrefillDuration = 0
        totalGenerationDuration = 0
        totalCacheHits = 0
        totalCacheMisses = 0
        totalCacheEvictions = 0
@@ -386,9 +477,14 @@ final class InferenceStats {
        cacheFootprintHistory.removeAll()
        cacheReuseHistory.removeAll()
        cacheRebuildHistory.removeAll()
        currentPhaseElapsedHistory.removeAll()
        prefillDurationHistory.removeAll()
        sessionBuildDurationHistory.removeAll()
        lastGenerationTokenCount = 0
        lastPromptTokenCount = 0
        lastCacheReuseTokenCount = 0
        lastCacheRebuildTokenCount = 0
        lastPrefillDuration = 0
        lastSessionBuildDuration = 0
    }
 }
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -393,7 +393,7 @@ final class APIServer {
        // Extract images from the last message only (ChatSession.streamDetails takes images separately)
        let lastImages = lastMessage.images
-        let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
+        let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)
        if isStream {
            result = await handleStreamingResponse(
@@ -405,7 +405,8 @@ final class APIServer {
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName
+                modelName: modelName,
                isQwen: isQwen
            )
        } else {
            result = await handleNonStreamingResponse(
@@ -417,16 +418,23 @@ final class APIServer {
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName
+                modelName: modelName,
                isQwen: isQwen
            )
        }
        if result.succeeded {
            var cachedSignatures = messageSignatures
            if let assistantHistoryText = result.assistantHistoryText {
                cachedSignatures.append(
                    Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
                )
            }
            ConversationSessionCache.shared.completeRequest(
                entryId: lease.entryId,
                session: session,
-                requestMessageSignatures: messageSignatures,
+                requestMessageSignatures: cachedSignatures,
-                requestMessageCount: chatMessages.count,
+                requestMessageCount: cachedSignatures.count,
                estimatedPromptTokens: estimatedPromptTokens,
                estimatedBytes: estimatedBytes,
                promptTokens: result.promptTokens,
@@ -473,8 +481,9 @@ final class APIServer {
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String
+        modelName: String,
-    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
+        isQwen: Bool
    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        do {
            var fullText = ""
            var promptTokens = 0
@@ -510,48 +519,11 @@ final class APIServer {
                }
            }
-            // Parse tool calls: first check framework-detected ones, then our own text parser
+            let resolved = Self.resolveAssistantResponse(
-            var finishReason = "stop"
+                fullText: fullText,
-            var responseContent: String? = fullText
+                frameworkToolCalls: frameworkToolCalls,
-            var apiToolCalls: [APIToolCall]? = nil
+                tools: tools
-
+            )
            if !frameworkToolCalls.isEmpty {
                // Framework natively detected tool calls (e.g. Qwen)
                finishReason = "tool_calls"
                apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
                    let argsJSON: String
                    let argsDict = tc.function.arguments.mapValues { $0.anyValue }
                    if let data = try? JSONSerialization.data(withJSONObject: argsDict),
                       let str = String(data: data, encoding: .utf8) {
                        argsJSON = str
                    } else {
                        argsJSON = "{}"
                    }
                    let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
                    return APIToolCall(
                        index: i,
                        id: callId,
                        type: "function",
                        function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
                    )
                }
                responseContent = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
            } else if let tools, !tools.isEmpty {
                // Try our own text parser (e.g. Gemma tool_code blocks)
                let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
                if !parsedCalls.isEmpty {
                    finishReason = "tool_calls"
                    apiToolCalls = parsedCalls.enumerated().map { i, tc in
                        APIToolCall(
                            index: i,
                            id: tc.id,
                            type: "function",
                            function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
                        )
                    }
                    responseContent = cleanText.isEmpty ? nil : cleanText
                }
            }
            let response = APIChatCompletionResponse(
                id: requestId,
@@ -563,10 +535,10 @@ final class APIServer {
                        index: 0,
                        message: APIChoiceMessage(
                            role: "assistant",
-                            content: responseContent,
+                            content: resolved.content,
-                            tool_calls: apiToolCalls
+                            tool_calls: resolved.toolCalls
                        ),
-                        finish_reason: finishReason
+                        finish_reason: resolved.finishReason
                    )
                ],
                usage: APIUsageInfo(
@@ -579,10 +551,15 @@ final class APIServer {
            if let json = try? JSONEncoder().encode(response) {
                sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
            }
-            return (promptTokens, completionTokens, true)
+            let assistantHistoryText = Self.normalizedAssistantHistoryContent(
                content: resolved.content,
                toolCalls: resolved.toolCalls,
                isQwen: isQwen
            )
            return (promptTokens, completionTokens, assistantHistoryText, true)
        } catch {
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
-            return (0, 0, false)
+            return (0, 0, nil, false)
        }
    }
@@ -597,8 +574,9 @@ final class APIServer {
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String
+        modelName: String,
-    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
+        isQwen: Bool
    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        // Send SSE headers
        let header = [
            "HTTP/1.1 200 OK",
@@ -657,50 +635,14 @@ final class APIServer {
            LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
        }
-        // Stats were already updated by LiveCounters inside the loop
+        let resolved = Self.resolveAssistantResponse(
            fullText: fullText,
            frameworkToolCalls: frameworkToolCalls,
            tools: tools
        )
-        // Post-generation: handle tool calls (framework-detected or text-parsed)
+        if let toolCalls = resolved.toolCalls {
-        var finishReason = "stop"
+            for apiToolCall in toolCalls {
        if !frameworkToolCalls.isEmpty {
            finishReason = "tool_calls"
            for (i, tc) in frameworkToolCalls.enumerated() {
                let argsDict = tc.function.arguments.mapValues { $0.anyValue }
                let argsJSON: String
                if let data = try? JSONSerialization.data(withJSONObject: argsDict),
                   let str = String(data: data, encoding: .utf8) {
                    argsJSON = str
                } else {
                    argsJSON = "{}"
                }
                let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
                let apiToolCall = APIToolCall(
                    index: i,
                    id: callId,
                    type: "function",
                    function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
                )
                await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
                    id: requestId,
                    object: "chat.completion.chunk",
                    created: created,
                    model: modelName,
                    choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: [apiToolCall]), finish_reason: nil)],
                    usage: nil
                ))
            }
        } else if hasTools {
            let (_, parsed) = ToolCallParser.parse(text: fullText, tools: tools)
            if !parsed.isEmpty {
                finishReason = "tool_calls"
            }
            for (i, tc) in parsed.enumerated() {
                let apiToolCall = APIToolCall(
                    index: i,
                    id: tc.id,
                    type: "function",
                    function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
                )
                await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
                    id: requestId,
                    object: "chat.completion.chunk",
@@ -718,7 +660,7 @@ final class APIServer {
            object: "chat.completion.chunk",
            created: created,
            model: modelName,
-            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: finishReason)],
+            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
            usage: APIUsageInfo(
                prompt_tokens: promptTokens,
                completion_tokens: completionTokens,
@@ -729,7 +671,12 @@ final class APIServer {
        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
        connection.cancel()
-        return (promptTokens, completionTokens, succeeded)
+        let assistantHistoryText = Self.normalizedAssistantHistoryContent(
            content: resolved.content,
            toolCalls: resolved.toolCalls,
            isQwen: isQwen
        )
        return (promptTokens, completionTokens, assistantHistoryText, succeeded)
    }
    /// Run the token generation + SSE send loop entirely off MainActor.
@@ -876,6 +823,68 @@ final class APIServer {
        return hash
    }
    private static func normalizedAssistantHistoryContent(
        content: String?,
        toolCalls: [APIToolCall]?,
        isQwen: Bool
    ) -> String? {
        var text = content?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
        if let toolCalls, !toolCalls.isEmpty {
            let formattedCalls = isQwen
                ? ToolPromptBuilder.formatQwenToolCalls(toolCalls)
                : ToolPromptBuilder.formatGemmaToolCalls(toolCalls)
            text = text.isEmpty ? formattedCalls : text + "\n" + formattedCalls
        }
        return text.isEmpty ? nil : text
    }
    private static func resolveAssistantResponse(
        fullText: String,
        frameworkToolCalls: [MLXLMCommon.ToolCall],
        tools: [APIToolDefinition]?
    ) -> (content: String?, toolCalls: [APIToolCall]?, finishReason: String) {
        var finishReason = "stop"
        var responseContent: String? = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
        var apiToolCalls: [APIToolCall]? = nil
        if !frameworkToolCalls.isEmpty {
            finishReason = "tool_calls"
            apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
                let argsJSON: String
                let argsDict = tc.function.arguments.mapValues { $0.anyValue }
                if let data = try? JSONSerialization.data(withJSONObject: argsDict),
                   let str = String(data: data, encoding: .utf8) {
                    argsJSON = str
                } else {
                    argsJSON = "{}"
                }
                let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
                return APIToolCall(
                    index: i,
                    id: callId,
                    type: "function",
                    function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
                )
            }
        } else if let tools, !tools.isEmpty {
            let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
            if !parsedCalls.isEmpty {
                finishReason = "tool_calls"
                apiToolCalls = parsedCalls.enumerated().map { i, tc in
                    APIToolCall(
                        index: i,
                        id: tc.id,
                        type: "function",
                        function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
                    )
                }
                responseContent = cleanText.isEmpty ? nil : cleanText
            }
        }
        return (responseContent, apiToolCalls, finishReason)
    }
 }
 private struct DecodedImage {
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
@@ -17,6 +17,7 @@ struct MonitorView: View {
                LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
                    tokenRateChart
                    tokenThroughputChart
                    phaseActivityChart
                    cacheReuseChart
                    cacheFootprintChart
                    cacheSessionChart
@@ -90,6 +91,9 @@ struct MonitorView: View {
                phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
                phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
                phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
                if stats.activeRequests > 0 {
                    phaseChip(title: phaseAgeLabel, count: Int(stats.currentPhaseElapsed.rounded()), color: activityColor)
                }
            }
        }
        .padding(12)
@@ -161,6 +165,71 @@ struct MonitorView: View {
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }
    @ViewBuilder
    private var phaseActivityChart: some View {
        VStack(alignment: .leading, spacing: 6) {
            Text("Phase Activity")
                .font(.caption.bold())
                .foregroundStyle(.secondary)
            Chart {
                ForEach(stats.currentPhaseElapsedHistory) { point in
                    LineMark(
                        x: .value("Time", point.timestamp),
                        y: .value("Active s", point.value)
                    )
                    .foregroundStyle(activityColor)
                    .interpolationMethod(.monotone)
                }
                ForEach(stats.prefillDurationHistory) { point in
                    BarMark(
                        x: .value("Time", point.timestamp),
                        y: .value("Prefill done", point.value)
                    )
                    .foregroundStyle(.blue.opacity(0.45))
                }
                ForEach(stats.sessionBuildDurationHistory) { point in
                    BarMark(
                        x: .value("Time", point.timestamp),
                        y: .value("Build done", point.value)
                    )
                    .foregroundStyle(.purple.opacity(0.45))
                }
            }
            .chartXAxis {
                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
                    AxisGridLine()
                }
            }
            .chartYAxis {
                AxisMarks(position: .leading) { value in
                    AxisGridLine()
                    AxisValueLabel {
                        if let v = value.as(Double.self) {
                            Text(String(format: "%.0f", v))
                                .font(.caption2.monospacedDigit())
                        }
                    }
                }
            }
            .frame(height: 150)
            HStack(spacing: 12) {
                Label("Active phase age", systemImage: "circle.fill")
                    .font(.caption2)
                    .foregroundStyle(activityColor)
                Label("Prefill completed", systemImage: "circle.fill")
                    .font(.caption2)
                    .foregroundStyle(.blue)
                Label("Session build completed", systemImage: "circle.fill")
                    .font(.caption2)
                    .foregroundStyle(.purple)
            }
        }
        .padding(12)
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }
    @ViewBuilder
    private var cacheReuseChart: some View {
        VStack(alignment: .leading, spacing: 6) {
@@ -717,6 +786,13 @@ struct MonitorView: View {
        return Double(stats.totalCacheHits) / Double(total)
    }
    private var phaseAgeLabel: String {
        if stats.generatingRequests > 0 { return "Generating s" }
        if stats.prefillingRequests > 0 { return "Prefill s" }
        if stats.sessionBuildRequests > 0 { return "Build s" }
        return "Preparing s"
    }
    private func maxContextRatio(for tokens: Int) -> Double {
        let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
        guard maxContext > 0 else { return 0 }