feat: more visibility of prefilling

2026-03-19 11:36:46 +01:00
parent 49bd165ce7
commit 577fdf8950
3 changed files with 290 additions and 109 deletions
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -9,7 +9,7 @@ final class LiveCounters: @unchecked Sendable {
    static let shared = LiveCounters()

    private let lock = OSAllocatedUnfairLock()
-    private var requestPhases: [String: RequestPhase] = [:]
+    private var requestPhases: [String: RequestState] = [:]

    // Current request
    private var _activeRequests: Int = 0
@@ -23,13 +23,19 @@ final class LiveCounters: @unchecked Sendable {
    private var _isPrefilling: Bool = false
    private var _isGenerating: Bool = false
    private var _contextMax: Int = 0
+    private var _currentPhaseElapsed: TimeInterval = 0

    // Cumulative
    private var _totalRequests: Int = 0
    private var _totalPromptTokens: Int = 0
    private var _totalGenerationTokens: Int = 0
+    private var _totalPreparingDuration: TimeInterval = 0
+    private var _totalSessionBuildDuration: TimeInterval = 0
+    private var _totalPrefillDuration: TimeInterval = 0
+    private var _totalGenerationDuration: TimeInterval = 0

    func requestStarted(requestId: String, contextLength: Int) {
+        let now = Date()
        lock.lock()
        _activeRequests += 1
        _preparingRequests += 1
@@ -40,33 +46,40 @@ final class LiveCounters: @unchecked Sendable {
        _generationTokens = 0
        _tokensPerSecond = 0
        _contextMax = contextLength
-        requestPhases[requestId] = .preparing
+        requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
+        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }

    func requestPhaseChanged(requestId: String, phase: RequestPhase) {
+        let now = Date()
        lock.lock()
        if let current = requestPhases[requestId] {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
+            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        incrementCount(for: phase)
-        requestPhases[requestId] = phase
+        requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
        _isGenerating = _generatingRequests > 0
+        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }

    func prefillCompleted(requestId: String, promptTokens: Int) {
+        let now = Date()
        lock.lock()
        if let current = requestPhases[requestId] {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
+            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        incrementCount(for: .generating)
-        requestPhases[requestId] = .generating
+        requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
        _promptTokens = promptTokens
        _totalPromptTokens += promptTokens
        _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
        _isGenerating = _generatingRequests > 0
+        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }

@@ -78,9 +91,11 @@ final class LiveCounters: @unchecked Sendable {
    }

    func requestCompleted(requestId: String, generationTokens: Int) {
+        let now = Date()
        lock.lock()
        if let current = requestPhases.removeValue(forKey: requestId) {
-            decrementCount(for: current)
+            decrementCount(for: current.phase)
+            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
        }
        _activeRequests = max(0, _activeRequests - 1)
        _totalGenerationTokens += generationTokens
@@ -92,6 +107,7 @@ final class LiveCounters: @unchecked Sendable {
            _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
            _isGenerating = _generatingRequests > 0
        }
+        refreshCurrentPhaseElapsed(now: now)
        lock.unlock()
    }

@@ -109,15 +125,22 @@ final class LiveCounters: @unchecked Sendable {
        _isPrefilling = false
        _isGenerating = false
        _contextMax = 0
+        _currentPhaseElapsed = 0
        _totalRequests = 0
        _totalPromptTokens = 0
        _totalGenerationTokens = 0
+        _totalPreparingDuration = 0
+        _totalSessionBuildDuration = 0
+        _totalPrefillDuration = 0
+        _totalGenerationDuration = 0
        lock.unlock()
    }

    /// Atomic snapshot for the UI timer.
    func snapshot() -> Snapshot {
+        let now = Date()
        lock.lock()
+        refreshCurrentPhaseElapsed(now: now)
        let s = Snapshot(
            activeRequests: _activeRequests,
            preparingRequests: _preparingRequests,
@@ -130,9 +153,14 @@ final class LiveCounters: @unchecked Sendable {
            isPrefilling: _isPrefilling,
            isGenerating: _isGenerating,
            contextMax: _contextMax,
+            currentPhaseElapsed: _currentPhaseElapsed,
            totalRequests: _totalRequests,
            totalPromptTokens: _totalPromptTokens,
-            totalGenerationTokens: _totalGenerationTokens
+            totalGenerationTokens: _totalGenerationTokens,
+            totalPreparingDuration: _totalPreparingDuration,
+            totalSessionBuildDuration: _totalSessionBuildDuration,
+            totalPrefillDuration: _totalPrefillDuration,
+            totalGenerationDuration: _totalGenerationDuration
        )
        lock.unlock()
        return s
@@ -150,9 +178,14 @@ final class LiveCounters: @unchecked Sendable {
        let isPrefilling: Bool
        let isGenerating: Bool
        let contextMax: Int
+        let currentPhaseElapsed: TimeInterval
        let totalRequests: Int
        let totalPromptTokens: Int
        let totalGenerationTokens: Int
+        let totalPreparingDuration: TimeInterval
+        let totalSessionBuildDuration: TimeInterval
+        let totalPrefillDuration: TimeInterval
+        let totalGenerationDuration: TimeInterval
    }

    private func incrementCount(for phase: RequestPhase) {
@@ -181,6 +214,28 @@ final class LiveCounters: @unchecked Sendable {
        }
    }

+    private func accumulateDuration(for phase: RequestPhase, elapsed: TimeInterval) {
+        switch phase {
+        case .preparing:
+            _totalPreparingDuration += elapsed
+        case .sessionBuild:
+            _totalSessionBuildDuration += elapsed
+        case .prefilling:
+            _totalPrefillDuration += elapsed
+        case .generating:
+            _totalGenerationDuration += elapsed
+        }
+    }
+
+    private func refreshCurrentPhaseElapsed(now: Date) {
+        _currentPhaseElapsed = requestPhases.values.map { now.timeIntervalSince($0.phaseStartedAt) }.max() ?? 0
+    }
+
+    private struct RequestState {
+        var phase: RequestPhase
+        var phaseStartedAt: Date
+    }
+
    enum RequestPhase {
        case preparing
        case sessionBuild
@@ -208,6 +263,7 @@ final class InferenceStats {
    var currentTokensPerSecond: Double = 0
    var contextUsed: Int = 0
    var contextMax: Int = 0
+    var currentPhaseElapsed: TimeInterval = 0

    // MARK: - Cumulative counters

@@ -219,6 +275,10 @@ final class InferenceStats {
    var totalCacheEvictions: Int = 0
    var totalCacheReusePromptTokens: Int = 0
    var totalCacheRebuildPromptTokens: Int = 0
+    var totalPreparingDuration: TimeInterval = 0
+    var totalSessionBuildDuration: TimeInterval = 0
+    var totalPrefillDuration: TimeInterval = 0
+    var totalGenerationDuration: TimeInterval = 0

    // MARK: - Cache state

@@ -246,6 +306,9 @@ final class InferenceStats {
    private(set) var cacheFootprintHistory: [DataPoint] = []
    private(set) var cacheReuseHistory: [DataPoint] = []
    private(set) var cacheRebuildHistory: [DataPoint] = []
+    private(set) var currentPhaseElapsedHistory: [DataPoint] = []
+    private(set) var prefillDurationHistory: [DataPoint] = []
+    private(set) var sessionBuildDurationHistory: [DataPoint] = []

    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz

@@ -255,6 +318,8 @@ final class InferenceStats {
    private var lastPromptTokenCount: Int = 0
    private var lastCacheReuseTokenCount: Int = 0
    private var lastCacheRebuildTokenCount: Int = 0
+    private var lastPrefillDuration: TimeInterval = 0
+    private var lastSessionBuildDuration: TimeInterval = 0

    func startSampling() {
        guard sampleTimer == nil else { return }
@@ -287,9 +352,14 @@ final class InferenceStats {
        isGenerating = snap.isGenerating
        contextMax = snap.contextMax
        contextUsed = snap.promptTokens + snap.generationTokens
+        currentPhaseElapsed = snap.currentPhaseElapsed
        totalRequests = snap.totalRequests
        totalPromptTokens = snap.totalPromptTokens
        totalGenerationTokens = snap.totalGenerationTokens
+        totalPreparingDuration = snap.totalPreparingDuration
+        totalSessionBuildDuration = snap.totalSessionBuildDuration
+        totalPrefillDuration = snap.totalPrefillDuration
+        totalGenerationDuration = snap.totalGenerationDuration
        totalCacheHits = cache.totalHits
        totalCacheMisses = cache.totalMisses
        totalCacheEvictions = cache.totalEvictions
@@ -308,10 +378,14 @@ final class InferenceStats {
        let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
        let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
        let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
+        let prefillDurationDelta = snap.totalPrefillDuration - lastPrefillDuration
+        let sessionBuildDurationDelta = snap.totalSessionBuildDuration - lastSessionBuildDuration
        lastGenerationTokenCount = snap.totalGenerationTokens
        lastPromptTokenCount = snap.totalPromptTokens
        lastCacheReuseTokenCount = cache.totalReusePromptTokens
        lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
+        lastPrefillDuration = snap.totalPrefillDuration
+        lastSessionBuildDuration = snap.totalSessionBuildDuration

        tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
@@ -321,6 +395,9 @@ final class InferenceStats {
        cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
        cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
        cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
+        currentPhaseElapsedHistory.append(DataPoint(timestamp: now, value: snap.currentPhaseElapsed))
+        prefillDurationHistory.append(DataPoint(timestamp: now, value: prefillDurationDelta))
+        sessionBuildDurationHistory.append(DataPoint(timestamp: now, value: sessionBuildDurationDelta))

        if tokenRateHistory.count > Self.maxHistoryPoints {
            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -346,6 +423,15 @@ final class InferenceStats {
        if cacheRebuildHistory.count > Self.maxHistoryPoints {
            cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
        }
+        if currentPhaseElapsedHistory.count > Self.maxHistoryPoints {
+            currentPhaseElapsedHistory.removeFirst(currentPhaseElapsedHistory.count - Self.maxHistoryPoints)
+        }
+        if prefillDurationHistory.count > Self.maxHistoryPoints {
+            prefillDurationHistory.removeFirst(prefillDurationHistory.count - Self.maxHistoryPoints)
+        }
+        if sessionBuildDurationHistory.count > Self.maxHistoryPoints {
+            sessionBuildDurationHistory.removeFirst(sessionBuildDurationHistory.count - Self.maxHistoryPoints)
+        }
    }

    func reset() {
@@ -363,9 +449,14 @@ final class InferenceStats {
        currentTokensPerSecond = 0
        contextUsed = 0
        contextMax = 0
+        currentPhaseElapsed = 0
        totalRequests = 0
        totalPromptTokens = 0
        totalGenerationTokens = 0
+        totalPreparingDuration = 0
+        totalSessionBuildDuration = 0
+        totalPrefillDuration = 0
+        totalGenerationDuration = 0
        totalCacheHits = 0
        totalCacheMisses = 0
        totalCacheEvictions = 0
@@ -386,9 +477,14 @@ final class InferenceStats {
        cacheFootprintHistory.removeAll()
        cacheReuseHistory.removeAll()
        cacheRebuildHistory.removeAll()
+        currentPhaseElapsedHistory.removeAll()
+        prefillDurationHistory.removeAll()
+        sessionBuildDurationHistory.removeAll()
        lastGenerationTokenCount = 0
        lastPromptTokenCount = 0
        lastCacheReuseTokenCount = 0
        lastCacheRebuildTokenCount = 0
+        lastPrefillDuration = 0
+        lastSessionBuildDuration = 0
    }
 }
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -393,7 +393,7 @@ final class APIServer {
        // Extract images from the last message only (ChatSession.streamDetails takes images separately)
        let lastImages = lastMessage.images

-        let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
+        let result: (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool)

        if isStream {
            result = await handleStreamingResponse(
@@ -405,7 +405,8 @@ final class APIServer {
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName
+                modelName: modelName,
+                isQwen: isQwen
            )
        } else {
            result = await handleNonStreamingResponse(
@@ -417,16 +418,23 @@ final class APIServer {
                images: lastImages,
                tools: request.tools,
                created: created,
-                modelName: modelName
+                modelName: modelName,
+                isQwen: isQwen
            )
        }

        if result.succeeded {
+            var cachedSignatures = messageSignatures
+            if let assistantHistoryText = result.assistantHistoryText {
+                cachedSignatures.append(
+                    Self.messageSignature(role: .assistant, content: assistantHistoryText, imageURLs: [])
+                )
+            }
            ConversationSessionCache.shared.completeRequest(
                entryId: lease.entryId,
                session: session,
-                requestMessageSignatures: messageSignatures,
-                requestMessageCount: chatMessages.count,
+                requestMessageSignatures: cachedSignatures,
+                requestMessageCount: cachedSignatures.count,
                estimatedPromptTokens: estimatedPromptTokens,
                estimatedBytes: estimatedBytes,
                promptTokens: result.promptTokens,
@@ -473,8 +481,9 @@ final class APIServer {
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String
-    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
+        modelName: String,
+        isQwen: Bool
+    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        do {
            var fullText = ""
            var promptTokens = 0
@@ -510,48 +519,11 @@ final class APIServer {
                }
            }

-            // Parse tool calls: first check framework-detected ones, then our own text parser
-            var finishReason = "stop"
-            var responseContent: String? = fullText
-            var apiToolCalls: [APIToolCall]? = nil
-
-            if !frameworkToolCalls.isEmpty {
-                // Framework natively detected tool calls (e.g. Qwen)
-                finishReason = "tool_calls"
-                apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
-                    let argsJSON: String
-                    let argsDict = tc.function.arguments.mapValues { $0.anyValue }
-                    if let data = try? JSONSerialization.data(withJSONObject: argsDict),
-                       let str = String(data: data, encoding: .utf8) {
-                        argsJSON = str
-                    } else {
-                        argsJSON = "{}"
-                    }
-                    let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
-                    return APIToolCall(
-                        index: i,
-                        id: callId,
-                        type: "function",
-                        function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
-                    )
-                }
-                responseContent = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
-            } else if let tools, !tools.isEmpty {
-                // Try our own text parser (e.g. Gemma tool_code blocks)
-                let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
-                if !parsedCalls.isEmpty {
-                    finishReason = "tool_calls"
-                    apiToolCalls = parsedCalls.enumerated().map { i, tc in
-                        APIToolCall(
-                            index: i,
-                            id: tc.id,
-                            type: "function",
-                            function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
-                        )
-                    }
-                    responseContent = cleanText.isEmpty ? nil : cleanText
-                }
-            }
+            let resolved = Self.resolveAssistantResponse(
+                fullText: fullText,
+                frameworkToolCalls: frameworkToolCalls,
+                tools: tools
+            )

            let response = APIChatCompletionResponse(
                id: requestId,
@@ -563,10 +535,10 @@ final class APIServer {
                        index: 0,
                        message: APIChoiceMessage(
                            role: "assistant",
-                            content: responseContent,
-                            tool_calls: apiToolCalls
+                            content: resolved.content,
+                            tool_calls: resolved.toolCalls
                        ),
-                        finish_reason: finishReason
+                        finish_reason: resolved.finishReason
                    )
                ],
                usage: APIUsageInfo(
@@ -579,10 +551,15 @@ final class APIServer {
            if let json = try? JSONEncoder().encode(response) {
                sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
            }
-            return (promptTokens, completionTokens, true)
+            let assistantHistoryText = Self.normalizedAssistantHistoryContent(
+                content: resolved.content,
+                toolCalls: resolved.toolCalls,
+                isQwen: isQwen
+            )
+            return (promptTokens, completionTokens, assistantHistoryText, true)
        } catch {
            sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
-            return (0, 0, false)
+            return (0, 0, nil, false)
        }
    }

@@ -597,8 +574,9 @@ final class APIServer {
        images: [UserInput.Image],
        tools: [APIToolDefinition]?,
        created: Int,
-        modelName: String
-    ) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
+        modelName: String,
+        isQwen: Bool
+    ) async -> (promptTokens: Int, completionTokens: Int, assistantHistoryText: String?, succeeded: Bool) {
        // Send SSE headers
        let header = [
            "HTTP/1.1 200 OK",
@@ -657,50 +635,14 @@ final class APIServer {
            LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
        }

-        // Stats were already updated by LiveCounters inside the loop
+        let resolved = Self.resolveAssistantResponse(
+            fullText: fullText,
+            frameworkToolCalls: frameworkToolCalls,
+            tools: tools
+        )

-        // Post-generation: handle tool calls (framework-detected or text-parsed)
-        var finishReason = "stop"
-
-        if !frameworkToolCalls.isEmpty {
-            finishReason = "tool_calls"
-            for (i, tc) in frameworkToolCalls.enumerated() {
-                let argsDict = tc.function.arguments.mapValues { $0.anyValue }
-                let argsJSON: String
-                if let data = try? JSONSerialization.data(withJSONObject: argsDict),
-                   let str = String(data: data, encoding: .utf8) {
-                    argsJSON = str
-                } else {
-                    argsJSON = "{}"
-                }
-                let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
-                let apiToolCall = APIToolCall(
-                    index: i,
-                    id: callId,
-                    type: "function",
-                    function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
-                )
-                await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
-                    id: requestId,
-                    object: "chat.completion.chunk",
-                    created: created,
-                    model: modelName,
-                    choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: [apiToolCall]), finish_reason: nil)],
-                    usage: nil
-                ))
-            }
-        } else if hasTools {
-            let (_, parsed) = ToolCallParser.parse(text: fullText, tools: tools)
-            if !parsed.isEmpty {
-                finishReason = "tool_calls"
-            }
-            for (i, tc) in parsed.enumerated() {
-                let apiToolCall = APIToolCall(
-                    index: i,
-                    id: tc.id,
-                    type: "function",
-                    function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
-                )
+        if let toolCalls = resolved.toolCalls {
+            for apiToolCall in toolCalls {
                await Self.sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
                    id: requestId,
                    object: "chat.completion.chunk",
@@ -718,7 +660,7 @@ final class APIServer {
            object: "chat.completion.chunk",
            created: created,
            model: modelName,
-            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: finishReason)],
+            choices: [APIStreamChoice(index: 0, delta: APIDeltaMessage(role: nil, content: nil, tool_calls: nil), finish_reason: resolved.finishReason)],
            usage: APIUsageInfo(
                prompt_tokens: promptTokens,
                completion_tokens: completionTokens,
@@ -729,7 +671,12 @@ final class APIServer {
        // Send [DONE] and close
        await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
        connection.cancel()
-        return (promptTokens, completionTokens, succeeded)
+        let assistantHistoryText = Self.normalizedAssistantHistoryContent(
+            content: resolved.content,
+            toolCalls: resolved.toolCalls,
+            isQwen: isQwen
+        )
+        return (promptTokens, completionTokens, assistantHistoryText, succeeded)
    }

    /// Run the token generation + SSE send loop entirely off MainActor.
@@ -876,6 +823,68 @@ final class APIServer {

        return hash
    }
+
+    private static func normalizedAssistantHistoryContent(
+        content: String?,
+        toolCalls: [APIToolCall]?,
+        isQwen: Bool
+    ) -> String? {
+        var text = content?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+        if let toolCalls, !toolCalls.isEmpty {
+            let formattedCalls = isQwen
+                ? ToolPromptBuilder.formatQwenToolCalls(toolCalls)
+                : ToolPromptBuilder.formatGemmaToolCalls(toolCalls)
+            text = text.isEmpty ? formattedCalls : text + "\n" + formattedCalls
+        }
+        return text.isEmpty ? nil : text
+    }
+
+    private static func resolveAssistantResponse(
+        fullText: String,
+        frameworkToolCalls: [MLXLMCommon.ToolCall],
+        tools: [APIToolDefinition]?
+    ) -> (content: String?, toolCalls: [APIToolCall]?, finishReason: String) {
+        var finishReason = "stop"
+        var responseContent: String? = fullText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : fullText
+        var apiToolCalls: [APIToolCall]? = nil
+
+        if !frameworkToolCalls.isEmpty {
+            finishReason = "tool_calls"
+            apiToolCalls = frameworkToolCalls.enumerated().map { i, tc in
+                let argsJSON: String
+                let argsDict = tc.function.arguments.mapValues { $0.anyValue }
+                if let data = try? JSONSerialization.data(withJSONObject: argsDict),
+                   let str = String(data: data, encoding: .utf8) {
+                    argsJSON = str
+                } else {
+                    argsJSON = "{}"
+                }
+                let callId = String(format: "call_%d_%08d", i, abs(tc.function.name.hashValue) % 100_000_000)
+                return APIToolCall(
+                    index: i,
+                    id: callId,
+                    type: "function",
+                    function: APIFunctionCall(name: tc.function.name, arguments: argsJSON)
+                )
+            }
+        } else if let tools, !tools.isEmpty {
+            let (cleanText, parsedCalls) = ToolCallParser.parse(text: fullText, tools: tools)
+            if !parsedCalls.isEmpty {
+                finishReason = "tool_calls"
+                apiToolCalls = parsedCalls.enumerated().map { i, tc in
+                    APIToolCall(
+                        index: i,
+                        id: tc.id,
+                        type: "function",
+                        function: APIFunctionCall(name: tc.name, arguments: tc.arguments)
+                    )
+                }
+                responseContent = cleanText.isEmpty ? nil : cleanText
+            }
+        }
+
+        return (responseContent, apiToolCalls, finishReason)
+    }
 }

 private struct DecodedImage {
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
@@ -17,6 +17,7 @@ struct MonitorView: View {
                LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
                    tokenRateChart
                    tokenThroughputChart
+                    phaseActivityChart
                    cacheReuseChart
                    cacheFootprintChart
                    cacheSessionChart
@@ -90,6 +91,9 @@ struct MonitorView: View {
                phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
                phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
                phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
+                if stats.activeRequests > 0 {
+                    phaseChip(title: phaseAgeLabel, count: Int(stats.currentPhaseElapsed.rounded()), color: activityColor)
+                }
            }
        }
        .padding(12)
@@ -161,6 +165,71 @@ struct MonitorView: View {
        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
    }

+    @ViewBuilder
+    private var phaseActivityChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Phase Activity")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart {
+                ForEach(stats.currentPhaseElapsedHistory) { point in
+                    LineMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Active s", point.value)
+                    )
+                    .foregroundStyle(activityColor)
+                    .interpolationMethod(.monotone)
+                }
+                ForEach(stats.prefillDurationHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Prefill done", point.value)
+                    )
+                    .foregroundStyle(.blue.opacity(0.45))
+                }
+                ForEach(stats.sessionBuildDurationHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Build done", point.value)
+                    )
+                    .foregroundStyle(.purple.opacity(0.45))
+                }
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.0f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .frame(height: 150)
+
+            HStack(spacing: 12) {
+                Label("Active phase age", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(activityColor)
+                Label("Prefill completed", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.blue)
+                Label("Session build completed", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.purple)
+            }
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
    @ViewBuilder
    private var cacheReuseChart: some View {
        VStack(alignment: .leading, spacing: 6) {
@@ -717,6 +786,13 @@ struct MonitorView: View {
        return Double(stats.totalCacheHits) / Double(total)
    }

+    private var phaseAgeLabel: String {
+        if stats.generatingRequests > 0 { return "Generating s" }
+        if stats.prefillingRequests > 0 { return "Prefill s" }
+        if stats.sessionBuildRequests > 0 { return "Build s" }
+        return "Preparing s"
+    }
+
    private func maxContextRatio(for tokens: Int) -> Double {
        let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
        guard maxContext > 0 else { return 0 }