feat: inference visualisation

This commit is contained in:
2026-03-17 19:30:09 +01:00
parent 5313b7175e
commit 1a67311874
6 changed files with 629 additions and 83 deletions

View File

@@ -11,6 +11,7 @@
165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */ = {isa = PBXBuildFile; fileRef = 145B888FBDD4F931512C5473 /* Preferences.swift */; }; 165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */ = {isa = PBXBuildFile; fileRef = 145B888FBDD4F931512C5473 /* Preferences.swift */; };
189362AAE2CDE5D4B3428334 /* ToolCallParser.swift in Sources */ = {isa = PBXBuildFile; fileRef = E73B165A1822729C907791AE /* ToolCallParser.swift */; }; 189362AAE2CDE5D4B3428334 /* ToolCallParser.swift in Sources */ = {isa = PBXBuildFile; fileRef = E73B165A1822729C907791AE /* ToolCallParser.swift */; };
2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */; }; 2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */; };
2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */ = {isa = PBXBuildFile; fileRef = E35452B166893B25E765FF70 /* InferenceStats.swift */; };
4CB13DC1AC7A500DDBB443EC /* ChatInputView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */; }; 4CB13DC1AC7A500DDBB443EC /* ChatInputView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */; };
50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C67742651DB486871CEF1612 /* MLXServerApp.swift */; }; 50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C67742651DB486871CEF1612 /* MLXServerApp.swift */; };
50DD129CCF2843482DEC3B96 /* APIServer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3D08828E16B17EF02C14243E /* APIServer.swift */; }; 50DD129CCF2843482DEC3B96 /* APIServer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3D08828E16B17EF02C14243E /* APIServer.swift */; };
@@ -22,6 +23,7 @@
80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */; }; 80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */; };
84D32315B418B5243E017350 /* ToolPromptBuilder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */; }; 84D32315B418B5243E017350 /* ToolPromptBuilder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */; };
945474365D0B3E961811909A /* MLXVLM in Frameworks */ = {isa = PBXBuildFile; productRef = D5E8E1C2DD8D8AABB4306193 /* MLXVLM */; }; 945474365D0B3E961811909A /* MLXVLM in Frameworks */ = {isa = PBXBuildFile; productRef = D5E8E1C2DD8D8AABB4306193 /* MLXVLM */; };
B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4239CFF94B819C35A8D4D617 /* MonitorView.swift */; };
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */; }; B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */; };
B6D3662995B885C102876B4A /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = 9090667D4134056AE66DC2F1 /* MLXLMCommon */; }; B6D3662995B885C102876B4A /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = 9090667D4134056AE66DC2F1 /* MLXLMCommon */; };
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; }; D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
@@ -38,6 +40,7 @@
3AF462805202797F61422AEE /* MLXServer.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = MLXServer.entitlements; sourceTree = "<group>"; }; 3AF462805202797F61422AEE /* MLXServer.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = MLXServer.entitlements; sourceTree = "<group>"; };
3D08828E16B17EF02C14243E /* APIServer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServer.swift; sourceTree = "<group>"; }; 3D08828E16B17EF02C14243E /* APIServer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServer.swift; sourceTree = "<group>"; };
4147321383E94E9F17A0154E /* SettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsView.swift; sourceTree = "<group>"; }; 4147321383E94E9F17A0154E /* SettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsView.swift; sourceTree = "<group>"; };
4239CFF94B819C35A8D4D617 /* MonitorView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MonitorView.swift; sourceTree = "<group>"; };
6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; }; 6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; };
922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; }; 922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; };
944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; }; 944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
@@ -49,6 +52,7 @@
C67742651DB486871CEF1612 /* MLXServerApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MLXServerApp.swift; sourceTree = "<group>"; }; C67742651DB486871CEF1612 /* MLXServerApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MLXServerApp.swift; sourceTree = "<group>"; };
D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LocalModelResolver.swift; sourceTree = "<group>"; }; D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LocalModelResolver.swift; sourceTree = "<group>"; };
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; }; DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; }; E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; }; E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; }; F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
@@ -108,6 +112,7 @@
E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */, E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */,
DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */, DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */,
C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */, C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */,
4239CFF94B819C35A8D4D617 /* MonitorView.swift */,
4147321383E94E9F17A0154E /* SettingsView.swift */, 4147321383E94E9F17A0154E /* SettingsView.swift */,
B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */, B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */,
); );
@@ -118,6 +123,7 @@
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
A4B359324B5FD8D106C74338 /* ChatMessage.swift */, A4B359324B5FD8D106C74338 /* ChatMessage.swift */,
E35452B166893B25E765FF70 /* InferenceStats.swift */,
38DFC212AF4359A45FBE22BA /* ModelConfig.swift */, 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */,
); );
path = Models; path = Models;
@@ -234,11 +240,13 @@
5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */, 5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */,
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */, B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */, 5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */, 6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */,
50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */, 50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */,
80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */, 80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */,
0168AEE16009097901363E16 /* ModelManager.swift in Sources */, 0168AEE16009097901363E16 /* ModelManager.swift in Sources */,
2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */, 2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */,
B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */,
165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */, 165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */,
D666A311788375E8A061C832 /* SettingsView.swift in Sources */, D666A311788375E8A061C832 /* SettingsView.swift in Sources */,
621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */, 621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */,

View File

@@ -4,86 +4,110 @@ struct ContentView: View {
@Environment(ModelManager.self) private var modelManager @Environment(ModelManager.self) private var modelManager
@State private var chatVM: ChatViewModel? @State private var chatVM: ChatViewModel?
@State private var showLoadError = false @State private var showLoadError = false
@State private var showMonitor = false
var body: some View { var body: some View {
Group { mainContent
if let chatVM { .navigationTitle(modelManager.currentModel?.displayName ?? "MLX Server")
ChatView(viewModel: chatVM) .onAppear {
} else { if chatVM == nil {
ProgressView("Initializing…") chatVM = ChatViewModel(modelManager: modelManager)
} // Auto-start API server if configured
} if Preferences.apiAutoStart {
.navigationTitle(modelManager.currentModel?.displayName ?? "MLX Server") chatVM?.startAPIServer()
.onAppear {
if chatVM == nil {
chatVM = ChatViewModel(modelManager: modelManager)
// Auto-start API server if configured
if Preferences.apiAutoStart {
chatVM?.startAPIServer()
}
}
}
.onChange(of: modelManager.currentModel) {
chatVM?.resetSession()
// Persist last used model
if let id = modelManager.currentModel?.id {
Preferences.lastModelId = id
}
}
.onChange(of: modelManager.errorMessage) {
showLoadError = modelManager.errorMessage != nil
}
.alert("Model Error", isPresented: $showLoadError) {
Button("Retry") {
if let config = modelManager.currentModel ?? ModelConfig.availableModels.first {
Task { await modelManager.loadModel(config) }
}
}
Button("Cancel", role: .cancel) {
modelManager.errorMessage = nil
}
} message: {
Text(modelManager.errorMessage ?? "Unknown error loading model.")
}
.toolbar {
ToolbarItem(placement: .principal) {
ModelPickerView()
}
ToolbarItemGroup(placement: .primaryAction) {
// API server toggle
Button {
if let chatVM {
if chatVM.apiServer.isRunning {
chatVM.stopAPIServer()
} else {
chatVM.startAPIServer()
}
} }
} label: {
// Running solid globe (green tint), click to stop
// Stopped slashed globe, click to start
Label(
chatVM?.apiServer.isRunning == true ? "Stop API" : "Start API",
systemImage: chatVM?.apiServer.isRunning == true ? "network" : "network.slash"
)
.foregroundStyle(chatVM?.apiServer.isRunning == true ? .green : .secondary)
} }
.help(chatVM?.apiServer.isRunning == true ? "API server running on port \(Preferences.apiPort) — click to stop" : "Click to start API server")
// New conversation
Button {
chatVM?.newConversation()
} label: {
Label("New Chat", systemImage: "plus.message")
}
.keyboardShortcut("n", modifiers: .command)
} }
.onChange(of: modelManager.currentModel) {
chatVM?.resetSession()
// Persist last used model
if let id = modelManager.currentModel?.id {
Preferences.lastModelId = id
}
}
.onChange(of: modelManager.errorMessage) {
showLoadError = modelManager.errorMessage != nil
}
.alert("Model Error", isPresented: $showLoadError) {
Button("Retry") {
if let config = modelManager.currentModel ?? ModelConfig.availableModels.first {
Task { await modelManager.loadModel(config) }
}
}
Button("Cancel", role: .cancel) {
modelManager.errorMessage = nil
}
} message: {
Text(modelManager.errorMessage ?? "Unknown error loading model.")
}
.toolbar {
ToolbarItem(placement: .principal) {
ModelPickerView()
}
ToolbarItemGroup(placement: .primaryAction) {
toolbarButtons
}
}
// Cmd+1/2/3 model switching
.background {
modelSwitchShortcuts
}
}
@ViewBuilder
private var mainContent: some View {
if let chatVM {
if showMonitor {
MonitorView(stats: chatVM.apiServer.inferenceStats)
} else {
ChatView(viewModel: chatVM)
}
} else {
ProgressView("Initializing…")
} }
// Cmd+1/2/3 model switching }
.background {
modelSwitchShortcuts @ViewBuilder
private var toolbarButtons: some View {
// API server toggle
let isRunning = chatVM?.apiServer.isRunning == true
Button {
if let chatVM {
if chatVM.apiServer.isRunning {
chatVM.stopAPIServer()
} else {
chatVM.startAPIServer()
}
}
} label: {
Label(
isRunning ? "Stop API" : "Start API",
systemImage: isRunning ? "network" : "network.slash"
)
.foregroundStyle(isRunning ? .green : .secondary)
} }
.help(isRunning ? "API server running on port \(Preferences.apiPort) — click to stop" : "Click to start API server")
// Monitor toggle
Button {
showMonitor.toggle()
} label: {
Label(
showMonitor ? "Chat" : "Monitor",
systemImage: showMonitor ? "bubble.left.and.text.bubble.right" : "chart.xyaxis.line"
)
.foregroundStyle(showMonitor ? Color.accentColor : Color.secondary)
}
.help(showMonitor ? "Switch to chat" : "Show inference monitor")
.keyboardShortcut("m", modifiers: [.command, .shift])
// New conversation
Button {
chatVM?.newConversation()
} label: {
Label("New Chat", systemImage: "plus.message")
}
.keyboardShortcut("n", modifiers: .command)
} }
@ViewBuilder @ViewBuilder

View File

@@ -0,0 +1,141 @@
import Foundation
/// Lightweight stats collector for inference activity visualization.
/// All mutations happen on @MainActor to avoid locks.
@Observable
@MainActor
final class InferenceStats {
// MARK: - Current request state
var activeRequests: Int = 0
var currentPromptTokens: Int = 0
var currentGenerationTokens: Int = 0
var isGenerating: Bool = false
var isPrefilling: Bool = false
var currentTokensPerSecond: Double = 0
var contextUsed: Int = 0
var contextMax: Int = 0
// MARK: - Cumulative counters
var totalRequests: Int = 0
var totalPromptTokens: Int = 0
var totalGenerationTokens: Int = 0
// MARK: - Time series data (ring buffers for charts)
struct DataPoint: Identifiable {
let id = UUID()
let timestamp: Date
let value: Double
}
private(set) var tokenRateHistory: [DataPoint] = []
private(set) var promptTokenHistory: [DataPoint] = []
private(set) var generationTokenHistory: [DataPoint] = []
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
// Periodic sampling
private var sampleTimer: Timer?
private var lastGenerationTokenCount: Int = 0
private var lastPromptTokenCount: Int = 0
private var lastSampleTime: Date = .now
func startSampling() {
guard sampleTimer == nil else { return }
lastSampleTime = .now
sampleTimer = Timer.scheduledTimer(withTimeInterval: 1.0, repeats: true) { [weak self] _ in
Task { @MainActor in
self?.recordSample()
}
}
}
func stopSampling() {
sampleTimer?.invalidate()
sampleTimer = nil
}
private func recordSample() {
let now = Date.now
// Token rate: tokens generated since last sample
let genDelta = totalGenerationTokens - lastGenerationTokenCount
let promptDelta = totalPromptTokens - lastPromptTokenCount
lastGenerationTokenCount = totalGenerationTokens
lastPromptTokenCount = totalPromptTokens
tokenRateHistory.append(DataPoint(timestamp: now, value: currentTokensPerSecond))
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
// Trim to ring buffer size
if tokenRateHistory.count > Self.maxHistoryPoints {
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
}
if generationTokenHistory.count > Self.maxHistoryPoints {
generationTokenHistory.removeFirst(generationTokenHistory.count - Self.maxHistoryPoints)
}
if promptTokenHistory.count > Self.maxHistoryPoints {
promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
}
}
// MARK: - Event recording (called from APIServer)
func requestStarted(contextLength: Int) {
activeRequests += 1
totalRequests += 1
isPrefilling = true
isGenerating = false
currentPromptTokens = 0
currentGenerationTokens = 0
currentTokensPerSecond = 0
contextMax = contextLength
contextUsed = 0
}
func prefillCompleted(promptTokens: Int) {
isPrefilling = false
isGenerating = true
currentPromptTokens = promptTokens
totalPromptTokens += promptTokens
contextUsed = promptTokens
}
func tokenGenerated(tokensPerSecond: Double, totalGenerated: Int) {
currentGenerationTokens = totalGenerated
currentTokensPerSecond = tokensPerSecond
contextUsed = currentPromptTokens + totalGenerated
}
func requestCompleted(promptTokens: Int, generationTokens: Int) {
activeRequests = max(0, activeRequests - 1)
totalGenerationTokens += generationTokens
if activeRequests == 0 {
isGenerating = false
isPrefilling = false
currentTokensPerSecond = 0
}
}
func reset() {
activeRequests = 0
currentPromptTokens = 0
currentGenerationTokens = 0
isGenerating = false
isPrefilling = false
currentTokensPerSecond = 0
contextUsed = 0
contextMax = 0
totalRequests = 0
totalPromptTokens = 0
totalGenerationTokens = 0
tokenRateHistory.removeAll()
promptTokenHistory.removeAll()
generationTokenHistory.removeAll()
lastGenerationTokenCount = 0
lastPromptTokenCount = 0
}
}

View File

@@ -11,6 +11,7 @@ final class APIServer {
var isRunning = false var isRunning = false
var port: Int = 1234 var port: Int = 1234
var requestCount: Int = 0 var requestCount: Int = 0
let inferenceStats = InferenceStats()
private var listener: NWListener? private var listener: NWListener?
private var modelManager: ModelManager? private var modelManager: ModelManager?
@@ -54,6 +55,7 @@ final class APIServer {
} }
listener?.start(queue: .global(qos: .userInitiated)) listener?.start(queue: .global(qos: .userInitiated))
inferenceStats.startSampling()
} catch { } catch {
print("[APIServer] Failed to start: \(error)") print("[APIServer] Failed to start: \(error)")
} }
@@ -66,6 +68,7 @@ final class APIServer {
cachedSession = nil cachedSession = nil
cachedMessages = nil cachedMessages = nil
cachedModelId = nil cachedModelId = nil
inferenceStats.stopSampling()
} }
// MARK: - Connection handling // MARK: - Connection handling
@@ -341,6 +344,8 @@ final class APIServer {
// Extract images from the last message only (ChatSession.streamDetails takes images separately) // Extract images from the last message only (ChatSession.streamDetails takes images separately)
let lastImages = lastMessage.images let lastImages = lastMessage.images
inferenceStats.requestStarted(contextLength: contextLength)
if isStream { if isStream {
await handleStreamingResponse( await handleStreamingResponse(
connection: connection, connection: connection,
@@ -421,14 +426,22 @@ final class APIServer {
switch generation { switch generation {
case .chunk(let text): case .chunk(let text):
fullText += text fullText += text
completionTokens += 1
inferenceStats.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
case .info(let info): case .info(let info):
promptTokens = info.promptTokenCount promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount completionTokens = info.generationTokenCount
inferenceStats.prefillCompleted(promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
inferenceStats.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call): case .toolCall(let call):
frameworkToolCalls.append(call) frameworkToolCalls.append(call)
} }
} }
inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
// Parse tool calls: first check framework-detected ones, then our own text parser // Parse tool calls: first check framework-detected ones, then our own text parser
var finishReason = "stop" var finishReason = "stop"
var responseContent: String? = fullText var responseContent: String? = fullText
@@ -499,6 +512,7 @@ final class APIServer {
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}") sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
} }
} catch { } catch {
inferenceStats.requestCompleted(promptTokens: 0, generationTokens: 0)
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#) sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
} }
} }
@@ -564,6 +578,7 @@ final class APIServer {
case .chunk(let text): case .chunk(let text):
completionTokens += 1 completionTokens += 1
fullText += text fullText += text
inferenceStats.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
if !bufferForTools { if !bufferForTools {
sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk( sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
@@ -579,12 +594,17 @@ final class APIServer {
case .info(let info): case .info(let info):
promptTokens = info.promptTokenCount promptTokens = info.promptTokenCount
completionTokens = info.generationTokenCount completionTokens = info.generationTokenCount
inferenceStats.prefillCompleted(promptTokens: promptTokens)
if info.tokensPerSecond > 0 {
inferenceStats.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
}
case .toolCall(let call): case .toolCall(let call):
frameworkToolCalls.append(call) frameworkToolCalls.append(call)
} }
} }
} catch { } catch {
inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n" let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
connection.send(content: errorEvent.data(using: .utf8), completion: .contentProcessed({ _ in })) connection.send(content: errorEvent.data(using: .utf8), completion: .contentProcessed({ _ in }))
} }
@@ -687,6 +707,8 @@ final class APIServer {
) )
)) ))
inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
// Send [DONE] and close // Send [DONE] and close
let done = "data: [DONE]\n\n" let done = "data: [DONE]\n\n"
connection.send(content: done.data(using: .utf8), completion: .contentProcessed({ _ in connection.send(content: done.data(using: .utf8), completion: .contentProcessed({ _ in

View File

@@ -7,7 +7,7 @@ struct ChatMessagesView: View {
var body: some View { var body: some View {
ScrollViewReader { proxy in ScrollViewReader { proxy in
ScrollView { ScrollView {
LazyVStack(alignment: .leading, spacing: 12) { VStack(alignment: .leading, spacing: 12) {
if viewModel.conversation.messages.isEmpty { if viewModel.conversation.messages.isEmpty {
emptyState emptyState
} else { } else {
@@ -16,14 +16,20 @@ struct ChatMessagesView: View {
.id(message.id) .id(message.id)
} }
} }
Color.clear
.frame(height: 1)
.id("bottom")
} }
.padding() .padding()
} }
.onChange(of: viewModel.conversation.messages.last?.content) { .onChange(of: viewModel.conversation.messages.last?.content) {
scrollToBottom(proxy: proxy) // During streaming, scroll without animation to avoid overlapping animations
proxy.scrollTo("bottom", anchor: .bottom)
} }
.onChange(of: viewModel.conversation.messages.count) { .onChange(of: viewModel.conversation.messages.count) {
scrollToBottom(proxy: proxy) withAnimation(.easeOut(duration: 0.2)) {
proxy.scrollTo("bottom", anchor: .bottom)
}
} }
} }
} }
@@ -47,13 +53,6 @@ struct ChatMessagesView: View {
.frame(maxWidth: .infinity, minHeight: 300) .frame(maxWidth: .infinity, minHeight: 300)
} }
private func scrollToBottom(proxy: ScrollViewProxy) {
if let lastId = viewModel.conversation.messages.last?.id {
withAnimation(.easeOut(duration: 0.2)) {
proxy.scrollTo(lastId, anchor: .bottom)
}
}
}
} }
struct MessageBubbleView: View { struct MessageBubbleView: View {

View File

@@ -0,0 +1,352 @@
import Charts
import MLX
import SwiftUI
/// Real-time inference monitoring dashboard, shown in place of the chat UI.
struct MonitorView: View {
let stats: InferenceStats
@Environment(ModelManager.self) private var modelManager
var body: some View {
ScrollView {
VStack(spacing: 20) {
// Live status header
liveStatusSection
// Charts
HStack(alignment: .top, spacing: 16) {
tokenRateChart
tokenThroughputChart
}
// Gauges row
HStack(spacing: 16) {
contextGauge
gpuMemoryGauge
requestsCard
}
// Cumulative stats
cumulativeSection
}
.padding(20)
}
.frame(maxWidth: .infinity, maxHeight: .infinity)
.background(.background)
}
// MARK: - Live Status
@ViewBuilder
private var liveStatusSection: some View {
HStack(spacing: 16) {
// Activity indicator
HStack(spacing: 8) {
Circle()
.fill(activityColor)
.frame(width: 10, height: 10)
.overlay {
if stats.isGenerating || stats.isPrefilling {
Circle()
.stroke(activityColor.opacity(0.5), lineWidth: 2)
.scaleEffect(1.8)
.opacity(0.6)
}
}
Text(activityLabel)
.font(.headline)
}
Spacer()
if stats.isGenerating {
Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
.font(.title2.monospacedDigit().bold())
.foregroundStyle(.green)
}
if stats.currentPromptTokens > 0 {
HStack(spacing: 4) {
Image(systemName: "arrow.down.circle.fill")
.foregroundStyle(.blue)
Text("\(stats.currentPromptTokens)")
.monospacedDigit()
Image(systemName: "arrow.up.circle.fill")
.foregroundStyle(.orange)
Text("\(stats.currentGenerationTokens)")
.monospacedDigit()
}
.font(.callout)
}
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
private var activityColor: Color {
if stats.isPrefilling { return .blue }
if stats.isGenerating { return .green }
if stats.activeRequests > 0 { return .orange }
return .secondary
}
private var activityLabel: String {
if stats.isPrefilling { return "Prefilling" }
if stats.isGenerating { return "Generating" }
if stats.activeRequests > 0 { return "Processing" }
return "Idle"
}
// MARK: - Token Rate Chart
@ViewBuilder
private var tokenRateChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Generation Speed (tok/s)")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart(stats.tokenRateHistory) { point in
LineMark(
x: .value("Time", point.timestamp),
y: .value("tok/s", point.value)
)
.foregroundStyle(.green)
.interpolationMethod(.monotone)
AreaMark(
x: .value("Time", point.timestamp),
y: .value("tok/s", point.value)
)
.foregroundStyle(.green.opacity(0.1))
.interpolationMethod(.monotone)
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.0f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.chartYScale(domain: 0...(maxTokenRate + 1))
.frame(height: 150)
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
private var maxTokenRate: Double {
stats.tokenRateHistory.map(\.value).max() ?? 10
}
// MARK: - Token Throughput Chart
@ViewBuilder
private var tokenThroughputChart: some View {
VStack(alignment: .leading, spacing: 6) {
Text("Token Throughput (/sec)")
.font(.caption.bold())
.foregroundStyle(.secondary)
Chart {
ForEach(stats.promptTokenHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Tokens", point.value)
)
.foregroundStyle(.blue.opacity(0.7))
}
ForEach(stats.generationTokenHistory) { point in
BarMark(
x: .value("Time", point.timestamp),
y: .value("Tokens", point.value)
)
.foregroundStyle(.orange.opacity(0.7))
}
}
.chartXAxis {
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
AxisGridLine()
}
}
.chartYAxis {
AxisMarks(position: .leading) { value in
AxisGridLine()
AxisValueLabel {
if let v = value.as(Double.self) {
Text(String(format: "%.0f", v))
.font(.caption2.monospacedDigit())
}
}
}
}
.frame(height: 150)
// Legend
HStack(spacing: 12) {
Label("Prompt", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.blue)
Label("Generation", systemImage: "circle.fill")
.font(.caption2)
.foregroundStyle(.orange)
}
}
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Context Gauge
@ViewBuilder
private var contextGauge: some View {
VStack(spacing: 8) {
Text("Context")
.font(.caption.bold())
.foregroundStyle(.secondary)
let maxCtx = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
let used = stats.contextUsed
let ratio = maxCtx > 0 ? Double(used) / Double(maxCtx) : 0
Gauge(value: ratio) {
EmptyView()
} currentValueLabel: {
Text(formatTokenCount(used))
.font(.title3.monospacedDigit().bold())
} minimumValueLabel: {
Text("0")
.font(.caption2)
} maximumValueLabel: {
Text(formatTokenCount(maxCtx))
.font(.caption2)
}
.gaugeStyle(.accessoryCircular)
.scaleEffect(1.3)
.tint(contextGradient(ratio: ratio))
Text("\(Int(ratio * 100))%")
.font(.caption.monospacedDigit())
.foregroundStyle(.secondary)
}
.frame(maxWidth: .infinity)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
private func contextGradient(ratio: Double) -> Color {
if ratio > 0.9 { return .red }
if ratio > 0.7 { return .orange }
return .blue
}
// MARK: - GPU Memory Gauge
@ViewBuilder
private var gpuMemoryGauge: some View {
VStack(spacing: 8) {
Text("GPU Memory")
.font(.caption.bold())
.foregroundStyle(.secondary)
let activeMB = Double(MLX.GPU.activeMemory) / 1_048_576
let peakMB = Double(MLX.GPU.peakMemory) / 1_048_576
Text(String(format: "%.0f MB", activeMB))
.font(.title3.monospacedDigit().bold())
if peakMB > 0 {
Text(String(format: "Peak: %.0f MB", peakMB))
.font(.caption2.monospacedDigit())
.foregroundStyle(.tertiary)
}
}
.frame(maxWidth: .infinity)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Requests Card
@ViewBuilder
private var requestsCard: some View {
VStack(spacing: 8) {
Text("Requests")
.font(.caption.bold())
.foregroundStyle(.secondary)
Text("\(stats.totalRequests)")
.font(.title3.monospacedDigit().bold())
if stats.activeRequests > 0 {
Text("\(stats.activeRequests) active")
.font(.caption2)
.foregroundStyle(.green)
} else {
Text("none active")
.font(.caption2)
.foregroundStyle(.tertiary)
}
}
.frame(maxWidth: .infinity)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Cumulative
@ViewBuilder
private var cumulativeSection: some View {
HStack(spacing: 24) {
VStack(spacing: 2) {
Text("Total Prompt Tokens")
.font(.caption2)
.foregroundStyle(.secondary)
Text(formatTokenCount(stats.totalPromptTokens))
.font(.callout.monospacedDigit().bold())
.foregroundStyle(.blue)
}
VStack(spacing: 2) {
Text("Total Generated Tokens")
.font(.caption2)
.foregroundStyle(.secondary)
Text(formatTokenCount(stats.totalGenerationTokens))
.font(.callout.monospacedDigit().bold())
.foregroundStyle(.orange)
}
VStack(spacing: 2) {
Text("Total Tokens")
.font(.caption2)
.foregroundStyle(.secondary)
Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens))
.font(.callout.monospacedDigit().bold())
}
}
.frame(maxWidth: .infinity)
.padding(12)
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
}
// MARK: - Helpers
private func formatTokenCount(_ count: Int) -> String {
if count >= 1_000_000 {
return String(format: "%.1fM", Double(count) / 1_000_000)
} else if count >= 1_000 {
return String(format: "%.1fk", Double(count) / 1_000)
}
return "\(count)"
}
}