diff --git a/MLXServer.xcodeproj/project.pbxproj b/MLXServer.xcodeproj/project.pbxproj
index ceb748b..2c00ae2 100644
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -11,6 +11,7 @@
 		165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */ = {isa = PBXBuildFile; fileRef = 145B888FBDD4F931512C5473 /* Preferences.swift */; };
 		189362AAE2CDE5D4B3428334 /* ToolCallParser.swift in Sources */ = {isa = PBXBuildFile; fileRef = E73B165A1822729C907791AE /* ToolCallParser.swift */; };
 		2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */; };
+		2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */ = {isa = PBXBuildFile; fileRef = E35452B166893B25E765FF70 /* InferenceStats.swift */; };
 		4CB13DC1AC7A500DDBB443EC /* ChatInputView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */; };
 		50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = C67742651DB486871CEF1612 /* MLXServerApp.swift */; };
 		50DD129CCF2843482DEC3B96 /* APIServer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3D08828E16B17EF02C14243E /* APIServer.swift */; };
@@ -22,6 +23,7 @@
 		80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */; };
 		84D32315B418B5243E017350 /* ToolPromptBuilder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */; };
 		945474365D0B3E961811909A /* MLXVLM in Frameworks */ = {isa = PBXBuildFile; productRef = D5E8E1C2DD8D8AABB4306193 /* MLXVLM */; };
+		B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4239CFF94B819C35A8D4D617 /* MonitorView.swift */; };
 		B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */; };
 		B6D3662995B885C102876B4A /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = 9090667D4134056AE66DC2F1 /* MLXLMCommon */; };
 		D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
@@ -38,6 +40,7 @@
 		3AF462805202797F61422AEE /* MLXServer.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = MLXServer.entitlements; sourceTree = "<group>"; };
 		3D08828E16B17EF02C14243E /* APIServer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServer.swift; sourceTree = "<group>"; };
 		4147321383E94E9F17A0154E /* SettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsView.swift; sourceTree = "<group>"; };
+		4239CFF94B819C35A8D4D617 /* MonitorView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MonitorView.swift; sourceTree = "<group>"; };
 		6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; };
 		944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
@@ -49,6 +52,7 @@
 		C67742651DB486871CEF1612 /* MLXServerApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MLXServerApp.swift; sourceTree = "<group>"; };
 		D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LocalModelResolver.swift; sourceTree = "<group>"; };
 		DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessagesView.swift; sourceTree = "<group>"; };
+		E35452B166893B25E765FF70 /* InferenceStats.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceStats.swift; sourceTree = "<group>"; };
 		E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatInputView.swift; sourceTree = "<group>"; };
 		E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
 		F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
@@ -108,6 +112,7 @@
 				E5E6AD02CDF23BDAB64700A7 /* ChatInputView.swift */,
 				DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */,
 				C3C3A76C02AF70A9D8F868FC /* ModelPickerView.swift */,
+				4239CFF94B819C35A8D4D617 /* MonitorView.swift */,
 				4147321383E94E9F17A0154E /* SettingsView.swift */,
 				B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */,
 			);
@@ -118,6 +123,7 @@
 			isa = PBXGroup;
 			children = (
 				A4B359324B5FD8D106C74338 /* ChatMessage.swift */,
+				E35452B166893B25E765FF70 /* InferenceStats.swift */,
 				38DFC212AF4359A45FBE22BA /* ModelConfig.swift */,
 			);
 			path = Models;
@@ -234,11 +240,13 @@
 				5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */,
 				B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
 				5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
+				2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
 				6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */,
 				50B6861FF8610B3ED4FFAD9D /* MLXServerApp.swift in Sources */,
 				80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */,
 				0168AEE16009097901363E16 /* ModelManager.swift in Sources */,
 				2CAAF7129F7CC45200FA9F6B /* ModelPickerView.swift in Sources */,
+				B1D9BC407DB7DB1489230C20 /* MonitorView.swift in Sources */,
 				165E8AB6ADAE1D59B1A86420 /* Preferences.swift in Sources */,
 				D666A311788375E8A061C832 /* SettingsView.swift in Sources */,
 				621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */,
diff --git a/MLXServer/ContentView.swift b/MLXServer/ContentView.swift
index 62ced89..4b1d67a 100644
--- a/MLXServer/ContentView.swift
+++ b/MLXServer/ContentView.swift
@@ -4,86 +4,110 @@ struct ContentView: View {
     @Environment(ModelManager.self) private var modelManager
     @State private var chatVM: ChatViewModel?
     @State private var showLoadError = false
+    @State private var showMonitor = false
 
     var body: some View {
-        Group {
-            if let chatVM {
-                ChatView(viewModel: chatVM)
-            } else {
-                ProgressView("Initializing…")
-            }
-        }
-        .navigationTitle(modelManager.currentModel?.displayName ?? "MLX Server")
-        .onAppear {
-            if chatVM == nil {
-                chatVM = ChatViewModel(modelManager: modelManager)
-                // Auto-start API server if configured
-                if Preferences.apiAutoStart {
-                    chatVM?.startAPIServer()
-                }
-            }
-        }
-        .onChange(of: modelManager.currentModel) {
-            chatVM?.resetSession()
-            // Persist last used model
-            if let id = modelManager.currentModel?.id {
-                Preferences.lastModelId = id
-            }
-        }
-        .onChange(of: modelManager.errorMessage) {
-            showLoadError = modelManager.errorMessage != nil
-        }
-        .alert("Model Error", isPresented: $showLoadError) {
-            Button("Retry") {
-                if let config = modelManager.currentModel ?? ModelConfig.availableModels.first {
-                    Task { await modelManager.loadModel(config) }
-                }
-            }
-            Button("Cancel", role: .cancel) {
-                modelManager.errorMessage = nil
-            }
-        } message: {
-            Text(modelManager.errorMessage ?? "Unknown error loading model.")
-        }
-        .toolbar {
-            ToolbarItem(placement: .principal) {
-                ModelPickerView()
-            }
-
-            ToolbarItemGroup(placement: .primaryAction) {
-                // API server toggle
-                Button {
-                    if let chatVM {
-                        if chatVM.apiServer.isRunning {
-                            chatVM.stopAPIServer()
-                        } else {
-                            chatVM.startAPIServer()
-                        }
+        mainContent
+            .navigationTitle(modelManager.currentModel?.displayName ?? "MLX Server")
+            .onAppear {
+                if chatVM == nil {
+                    chatVM = ChatViewModel(modelManager: modelManager)
+                    // Auto-start API server if configured
+                    if Preferences.apiAutoStart {
+                        chatVM?.startAPIServer()
                     }
-                } label: {
-                    // Running → solid globe (green tint), click to stop
-                    // Stopped → slashed globe, click to start
-                    Label(
-                        chatVM?.apiServer.isRunning == true ? "Stop API" : "Start API",
-                        systemImage: chatVM?.apiServer.isRunning == true ? "network" : "network.slash"
-                    )
-                    .foregroundStyle(chatVM?.apiServer.isRunning == true ? .green : .secondary)
                 }
-                .help(chatVM?.apiServer.isRunning == true ? "API server running on port \(Preferences.apiPort) — click to stop" : "Click to start API server")
-
-                // New conversation
-                Button {
-                    chatVM?.newConversation()
-                } label: {
-                    Label("New Chat", systemImage: "plus.message")
-                }
-                .keyboardShortcut("n", modifiers: .command)
             }
+            .onChange(of: modelManager.currentModel) {
+                chatVM?.resetSession()
+                // Persist last used model
+                if let id = modelManager.currentModel?.id {
+                    Preferences.lastModelId = id
+                }
+            }
+            .onChange(of: modelManager.errorMessage) {
+                showLoadError = modelManager.errorMessage != nil
+            }
+            .alert("Model Error", isPresented: $showLoadError) {
+                Button("Retry") {
+                    if let config = modelManager.currentModel ?? ModelConfig.availableModels.first {
+                        Task { await modelManager.loadModel(config) }
+                    }
+                }
+                Button("Cancel", role: .cancel) {
+                    modelManager.errorMessage = nil
+                }
+            } message: {
+                Text(modelManager.errorMessage ?? "Unknown error loading model.")
+            }
+            .toolbar {
+                ToolbarItem(placement: .principal) {
+                    ModelPickerView()
+                }
+                ToolbarItemGroup(placement: .primaryAction) {
+                    toolbarButtons
+                }
+            }
+            // Cmd+1/2/3 model switching
+            .background {
+                modelSwitchShortcuts
+            }
+    }
+
+    @ViewBuilder
+    private var mainContent: some View {
+        if let chatVM {
+            if showMonitor {
+                MonitorView(stats: chatVM.apiServer.inferenceStats)
+            } else {
+                ChatView(viewModel: chatVM)
+            }
+        } else {
+            ProgressView("Initializing…")
         }
-        // Cmd+1/2/3 model switching
-        .background {
-            modelSwitchShortcuts
+    }
+
+    @ViewBuilder
+    private var toolbarButtons: some View {
+        // API server toggle
+        let isRunning = chatVM?.apiServer.isRunning == true
+        Button {
+            if let chatVM {
+                if chatVM.apiServer.isRunning {
+                    chatVM.stopAPIServer()
+                } else {
+                    chatVM.startAPIServer()
+                }
+            }
+        } label: {
+            Label(
+                isRunning ? "Stop API" : "Start API",
+                systemImage: isRunning ? "network" : "network.slash"
+            )
+            .foregroundStyle(isRunning ? .green : .secondary)
         }
+        .help(isRunning ? "API server running on port \(Preferences.apiPort) — click to stop" : "Click to start API server")
+
+        // Monitor toggle
+        Button {
+            showMonitor.toggle()
+        } label: {
+            Label(
+                showMonitor ? "Chat" : "Monitor",
+                systemImage: showMonitor ? "bubble.left.and.text.bubble.right" : "chart.xyaxis.line"
+            )
+            .foregroundStyle(showMonitor ? Color.accentColor : Color.secondary)
+        }
+        .help(showMonitor ? "Switch to chat" : "Show inference monitor")
+        .keyboardShortcut("m", modifiers: [.command, .shift])
+
+        // New conversation
+        Button {
+            chatVM?.newConversation()
+        } label: {
+            Label("New Chat", systemImage: "plus.message")
+        }
+        .keyboardShortcut("n", modifiers: .command)
     }
 
     @ViewBuilder
diff --git a/MLXServer/Models/InferenceStats.swift b/MLXServer/Models/InferenceStats.swift
new file mode 100644
index 0000000..5efc6a5
--- /dev/null
+++ b/MLXServer/Models/InferenceStats.swift
@@ -0,0 +1,141 @@
+import Foundation
+
+/// Lightweight stats collector for inference activity visualization.
+/// All mutations happen on @MainActor to avoid locks.
+@Observable
+@MainActor
+final class InferenceStats {
+    // MARK: - Current request state
+
+    var activeRequests: Int = 0
+    var currentPromptTokens: Int = 0
+    var currentGenerationTokens: Int = 0
+    var isGenerating: Bool = false
+    var isPrefilling: Bool = false
+    var currentTokensPerSecond: Double = 0
+    var contextUsed: Int = 0
+    var contextMax: Int = 0
+
+    // MARK: - Cumulative counters
+
+    var totalRequests: Int = 0
+    var totalPromptTokens: Int = 0
+    var totalGenerationTokens: Int = 0
+
+    // MARK: - Time series data (ring buffers for charts)
+
+    struct DataPoint: Identifiable {
+        let id = UUID()
+        let timestamp: Date
+        let value: Double
+    }
+
+    private(set) var tokenRateHistory: [DataPoint] = []
+    private(set) var promptTokenHistory: [DataPoint] = []
+    private(set) var generationTokenHistory: [DataPoint] = []
+
+    private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
+
+    // Periodic sampling
+    private var sampleTimer: Timer?
+    private var lastGenerationTokenCount: Int = 0
+    private var lastPromptTokenCount: Int = 0
+    private var lastSampleTime: Date = .now
+
+    func startSampling() {
+        guard sampleTimer == nil else { return }
+        lastSampleTime = .now
+        sampleTimer = Timer.scheduledTimer(withTimeInterval: 1.0, repeats: true) { [weak self] _ in
+            Task { @MainActor in
+                self?.recordSample()
+            }
+        }
+    }
+
+    func stopSampling() {
+        sampleTimer?.invalidate()
+        sampleTimer = nil
+    }
+
+    private func recordSample() {
+        let now = Date.now
+
+        // Token rate: tokens generated since last sample
+        let genDelta = totalGenerationTokens - lastGenerationTokenCount
+        let promptDelta = totalPromptTokens - lastPromptTokenCount
+        lastGenerationTokenCount = totalGenerationTokens
+        lastPromptTokenCount = totalPromptTokens
+
+        tokenRateHistory.append(DataPoint(timestamp: now, value: currentTokensPerSecond))
+        generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
+        promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
+
+        // Trim to ring buffer size
+        if tokenRateHistory.count > Self.maxHistoryPoints {
+            tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
+        }
+        if generationTokenHistory.count > Self.maxHistoryPoints {
+            generationTokenHistory.removeFirst(generationTokenHistory.count - Self.maxHistoryPoints)
+        }
+        if promptTokenHistory.count > Self.maxHistoryPoints {
+            promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
+        }
+    }
+
+    // MARK: - Event recording (called from APIServer)
+
+    func requestStarted(contextLength: Int) {
+        activeRequests += 1
+        totalRequests += 1
+        isPrefilling = true
+        isGenerating = false
+        currentPromptTokens = 0
+        currentGenerationTokens = 0
+        currentTokensPerSecond = 0
+        contextMax = contextLength
+        contextUsed = 0
+    }
+
+    func prefillCompleted(promptTokens: Int) {
+        isPrefilling = false
+        isGenerating = true
+        currentPromptTokens = promptTokens
+        totalPromptTokens += promptTokens
+        contextUsed = promptTokens
+    }
+
+    func tokenGenerated(tokensPerSecond: Double, totalGenerated: Int) {
+        currentGenerationTokens = totalGenerated
+        currentTokensPerSecond = tokensPerSecond
+        contextUsed = currentPromptTokens + totalGenerated
+    }
+
+    func requestCompleted(promptTokens: Int, generationTokens: Int) {
+        activeRequests = max(0, activeRequests - 1)
+        totalGenerationTokens += generationTokens
+        if activeRequests == 0 {
+            isGenerating = false
+            isPrefilling = false
+            currentTokensPerSecond = 0
+        }
+    }
+
+    func reset() {
+        activeRequests = 0
+        currentPromptTokens = 0
+        currentGenerationTokens = 0
+        isGenerating = false
+        isPrefilling = false
+        currentTokensPerSecond = 0
+        contextUsed = 0
+        contextMax = 0
+        totalRequests = 0
+        totalPromptTokens = 0
+        totalGenerationTokens = 0
+        tokenRateHistory.removeAll()
+        promptTokenHistory.removeAll()
+        generationTokenHistory.removeAll()
+        lastGenerationTokenCount = 0
+        lastPromptTokenCount = 0
+    }
+}
diff --git a/MLXServer/Server/APIServer.swift b/MLXServer/Server/APIServer.swift
index 5944842..17e7d07 100644
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -11,6 +11,7 @@ final class APIServer {
     var isRunning = false
     var port: Int = 1234
     var requestCount: Int = 0
+    let inferenceStats = InferenceStats()
 
     private var listener: NWListener?
     private var modelManager: ModelManager?
@@ -54,6 +55,7 @@ final class APIServer {
             }
 
             listener?.start(queue: .global(qos: .userInitiated))
+            inferenceStats.startSampling()
         } catch {
             print("[APIServer] Failed to start: \(error)")
         }
@@ -66,6 +68,7 @@ final class APIServer {
         cachedSession = nil
         cachedMessages = nil
         cachedModelId = nil
+        inferenceStats.stopSampling()
     }
 
     // MARK: - Connection handling
@@ -341,6 +344,8 @@ final class APIServer {
         // Extract images from the last message only (ChatSession.streamDetails takes images separately)
         let lastImages = lastMessage.images
 
+        inferenceStats.requestStarted(contextLength: contextLength)
+
         if isStream {
             await handleStreamingResponse(
                 connection: connection,
@@ -421,14 +426,22 @@ final class APIServer {
                 switch generation {
                 case .chunk(let text):
                     fullText += text
+                    completionTokens += 1
+                    inferenceStats.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
                 case .info(let info):
                     promptTokens = info.promptTokenCount
                     completionTokens = info.generationTokenCount
+                    inferenceStats.prefillCompleted(promptTokens: promptTokens)
+                    if info.tokensPerSecond > 0 {
+                        inferenceStats.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
+                    }
                 case .toolCall(let call):
                     frameworkToolCalls.append(call)
                 }
             }
 
+            inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
+
             // Parse tool calls: first check framework-detected ones, then our own text parser
             var finishReason = "stop"
             var responseContent: String? = fullText
@@ -499,6 +512,7 @@ final class APIServer {
                 sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
             }
         } catch {
+            inferenceStats.requestCompleted(promptTokens: 0, generationTokens: 0)
             sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
         }
     }
@@ -564,6 +578,7 @@ final class APIServer {
                 case .chunk(let text):
                     completionTokens += 1
                     fullText += text
+                    inferenceStats.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
 
                     if !bufferForTools {
                         sendSSEEvent(connection: connection, chunk: APIChatCompletionChunk(
@@ -579,12 +594,17 @@ final class APIServer {
                 case .info(let info):
                     promptTokens = info.promptTokenCount
                     completionTokens = info.generationTokenCount
+                    inferenceStats.prefillCompleted(promptTokens: promptTokens)
+                    if info.tokensPerSecond > 0 {
+                        inferenceStats.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
+                    }
 
                 case .toolCall(let call):
                     frameworkToolCalls.append(call)
                 }
             }
         } catch {
+            inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
             let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
             connection.send(content: errorEvent.data(using: .utf8), completion: .contentProcessed({ _ in }))
         }
@@ -687,6 +707,8 @@ final class APIServer {
             )
         ))
 
+        inferenceStats.requestCompleted(promptTokens: promptTokens, generationTokens: completionTokens)
+
         // Send [DONE] and close
         let done = "data: [DONE]\n\n"
         connection.send(content: done.data(using: .utf8), completion: .contentProcessed({ _ in
diff --git a/MLXServer/Views/ChatMessagesView.swift b/MLXServer/Views/ChatMessagesView.swift
index 8f22d0e..d7476ec 100644
--- a/MLXServer/Views/ChatMessagesView.swift
+++ b/MLXServer/Views/ChatMessagesView.swift
@@ -7,7 +7,7 @@ struct ChatMessagesView: View {
     var body: some View {
         ScrollViewReader { proxy in
             ScrollView {
-                LazyVStack(alignment: .leading, spacing: 12) {
+                VStack(alignment: .leading, spacing: 12) {
                     if viewModel.conversation.messages.isEmpty {
                         emptyState
                     } else {
@@ -16,14 +16,20 @@ struct ChatMessagesView: View {
                                 .id(message.id)
                         }
                     }
+                    Color.clear
+                        .frame(height: 1)
+                        .id("bottom")
                 }
                 .padding()
             }
             .onChange(of: viewModel.conversation.messages.last?.content) {
-                scrollToBottom(proxy: proxy)
+                // During streaming, scroll without animation to avoid overlapping animations
+                proxy.scrollTo("bottom", anchor: .bottom)
             }
             .onChange(of: viewModel.conversation.messages.count) {
-                scrollToBottom(proxy: proxy)
+                withAnimation(.easeOut(duration: 0.2)) {
+                    proxy.scrollTo("bottom", anchor: .bottom)
+                }
             }
         }
     }
@@ -47,13 +53,6 @@ struct ChatMessagesView: View {
         .frame(maxWidth: .infinity, minHeight: 300)
     }
 
-    private func scrollToBottom(proxy: ScrollViewProxy) {
-        if let lastId = viewModel.conversation.messages.last?.id {
-            withAnimation(.easeOut(duration: 0.2)) {
-                proxy.scrollTo(lastId, anchor: .bottom)
-            }
-        }
-    }
 }
 
 struct MessageBubbleView: View {
diff --git a/MLXServer/Views/MonitorView.swift b/MLXServer/Views/MonitorView.swift
new file mode 100644
index 0000000..505110b
--- /dev/null
+++ b/MLXServer/Views/MonitorView.swift
@@ -0,0 +1,352 @@
+import Charts
+import MLX
+import SwiftUI
+
+/// Real-time inference monitoring dashboard, shown in place of the chat UI.
+struct MonitorView: View {
+    let stats: InferenceStats
+    @Environment(ModelManager.self) private var modelManager
+
+    var body: some View {
+        ScrollView {
+            VStack(spacing: 20) {
+                // Live status header
+                liveStatusSection
+
+                // Charts
+                HStack(alignment: .top, spacing: 16) {
+                    tokenRateChart
+                    tokenThroughputChart
+                }
+
+                // Gauges row
+                HStack(spacing: 16) {
+                    contextGauge
+                    gpuMemoryGauge
+                    requestsCard
+                }
+
+                // Cumulative stats
+                cumulativeSection
+            }
+            .padding(20)
+        }
+        .frame(maxWidth: .infinity, maxHeight: .infinity)
+        .background(.background)
+    }
+
+    // MARK: - Live Status
+
+    @ViewBuilder
+    private var liveStatusSection: some View {
+        HStack(spacing: 16) {
+            // Activity indicator
+            HStack(spacing: 8) {
+                Circle()
+                    .fill(activityColor)
+                    .frame(width: 10, height: 10)
+                    .overlay {
+                        if stats.isGenerating || stats.isPrefilling {
+                            Circle()
+                                .stroke(activityColor.opacity(0.5), lineWidth: 2)
+                                .scaleEffect(1.8)
+                                .opacity(0.6)
+                        }
+                    }
+
+                Text(activityLabel)
+                    .font(.headline)
+            }
+
+            Spacer()
+
+            if stats.isGenerating {
+                Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
+                    .font(.title2.monospacedDigit().bold())
+                    .foregroundStyle(.green)
+            }
+
+            if stats.currentPromptTokens > 0 {
+                HStack(spacing: 4) {
+                    Image(systemName: "arrow.down.circle.fill")
+                        .foregroundStyle(.blue)
+                    Text("\(stats.currentPromptTokens)")
+                        .monospacedDigit()
+                    Image(systemName: "arrow.up.circle.fill")
+                        .foregroundStyle(.orange)
+                    Text("\(stats.currentGenerationTokens)")
+                        .monospacedDigit()
+                }
+                .font(.callout)
+            }
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    private var activityColor: Color {
+        if stats.isPrefilling { return .blue }
+        if stats.isGenerating { return .green }
+        if stats.activeRequests > 0 { return .orange }
+        return .secondary
+    }
+
+    private var activityLabel: String {
+        if stats.isPrefilling { return "Prefilling" }
+        if stats.isGenerating { return "Generating" }
+        if stats.activeRequests > 0 { return "Processing" }
+        return "Idle"
+    }
+
+    // MARK: - Token Rate Chart
+
+    @ViewBuilder
+    private var tokenRateChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Generation Speed (tok/s)")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart(stats.tokenRateHistory) { point in
+                LineMark(
+                    x: .value("Time", point.timestamp),
+                    y: .value("tok/s", point.value)
+                )
+                .foregroundStyle(.green)
+                .interpolationMethod(.monotone)
+
+                AreaMark(
+                    x: .value("Time", point.timestamp),
+                    y: .value("tok/s", point.value)
+                )
+                .foregroundStyle(.green.opacity(0.1))
+                .interpolationMethod(.monotone)
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.0f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .chartYScale(domain: 0...(maxTokenRate + 1))
+            .frame(height: 150)
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    private var maxTokenRate: Double {
+        stats.tokenRateHistory.map(\.value).max() ?? 10
+    }
+
+    // MARK: - Token Throughput Chart
+
+    @ViewBuilder
+    private var tokenThroughputChart: some View {
+        VStack(alignment: .leading, spacing: 6) {
+            Text("Token Throughput (/sec)")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Chart {
+                ForEach(stats.promptTokenHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Tokens", point.value)
+                    )
+                    .foregroundStyle(.blue.opacity(0.7))
+                }
+                ForEach(stats.generationTokenHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Tokens", point.value)
+                    )
+                    .foregroundStyle(.orange.opacity(0.7))
+                }
+            }
+            .chartXAxis {
+                AxisMarks(values: .stride(by: .second, count: 30)) { _ in
+                    AxisGridLine()
+                }
+            }
+            .chartYAxis {
+                AxisMarks(position: .leading) { value in
+                    AxisGridLine()
+                    AxisValueLabel {
+                        if let v = value.as(Double.self) {
+                            Text(String(format: "%.0f", v))
+                                .font(.caption2.monospacedDigit())
+                        }
+                    }
+                }
+            }
+            .frame(height: 150)
+
+            // Legend
+            HStack(spacing: 12) {
+                Label("Prompt", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.blue)
+                Label("Generation", systemImage: "circle.fill")
+                    .font(.caption2)
+                    .foregroundStyle(.orange)
+            }
+        }
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    // MARK: - Context Gauge
+
+    @ViewBuilder
+    private var contextGauge: some View {
+        VStack(spacing: 8) {
+            Text("Context")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            let maxCtx = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
+            let used = stats.contextUsed
+            let ratio = maxCtx > 0 ? Double(used) / Double(maxCtx) : 0
+
+            Gauge(value: ratio) {
+                EmptyView()
+            } currentValueLabel: {
+                Text(formatTokenCount(used))
+                    .font(.title3.monospacedDigit().bold())
+            } minimumValueLabel: {
+                Text("0")
+                    .font(.caption2)
+            } maximumValueLabel: {
+                Text(formatTokenCount(maxCtx))
+                    .font(.caption2)
+            }
+            .gaugeStyle(.accessoryCircular)
+            .scaleEffect(1.3)
+            .tint(contextGradient(ratio: ratio))
+
+            Text("\(Int(ratio * 100))%")
+                .font(.caption.monospacedDigit())
+                .foregroundStyle(.secondary)
+        }
+        .frame(maxWidth: .infinity)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    private func contextGradient(ratio: Double) -> Color {
+        if ratio > 0.9 { return .red }
+        if ratio > 0.7 { return .orange }
+        return .blue
+    }
+
+    // MARK: - GPU Memory Gauge
+
+    @ViewBuilder
+    private var gpuMemoryGauge: some View {
+        VStack(spacing: 8) {
+            Text("GPU Memory")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            let activeMB = Double(MLX.GPU.activeMemory) / 1_048_576
+            let peakMB = Double(MLX.GPU.peakMemory) / 1_048_576
+
+            Text(String(format: "%.0f MB", activeMB))
+                .font(.title3.monospacedDigit().bold())
+
+            if peakMB > 0 {
+                Text(String(format: "Peak: %.0f MB", peakMB))
+                    .font(.caption2.monospacedDigit())
+                    .foregroundStyle(.tertiary)
+            }
+        }
+        .frame(maxWidth: .infinity)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    // MARK: - Requests Card
+
+    @ViewBuilder
+    private var requestsCard: some View {
+        VStack(spacing: 8) {
+            Text("Requests")
+                .font(.caption.bold())
+                .foregroundStyle(.secondary)
+
+            Text("\(stats.totalRequests)")
+                .font(.title3.monospacedDigit().bold())
+
+            if stats.activeRequests > 0 {
+                Text("\(stats.activeRequests) active")
+                    .font(.caption2)
+                    .foregroundStyle(.green)
+            } else {
+                Text("none active")
+                    .font(.caption2)
+                    .foregroundStyle(.tertiary)
+            }
+        }
+        .frame(maxWidth: .infinity)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    // MARK: - Cumulative
+
+    @ViewBuilder
+    private var cumulativeSection: some View {
+        HStack(spacing: 24) {
+            VStack(spacing: 2) {
+                Text("Total Prompt Tokens")
+                    .font(.caption2)
+                    .foregroundStyle(.secondary)
+                Text(formatTokenCount(stats.totalPromptTokens))
+                    .font(.callout.monospacedDigit().bold())
+                    .foregroundStyle(.blue)
+            }
+
+            VStack(spacing: 2) {
+                Text("Total Generated Tokens")
+                    .font(.caption2)
+                    .foregroundStyle(.secondary)
+                Text(formatTokenCount(stats.totalGenerationTokens))
+                    .font(.callout.monospacedDigit().bold())
+                    .foregroundStyle(.orange)
+            }
+
+            VStack(spacing: 2) {
+                Text("Total Tokens")
+                    .font(.caption2)
+                    .foregroundStyle(.secondary)
+                Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens))
+                    .font(.callout.monospacedDigit().bold())
+            }
+        }
+        .frame(maxWidth: .infinity)
+        .padding(12)
+        .background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
+    }
+
+    // MARK: - Helpers
+
+    private func formatTokenCount(_ count: Int) -> String {
+        if count >= 1_000_000 {
+            return String(format: "%.1fM", Double(count) / 1_000_000)
+        } else if count >= 1_000 {
+            return String(format: "%.1fk", Double(count) / 1_000)
+        }
+        return "\(count)"
+    }
+}