diff --git a/MLXServer.xcodeproj/project.pbxproj b/MLXServer.xcodeproj/project.pbxproj
index bc287bd..98b6e78 100644
--- a/MLXServer.xcodeproj/project.pbxproj
+++ b/MLXServer.xcodeproj/project.pbxproj
@@ -28,6 +28,7 @@
 		5C1E8FE1C521914CEF98D3AA /* ChatMessagesView.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB1A5E8B1C9F2BC4D262C53A /* ChatMessagesView.swift */; };
 		621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */; };
 		67262C5E24739F1FE0011439 /* StreamingSSEEncoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 615F8A7C9ABCADEB215D31BD /* StreamingSSEEncoder.swift */; };
+		67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */; };
 		6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */; };
 		741692862DB1F13EA0B2D14D /* TokenPrefixCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1962D530BEABCC7F1E8E0ED1 /* TokenPrefixCache.swift */; };
 		7CD765C1E2F9F4D7504C8D09 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B629DA084A9A40E54F8EA5FA /* Assets.xcassets */; };
@@ -95,6 +96,7 @@
 		6B3AA91D2C7842D7366F9A41 /* ChatDocumentPackage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatDocumentPackage.swift; sourceTree = "<group>"; };
 		6EE59189918D06B8D2F588FC /* MLXServer.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MLXServer.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		7C1A89C076E717F87A60397D /* ImageDecoder.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImageDecoder.swift; sourceTree = "<group>"; };
+		7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveCountersTests.swift; sourceTree = "<group>"; };
 		922CBDC9206737BD04AF2874 /* ModelManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelManager.swift; sourceTree = "<group>"; };
 		944C699FBB76C734C9DF2F2E /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
 		A4B359324B5FD8D106C74338 /* ChatMessage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatMessage.swift; sourceTree = "<group>"; };
@@ -175,6 +177,7 @@
 				E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
 				FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
 				E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
+				7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */,
 				D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
 				5F9426FA5A4AC55F8D9C080E /* PromptBuilderTests.swift */,
 				49C383DD5224F3420EB98DB2 /* StreamingSSEEncoderTests.swift */,
@@ -382,6 +385,7 @@
 				CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
 				962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
 				E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
+				67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */,
 				8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
 				1FE8C624898960ECCE39C0D4 /* PromptBuilderTests.swift in Sources */,
 				FE4405F66873C75CD6FA19A5 /* StreamingSSEEncoderTests.swift in Sources */,
diff --git a/MLXServer/Models/InferenceStats.swift b/MLXServer/Models/InferenceStats.swift
index 28b93eb..335587b 100644
--- a/MLXServer/Models/InferenceStats.swift
+++ b/MLXServer/Models/InferenceStats.swift
@@ -20,12 +20,16 @@ final class LiveCounters: @unchecked Sendable {
     private var _promptTokens: Int = 0
     private var _generationTokens: Int = 0
     private var _tokensPerSecond: Double = 0
+    private var _prefillTokensPerSecond: Double = 0
+    private var _timeToFirstToken: TimeInterval = 0
     private var _isPrefilling: Bool = false
     private var _isGenerating: Bool = false
     private var _contextMax: Int = 0
     private var _currentPhaseElapsed: TimeInterval = 0
     private var _currentCacheMatchedPromptTokens: Int = 0
     private var _currentCacheRebuiltPromptTokens: Int = 0
+    private var _cacheMatchDepth: Int = 0
+    private var _visionEncoderTime: TimeInterval = 0
 
     // Cumulative
     private var _totalRequests: Int = 0
@@ -37,6 +41,8 @@ final class LiveCounters: @unchecked Sendable {
     private var _totalSessionBuildDuration: TimeInterval = 0
     private var _totalPrefillDuration: TimeInterval = 0
     private var _totalGenerationDuration: TimeInterval = 0
+    private var _totalVisionEncoderDuration: TimeInterval = 0
+    private var _totalDisconnects: Int = 0
 
     func requestStarted(requestId: String, contextLength: Int) {
         let now = Date()
@@ -49,8 +55,16 @@ final class LiveCounters: @unchecked Sendable {
         _promptTokens = 0
         _generationTokens = 0
         _tokensPerSecond = 0
+        _prefillTokensPerSecond = 0
+        _timeToFirstToken = 0
         _contextMax = contextLength
-        requestPhases[requestId] = RequestState(phase: .preparing, phaseStartedAt: now)
+        _cacheMatchDepth = 0
+        _visionEncoderTime = 0
+        requestPhases[requestId] = RequestState(
+            phase: .preparing,
+            phaseStartedAt: now,
+            requestStartedAt: now
+        )
         refreshCurrentPhaseElapsed(now: now)
         lock.unlock()
     }
@@ -61,9 +75,24 @@ final class LiveCounters: @unchecked Sendable {
         if let current = requestPhases[requestId] {
             decrementCount(for: current.phase)
             accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
+            requestPhases[requestId] = RequestState(
+                phase: phase,
+                phaseStartedAt: now,
+                requestStartedAt: current.requestStartedAt,
+                matchedPromptTokens: current.matchedPromptTokens,
+                rebuiltPromptTokens: current.rebuiltPromptTokens,
+                hasRecordedFirstToken: current.hasRecordedFirstToken,
+                disconnectRecorded: current.disconnectRecorded,
+                visionEncoderTime: current.visionEncoderTime
+            )
+        } else {
+            requestPhases[requestId] = RequestState(
+                phase: phase,
+                phaseStartedAt: now,
+                requestStartedAt: now
+            )
         }
         incrementCount(for: phase)
-        requestPhases[requestId] = RequestState(phase: phase, phaseStartedAt: now)
         _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
         _isGenerating = _generatingRequests > 0
         refreshCurrentPhaseElapsed(now: now)
@@ -74,11 +103,19 @@ final class LiveCounters: @unchecked Sendable {
         let now = Date()
         lock.lock()
         if let current = requestPhases[requestId] {
+            let prefillElapsed = max(now.timeIntervalSince(current.phaseStartedAt), 0)
+            _prefillTokensPerSecond = prefillElapsed > 0
+                ? Double(promptTokens) / prefillElapsed
+                : 0
             decrementCount(for: current.phase)
-            accumulateDuration(for: current.phase, elapsed: now.timeIntervalSince(current.phaseStartedAt))
+            accumulateDuration(for: current.phase, elapsed: prefillElapsed)
         }
         incrementCount(for: .generating)
-        requestPhases[requestId] = RequestState(phase: .generating, phaseStartedAt: now)
+        if var state = requestPhases[requestId] {
+            state.phase = .generating
+            state.phaseStartedAt = now
+            requestPhases[requestId] = state
+        }
         _promptTokens = promptTokens
         _totalPromptTokens += promptTokens
         _isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
@@ -87,6 +124,20 @@ final class LiveCounters: @unchecked Sendable {
         lock.unlock()
     }
 
+    func firstTokenGenerated(requestId: String) {
+        let now = Date()
+        lock.lock()
+        guard var state = requestPhases[requestId], !state.hasRecordedFirstToken else {
+            lock.unlock()
+            return
+        }
+
+        state.hasRecordedFirstToken = true
+        requestPhases[requestId] = state
+        _timeToFirstToken = max(now.timeIntervalSince(state.requestStartedAt), 0)
+        lock.unlock()
+    }
+
     func tokenGenerated(tokensPerSecond: Double, totalGenerated: Int) {
         lock.lock()
         _generationTokens = totalGenerated
@@ -106,6 +157,7 @@ final class LiveCounters: @unchecked Sendable {
 
         _totalCacheReusePromptTokens += matched
         _totalCacheRebuildPromptTokens += rebuilt
+        _cacheMatchDepth = matched
 
         state.matchedPromptTokens = matched
         state.rebuiltPromptTokens = rebuilt
@@ -114,6 +166,34 @@ final class LiveCounters: @unchecked Sendable {
         lock.unlock()
     }
 
+    func visionProcessingCompleted(requestId: String, duration: TimeInterval) {
+        let clampedDuration = max(duration, 0)
+        lock.lock()
+        guard var state = requestPhases[requestId] else {
+            lock.unlock()
+            return
+        }
+
+        _visionEncoderTime = clampedDuration
+        _totalVisionEncoderDuration += clampedDuration
+        state.visionEncoderTime = clampedDuration
+        requestPhases[requestId] = state
+        lock.unlock()
+    }
+
+    func disconnectDetected(requestId: String) {
+        lock.lock()
+        guard var state = requestPhases[requestId], !state.disconnectRecorded else {
+            lock.unlock()
+            return
+        }
+
+        state.disconnectRecorded = true
+        requestPhases[requestId] = state
+        _totalDisconnects += 1
+        lock.unlock()
+    }
+
     func requestCompleted(requestId: String, generationTokens: Int) {
         let now = Date()
         lock.lock()
@@ -147,12 +227,16 @@ final class LiveCounters: @unchecked Sendable {
         _promptTokens = 0
         _generationTokens = 0
         _tokensPerSecond = 0
+        _prefillTokensPerSecond = 0
+        _timeToFirstToken = 0
         _isPrefilling = false
         _isGenerating = false
         _contextMax = 0
         _currentPhaseElapsed = 0
         _currentCacheMatchedPromptTokens = 0
         _currentCacheRebuiltPromptTokens = 0
+        _cacheMatchDepth = 0
+        _visionEncoderTime = 0
         _totalRequests = 0
         _totalPromptTokens = 0
         _totalGenerationTokens = 0
@@ -162,6 +246,8 @@ final class LiveCounters: @unchecked Sendable {
         _totalSessionBuildDuration = 0
         _totalPrefillDuration = 0
         _totalGenerationDuration = 0
+        _totalVisionEncoderDuration = 0
+        _totalDisconnects = 0
         lock.unlock()
     }
 
@@ -179,12 +265,16 @@ final class LiveCounters: @unchecked Sendable {
             promptTokens: _promptTokens,
             generationTokens: _generationTokens,
             tokensPerSecond: _tokensPerSecond,
+            prefillTokensPerSecond: _prefillTokensPerSecond,
+            timeToFirstToken: _timeToFirstToken,
             isPrefilling: _isPrefilling,
             isGenerating: _isGenerating,
             contextMax: _contextMax,
             currentPhaseElapsed: _currentPhaseElapsed,
             currentCacheMatchedPromptTokens: _currentCacheMatchedPromptTokens,
             currentCacheRebuiltPromptTokens: _currentCacheRebuiltPromptTokens,
+            cacheMatchDepth: _cacheMatchDepth,
+            visionEncoderTime: _visionEncoderTime,
             totalRequests: _totalRequests,
             totalPromptTokens: _totalPromptTokens,
             totalGenerationTokens: _totalGenerationTokens,
@@ -193,7 +283,9 @@ final class LiveCounters: @unchecked Sendable {
             totalPreparingDuration: _totalPreparingDuration,
             totalSessionBuildDuration: _totalSessionBuildDuration,
             totalPrefillDuration: _totalPrefillDuration,
-            totalGenerationDuration: _totalGenerationDuration
+            totalGenerationDuration: _totalGenerationDuration,
+            totalVisionEncoderDuration: _totalVisionEncoderDuration,
+            totalDisconnects: _totalDisconnects
         )
         lock.unlock()
         return s
@@ -208,12 +300,16 @@ final class LiveCounters: @unchecked Sendable {
         let promptTokens: Int
         let generationTokens: Int
         let tokensPerSecond: Double
+        let prefillTokensPerSecond: Double
+        let timeToFirstToken: TimeInterval
         let isPrefilling: Bool
         let isGenerating: Bool
         let contextMax: Int
         let currentPhaseElapsed: TimeInterval
         let currentCacheMatchedPromptTokens: Int
         let currentCacheRebuiltPromptTokens: Int
+        let cacheMatchDepth: Int
+        let visionEncoderTime: TimeInterval
         let totalRequests: Int
         let totalPromptTokens: Int
         let totalGenerationTokens: Int
@@ -223,6 +319,8 @@ final class LiveCounters: @unchecked Sendable {
         let totalSessionBuildDuration: TimeInterval
         let totalPrefillDuration: TimeInterval
         let totalGenerationDuration: TimeInterval
+        let totalVisionEncoderDuration: TimeInterval
+        let totalDisconnects: Int
     }
 
     private func incrementCount(for phase: RequestPhase) {
@@ -276,8 +374,12 @@ final class LiveCounters: @unchecked Sendable {
     private struct RequestState {
         var phase: RequestPhase
         var phaseStartedAt: Date
+        var requestStartedAt: Date
         var matchedPromptTokens: Int = 0
         var rebuiltPromptTokens: Int = 0
+        var hasRecordedFirstToken: Bool = false
+        var disconnectRecorded: Bool = false
+        var visionEncoderTime: TimeInterval = 0
     }
 
     enum RequestPhase {
@@ -305,11 +407,15 @@ final class InferenceStats {
     var isGenerating: Bool = false
     var isPrefilling: Bool = false
     var currentTokensPerSecond: Double = 0
+    var prefillTokensPerSecond: Double = 0
+    var timeToFirstToken: TimeInterval = 0
     var contextUsed: Int = 0
     var contextMax: Int = 0
     var currentPhaseElapsed: TimeInterval = 0
     var currentCacheMatchedPromptTokens: Int = 0
     var currentCacheRebuiltPromptTokens: Int = 0
+    var cacheMatchDepth: Int = 0
+    var visionEncoderTime: TimeInterval = 0
 
     // MARK: - Cumulative counters
 
@@ -326,6 +432,8 @@ final class InferenceStats {
     var totalSessionBuildDuration: TimeInterval = 0
     var totalPrefillDuration: TimeInterval = 0
     var totalGenerationDuration: TimeInterval = 0
+    var totalVisionEncoderDuration: TimeInterval = 0
+    var totalDisconnects: Int = 0
 
     // MARK: - Cache state
 
@@ -356,6 +464,10 @@ final class InferenceStats {
     private(set) var cacheReusePromptHistory: [DataPoint] = []
     private(set) var cacheRebuildPromptHistory: [DataPoint] = []
     private(set) var cacheMatchQualityHistory: [DataPoint] = []
+    private(set) var ttftHistory: [DataPoint] = []
+    private(set) var prefillSpeedHistory: [DataPoint] = []
+    private(set) var cacheMatchDepthHistory: [DataPoint] = []
+    private(set) var visionTimeHistory: [DataPoint] = []
 
     private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
 
@@ -394,6 +506,8 @@ final class InferenceStats {
         currentPromptTokens = snap.promptTokens
         currentGenerationTokens = snap.generationTokens
         currentTokensPerSecond = snap.tokensPerSecond
+        prefillTokensPerSecond = snap.prefillTokensPerSecond
+        timeToFirstToken = snap.timeToFirstToken
         isPrefilling = snap.isPrefilling
         isGenerating = snap.isGenerating
         contextMax = snap.contextMax
@@ -401,6 +515,8 @@ final class InferenceStats {
         currentPhaseElapsed = snap.currentPhaseElapsed
         currentCacheMatchedPromptTokens = snap.currentCacheMatchedPromptTokens
         currentCacheRebuiltPromptTokens = snap.currentCacheRebuiltPromptTokens
+        cacheMatchDepth = snap.cacheMatchDepth
+        visionEncoderTime = snap.visionEncoderTime
         totalRequests = snap.totalRequests
         totalPromptTokens = snap.totalPromptTokens
         totalGenerationTokens = snap.totalGenerationTokens
@@ -410,6 +526,8 @@ final class InferenceStats {
         totalSessionBuildDuration = snap.totalSessionBuildDuration
         totalPrefillDuration = snap.totalPrefillDuration
         totalGenerationDuration = snap.totalGenerationDuration
+        totalVisionEncoderDuration = snap.totalVisionEncoderDuration
+        totalDisconnects = snap.totalDisconnects
         totalCacheHits = cache.totalHits
         totalCacheMisses = cache.totalMisses
         totalCacheEvictions = cache.totalEvictions
@@ -448,6 +566,10 @@ final class InferenceStats {
         cacheReusePromptHistory.append(DataPoint(timestamp: now, value: Double(cacheReusePromptDelta)))
         cacheRebuildPromptHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildPromptDelta)))
         cacheMatchQualityHistory.append(DataPoint(timestamp: now, value: cacheMatchQualityDelta))
+        ttftHistory.append(DataPoint(timestamp: now, value: snap.timeToFirstToken * 1_000))
+        prefillSpeedHistory.append(DataPoint(timestamp: now, value: snap.prefillTokensPerSecond))
+        cacheMatchDepthHistory.append(DataPoint(timestamp: now, value: Double(snap.cacheMatchDepth)))
+        visionTimeHistory.append(DataPoint(timestamp: now, value: snap.visionEncoderTime * 1_000))
 
         if tokenRateHistory.count > Self.maxHistoryPoints {
             tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
@@ -485,6 +607,18 @@ final class InferenceStats {
         if cacheMatchQualityHistory.count > Self.maxHistoryPoints {
             cacheMatchQualityHistory.removeFirst(cacheMatchQualityHistory.count - Self.maxHistoryPoints)
         }
+        if ttftHistory.count > Self.maxHistoryPoints {
+            ttftHistory.removeFirst(ttftHistory.count - Self.maxHistoryPoints)
+        }
+        if prefillSpeedHistory.count > Self.maxHistoryPoints {
+            prefillSpeedHistory.removeFirst(prefillSpeedHistory.count - Self.maxHistoryPoints)
+        }
+        if cacheMatchDepthHistory.count > Self.maxHistoryPoints {
+            cacheMatchDepthHistory.removeFirst(cacheMatchDepthHistory.count - Self.maxHistoryPoints)
+        }
+        if visionTimeHistory.count > Self.maxHistoryPoints {
+            visionTimeHistory.removeFirst(visionTimeHistory.count - Self.maxHistoryPoints)
+        }
     }
 
     func reset() {
@@ -500,11 +634,15 @@ final class InferenceStats {
         isGenerating = false
         isPrefilling = false
         currentTokensPerSecond = 0
+        prefillTokensPerSecond = 0
+        timeToFirstToken = 0
         contextUsed = 0
         contextMax = 0
         currentPhaseElapsed = 0
         currentCacheMatchedPromptTokens = 0
         currentCacheRebuiltPromptTokens = 0
+        cacheMatchDepth = 0
+        visionEncoderTime = 0
         totalRequests = 0
         totalPromptTokens = 0
         totalGenerationTokens = 0
@@ -514,6 +652,8 @@ final class InferenceStats {
         totalSessionBuildDuration = 0
         totalPrefillDuration = 0
         totalGenerationDuration = 0
+        totalVisionEncoderDuration = 0
+        totalDisconnects = 0
         totalCacheHits = 0
         totalCacheMisses = 0
         totalCacheEvictions = 0
@@ -536,6 +676,10 @@ final class InferenceStats {
         cacheReusePromptHistory.removeAll()
         cacheRebuildPromptHistory.removeAll()
         cacheMatchQualityHistory.removeAll()
+        ttftHistory.removeAll()
+        prefillSpeedHistory.removeAll()
+        cacheMatchDepthHistory.removeAll()
+        visionTimeHistory.removeAll()
         lastGenerationTokenCount = 0
         lastPromptTokenCount = 0
         lastPrefillDuration = 0
diff --git a/MLXServer/Server/APIServer.swift b/MLXServer/Server/APIServer.swift
index b82bef4..bf03794 100644
--- a/MLXServer/Server/APIServer.swift
+++ b/MLXServer/Server/APIServer.swift
@@ -264,7 +264,14 @@ final class APIServer {
         let engine = InferenceEngine(container: container)
         let preparedInference: InferenceEngine.PreparedInference
         do {
+            let prepareStartedAt = Date()
             preparedInference = try await engine.prepare(preparedPrompt.userInput)
+            if preparedPrompt.containsImages {
+                LiveCounters.shared.visionProcessingCompleted(
+                    requestId: requestId,
+                    duration: Date().timeIntervalSince(prepareStartedAt)
+                )
+            }
         } catch {
             LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
             sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
@@ -428,6 +435,7 @@ final class APIServer {
         connection.stateUpdateHandler = { state in
             switch state {
             case .cancelled, .failed:
+                LiveCounters.shared.disconnectDetected(requestId: requestId)
                 cancellation.cancel()
             default:
                 break
@@ -538,6 +546,7 @@ final class APIServer {
         var fullText = ""
         var frameworkToolCalls: [MLXLMCommon.ToolCall] = []
         var cancelled = false
+        var sawFirstChunk = false
 
         for await generation in stream {
             if let cancellation, cancellation.isCancelled {
@@ -547,6 +556,10 @@ final class APIServer {
 
             switch generation {
             case .chunk(let text):
+                if !sawFirstChunk {
+                    sawFirstChunk = true
+                    LiveCounters.shared.firstTokenGenerated(requestId: requestId)
+                }
                 completionTokens += 1
                 fullText += text
                 LiveCounters.shared.tokenGenerated(tokensPerSecond: 0, totalGenerated: completionTokens)
diff --git a/MLXServer/Views/MonitorView.swift b/MLXServer/Views/MonitorView.swift
index 49345ab..be1f7fd 100644
--- a/MLXServer/Views/MonitorView.swift
+++ b/MLXServer/Views/MonitorView.swift
@@ -43,13 +43,27 @@ struct MonitorView: View {
                         color: .blue
                     )
                     metricCard(
-                        title: "Prefill Reuse",
-                        value: formatTokenCount(stats.totalCacheReusePromptTokens),
+                        title: "Cache Match",
+                        value: formatTokenCount(stats.cacheMatchDepth),
                         detail: stats.currentCacheMatchedPromptTokens > 0
                             ? String(format: "%.0f%% match now", stats.currentCacheMatchQualityPercent)
                             : String(format: "%.0f%% total quality", stats.totalCacheMatchQualityPercent),
                         color: .teal
                     )
+                    metricCard(
+                        title: "TTFT",
+                        value: formatMilliseconds(stats.timeToFirstToken * 1_000),
+                        detail: stats.isGenerating ? "time to first token" : "last completed request",
+                        color: .cyan
+                    )
+                    metricCard(
+                        title: "Prefill Speed",
+                        value: stats.prefillTokensPerSecond > 0
+                            ? String(format: "%.1f tok/s", stats.prefillTokensPerSecond)
+                            : "0 tok/s",
+                        detail: formatTokenCount(stats.currentPromptTokens) + " prompt tokens",
+                        color: .blue
+                    )
                     metricCard(
                         title: "Context",
                         value: formatTokenCount(stats.contextUsed),
@@ -68,13 +82,23 @@ struct MonitorView: View {
                         detail: "\(stats.currentGenerationTokens) output tokens",
                         color: .green
                     )
+                    metricCard(
+                        title: "Disconnects",
+                        value: "\(stats.totalDisconnects)",
+                        detail: stats.totalDisconnects == 0 ? "none detected" : "streams cancelled by clients",
+                        color: .red
+                    )
                 }
 
                 LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
+                    latencyChart
+                    prefillSpeedChart
                     throughputChart
-                    phaseChart
-                    cacheChart
+                    cacheMatchChart
                     memoryChart
+                    if hasVisionSamples {
+                        visionChart
+                    }
                 }
 
                 cumulativeSection
@@ -140,48 +164,33 @@ struct MonitorView: View {
         }
     }
 
-    private var phaseChart: some View {
-        chartCard(title: "Phase Timing") {
+    private var latencyChart: some View {
+        chartCard(title: "Time To First Token") {
             Chart {
-                ForEach(stats.currentPhaseElapsedHistory) { point in
+                ForEach(stats.ttftHistory) { point in
                     LineMark(
                         x: .value("Time", point.timestamp),
-                        y: .value("Age", point.value)
+                        y: .value("TTFT", point.value)
                     )
-                    .foregroundStyle(systemStateColor)
+                    .foregroundStyle(.cyan)
                     .interpolationMethod(.monotone)
                 }
-                ForEach(stats.prefillDurationHistory) { point in
-                    BarMark(
-                        x: .value("Time", point.timestamp),
-                        y: .value("Prefill", point.value)
-                    )
-                    .foregroundStyle(.blue.opacity(0.45))
-                }
             }
             .chartXAxis { timeAxis }
             .chartYAxis { leadingValueAxis }
             .frame(height: 180)
         } footer: {
-            legendRow(items: [("Active phase age", systemStateColor), ("Prefill completions", .blue)])
+            legendRow(items: [("TTFT ms", .cyan)])
         }
     }
 
-    private var cacheChart: some View {
-        chartCard(title: "Cache Match Quality") {
+    private var prefillSpeedChart: some View {
+        chartCard(title: "Prefill Speed") {
             Chart {
-                ForEach(stats.cacheMatchQualityHistory) { point in
+                ForEach(stats.prefillSpeedHistory) { point in
                     LineMark(
                         x: .value("Time", point.timestamp),
-                        y: .value("Match Quality", point.value)
-                    )
-                    .foregroundStyle(.teal)
-                    .interpolationMethod(.monotone)
-                }
-                ForEach(stats.cacheHitRateHistory) { point in
-                    LineMark(
-                        x: .value("Time", point.timestamp),
-                        y: .value("Hit Rate", point.value)
+                        y: .value("Prefill Speed", point.value)
                     )
                     .foregroundStyle(.blue)
                     .interpolationMethod(.monotone)
@@ -191,7 +200,41 @@ struct MonitorView: View {
             .chartYAxis { leadingValueAxis }
             .frame(height: 180)
         } footer: {
-            legendRow(items: [("Match quality %", .teal), ("Hit rate %", .blue)])
+            legendRow(items: [("Prompt tok/s", .blue)])
+        }
+    }
+
+    private var cacheMatchChart: some View {
+        chartCard(title: "Cache Match Depth") {
+            Chart {
+                ForEach(stats.cacheReusePromptHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Cached", point.value)
+                    )
+                    .foregroundStyle(.teal.opacity(0.7))
+                }
+                ForEach(stats.cacheRebuildPromptHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Prefilled", point.value)
+                    )
+                    .foregroundStyle(.orange.opacity(0.65))
+                }
+                ForEach(stats.cacheMatchDepthHistory) { point in
+                    LineMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Depth", point.value)
+                    )
+                    .foregroundStyle(.blue)
+                    .interpolationMethod(.monotone)
+                }
+            }
+            .chartXAxis { timeAxis }
+            .chartYAxis { leadingValueAxis }
+            .frame(height: 180)
+        } footer: {
+            legendRow(items: [("Cached", .teal), ("Prefilled", .orange), ("Matched depth", .blue)])
         }
     }
 
@@ -230,6 +273,25 @@ struct MonitorView: View {
         }
     }
 
+    private var visionChart: some View {
+        chartCard(title: "Vision Prepare Time") {
+            Chart {
+                ForEach(stats.visionTimeHistory) { point in
+                    BarMark(
+                        x: .value("Time", point.timestamp),
+                        y: .value("Vision", point.value)
+                    )
+                    .foregroundStyle(.purple.opacity(0.8))
+                }
+            }
+            .chartXAxis { timeAxis }
+            .chartYAxis { leadingValueAxis }
+            .frame(height: 180)
+        } footer: {
+            legendRow(items: [("Prepare ms", .purple)])
+        }
+    }
+
     private var cumulativeSection: some View {
         VStack(alignment: .leading, spacing: 10) {
             Text("Totals")
@@ -240,11 +302,13 @@ struct MonitorView: View {
                 compactTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue)
                 compactTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .green)
                 compactTile(title: "Cache Evictions", value: "\(stats.totalCacheEvictions)", color: .red)
-                compactTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .teal)
-                compactTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .orange)
+                compactTile(title: "Tokens From Cache", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .teal)
+                compactTile(title: "Tokens Prefilled", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .orange)
                 compactTile(title: "Match Quality", value: String(format: "%.0f%%", stats.totalCacheMatchQualityPercent), color: .teal)
                 compactTile(title: "Prefill Time", value: String(format: "%.1fs", stats.totalPrefillDuration), color: .blue)
                 compactTile(title: "Generation Time", value: String(format: "%.1fs", stats.totalGenerationDuration), color: .green)
+                compactTile(title: "Vision Time", value: String(format: "%.1fs", stats.totalVisionEncoderDuration), color: .purple)
+                compactTile(title: "Disconnects", value: "\(stats.totalDisconnects)", color: .red)
                 compactTile(title: "Cache Budget", value: formatByteCount(stats.cacheMemoryBudgetBytes), color: .orange)
             }
         }
@@ -451,4 +515,16 @@ struct MonitorView: View {
         if minutes < 60 { return "\(minutes)m ago" }
         return "\(minutes / 60)h ago"
     }
+
+    private var hasVisionSamples: Bool {
+        stats.visionTimeHistory.contains { $0.value > 0 }
+    }
+
+    private func formatMilliseconds(_ value: Double) -> String {
+        guard value > 0 else { return "0 ms" }
+        if value >= 1_000 {
+            return String(format: "%.2fs", value / 1_000)
+        }
+        return String(format: "%.0f ms", value)
+    }
 }
diff --git a/MLXServerTests/Server/APIServerRewriteTests.swift b/MLXServerTests/Server/APIServerRewriteTests.swift
index b011057..37f0379 100644
--- a/MLXServerTests/Server/APIServerRewriteTests.swift
+++ b/MLXServerTests/Server/APIServerRewriteTests.swift
@@ -36,6 +36,8 @@ final class APIServerRewriteTests: XCTestCase {
         let firstSnapshot = TokenPrefixCache.shared.snapshot()
         let firstLiveSnapshot = LiveCounters.shared.snapshot()
         XCTAssertGreaterThan(firstSnapshot.totalEntries, 0)
+        XCTAssertGreaterThan(firstLiveSnapshot.prefillTokensPerSecond, 0)
+        XCTAssertGreaterThan(firstLiveSnapshot.timeToFirstToken, 0)
 
         _ = try await sendChatCompletion(request, port: harness.port)
 
@@ -46,6 +48,7 @@ final class APIServerRewriteTests: XCTestCase {
         let secondLiveSnapshot = LiveCounters.shared.snapshot()
         XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
         XCTAssertGreaterThan(secondLiveSnapshot.totalCacheReusePromptTokens, firstLiveSnapshot.totalCacheReusePromptTokens)
+        XCTAssertGreaterThan(secondLiveSnapshot.cacheMatchDepth, 0)
     }
 
     func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
@@ -429,6 +432,7 @@ final class APIServerRewriteTests: XCTestCase {
         let afterDisconnectSnapshot = TokenPrefixCache.shared.snapshot()
         let afterDisconnectLiveSnapshot = LiveCounters.shared.snapshot()
         XCTAssertGreaterThan(afterDisconnectSnapshot.totalEntries, initialSnapshot.totalEntries)
+        XCTAssertGreaterThan(afterDisconnectLiveSnapshot.totalDisconnects, 0)
 
         _ = try await sendChatCompletion(
             APIChatCompletionRequest(
diff --git a/MLXServerTests/Server/LiveCountersTests.swift b/MLXServerTests/Server/LiveCountersTests.swift
new file mode 100644
index 0000000..73a4caa
--- /dev/null
+++ b/MLXServerTests/Server/LiveCountersTests.swift
@@ -0,0 +1,46 @@
+import Foundation
+import XCTest
+@testable import MLX_Server
+
+final class LiveCountersTests: XCTestCase {
+    override func tearDown() {
+        LiveCounters.shared.reset()
+        super.tearDown()
+    }
+
+    func testTracksRequestMetricsAndDeduplicatesDisconnects() {
+        let requestId = "req-1"
+
+        LiveCounters.shared.reset()
+        LiveCounters.shared.requestStarted(requestId: requestId, contextLength: 8_192)
+        LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
+        LiveCounters.shared.recordPrefillReuse(requestId: requestId, matchedPromptTokens: 40, promptTokenCount: 64)
+        LiveCounters.shared.visionProcessingCompleted(requestId: requestId, duration: 0.25)
+
+        Thread.sleep(forTimeInterval: 0.01)
+        LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: 64)
+
+        Thread.sleep(forTimeInterval: 0.01)
+        LiveCounters.shared.firstTokenGenerated(requestId: requestId)
+        LiveCounters.shared.tokenGenerated(tokensPerSecond: 12.5, totalGenerated: 3)
+        LiveCounters.shared.disconnectDetected(requestId: requestId)
+        LiveCounters.shared.disconnectDetected(requestId: requestId)
+
+        let inFlight = LiveCounters.shared.snapshot()
+        XCTAssertEqual(inFlight.cacheMatchDepth, 40)
+        XCTAssertEqual(inFlight.currentCacheMatchedPromptTokens, 40)
+        XCTAssertEqual(inFlight.currentCacheRebuiltPromptTokens, 24)
+        XCTAssertEqual(inFlight.visionEncoderTime, 0.25, accuracy: 0.0001)
+        XCTAssertGreaterThan(inFlight.prefillTokensPerSecond, 0)
+        XCTAssertGreaterThan(inFlight.timeToFirstToken, 0)
+        XCTAssertEqual(inFlight.totalDisconnects, 1)
+
+        LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 3)
+
+        let completed = LiveCounters.shared.snapshot()
+        XCTAssertEqual(completed.totalPromptTokens, 64)
+        XCTAssertEqual(completed.totalGenerationTokens, 3)
+        XCTAssertEqual(completed.totalVisionEncoderDuration, 0.25, accuracy: 0.0001)
+        XCTAssertEqual(completed.totalDisconnects, 1)
+    }
+}
\ No newline at end of file
diff --git a/docs/session-cache-upgrade.md b/docs/session-cache-upgrade.md
index 161ab11..11202c5 100644
--- a/docs/session-cache-upgrade.md
+++ b/docs/session-cache-upgrade.md
@@ -2573,17 +2573,17 @@ Validation note: `PromptBuilder.swift` is now covered by both shaping-parity uni
 ### Phase 3: Integration
 
 7. [x] **`APIServer.swift` rewrite** — Wire everything together. Replace ChatSession with InferenceEngine, ConversationSessionCache with TokenPrefixCache, add PromptBuilder and StreamingSSEEncoder.
-8. **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
+8. [x] **Delete `ConversationSessionCache.swift`** — Only after APIServer is fully migrated and tested.
 
 Validation note: `APIServer.swift` now routes the API path through `PromptBuilder`, `InferenceEngine`, `TokenPrefixCache`, and `StreamingSSEEncoder`, and the full repository test workflow is green. Image-bearing requests intentionally bypass prefix-cache reuse for now until image fingerprinting is implemented.
 
 ### Phase 4: Statistics & Monitoring
 
-9. **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
+9. [x] **LiveCounters upgrade** — Add TTFT, prefill tok/s, cache match depth, vision time, disconnect tracking. Wire up new reporting calls in APIServer.
 10. [x] **InferenceStats upgrade** — Add new snapshot fields, new time-series histories. Switch from ConversationSessionCache.snapshot() to TokenPrefixCache.snapshot().
 11. [x] **MonitorView upgrade** — Add TTFT chart, prefill speed chart, cache match quality chart, cache memory budget chart. Update cache card and cumulative tiles. Add vision encoder time chart (conditional on VL model). Replace session list with cache entry list.
 
-Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` has been rebuilt around current system state and prefix-cache visibility rather than session-era charts. The dashboard now exposes cache match quality from matched-vs-rebuilt prompt token counters, but it still does not expose TTFT, cache match depth, or vision timing because those `LiveCounters` signals have not been implemented yet.
+Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly and `MonitorView.swift` now surfaces TTFT, prefill speed, cache match depth, cache memory pressure, disconnect totals, and vision prepare time from `LiveCounters`. Match-type hit breakdown is still open because it depends on the advanced cache matching work in Phase 5.
 
 ### Phase 5: Advanced Cache Matching
 
@@ -2608,10 +2608,10 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
 
 ### Cache Correctness
 
-- [ ] Cold start: no cache entries → fresh generation works
+- [x] Cold start: no cache entries → fresh generation works
 - [ ] Second identical request → full cache hit, zero prefill tokens
 - [ ] Conversation continuation (add 1 message) → partial cache hit
-- [ ] Conversation continuation (add 2+ messages, e.g. tool-use flow) → partial cache hit (not a miss!)
+- [x] Conversation continuation (add 2+ messages, e.g. tool-use flow) → partial cache hit (not a miss!)
 - [ ] Same system prompt, different user message → system prompt prefix cached and reused
 - [ ] Different system prompt → no false cache hit
 - [ ] Model swap → cache invalidated, fresh generation works
@@ -2622,31 +2622,31 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
 - [ ] Memory budget computed correctly from Metal device
 - [x] Entries evicted under memory pressure (oldest first)
 - [x] Expired entries pruned after 30 min idle
-- [ ] Trie nodes cleaned up when entries are evicted (no memory leak)
+- [x] Trie nodes cleaned up when entries are evicted (no memory leak)
 - [ ] `snapshot()` reports accurate memory usage and hit rates
 
 ### Disconnect Handling
 
 - [ ] Client disconnects mid-stream → generation stops within ~200ms
-- [ ] Partial KV cache from disconnected request is still stored for reuse
+- [x] Partial KV cache from disconnected request is still stored for reuse
 - [ ] No Metal assertion failures on disconnect
 
 ### Streaming
 
-- [ ] SSE JSON is valid and parseable by standard clients
+- [x] SSE JSON is valid and parseable by standard clients
 - [x] `StreamingSSEEncoder` output matches `JSONEncoder` output byte-for-byte (for content deltas)
-- [ ] Role delta sent once at stream start
-- [ ] Tool call chunks sent correctly
-- [ ] Final chunk has finish_reason and usage stats
-- [ ] `data: [DONE]` sent at end
+- [x] Role delta sent once at stream start
+- [x] Tool call chunks sent correctly
+- [x] Final chunk has finish_reason and usage stats
+- [x] `data: [DONE]` sent at end
 
 ### Tool Use
 
 - [ ] Gemma tool_code blocks parsed correctly
 - [ ] Qwen `<tool_call>` tags parsed correctly
 - [ ] Framework `ToolCall` events handled correctly
-- [ ] Tool results round-trip correctly (user sends tool result → model sees it in context)
-- [ ] finish_reason is "tool_calls" when tools are invoked
+- [x] Tool results round-trip correctly (user sends tool result → model sees it in context)
+- [x] finish_reason is "tool_calls" when tools are invoked
 
 ### Vision-Language Models
 
@@ -2694,7 +2694,7 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
 
 ### Thinking Mode
 
-- [ ] `enable_thinking: false` passed through to template correctly
+- [x] `enable_thinking: false` passed through to template correctly
 - [ ] Thinking mode on: `<think>` blocks appear in output
 - [ ] Thinking mode off: no `<think>` blocks
 
@@ -2702,7 +2702,7 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
 
 - [ ] `GET /health` → `{"status":"ok"}`
 - [ ] `GET /v1/models` → model list with context windows
-- [ ] Non-streaming `POST /v1/chat/completions` → full response
-- [ ] Streaming `POST /v1/chat/completions` → SSE stream
+- [x] Non-streaming `POST /v1/chat/completions` → full response
+- [x] Streaming `POST /v1/chat/completions` → SSE stream
 - [ ] Model field in request triggers model swap
 - [ ] UI chat (ChatViewModel) completely unaffected