fix: more telemetry and tighter implementation of cache
This commit is contained in:
@@ -42,6 +42,7 @@
|
|||||||
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
|
D666A311788375E8A061C832 /* SettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4147321383E94E9F17A0154E /* SettingsView.swift */; };
|
||||||
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
|
D96DDE66F76FDDA642629E17 /* APIModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1A52E2C9964ADA9D841A89B /* APIModels.swift */; };
|
||||||
DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; };
|
DF5C525DBD2E3153256951C1 /* SceneManagementWindow.swift in Sources */ = {isa = PBXBuildFile; fileRef = BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */; };
|
||||||
|
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */; };
|
||||||
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
|
F546CE5955ED253D8A793D5E /* MarkdownUI in Frameworks */ = {isa = PBXBuildFile; productRef = A98257123539E9E738213BFA /* MarkdownUI */; };
|
||||||
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
|
FAF7D4714AC6D02674920208 /* ChatMessage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A4B359324B5FD8D106C74338 /* ChatMessage.swift */; };
|
||||||
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
|
FCD48F8C132A2B830A15EEB4 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = 3F5A4AC6DBAF7CA686ECA74E /* MLXLLM */; };
|
||||||
@@ -85,6 +86,7 @@
|
|||||||
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
|
E73B165A1822729C907791AE /* ToolCallParser.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParser.swift; sourceTree = "<group>"; };
|
||||||
EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = "<group>"; };
|
EF518FEBF3A38E830E3CE1A5 /* FocusedValues.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusedValues.swift; sourceTree = "<group>"; };
|
||||||
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
|
F1A52E2C9964ADA9D841A89B /* APIModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIModels.swift; sourceTree = "<group>"; };
|
||||||
|
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConversationSessionCache.swift; sourceTree = "<group>"; };
|
||||||
/* End PBXFileReference section */
|
/* End PBXFileReference section */
|
||||||
|
|
||||||
/* Begin PBXFrameworksBuildPhase section */
|
/* Begin PBXFrameworksBuildPhase section */
|
||||||
@@ -203,6 +205,7 @@
|
|||||||
children = (
|
children = (
|
||||||
F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
|
F1A52E2C9964ADA9D841A89B /* APIModels.swift */,
|
||||||
3D08828E16B17EF02C14243E /* APIServer.swift */,
|
3D08828E16B17EF02C14243E /* APIServer.swift */,
|
||||||
|
FFBB16D3AF2E61D001FD6051 /* ConversationSessionCache.swift */,
|
||||||
E73B165A1822729C907791AE /* ToolCallParser.swift */,
|
E73B165A1822729C907791AE /* ToolCallParser.swift */,
|
||||||
16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */,
|
16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */,
|
||||||
);
|
);
|
||||||
@@ -306,6 +309,7 @@
|
|||||||
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
|
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */,
|
||||||
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
|
B5AA6E3B4BE21676226B342B /* ChatViewModel.swift in Sources */,
|
||||||
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
|
5946258F1DE88CE904584E0B /* ContentView.swift in Sources */,
|
||||||
|
F141B91A64F7DAD73CE2910A /* ConversationSessionCache.swift in Sources */,
|
||||||
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
|
C07A377244DCD67F4FE709FE /* DownloadModalView.swift in Sources */,
|
||||||
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
|
4DC033E45880B2948B47DEB1 /* FocusedValues.swift in Sources */,
|
||||||
2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
|
2D08769282BD71C170DB0943 /* InferenceStats.swift in Sources */,
|
||||||
|
|||||||
@@ -9,9 +9,14 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
static let shared = LiveCounters()
|
static let shared = LiveCounters()
|
||||||
|
|
||||||
private let lock = OSAllocatedUnfairLock()
|
private let lock = OSAllocatedUnfairLock()
|
||||||
|
private var requestPhases: [String: RequestPhase] = [:]
|
||||||
|
|
||||||
// Current request
|
// Current request
|
||||||
private var _activeRequests: Int = 0
|
private var _activeRequests: Int = 0
|
||||||
|
private var _preparingRequests: Int = 0
|
||||||
|
private var _sessionBuildRequests: Int = 0
|
||||||
|
private var _prefillRequests: Int = 0
|
||||||
|
private var _generatingRequests: Int = 0
|
||||||
private var _promptTokens: Int = 0
|
private var _promptTokens: Int = 0
|
||||||
private var _generationTokens: Int = 0
|
private var _generationTokens: Int = 0
|
||||||
private var _tokensPerSecond: Double = 0
|
private var _tokensPerSecond: Double = 0
|
||||||
@@ -24,9 +29,10 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
private var _totalPromptTokens: Int = 0
|
private var _totalPromptTokens: Int = 0
|
||||||
private var _totalGenerationTokens: Int = 0
|
private var _totalGenerationTokens: Int = 0
|
||||||
|
|
||||||
func requestStarted(contextLength: Int) {
|
func requestStarted(requestId: String, contextLength: Int) {
|
||||||
lock.lock()
|
lock.lock()
|
||||||
_activeRequests += 1
|
_activeRequests += 1
|
||||||
|
_preparingRequests += 1
|
||||||
_totalRequests += 1
|
_totalRequests += 1
|
||||||
_isPrefilling = true
|
_isPrefilling = true
|
||||||
_isGenerating = false
|
_isGenerating = false
|
||||||
@@ -34,15 +40,33 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
_generationTokens = 0
|
_generationTokens = 0
|
||||||
_tokensPerSecond = 0
|
_tokensPerSecond = 0
|
||||||
_contextMax = contextLength
|
_contextMax = contextLength
|
||||||
|
requestPhases[requestId] = .preparing
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func prefillCompleted(promptTokens: Int) {
|
func requestPhaseChanged(requestId: String, phase: RequestPhase) {
|
||||||
lock.lock()
|
lock.lock()
|
||||||
_isPrefilling = false
|
if let current = requestPhases[requestId] {
|
||||||
_isGenerating = true
|
decrementCount(for: current)
|
||||||
|
}
|
||||||
|
incrementCount(for: phase)
|
||||||
|
requestPhases[requestId] = phase
|
||||||
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
|
_isGenerating = _generatingRequests > 0
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func prefillCompleted(requestId: String, promptTokens: Int) {
|
||||||
|
lock.lock()
|
||||||
|
if let current = requestPhases[requestId] {
|
||||||
|
decrementCount(for: current)
|
||||||
|
}
|
||||||
|
incrementCount(for: .generating)
|
||||||
|
requestPhases[requestId] = .generating
|
||||||
_promptTokens = promptTokens
|
_promptTokens = promptTokens
|
||||||
_totalPromptTokens += promptTokens
|
_totalPromptTokens += promptTokens
|
||||||
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
|
_isGenerating = _generatingRequests > 0
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,21 +77,32 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func requestCompleted(generationTokens: Int) {
|
func requestCompleted(requestId: String, generationTokens: Int) {
|
||||||
lock.lock()
|
lock.lock()
|
||||||
|
if let current = requestPhases.removeValue(forKey: requestId) {
|
||||||
|
decrementCount(for: current)
|
||||||
|
}
|
||||||
_activeRequests = max(0, _activeRequests - 1)
|
_activeRequests = max(0, _activeRequests - 1)
|
||||||
_totalGenerationTokens += generationTokens
|
_totalGenerationTokens += generationTokens
|
||||||
if _activeRequests == 0 {
|
if _activeRequests == 0 {
|
||||||
_isGenerating = false
|
_isGenerating = false
|
||||||
_isPrefilling = false
|
_isPrefilling = false
|
||||||
_tokensPerSecond = 0
|
_tokensPerSecond = 0
|
||||||
|
} else {
|
||||||
|
_isPrefilling = _prefillRequests > 0 || _sessionBuildRequests > 0 || _preparingRequests > 0
|
||||||
|
_isGenerating = _generatingRequests > 0
|
||||||
}
|
}
|
||||||
lock.unlock()
|
lock.unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func reset() {
|
func reset() {
|
||||||
lock.lock()
|
lock.lock()
|
||||||
|
requestPhases.removeAll()
|
||||||
_activeRequests = 0
|
_activeRequests = 0
|
||||||
|
_preparingRequests = 0
|
||||||
|
_sessionBuildRequests = 0
|
||||||
|
_prefillRequests = 0
|
||||||
|
_generatingRequests = 0
|
||||||
_promptTokens = 0
|
_promptTokens = 0
|
||||||
_generationTokens = 0
|
_generationTokens = 0
|
||||||
_tokensPerSecond = 0
|
_tokensPerSecond = 0
|
||||||
@@ -85,6 +120,10 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
lock.lock()
|
lock.lock()
|
||||||
let s = Snapshot(
|
let s = Snapshot(
|
||||||
activeRequests: _activeRequests,
|
activeRequests: _activeRequests,
|
||||||
|
preparingRequests: _preparingRequests,
|
||||||
|
sessionBuildRequests: _sessionBuildRequests,
|
||||||
|
prefillRequests: _prefillRequests,
|
||||||
|
generatingRequests: _generatingRequests,
|
||||||
promptTokens: _promptTokens,
|
promptTokens: _promptTokens,
|
||||||
generationTokens: _generationTokens,
|
generationTokens: _generationTokens,
|
||||||
tokensPerSecond: _tokensPerSecond,
|
tokensPerSecond: _tokensPerSecond,
|
||||||
@@ -101,6 +140,10 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
|
|
||||||
struct Snapshot {
|
struct Snapshot {
|
||||||
let activeRequests: Int
|
let activeRequests: Int
|
||||||
|
let preparingRequests: Int
|
||||||
|
let sessionBuildRequests: Int
|
||||||
|
let prefillRequests: Int
|
||||||
|
let generatingRequests: Int
|
||||||
let promptTokens: Int
|
let promptTokens: Int
|
||||||
let generationTokens: Int
|
let generationTokens: Int
|
||||||
let tokensPerSecond: Double
|
let tokensPerSecond: Double
|
||||||
@@ -111,6 +154,39 @@ final class LiveCounters: @unchecked Sendable {
|
|||||||
let totalPromptTokens: Int
|
let totalPromptTokens: Int
|
||||||
let totalGenerationTokens: Int
|
let totalGenerationTokens: Int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func incrementCount(for phase: RequestPhase) {
|
||||||
|
switch phase {
|
||||||
|
case .preparing:
|
||||||
|
_preparingRequests += 1
|
||||||
|
case .sessionBuild:
|
||||||
|
_sessionBuildRequests += 1
|
||||||
|
case .prefilling:
|
||||||
|
_prefillRequests += 1
|
||||||
|
case .generating:
|
||||||
|
_generatingRequests += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func decrementCount(for phase: RequestPhase) {
|
||||||
|
switch phase {
|
||||||
|
case .preparing:
|
||||||
|
_preparingRequests = max(0, _preparingRequests - 1)
|
||||||
|
case .sessionBuild:
|
||||||
|
_sessionBuildRequests = max(0, _sessionBuildRequests - 1)
|
||||||
|
case .prefilling:
|
||||||
|
_prefillRequests = max(0, _prefillRequests - 1)
|
||||||
|
case .generating:
|
||||||
|
_generatingRequests = max(0, _generatingRequests - 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum RequestPhase {
|
||||||
|
case preparing
|
||||||
|
case sessionBuild
|
||||||
|
case prefilling
|
||||||
|
case generating
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Observable stats for the UI (polls LiveCounters at 1Hz)
|
// MARK: - Observable stats for the UI (polls LiveCounters at 1Hz)
|
||||||
@@ -121,6 +197,10 @@ final class InferenceStats {
|
|||||||
// MARK: - Current request state (refreshed from LiveCounters)
|
// MARK: - Current request state (refreshed from LiveCounters)
|
||||||
|
|
||||||
var activeRequests: Int = 0
|
var activeRequests: Int = 0
|
||||||
|
var preparingRequests: Int = 0
|
||||||
|
var sessionBuildRequests: Int = 0
|
||||||
|
var prefillingRequests: Int = 0
|
||||||
|
var generatingRequests: Int = 0
|
||||||
var currentPromptTokens: Int = 0
|
var currentPromptTokens: Int = 0
|
||||||
var currentGenerationTokens: Int = 0
|
var currentGenerationTokens: Int = 0
|
||||||
var isGenerating: Bool = false
|
var isGenerating: Bool = false
|
||||||
@@ -134,6 +214,21 @@ final class InferenceStats {
|
|||||||
var totalRequests: Int = 0
|
var totalRequests: Int = 0
|
||||||
var totalPromptTokens: Int = 0
|
var totalPromptTokens: Int = 0
|
||||||
var totalGenerationTokens: Int = 0
|
var totalGenerationTokens: Int = 0
|
||||||
|
var totalCacheHits: Int = 0
|
||||||
|
var totalCacheMisses: Int = 0
|
||||||
|
var totalCacheEvictions: Int = 0
|
||||||
|
var totalCacheReusePromptTokens: Int = 0
|
||||||
|
var totalCacheRebuildPromptTokens: Int = 0
|
||||||
|
|
||||||
|
// MARK: - Cache state
|
||||||
|
|
||||||
|
var cacheEntryCount: Int = 0
|
||||||
|
var warmCacheEntryCount: Int = 0
|
||||||
|
var activeCacheEntryCount: Int = 0
|
||||||
|
var generatingCacheEntryCount: Int = 0
|
||||||
|
var cacheEstimatedBytes: Int = 0
|
||||||
|
var cacheEstimatedTokens: Int = 0
|
||||||
|
var cachedSessions: [ConversationSessionCache.SessionSummary] = []
|
||||||
|
|
||||||
// MARK: - Time series data (ring buffers for charts)
|
// MARK: - Time series data (ring buffers for charts)
|
||||||
|
|
||||||
@@ -146,6 +241,11 @@ final class InferenceStats {
|
|||||||
private(set) var tokenRateHistory: [DataPoint] = []
|
private(set) var tokenRateHistory: [DataPoint] = []
|
||||||
private(set) var promptTokenHistory: [DataPoint] = []
|
private(set) var promptTokenHistory: [DataPoint] = []
|
||||||
private(set) var generationTokenHistory: [DataPoint] = []
|
private(set) var generationTokenHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheEntryHistory: [DataPoint] = []
|
||||||
|
private(set) var activeSessionHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheFootprintHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheReuseHistory: [DataPoint] = []
|
||||||
|
private(set) var cacheRebuildHistory: [DataPoint] = []
|
||||||
|
|
||||||
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
private static let maxHistoryPoints = 120 // ~2 minutes at 1Hz
|
||||||
|
|
||||||
@@ -153,6 +253,8 @@ final class InferenceStats {
|
|||||||
private var sampleTimer: Timer?
|
private var sampleTimer: Timer?
|
||||||
private var lastGenerationTokenCount: Int = 0
|
private var lastGenerationTokenCount: Int = 0
|
||||||
private var lastPromptTokenCount: Int = 0
|
private var lastPromptTokenCount: Int = 0
|
||||||
|
private var lastCacheReuseTokenCount: Int = 0
|
||||||
|
private var lastCacheRebuildTokenCount: Int = 0
|
||||||
|
|
||||||
func startSampling() {
|
func startSampling() {
|
||||||
guard sampleTimer == nil else { return }
|
guard sampleTimer == nil else { return }
|
||||||
@@ -171,8 +273,13 @@ final class InferenceStats {
|
|||||||
private func recordSample() {
|
private func recordSample() {
|
||||||
// Pull live values from the thread-safe counters
|
// Pull live values from the thread-safe counters
|
||||||
let snap = LiveCounters.shared.snapshot()
|
let snap = LiveCounters.shared.snapshot()
|
||||||
|
let cache = ConversationSessionCache.shared.snapshot()
|
||||||
|
|
||||||
activeRequests = snap.activeRequests
|
activeRequests = snap.activeRequests
|
||||||
|
preparingRequests = snap.preparingRequests
|
||||||
|
sessionBuildRequests = snap.sessionBuildRequests
|
||||||
|
prefillingRequests = snap.prefillRequests
|
||||||
|
generatingRequests = snap.generatingRequests
|
||||||
currentPromptTokens = snap.promptTokens
|
currentPromptTokens = snap.promptTokens
|
||||||
currentGenerationTokens = snap.generationTokens
|
currentGenerationTokens = snap.generationTokens
|
||||||
currentTokensPerSecond = snap.tokensPerSecond
|
currentTokensPerSecond = snap.tokensPerSecond
|
||||||
@@ -183,16 +290,37 @@ final class InferenceStats {
|
|||||||
totalRequests = snap.totalRequests
|
totalRequests = snap.totalRequests
|
||||||
totalPromptTokens = snap.totalPromptTokens
|
totalPromptTokens = snap.totalPromptTokens
|
||||||
totalGenerationTokens = snap.totalGenerationTokens
|
totalGenerationTokens = snap.totalGenerationTokens
|
||||||
|
totalCacheHits = cache.totalHits
|
||||||
|
totalCacheMisses = cache.totalMisses
|
||||||
|
totalCacheEvictions = cache.totalEvictions
|
||||||
|
totalCacheReusePromptTokens = cache.totalReusePromptTokens
|
||||||
|
totalCacheRebuildPromptTokens = cache.totalRebuildPromptTokens
|
||||||
|
cacheEntryCount = cache.totalEntries
|
||||||
|
warmCacheEntryCount = cache.warmEntries
|
||||||
|
activeCacheEntryCount = cache.activeEntries
|
||||||
|
generatingCacheEntryCount = cache.generatingEntries
|
||||||
|
cacheEstimatedBytes = cache.estimatedBytes
|
||||||
|
cacheEstimatedTokens = cache.cachedTokenEstimate
|
||||||
|
cachedSessions = cache.sessions
|
||||||
|
|
||||||
let now = Date.now
|
let now = Date.now
|
||||||
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
|
let genDelta = snap.totalGenerationTokens - lastGenerationTokenCount
|
||||||
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
let promptDelta = snap.totalPromptTokens - lastPromptTokenCount
|
||||||
|
let cacheReuseDelta = cache.totalReusePromptTokens - lastCacheReuseTokenCount
|
||||||
|
let cacheRebuildDelta = cache.totalRebuildPromptTokens - lastCacheRebuildTokenCount
|
||||||
lastGenerationTokenCount = snap.totalGenerationTokens
|
lastGenerationTokenCount = snap.totalGenerationTokens
|
||||||
lastPromptTokenCount = snap.totalPromptTokens
|
lastPromptTokenCount = snap.totalPromptTokens
|
||||||
|
lastCacheReuseTokenCount = cache.totalReusePromptTokens
|
||||||
|
lastCacheRebuildTokenCount = cache.totalRebuildPromptTokens
|
||||||
|
|
||||||
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
tokenRateHistory.append(DataPoint(timestamp: now, value: snap.tokensPerSecond))
|
||||||
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
generationTokenHistory.append(DataPoint(timestamp: now, value: Double(genDelta)))
|
||||||
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
|
promptTokenHistory.append(DataPoint(timestamp: now, value: Double(promptDelta)))
|
||||||
|
cacheEntryHistory.append(DataPoint(timestamp: now, value: Double(cache.totalEntries)))
|
||||||
|
activeSessionHistory.append(DataPoint(timestamp: now, value: Double(cache.activeEntries)))
|
||||||
|
cacheFootprintHistory.append(DataPoint(timestamp: now, value: Double(cache.estimatedBytes)))
|
||||||
|
cacheReuseHistory.append(DataPoint(timestamp: now, value: Double(cacheReuseDelta)))
|
||||||
|
cacheRebuildHistory.append(DataPoint(timestamp: now, value: Double(cacheRebuildDelta)))
|
||||||
|
|
||||||
if tokenRateHistory.count > Self.maxHistoryPoints {
|
if tokenRateHistory.count > Self.maxHistoryPoints {
|
||||||
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
tokenRateHistory.removeFirst(tokenRateHistory.count - Self.maxHistoryPoints)
|
||||||
@@ -203,11 +331,31 @@ final class InferenceStats {
|
|||||||
if promptTokenHistory.count > Self.maxHistoryPoints {
|
if promptTokenHistory.count > Self.maxHistoryPoints {
|
||||||
promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
|
promptTokenHistory.removeFirst(promptTokenHistory.count - Self.maxHistoryPoints)
|
||||||
}
|
}
|
||||||
|
if cacheEntryHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheEntryHistory.removeFirst(cacheEntryHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if activeSessionHistory.count > Self.maxHistoryPoints {
|
||||||
|
activeSessionHistory.removeFirst(activeSessionHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if cacheFootprintHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheFootprintHistory.removeFirst(cacheFootprintHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if cacheReuseHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheReuseHistory.removeFirst(cacheReuseHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
|
if cacheRebuildHistory.count > Self.maxHistoryPoints {
|
||||||
|
cacheRebuildHistory.removeFirst(cacheRebuildHistory.count - Self.maxHistoryPoints)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func reset() {
|
func reset() {
|
||||||
LiveCounters.shared.reset()
|
LiveCounters.shared.reset()
|
||||||
|
ConversationSessionCache.shared.reset()
|
||||||
activeRequests = 0
|
activeRequests = 0
|
||||||
|
preparingRequests = 0
|
||||||
|
sessionBuildRequests = 0
|
||||||
|
prefillingRequests = 0
|
||||||
|
generatingRequests = 0
|
||||||
currentPromptTokens = 0
|
currentPromptTokens = 0
|
||||||
currentGenerationTokens = 0
|
currentGenerationTokens = 0
|
||||||
isGenerating = false
|
isGenerating = false
|
||||||
@@ -218,10 +366,29 @@ final class InferenceStats {
|
|||||||
totalRequests = 0
|
totalRequests = 0
|
||||||
totalPromptTokens = 0
|
totalPromptTokens = 0
|
||||||
totalGenerationTokens = 0
|
totalGenerationTokens = 0
|
||||||
|
totalCacheHits = 0
|
||||||
|
totalCacheMisses = 0
|
||||||
|
totalCacheEvictions = 0
|
||||||
|
totalCacheReusePromptTokens = 0
|
||||||
|
totalCacheRebuildPromptTokens = 0
|
||||||
|
cacheEntryCount = 0
|
||||||
|
warmCacheEntryCount = 0
|
||||||
|
activeCacheEntryCount = 0
|
||||||
|
generatingCacheEntryCount = 0
|
||||||
|
cacheEstimatedBytes = 0
|
||||||
|
cacheEstimatedTokens = 0
|
||||||
|
cachedSessions.removeAll()
|
||||||
tokenRateHistory.removeAll()
|
tokenRateHistory.removeAll()
|
||||||
promptTokenHistory.removeAll()
|
promptTokenHistory.removeAll()
|
||||||
generationTokenHistory.removeAll()
|
generationTokenHistory.removeAll()
|
||||||
|
cacheEntryHistory.removeAll()
|
||||||
|
activeSessionHistory.removeAll()
|
||||||
|
cacheFootprintHistory.removeAll()
|
||||||
|
cacheReuseHistory.removeAll()
|
||||||
|
cacheRebuildHistory.removeAll()
|
||||||
lastGenerationTokenCount = 0
|
lastGenerationTokenCount = 0
|
||||||
lastPromptTokenCount = 0
|
lastPromptTokenCount = 0
|
||||||
|
lastCacheReuseTokenCount = 0
|
||||||
|
lastCacheRebuildTokenCount = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,50 @@ struct APIToolDefinition: Codable {
|
|||||||
struct APIFunctionCall: Codable {
|
struct APIFunctionCall: Codable {
|
||||||
let name: String
|
let name: String
|
||||||
let arguments: String // JSON string
|
let arguments: String // JSON string
|
||||||
|
|
||||||
|
init(name: String, arguments: String) {
|
||||||
|
self.name = name
|
||||||
|
self.arguments = arguments
|
||||||
|
}
|
||||||
|
|
||||||
|
init(from decoder: Decoder) throws {
|
||||||
|
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||||||
|
name = try container.decode(String.self, forKey: .name)
|
||||||
|
|
||||||
|
if let argumentString = try? container.decode(String.self, forKey: .arguments) {
|
||||||
|
arguments = argumentString
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let argumentObject = try? container.decode([String: AnyCodable].self, forKey: .arguments) {
|
||||||
|
let jsonObject = argumentObject.mapValues(\.value)
|
||||||
|
if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
|
||||||
|
let string = String(data: data, encoding: .utf8) {
|
||||||
|
arguments = string
|
||||||
|
} else {
|
||||||
|
arguments = "{}"
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if let argumentArray = try? container.decode([AnyCodable].self, forKey: .arguments) {
|
||||||
|
let jsonObject = argumentArray.map(\.value)
|
||||||
|
if let data = try? JSONSerialization.data(withJSONObject: jsonObject, options: [.sortedKeys]),
|
||||||
|
let string = String(data: data, encoding: .utf8) {
|
||||||
|
arguments = string
|
||||||
|
} else {
|
||||||
|
arguments = "[]"
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (try? container.decodeNil(forKey: .arguments)) == true {
|
||||||
|
arguments = "{}"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
arguments = "{}"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct APIToolCall: Codable {
|
struct APIToolCall: Codable {
|
||||||
@@ -30,6 +74,14 @@ struct APIToolCall: Codable {
|
|||||||
self.type = type
|
self.type = type
|
||||||
self.function = function
|
self.function = function
|
||||||
}
|
}
|
||||||
|
|
||||||
|
init(from decoder: Decoder) throws {
|
||||||
|
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||||||
|
index = try container.decodeIfPresent(Int.self, forKey: .index) ?? 0
|
||||||
|
id = try container.decodeIfPresent(String.self, forKey: .id) ?? "call_\(UUID().uuidString.lowercased())"
|
||||||
|
type = try container.decodeIfPresent(String.self, forKey: .type) ?? "function"
|
||||||
|
function = try container.decode(APIFunctionCall.self, forKey: .function)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct APIImageURL: Codable {
|
struct APIImageURL: Codable {
|
||||||
|
|||||||
@@ -16,12 +16,6 @@ final class APIServer {
|
|||||||
private var listener: NWListener?
|
private var listener: NWListener?
|
||||||
private var modelManager: ModelManager?
|
private var modelManager: ModelManager?
|
||||||
|
|
||||||
// Persistent ChatSession for KV cache reuse across requests
|
|
||||||
private var cachedSession: ChatSession?
|
|
||||||
private var cachedMessages: [Chat.Message]?
|
|
||||||
private var cachedModelId: String?
|
|
||||||
private var cachedInstructions: String = ""
|
|
||||||
|
|
||||||
func start(modelManager: ModelManager, port: Int = 1234) {
|
func start(modelManager: ModelManager, port: Int = 1234) {
|
||||||
guard !isRunning else { return }
|
guard !isRunning else { return }
|
||||||
self.modelManager = modelManager
|
self.modelManager = modelManager
|
||||||
@@ -70,10 +64,7 @@ final class APIServer {
|
|||||||
listener?.cancel()
|
listener?.cancel()
|
||||||
listener = nil
|
listener = nil
|
||||||
isRunning = false
|
isRunning = false
|
||||||
cachedSession = nil
|
ConversationSessionCache.shared.invalidateAll()
|
||||||
cachedMessages = nil
|
|
||||||
cachedModelId = nil
|
|
||||||
cachedInstructions = ""
|
|
||||||
inferenceStats.stopSampling()
|
inferenceStats.stopSampling()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -186,10 +177,7 @@ final class APIServer {
|
|||||||
if let targetConfig = ModelConfig.resolve(requestedModel) {
|
if let targetConfig = ModelConfig.resolve(requestedModel) {
|
||||||
if modelManager.currentModel?.id != targetConfig.id {
|
if modelManager.currentModel?.id != targetConfig.id {
|
||||||
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
|
print("[APIServer] Swapping model: \(modelManager.currentModel?.repoId ?? "none") -> \(targetConfig.repoId)")
|
||||||
cachedSession = nil
|
ConversationSessionCache.shared.invalidateAll()
|
||||||
cachedMessages = nil
|
|
||||||
cachedModelId = nil
|
|
||||||
cachedInstructions = ""
|
|
||||||
await modelManager.loadModel(targetConfig)
|
await modelManager.loadModel(targetConfig)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -200,10 +188,7 @@ final class APIServer {
|
|||||||
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
if modelManager.modelContainer == nil, let lastModelId = Preferences.lastModelId,
|
||||||
let config = ModelConfig.resolve(lastModelId) {
|
let config = ModelConfig.resolve(lastModelId) {
|
||||||
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
print("[APIServer] Reloading idle-unloaded model: \(config.repoId)")
|
||||||
cachedSession = nil
|
ConversationSessionCache.shared.invalidateAll()
|
||||||
cachedMessages = nil
|
|
||||||
cachedModelId = nil
|
|
||||||
cachedInstructions = ""
|
|
||||||
await modelManager.loadModel(config)
|
await modelManager.loadModel(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,9 +218,13 @@ final class APIServer {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LiveCounters.shared.requestStarted(requestId: requestId, contextLength: contextLength)
|
||||||
|
|
||||||
// Convert API messages to Chat.Message, extracting images from content parts
|
// Convert API messages to Chat.Message, extracting images from content parts
|
||||||
var chatMessages: [Chat.Message] = []
|
var chatMessages: [Chat.Message] = []
|
||||||
|
var messageSignatures: [UInt64] = []
|
||||||
var images: [UserInput.Image] = []
|
var images: [UserInput.Image] = []
|
||||||
|
var estimatedBytes = 0
|
||||||
let currentModelRepoId = currentModel?.repoId ?? modelName
|
let currentModelRepoId = currentModel?.repoId ?? modelName
|
||||||
|
|
||||||
// Build the instructions string (system prompt + tool definitions).
|
// Build the instructions string (system prompt + tool definitions).
|
||||||
@@ -259,8 +248,8 @@ final class APIServer {
|
|||||||
instructions += toolSystemPrompt
|
instructions += toolSystemPrompt
|
||||||
}
|
}
|
||||||
|
|
||||||
let toolsForInjection = request.tools
|
|
||||||
let isQwen = currentModelRepoId.lowercased().contains("qwen")
|
let isQwen = currentModelRepoId.lowercased().contains("qwen")
|
||||||
|
estimatedBytes += instructions.utf8.count
|
||||||
|
|
||||||
// Convert non-system messages to Chat.Message
|
// Convert non-system messages to Chat.Message
|
||||||
for msg in request.messages where msg.role != "system" {
|
for msg in request.messages where msg.role != "system" {
|
||||||
@@ -297,18 +286,25 @@ final class APIServer {
|
|||||||
// Extract base64 images from content parts
|
// Extract base64 images from content parts
|
||||||
let imageURLs = msg.content?.imageURLs ?? []
|
let imageURLs = msg.content?.imageURLs ?? []
|
||||||
var messageImages: [UserInput.Image] = []
|
var messageImages: [UserInput.Image] = []
|
||||||
|
var messageImageBytes = 0
|
||||||
for urlString in imageURLs {
|
for urlString in imageURLs {
|
||||||
if let image = decodeBase64Image(urlString) {
|
if let decoded = decodeBase64Image(urlString) {
|
||||||
messageImages.append(image)
|
messageImages.append(decoded.image)
|
||||||
|
messageImageBytes += decoded.estimatedBytes
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attach images to this specific message
|
// Attach images to this specific message
|
||||||
chatMessages.append(Chat.Message(role: role, content: text, images: messageImages))
|
chatMessages.append(Chat.Message(role: role, content: text, images: messageImages))
|
||||||
|
messageSignatures.append(
|
||||||
|
Self.messageSignature(role: role, content: text, imageURLs: imageURLs)
|
||||||
|
)
|
||||||
|
estimatedBytes += text.utf8.count + messageImageBytes
|
||||||
images.append(contentsOf: messageImages)
|
images.append(contentsOf: messageImages)
|
||||||
}
|
}
|
||||||
|
|
||||||
if !images.isEmpty, currentModel?.supportsImages != true {
|
if !images.isEmpty, currentModel?.supportsImages != true {
|
||||||
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
|
||||||
sendResponse(
|
sendResponse(
|
||||||
connection: connection,
|
connection: connection,
|
||||||
status: 400,
|
status: 400,
|
||||||
@@ -318,18 +314,18 @@ final class APIServer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Context window check: estimate token count and reject if over limit
|
// Context window check: estimate token count and reject if over limit
|
||||||
|
let estimatedPromptTokens = (instructions.count + chatMessages.reduce(0) { $0 + $1.content.count }) * 10 / 35
|
||||||
if contextLength > 0 {
|
if contextLength > 0 {
|
||||||
let totalChars = chatMessages.reduce(0) { $0 + $1.content.count }
|
let needed = estimatedPromptTokens + maxTokens
|
||||||
let estimatedTokens = totalChars * 10 / 35 // ~3.5 chars per token
|
|
||||||
let needed = estimatedTokens + maxTokens
|
|
||||||
if needed > contextLength {
|
if needed > contextLength {
|
||||||
let errorBody = """
|
let errorBody = """
|
||||||
{"error":{"message":"This model's maximum context length is \(contextLength) tokens. \
|
{"error":{"message":"This model's maximum context length is \(contextLength) tokens. \
|
||||||
However, your messages resulted in approximately \(estimatedTokens) tokens and \
|
However, your messages resulted in approximately \(estimatedPromptTokens) tokens and \
|
||||||
\(maxTokens) tokens were requested for the completion (\(needed) total). \
|
\(maxTokens) tokens were requested for the completion (\(needed) total). \
|
||||||
Please reduce the length of the messages or completion.",\
|
Please reduce the length of the messages or completion.",\
|
||||||
"type":"invalid_request_error","code":"context_length_exceeded"}}
|
"type":"invalid_request_error","code":"context_length_exceeded"}}
|
||||||
"""
|
"""
|
||||||
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: 0)
|
||||||
sendResponse(connection: connection, status: 400, body: errorBody)
|
sendResponse(connection: connection, status: 400, body: errorBody)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -345,23 +341,28 @@ final class APIServer {
|
|||||||
let allButLast = Array(chatMessages.dropLast())
|
let allButLast = Array(chatMessages.dropLast())
|
||||||
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
|
let lastMessage = chatMessages.last ?? Chat.Message(role: .user, content: "")
|
||||||
|
|
||||||
// KV cache reuse: check if the cached session's history matches
|
let historySignatures = Array(messageSignatures.dropLast())
|
||||||
let currentModelId = modelManager.currentModel?.id
|
let currentModelId = modelManager.currentModel?.id ?? modelName
|
||||||
let canReuse = cachedSession != nil
|
let lease = ConversationSessionCache.shared.checkoutSession(
|
||||||
&& cachedModelId == currentModelId
|
modelId: currentModelId,
|
||||||
&& cachedMessages != nil
|
instructions: instructions,
|
||||||
&& cachedInstructions == instructions
|
historySignatures: historySignatures,
|
||||||
&& messagesMatch(cachedMessages!, allButLast)
|
requestMessageCount: chatMessages.count,
|
||||||
|
estimatedPromptTokens: estimatedPromptTokens,
|
||||||
|
estimatedBytes: estimatedBytes
|
||||||
|
)
|
||||||
|
|
||||||
let session: ChatSession
|
let session: ChatSession
|
||||||
if canReuse {
|
if let reusableSession = lease.session {
|
||||||
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
|
print("[APIServer] Reusing cached session (\(allButLast.count) history messages)")
|
||||||
session = cachedSession!
|
session = reusableSession
|
||||||
session.generateParameters = generateParams
|
session.generateParameters = generateParams
|
||||||
|
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
|
||||||
|
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
|
||||||
} else {
|
} else {
|
||||||
if cachedSession != nil {
|
print("[APIServer] Creating fresh session")
|
||||||
print("[APIServer] History diverged, creating fresh session")
|
ConversationSessionCache.shared.markSessionBuild(entryId: lease.entryId)
|
||||||
}
|
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .sessionBuild)
|
||||||
// Use `instructions:` for system/tool prompt (matches internal chat pattern).
|
// Use `instructions:` for system/tool prompt (matches internal chat pattern).
|
||||||
// Only conversation turns go in `history:` — this avoids replaying the
|
// Only conversation turns go in `history:` — this avoids replaying the
|
||||||
// large tool prompt as history on every new session.
|
// large tool prompt as history on every new session.
|
||||||
@@ -385,47 +386,62 @@ final class APIServer {
|
|||||||
additionalContext: thinkingContext
|
additionalContext: thinkingContext
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
ConversationSessionCache.shared.markPrefilling(entryId: lease.entryId)
|
||||||
|
LiveCounters.shared.requestPhaseChanged(requestId: requestId, phase: .prefilling)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
// Extract images from the last message only (ChatSession.streamDetails takes images separately)
|
||||||
let lastImages = lastMessage.images
|
let lastImages = lastMessage.images
|
||||||
|
|
||||||
LiveCounters.shared.requestStarted(contextLength: contextLength)
|
let result: (promptTokens: Int, completionTokens: Int, succeeded: Bool)
|
||||||
|
|
||||||
if isStream {
|
if isStream {
|
||||||
await handleStreamingResponse(
|
result = await handleStreamingResponse(
|
||||||
connection: connection,
|
connection: connection,
|
||||||
|
requestId: requestId,
|
||||||
|
cacheEntryId: lease.entryId,
|
||||||
session: session,
|
session: session,
|
||||||
prompt: lastMessage.content,
|
prompt: lastMessage.content,
|
||||||
images: lastImages,
|
images: lastImages,
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
requestId: requestId,
|
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName
|
modelName: modelName
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
await handleNonStreamingResponse(
|
result = await handleNonStreamingResponse(
|
||||||
connection: connection,
|
connection: connection,
|
||||||
|
requestId: requestId,
|
||||||
|
cacheEntryId: lease.entryId,
|
||||||
session: session,
|
session: session,
|
||||||
prompt: lastMessage.content,
|
prompt: lastMessage.content,
|
||||||
images: lastImages,
|
images: lastImages,
|
||||||
tools: request.tools,
|
tools: request.tools,
|
||||||
requestId: requestId,
|
|
||||||
created: created,
|
created: created,
|
||||||
modelName: modelName
|
modelName: modelName
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cache the session for reuse on next request
|
if result.succeeded {
|
||||||
// allButLast + lastMessage (user) + assistant response = new cached history
|
ConversationSessionCache.shared.completeRequest(
|
||||||
cachedSession = session
|
entryId: lease.entryId,
|
||||||
cachedMessages = chatMessages // full messages including the one just sent
|
session: session,
|
||||||
cachedModelId = currentModelId
|
requestMessageSignatures: messageSignatures,
|
||||||
cachedInstructions = instructions
|
requestMessageCount: chatMessages.count,
|
||||||
|
estimatedPromptTokens: estimatedPromptTokens,
|
||||||
|
estimatedBytes: estimatedBytes,
|
||||||
|
promptTokens: result.promptTokens,
|
||||||
|
completionTokens: result.completionTokens
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
ConversationSessionCache.shared.abandonRequest(entryId: lease.entryId)
|
||||||
|
}
|
||||||
|
|
||||||
|
LiveCounters.shared.requestCompleted(requestId: requestId, generationTokens: result.completionTokens)
|
||||||
|
modelManager.touchActivity()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image.
|
/// Decode a base64 data URI (data:image/png;base64,...) into a UserInput.Image.
|
||||||
private func decodeBase64Image(_ urlString: String) -> UserInput.Image? {
|
private func decodeBase64Image(_ urlString: String) -> DecodedImage? {
|
||||||
// Handle data URIs: data:image/png;base64,<data>
|
// Handle data URIs: data:image/png;base64,<data>
|
||||||
let base64String: String
|
let base64String: String
|
||||||
if urlString.hasPrefix("data:") {
|
if urlString.hasPrefix("data:") {
|
||||||
@@ -442,21 +458,23 @@ final class APIServer {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return .ciImage(CIImage(cgImage: cgImage))
|
let estimatedBytes = max(data.count, cgImage.width * cgImage.height * 4)
|
||||||
|
return DecodedImage(image: .ciImage(CIImage(cgImage: cgImage)), estimatedBytes: estimatedBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Non-streaming response
|
// MARK: - Non-streaming response
|
||||||
|
|
||||||
private func handleNonStreamingResponse(
|
private func handleNonStreamingResponse(
|
||||||
connection: NWConnection,
|
connection: NWConnection,
|
||||||
|
requestId: String,
|
||||||
|
cacheEntryId: UUID,
|
||||||
session: ChatSession,
|
session: ChatSession,
|
||||||
prompt: String,
|
prompt: String,
|
||||||
images: [UserInput.Image],
|
images: [UserInput.Image],
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
requestId: String,
|
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String
|
modelName: String
|
||||||
) async {
|
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
||||||
do {
|
do {
|
||||||
var fullText = ""
|
var fullText = ""
|
||||||
var promptTokens = 0
|
var promptTokens = 0
|
||||||
@@ -478,7 +496,12 @@ final class APIServer {
|
|||||||
case .info(let info):
|
case .info(let info):
|
||||||
promptTokens = info.promptTokenCount
|
promptTokens = info.promptTokenCount
|
||||||
completionTokens = info.generationTokenCount
|
completionTokens = info.generationTokenCount
|
||||||
LiveCounters.shared.prefillCompleted(promptTokens: promptTokens)
|
ConversationSessionCache.shared.markGenerating(
|
||||||
|
entryId: cacheEntryId,
|
||||||
|
promptTokens: promptTokens,
|
||||||
|
completionTokens: completionTokens
|
||||||
|
)
|
||||||
|
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
||||||
if info.tokensPerSecond > 0 {
|
if info.tokensPerSecond > 0 {
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
||||||
}
|
}
|
||||||
@@ -487,9 +510,6 @@ final class APIServer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
|
|
||||||
modelManager?.touchActivity()
|
|
||||||
|
|
||||||
// Parse tool calls: first check framework-detected ones, then our own text parser
|
// Parse tool calls: first check framework-detected ones, then our own text parser
|
||||||
var finishReason = "stop"
|
var finishReason = "stop"
|
||||||
var responseContent: String? = fullText
|
var responseContent: String? = fullText
|
||||||
@@ -559,10 +579,10 @@ final class APIServer {
|
|||||||
if let json = try? JSONEncoder().encode(response) {
|
if let json = try? JSONEncoder().encode(response) {
|
||||||
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
sendResponse(connection: connection, status: 200, body: String(data: json, encoding: .utf8) ?? "{}")
|
||||||
}
|
}
|
||||||
|
return (promptTokens, completionTokens, true)
|
||||||
} catch {
|
} catch {
|
||||||
LiveCounters.shared.requestCompleted(generationTokens: 0)
|
|
||||||
modelManager?.touchActivity()
|
|
||||||
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
sendResponse(connection: connection, status: 500, body: #"{"error":"\#(error.localizedDescription)"}"#)
|
||||||
|
return (0, 0, false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -570,14 +590,15 @@ final class APIServer {
|
|||||||
|
|
||||||
private func handleStreamingResponse(
|
private func handleStreamingResponse(
|
||||||
connection: NWConnection,
|
connection: NWConnection,
|
||||||
|
requestId: String,
|
||||||
|
cacheEntryId: UUID,
|
||||||
session: ChatSession,
|
session: ChatSession,
|
||||||
prompt: String,
|
prompt: String,
|
||||||
images: [UserInput.Image],
|
images: [UserInput.Image],
|
||||||
tools: [APIToolDefinition]?,
|
tools: [APIToolDefinition]?,
|
||||||
requestId: String,
|
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String
|
modelName: String
|
||||||
) async {
|
) async -> (promptTokens: Int, completionTokens: Int, succeeded: Bool) {
|
||||||
// Send SSE headers
|
// Send SSE headers
|
||||||
let header = [
|
let header = [
|
||||||
"HTTP/1.1 200 OK",
|
"HTTP/1.1 200 OK",
|
||||||
@@ -625,7 +646,16 @@ final class APIServer {
|
|||||||
)
|
)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
let (promptTokens, completionTokens, fullText, frameworkToolCalls) = result
|
let (promptTokens, completionTokens, fullText, frameworkToolCalls, succeeded) = result
|
||||||
|
|
||||||
|
if promptTokens > 0 {
|
||||||
|
ConversationSessionCache.shared.markGenerating(
|
||||||
|
entryId: cacheEntryId,
|
||||||
|
promptTokens: promptTokens,
|
||||||
|
completionTokens: completionTokens
|
||||||
|
)
|
||||||
|
LiveCounters.shared.prefillCompleted(requestId: requestId, promptTokens: promptTokens)
|
||||||
|
}
|
||||||
|
|
||||||
// Stats were already updated by LiveCounters inside the loop
|
// Stats were already updated by LiveCounters inside the loop
|
||||||
|
|
||||||
@@ -696,12 +726,10 @@ final class APIServer {
|
|||||||
)
|
)
|
||||||
))
|
))
|
||||||
|
|
||||||
LiveCounters.shared.requestCompleted(generationTokens: completionTokens)
|
|
||||||
modelManager?.touchActivity()
|
|
||||||
|
|
||||||
// Send [DONE] and close
|
// Send [DONE] and close
|
||||||
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
await Self.sendData(connection: connection, data: "data: [DONE]\n\n".data(using: .utf8)!)
|
||||||
connection.cancel()
|
connection.cancel()
|
||||||
|
return (promptTokens, completionTokens, succeeded)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the token generation + SSE send loop entirely off MainActor.
|
/// Run the token generation + SSE send loop entirely off MainActor.
|
||||||
@@ -713,7 +741,7 @@ final class APIServer {
|
|||||||
requestId: String,
|
requestId: String,
|
||||||
created: Int,
|
created: Int,
|
||||||
modelName: String
|
modelName: String
|
||||||
) async -> (Int, Int, String, [MLXLMCommon.ToolCall]) {
|
) async -> (Int, Int, String, [MLXLMCommon.ToolCall], Bool) {
|
||||||
var promptTokens = 0
|
var promptTokens = 0
|
||||||
var completionTokens = 0
|
var completionTokens = 0
|
||||||
var fullText = ""
|
var fullText = ""
|
||||||
@@ -742,7 +770,6 @@ final class APIServer {
|
|||||||
case .info(let info):
|
case .info(let info):
|
||||||
promptTokens = info.promptTokenCount
|
promptTokens = info.promptTokenCount
|
||||||
completionTokens = info.generationTokenCount
|
completionTokens = info.generationTokenCount
|
||||||
LiveCounters.shared.prefillCompleted(promptTokens: promptTokens)
|
|
||||||
if info.tokensPerSecond > 0 {
|
if info.tokensPerSecond > 0 {
|
||||||
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
LiveCounters.shared.tokenGenerated(tokensPerSecond: info.tokensPerSecond, totalGenerated: completionTokens)
|
||||||
}
|
}
|
||||||
@@ -754,9 +781,10 @@ final class APIServer {
|
|||||||
} catch {
|
} catch {
|
||||||
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
|
let errorEvent = "data: {\"error\":\"\(error.localizedDescription)\"}\n\n"
|
||||||
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
|
await sendData(connection: connection, data: errorEvent.data(using: .utf8)!)
|
||||||
|
return (promptTokens, completionTokens, fullText, frameworkToolCalls, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
return (promptTokens, completionTokens, fullText, frameworkToolCalls)
|
return (promptTokens, completionTokens, fullText, frameworkToolCalls, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Send an SSE event and wait for the protocol stack to process it.
|
/// Send an SSE event and wait for the protocol stack to process it.
|
||||||
@@ -819,26 +847,42 @@ final class APIServer {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if the cached session can be reused for the new history.
|
private static func messageSignature(role: Chat.Message.Role, content: String, imageURLs: [String]) -> UInt64 {
|
||||||
///
|
var hash: UInt64 = 14_695_981_039_346_656_037
|
||||||
/// After a request the session's KV cache contains:
|
|
||||||
/// cachedMessages (history + user prompt) + the generated assistant response.
|
func mix(_ text: String) {
|
||||||
/// On the next request the client sends back the full conversation, so
|
for byte in text.utf8 {
|
||||||
/// `newHistory` (allButLast) is typically `cachedMessages` + 1 assistant reply.
|
hash ^= UInt64(byte)
|
||||||
/// We allow reuse when `cached` is a prefix of `newHistory` and there is at most
|
hash &*= 1_099_511_628_211
|
||||||
/// one extra message (the assistant response the session already generated).
|
}
|
||||||
/// More than one extra message (e.g. injected tool results) means the session
|
|
||||||
/// hasn't processed them, so we must create a fresh session.
|
|
||||||
private func messagesMatch(_ cached: [Chat.Message], _ newHistory: [Chat.Message]) -> Bool {
|
|
||||||
guard cached.count <= newHistory.count,
|
|
||||||
newHistory.count <= cached.count + 1 else { return false }
|
|
||||||
for (a, b) in zip(cached, newHistory) {
|
|
||||||
if a.role != b.role || a.content != b.content { return false }
|
|
||||||
}
|
}
|
||||||
return true
|
|
||||||
|
switch role {
|
||||||
|
case .assistant:
|
||||||
|
mix("assistant")
|
||||||
|
case .system:
|
||||||
|
mix("system")
|
||||||
|
case .user:
|
||||||
|
mix("user")
|
||||||
|
@unknown default:
|
||||||
|
mix("unknown")
|
||||||
|
}
|
||||||
|
mix("|")
|
||||||
|
mix(content)
|
||||||
|
for imageURL in imageURLs {
|
||||||
|
mix("|")
|
||||||
|
mix(imageURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
return hash
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private struct DecodedImage {
|
||||||
|
let image: UserInput.Image
|
||||||
|
let estimatedBytes: Int
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - HTTP request parser
|
// MARK: - HTTP request parser
|
||||||
|
|
||||||
private struct HTTPRequest {
|
private struct HTTPRequest {
|
||||||
|
|||||||
358
MLXServer/Server/ConversationSessionCache.swift
Normal file
358
MLXServer/Server/ConversationSessionCache.swift
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
import Foundation
|
||||||
|
import MLXLMCommon
|
||||||
|
import os
|
||||||
|
|
||||||
|
enum APISessionPhase: String, Sendable {
|
||||||
|
case idle = "Idle"
|
||||||
|
case sessionBuild = "Session Build"
|
||||||
|
case prefilling = "Prefilling"
|
||||||
|
case generating = "Generating"
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bounded cache of API chat sessions keyed by normalized conversation history.
|
||||||
|
/// The cache is internal-only and safe to sample from the monitor without involving MainActor.
|
||||||
|
final class ConversationSessionCache: @unchecked Sendable {
|
||||||
|
static let shared = ConversationSessionCache()
|
||||||
|
|
||||||
|
private let lock = OSAllocatedUnfairLock()
|
||||||
|
|
||||||
|
private let maxEntries = 8
|
||||||
|
private let maxCachedTokens = 256_000
|
||||||
|
private let idleTTL: TimeInterval = 10 * 60
|
||||||
|
|
||||||
|
private var entries: [UUID: Entry] = [:]
|
||||||
|
private var totals = Totals()
|
||||||
|
|
||||||
|
private init() {}
|
||||||
|
|
||||||
|
struct Lease {
|
||||||
|
let entryId: UUID
|
||||||
|
let session: ChatSession?
|
||||||
|
let reusedPromptTokens: Int
|
||||||
|
let cacheHit: Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SessionSummary: Identifiable, Sendable {
|
||||||
|
let id: UUID
|
||||||
|
let modelId: String
|
||||||
|
let phase: APISessionPhase
|
||||||
|
let messageCount: Int
|
||||||
|
let cachedTokenEstimate: Int
|
||||||
|
let estimatedBytes: Int
|
||||||
|
let inFlightRequests: Int
|
||||||
|
let hitCount: Int
|
||||||
|
let lastPromptTokens: Int
|
||||||
|
let lastCompletionTokens: Int
|
||||||
|
let lastReuseTokens: Int
|
||||||
|
let createdAt: Date
|
||||||
|
let lastAccessAt: Date
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Snapshot: Sendable {
|
||||||
|
let totalEntries: Int
|
||||||
|
let warmEntries: Int
|
||||||
|
let activeEntries: Int
|
||||||
|
let generatingEntries: Int
|
||||||
|
let estimatedBytes: Int
|
||||||
|
let cachedTokenEstimate: Int
|
||||||
|
let totalHits: Int
|
||||||
|
let totalMisses: Int
|
||||||
|
let totalEvictions: Int
|
||||||
|
let totalReusePromptTokens: Int
|
||||||
|
let totalRebuildPromptTokens: Int
|
||||||
|
let sessions: [SessionSummary]
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkoutSession(
|
||||||
|
modelId: String,
|
||||||
|
instructions: String,
|
||||||
|
historySignatures: [UInt64],
|
||||||
|
requestMessageCount: Int,
|
||||||
|
estimatedPromptTokens: Int,
|
||||||
|
estimatedBytes: Int
|
||||||
|
) -> Lease {
|
||||||
|
lock.lock()
|
||||||
|
let now = Date()
|
||||||
|
pruneExpiredLocked(now: now)
|
||||||
|
|
||||||
|
let instructionsHash = Self.stableHash(instructions)
|
||||||
|
let match = entries
|
||||||
|
.values
|
||||||
|
.filter {
|
||||||
|
$0.modelId == modelId
|
||||||
|
&& $0.instructionsHash == instructionsHash
|
||||||
|
&& $0.session != nil
|
||||||
|
&& $0.inFlightRequests == 0
|
||||||
|
&& Self.historyMatches(cached: $0.requestMessageSignatures, incoming: historySignatures)
|
||||||
|
}
|
||||||
|
.max { lhs, rhs in
|
||||||
|
lhs.requestMessageSignatures.count < rhs.requestMessageSignatures.count
|
||||||
|
}
|
||||||
|
|
||||||
|
if let match {
|
||||||
|
var entry = match
|
||||||
|
entry.inFlightRequests += 1
|
||||||
|
entry.lastAccessAt = now
|
||||||
|
entry.phase = .prefilling
|
||||||
|
entry.lastReuseTokens = max(entry.cachedTokenEstimate, estimatedPromptTokens)
|
||||||
|
entry.hitCount += 1
|
||||||
|
entries[entry.id] = entry
|
||||||
|
totals.totalHits += 1
|
||||||
|
totals.totalReusePromptTokens += entry.lastReuseTokens
|
||||||
|
let lease = Lease(
|
||||||
|
entryId: entry.id,
|
||||||
|
session: entry.session,
|
||||||
|
reusedPromptTokens: entry.lastReuseTokens,
|
||||||
|
cacheHit: true
|
||||||
|
)
|
||||||
|
lock.unlock()
|
||||||
|
return lease
|
||||||
|
}
|
||||||
|
|
||||||
|
let entryId = UUID()
|
||||||
|
entries[entryId] = Entry(
|
||||||
|
id: entryId,
|
||||||
|
modelId: modelId,
|
||||||
|
instructionsHash: instructionsHash,
|
||||||
|
requestMessageSignatures: historySignatures,
|
||||||
|
messageCount: requestMessageCount,
|
||||||
|
cachedTokenEstimate: estimatedPromptTokens,
|
||||||
|
estimatedBytes: estimatedBytes,
|
||||||
|
createdAt: now,
|
||||||
|
lastAccessAt: now,
|
||||||
|
inFlightRequests: 1,
|
||||||
|
hitCount: 0,
|
||||||
|
phase: .sessionBuild,
|
||||||
|
lastPromptTokens: 0,
|
||||||
|
lastCompletionTokens: 0,
|
||||||
|
lastReuseTokens: 0,
|
||||||
|
session: nil
|
||||||
|
)
|
||||||
|
totals.totalMisses += 1
|
||||||
|
totals.totalRebuildPromptTokens += estimatedPromptTokens
|
||||||
|
lock.unlock()
|
||||||
|
return Lease(entryId: entryId, session: nil, reusedPromptTokens: 0, cacheHit: false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func markSessionBuild(entryId: UUID) {
|
||||||
|
updatePhase(entryId: entryId, phase: .sessionBuild)
|
||||||
|
}
|
||||||
|
|
||||||
|
func markPrefilling(entryId: UUID) {
|
||||||
|
updatePhase(entryId: entryId, phase: .prefilling)
|
||||||
|
}
|
||||||
|
|
||||||
|
func markGenerating(entryId: UUID, promptTokens: Int, completionTokens: Int) {
|
||||||
|
lock.lock()
|
||||||
|
if var entry = entries[entryId] {
|
||||||
|
entry.phase = .generating
|
||||||
|
entry.lastPromptTokens = promptTokens
|
||||||
|
entry.lastCompletionTokens = completionTokens
|
||||||
|
entry.cachedTokenEstimate = max(entry.cachedTokenEstimate, promptTokens + completionTokens)
|
||||||
|
entry.lastAccessAt = Date()
|
||||||
|
entries[entryId] = entry
|
||||||
|
}
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func completeRequest(
|
||||||
|
entryId: UUID,
|
||||||
|
session: ChatSession,
|
||||||
|
requestMessageSignatures: [UInt64],
|
||||||
|
requestMessageCount: Int,
|
||||||
|
estimatedPromptTokens: Int,
|
||||||
|
estimatedBytes: Int,
|
||||||
|
promptTokens: Int,
|
||||||
|
completionTokens: Int
|
||||||
|
) {
|
||||||
|
lock.lock()
|
||||||
|
let now = Date()
|
||||||
|
if var entry = entries[entryId] {
|
||||||
|
entry.session = session
|
||||||
|
entry.requestMessageSignatures = requestMessageSignatures
|
||||||
|
entry.messageCount = requestMessageCount
|
||||||
|
entry.cachedTokenEstimate = max(estimatedPromptTokens, promptTokens + completionTokens)
|
||||||
|
entry.estimatedBytes = estimatedBytes
|
||||||
|
entry.lastPromptTokens = promptTokens
|
||||||
|
entry.lastCompletionTokens = completionTokens
|
||||||
|
entry.lastAccessAt = now
|
||||||
|
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
|
||||||
|
entry.phase = .idle
|
||||||
|
entries[entryId] = entry
|
||||||
|
enforceBudgetLocked(now: now)
|
||||||
|
}
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func abandonRequest(entryId: UUID) {
|
||||||
|
lock.lock()
|
||||||
|
if var entry = entries[entryId] {
|
||||||
|
entry.inFlightRequests = max(0, entry.inFlightRequests - 1)
|
||||||
|
if entry.session == nil && entry.inFlightRequests == 0 {
|
||||||
|
entries.removeValue(forKey: entryId)
|
||||||
|
} else {
|
||||||
|
entry.phase = .idle
|
||||||
|
entry.lastAccessAt = Date()
|
||||||
|
entries[entryId] = entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func invalidateAll() {
|
||||||
|
lock.lock()
|
||||||
|
totals.totalEvictions += entries.count
|
||||||
|
entries.removeAll()
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func reset() {
|
||||||
|
lock.lock()
|
||||||
|
entries.removeAll()
|
||||||
|
totals = Totals()
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshot() -> Snapshot {
|
||||||
|
lock.lock()
|
||||||
|
let now = Date()
|
||||||
|
pruneExpiredLocked(now: now)
|
||||||
|
let allEntries = Array(entries.values)
|
||||||
|
let sessions = allEntries
|
||||||
|
.sorted {
|
||||||
|
if $0.inFlightRequests != $1.inFlightRequests {
|
||||||
|
return $0.inFlightRequests > $1.inFlightRequests
|
||||||
|
}
|
||||||
|
return $0.lastAccessAt > $1.lastAccessAt
|
||||||
|
}
|
||||||
|
.map {
|
||||||
|
SessionSummary(
|
||||||
|
id: $0.id,
|
||||||
|
modelId: $0.modelId,
|
||||||
|
phase: $0.phase,
|
||||||
|
messageCount: $0.messageCount,
|
||||||
|
cachedTokenEstimate: $0.cachedTokenEstimate,
|
||||||
|
estimatedBytes: $0.estimatedBytes,
|
||||||
|
inFlightRequests: $0.inFlightRequests,
|
||||||
|
hitCount: $0.hitCount,
|
||||||
|
lastPromptTokens: $0.lastPromptTokens,
|
||||||
|
lastCompletionTokens: $0.lastCompletionTokens,
|
||||||
|
lastReuseTokens: $0.lastReuseTokens,
|
||||||
|
createdAt: $0.createdAt,
|
||||||
|
lastAccessAt: $0.lastAccessAt
|
||||||
|
)
|
||||||
|
}
|
||||||
|
let snapshot = Snapshot(
|
||||||
|
totalEntries: allEntries.count,
|
||||||
|
warmEntries: allEntries.filter { $0.session != nil }.count,
|
||||||
|
activeEntries: allEntries.filter { $0.inFlightRequests > 0 }.count,
|
||||||
|
generatingEntries: allEntries.filter { $0.phase == .generating }.count,
|
||||||
|
estimatedBytes: allEntries.reduce(0) { $0 + $1.estimatedBytes },
|
||||||
|
cachedTokenEstimate: allEntries.reduce(0) { $0 + $1.cachedTokenEstimate },
|
||||||
|
totalHits: totals.totalHits,
|
||||||
|
totalMisses: totals.totalMisses,
|
||||||
|
totalEvictions: totals.totalEvictions,
|
||||||
|
totalReusePromptTokens: totals.totalReusePromptTokens,
|
||||||
|
totalRebuildPromptTokens: totals.totalRebuildPromptTokens,
|
||||||
|
sessions: sessions
|
||||||
|
)
|
||||||
|
lock.unlock()
|
||||||
|
return snapshot
|
||||||
|
}
|
||||||
|
|
||||||
|
private func updatePhase(entryId: UUID, phase: APISessionPhase) {
|
||||||
|
lock.lock()
|
||||||
|
if var entry = entries[entryId] {
|
||||||
|
entry.phase = phase
|
||||||
|
entry.lastAccessAt = Date()
|
||||||
|
entries[entryId] = entry
|
||||||
|
}
|
||||||
|
lock.unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func pruneExpiredLocked(now: Date) {
|
||||||
|
let expired = entries.values.filter {
|
||||||
|
$0.inFlightRequests == 0 && now.timeIntervalSince($0.lastAccessAt) > idleTTL
|
||||||
|
}
|
||||||
|
guard !expired.isEmpty else { return }
|
||||||
|
for entry in expired {
|
||||||
|
entries.removeValue(forKey: entry.id)
|
||||||
|
}
|
||||||
|
totals.totalEvictions += expired.count
|
||||||
|
}
|
||||||
|
|
||||||
|
private func enforceBudgetLocked(now: Date) {
|
||||||
|
pruneExpiredLocked(now: now)
|
||||||
|
|
||||||
|
func totalCachedTokens() -> Int {
|
||||||
|
entries.values.reduce(0) { $0 + $1.cachedTokenEstimate }
|
||||||
|
}
|
||||||
|
|
||||||
|
while entries.count > maxEntries || totalCachedTokens() > maxCachedTokens {
|
||||||
|
guard let victim = entries.values
|
||||||
|
.filter({ $0.inFlightRequests == 0 })
|
||||||
|
.sorted(by: evictionOrder)
|
||||||
|
.first
|
||||||
|
else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
entries.removeValue(forKey: victim.id)
|
||||||
|
totals.totalEvictions += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func evictionOrder(lhs: Entry, rhs: Entry) -> Bool {
|
||||||
|
if lhs.lastAccessAt != rhs.lastAccessAt {
|
||||||
|
return lhs.lastAccessAt < rhs.lastAccessAt
|
||||||
|
}
|
||||||
|
if lhs.cachedTokenEstimate != rhs.cachedTokenEstimate {
|
||||||
|
return lhs.cachedTokenEstimate > rhs.cachedTokenEstimate
|
||||||
|
}
|
||||||
|
return lhs.createdAt < rhs.createdAt
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func historyMatches(cached: [UInt64], incoming: [UInt64]) -> Bool {
|
||||||
|
guard cached.count <= incoming.count,
|
||||||
|
incoming.count <= cached.count + 1 else { return false }
|
||||||
|
for (lhs, rhs) in zip(cached, incoming) where lhs != rhs {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
static func stableHash(_ text: String) -> UInt64 {
|
||||||
|
var hash: UInt64 = 14_695_981_039_346_656_037
|
||||||
|
for byte in text.utf8 {
|
||||||
|
hash ^= UInt64(byte)
|
||||||
|
hash &*= 1_099_511_628_211
|
||||||
|
}
|
||||||
|
return hash
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct Entry {
|
||||||
|
let id: UUID
|
||||||
|
let modelId: String
|
||||||
|
let instructionsHash: UInt64
|
||||||
|
var requestMessageSignatures: [UInt64]
|
||||||
|
var messageCount: Int
|
||||||
|
var cachedTokenEstimate: Int
|
||||||
|
var estimatedBytes: Int
|
||||||
|
let createdAt: Date
|
||||||
|
var lastAccessAt: Date
|
||||||
|
var inFlightRequests: Int
|
||||||
|
var hitCount: Int
|
||||||
|
var phase: APISessionPhase
|
||||||
|
var lastPromptTokens: Int
|
||||||
|
var lastCompletionTokens: Int
|
||||||
|
var lastReuseTokens: Int
|
||||||
|
var session: ChatSession?
|
||||||
|
}
|
||||||
|
|
||||||
|
private struct Totals {
|
||||||
|
var totalHits: Int = 0
|
||||||
|
var totalMisses: Int = 0
|
||||||
|
var totalEvictions: Int = 0
|
||||||
|
var totalReusePromptTokens: Int = 0
|
||||||
|
var totalRebuildPromptTokens: Int = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,28 +6,31 @@ import SwiftUI
|
|||||||
struct MonitorView: View {
|
struct MonitorView: View {
|
||||||
let stats: InferenceStats
|
let stats: InferenceStats
|
||||||
@Environment(ModelManager.self) private var modelManager
|
@Environment(ModelManager.self) private var modelManager
|
||||||
|
private let chartColumns = [GridItem(.flexible(minimum: 260), spacing: 16), GridItem(.flexible(minimum: 260), spacing: 16)]
|
||||||
|
private let cardColumns = [GridItem(.flexible(minimum: 180), spacing: 16), GridItem(.flexible(minimum: 180), spacing: 16)]
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
ScrollView {
|
ScrollView {
|
||||||
VStack(spacing: 20) {
|
VStack(spacing: 20) {
|
||||||
// Live status header
|
|
||||||
liveStatusSection
|
liveStatusSection
|
||||||
|
|
||||||
// Charts
|
LazyVGrid(columns: chartColumns, alignment: .leading, spacing: 16) {
|
||||||
HStack(alignment: .top, spacing: 16) {
|
|
||||||
tokenRateChart
|
tokenRateChart
|
||||||
tokenThroughputChart
|
tokenThroughputChart
|
||||||
|
cacheReuseChart
|
||||||
|
cacheFootprintChart
|
||||||
|
cacheSessionChart
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gauges row
|
LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 16) {
|
||||||
HStack(spacing: 16) {
|
|
||||||
contextGauge
|
contextGauge
|
||||||
gpuMemoryGauge
|
gpuMemoryGauge
|
||||||
requestsCard
|
requestsCard
|
||||||
|
cacheCard
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cumulative stats
|
|
||||||
cumulativeSection
|
cumulativeSection
|
||||||
|
sessionSection
|
||||||
}
|
}
|
||||||
.padding(20)
|
.padding(20)
|
||||||
}
|
}
|
||||||
@@ -39,45 +42,54 @@ struct MonitorView: View {
|
|||||||
|
|
||||||
@ViewBuilder
|
@ViewBuilder
|
||||||
private var liveStatusSection: some View {
|
private var liveStatusSection: some View {
|
||||||
HStack(spacing: 16) {
|
VStack(alignment: .leading, spacing: 12) {
|
||||||
// Activity indicator
|
HStack(spacing: 16) {
|
||||||
HStack(spacing: 8) {
|
HStack(spacing: 8) {
|
||||||
Circle()
|
Circle()
|
||||||
.fill(activityColor)
|
.fill(activityColor)
|
||||||
.frame(width: 10, height: 10)
|
.frame(width: 10, height: 10)
|
||||||
.overlay {
|
.overlay {
|
||||||
if stats.isGenerating || stats.isPrefilling {
|
if stats.activeRequests > 0 {
|
||||||
Circle()
|
Circle()
|
||||||
.stroke(activityColor.opacity(0.5), lineWidth: 2)
|
.stroke(activityColor.opacity(0.5), lineWidth: 2)
|
||||||
.scaleEffect(1.8)
|
.scaleEffect(1.8)
|
||||||
.opacity(0.6)
|
.opacity(0.6)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
Text(activityLabel)
|
Text(activityLabel)
|
||||||
.font(.headline)
|
.font(.headline)
|
||||||
}
|
|
||||||
|
|
||||||
Spacer()
|
|
||||||
|
|
||||||
if stats.isGenerating {
|
|
||||||
Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
|
|
||||||
.font(.title2.monospacedDigit().bold())
|
|
||||||
.foregroundStyle(.green)
|
|
||||||
}
|
|
||||||
|
|
||||||
if stats.currentPromptTokens > 0 {
|
|
||||||
HStack(spacing: 4) {
|
|
||||||
Image(systemName: "arrow.down.circle.fill")
|
|
||||||
.foregroundStyle(.blue)
|
|
||||||
Text("\(stats.currentPromptTokens)")
|
|
||||||
.monospacedDigit()
|
|
||||||
Image(systemName: "arrow.up.circle.fill")
|
|
||||||
.foregroundStyle(.orange)
|
|
||||||
Text("\(stats.currentGenerationTokens)")
|
|
||||||
.monospacedDigit()
|
|
||||||
}
|
}
|
||||||
.font(.callout)
|
|
||||||
|
Spacer()
|
||||||
|
|
||||||
|
if stats.isGenerating {
|
||||||
|
Text(String(format: "%.1f tok/s", stats.currentTokensPerSecond))
|
||||||
|
.font(.title2.monospacedDigit().bold())
|
||||||
|
.foregroundStyle(.green)
|
||||||
|
}
|
||||||
|
|
||||||
|
if stats.currentPromptTokens > 0 {
|
||||||
|
HStack(spacing: 4) {
|
||||||
|
Image(systemName: "arrow.down.circle.fill")
|
||||||
|
.foregroundStyle(.blue)
|
||||||
|
Text("\(stats.currentPromptTokens)")
|
||||||
|
.monospacedDigit()
|
||||||
|
Image(systemName: "arrow.up.circle.fill")
|
||||||
|
.foregroundStyle(.orange)
|
||||||
|
Text("\(stats.currentGenerationTokens)")
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.callout)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HStack(spacing: 8) {
|
||||||
|
phaseChip(title: "Preparing", count: stats.preparingRequests, color: .secondary)
|
||||||
|
phaseChip(title: "Session Build", count: stats.sessionBuildRequests, color: .purple)
|
||||||
|
phaseChip(title: "Prefill", count: stats.prefillingRequests, color: .blue)
|
||||||
|
phaseChip(title: "Generating", count: stats.generatingRequests, color: .green)
|
||||||
|
phaseChip(title: "Cache Active", count: stats.activeCacheEntryCount, color: .orange)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.padding(12)
|
.padding(12)
|
||||||
@@ -85,15 +97,19 @@ struct MonitorView: View {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private var activityColor: Color {
|
private var activityColor: Color {
|
||||||
if stats.isPrefilling { return .blue }
|
|
||||||
if stats.isGenerating { return .green }
|
if stats.isGenerating { return .green }
|
||||||
|
if stats.prefillingRequests > 0 { return .blue }
|
||||||
|
if stats.sessionBuildRequests > 0 { return .purple }
|
||||||
|
if stats.preparingRequests > 0 { return .orange }
|
||||||
if stats.activeRequests > 0 { return .orange }
|
if stats.activeRequests > 0 { return .orange }
|
||||||
return .secondary
|
return .secondary
|
||||||
}
|
}
|
||||||
|
|
||||||
private var activityLabel: String {
|
private var activityLabel: String {
|
||||||
if stats.isPrefilling { return "Prefilling" }
|
|
||||||
if stats.isGenerating { return "Generating" }
|
if stats.isGenerating { return "Generating" }
|
||||||
|
if stats.prefillingRequests > 0 { return "Prefilling" }
|
||||||
|
if stats.sessionBuildRequests > 0 { return "Building Sessions" }
|
||||||
|
if stats.preparingRequests > 0 { return "Preparing Requests" }
|
||||||
if stats.activeRequests > 0 { return "Processing" }
|
if stats.activeRequests > 0 { return "Processing" }
|
||||||
return "Idle"
|
return "Idle"
|
||||||
}
|
}
|
||||||
@@ -145,6 +161,160 @@ struct MonitorView: View {
|
|||||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var cacheReuseChart: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 6) {
|
||||||
|
Text("Prefill Reuse (/sec)")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
|
Chart {
|
||||||
|
ForEach(stats.cacheReuseHistory) { point in
|
||||||
|
BarMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Tokens", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.green.opacity(0.75))
|
||||||
|
}
|
||||||
|
ForEach(stats.cacheRebuildHistory) { point in
|
||||||
|
BarMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Tokens", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.red.opacity(0.65))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartXAxis {
|
||||||
|
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
|
||||||
|
AxisGridLine()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartYAxis {
|
||||||
|
AxisMarks(position: .leading) { value in
|
||||||
|
AxisGridLine()
|
||||||
|
AxisValueLabel {
|
||||||
|
if let v = value.as(Double.self) {
|
||||||
|
Text(String(format: "%.0f", v))
|
||||||
|
.font(.caption2.monospacedDigit())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(height: 150)
|
||||||
|
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
Label("Reused", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.green)
|
||||||
|
Label("Rebuilt", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.red)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var cacheFootprintChart: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 6) {
|
||||||
|
Text("Cache Footprint (est)")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
|
Chart(stats.cacheFootprintHistory) { point in
|
||||||
|
LineMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("MB", point.value / 1_048_576)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.orange)
|
||||||
|
.interpolationMethod(.monotone)
|
||||||
|
|
||||||
|
AreaMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("MB", point.value / 1_048_576)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.orange.opacity(0.12))
|
||||||
|
.interpolationMethod(.monotone)
|
||||||
|
}
|
||||||
|
.chartXAxis {
|
||||||
|
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
|
||||||
|
AxisGridLine()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartYAxis {
|
||||||
|
AxisMarks(position: .leading) { value in
|
||||||
|
AxisGridLine()
|
||||||
|
AxisValueLabel {
|
||||||
|
if let v = value.as(Double.self) {
|
||||||
|
Text(String(format: "%.1f", v))
|
||||||
|
.font(.caption2.monospacedDigit())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(height: 150)
|
||||||
|
}
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var cacheSessionChart: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 6) {
|
||||||
|
Text("Cached Sessions")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
|
Chart {
|
||||||
|
ForEach(stats.cacheEntryHistory) { point in
|
||||||
|
LineMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Cached", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.purple)
|
||||||
|
.interpolationMethod(.monotone)
|
||||||
|
}
|
||||||
|
ForEach(stats.activeSessionHistory) { point in
|
||||||
|
LineMark(
|
||||||
|
x: .value("Time", point.timestamp),
|
||||||
|
y: .value("Active", point.value)
|
||||||
|
)
|
||||||
|
.foregroundStyle(.blue)
|
||||||
|
.interpolationMethod(.monotone)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartXAxis {
|
||||||
|
AxisMarks(values: .stride(by: .second, count: 30)) { _ in
|
||||||
|
AxisGridLine()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.chartYAxis {
|
||||||
|
AxisMarks(position: .leading) { value in
|
||||||
|
AxisGridLine()
|
||||||
|
AxisValueLabel {
|
||||||
|
if let v = value.as(Double.self) {
|
||||||
|
Text(String(format: "%.0f", v))
|
||||||
|
.font(.caption2.monospacedDigit())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(height: 150)
|
||||||
|
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
Label("Cached", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.purple)
|
||||||
|
Label("Active", systemImage: "circle.fill")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.blue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
private var maxTokenRate: Double {
|
private var maxTokenRate: Double {
|
||||||
stats.tokenRateHistory.map(\.value).max() ?? 10
|
stats.tokenRateHistory.map(\.value).max() ?? 10
|
||||||
}
|
}
|
||||||
@@ -303,35 +473,69 @@ struct MonitorView: View {
|
|||||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var cacheCard: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 8) {
|
||||||
|
Text("Session Cache")
|
||||||
|
.font(.caption.bold())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
|
Text("\(stats.cacheEntryCount)")
|
||||||
|
.font(.title3.monospacedDigit().bold())
|
||||||
|
|
||||||
|
LabeledContent("Warm") {
|
||||||
|
Text("\(stats.warmCacheEntryCount)")
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
|
||||||
|
LabeledContent("Active") {
|
||||||
|
Text("\(stats.activeCacheEntryCount)")
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
|
||||||
|
LabeledContent("Est. Footprint") {
|
||||||
|
Text(formatByteCount(stats.cacheEstimatedBytes))
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
|
||||||
|
LabeledContent("Cached Tokens") {
|
||||||
|
Text(formatTokenCount(stats.cacheEstimatedTokens))
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
|
||||||
|
LabeledContent("Hit Rate") {
|
||||||
|
Text(String(format: "%.0f%%", cacheHitRate * 100))
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
}
|
||||||
|
.frame(maxWidth: .infinity, alignment: .leading)
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - Cumulative
|
// MARK: - Cumulative
|
||||||
|
|
||||||
@ViewBuilder
|
@ViewBuilder
|
||||||
private var cumulativeSection: some View {
|
private var cumulativeSection: some View {
|
||||||
HStack(spacing: 24) {
|
VStack(alignment: .leading, spacing: 10) {
|
||||||
VStack(spacing: 2) {
|
Text("Cumulative")
|
||||||
Text("Total Prompt Tokens")
|
.font(.caption.bold())
|
||||||
.font(.caption2)
|
.foregroundStyle(.secondary)
|
||||||
.foregroundStyle(.secondary)
|
|
||||||
Text(formatTokenCount(stats.totalPromptTokens))
|
|
||||||
.font(.callout.monospacedDigit().bold())
|
|
||||||
.foregroundStyle(.blue)
|
|
||||||
}
|
|
||||||
|
|
||||||
VStack(spacing: 2) {
|
LazyVGrid(columns: cardColumns, alignment: .leading, spacing: 12) {
|
||||||
Text("Total Generated Tokens")
|
statTile(title: "Prompt Tokens", value: formatTokenCount(stats.totalPromptTokens), color: .blue)
|
||||||
.font(.caption2)
|
statTile(title: "Generated Tokens", value: formatTokenCount(stats.totalGenerationTokens), color: .orange)
|
||||||
.foregroundStyle(.secondary)
|
statTile(title: "Cache Hits", value: "\(stats.totalCacheHits)", color: .green)
|
||||||
Text(formatTokenCount(stats.totalGenerationTokens))
|
statTile(title: "Cache Misses", value: "\(stats.totalCacheMisses)", color: .red)
|
||||||
.font(.callout.monospacedDigit().bold())
|
statTile(title: "Reused Prefill", value: formatTokenCount(stats.totalCacheReusePromptTokens), color: .green)
|
||||||
.foregroundStyle(.orange)
|
statTile(title: "Rebuilt Prefill", value: formatTokenCount(stats.totalCacheRebuildPromptTokens), color: .red)
|
||||||
}
|
statTile(title: "Evictions", value: "\(stats.totalCacheEvictions)", color: .secondary)
|
||||||
|
statTile(title: "Total Tokens", value: formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens), color: .primary)
|
||||||
VStack(spacing: 2) {
|
|
||||||
Text("Total Tokens")
|
|
||||||
.font(.caption2)
|
|
||||||
.foregroundStyle(.secondary)
|
|
||||||
Text(formatTokenCount(stats.totalPromptTokens + stats.totalGenerationTokens))
|
|
||||||
.font(.callout.monospacedDigit().bold())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.frame(maxWidth: .infinity)
|
.frame(maxWidth: .infinity)
|
||||||
@@ -339,8 +543,129 @@ struct MonitorView: View {
|
|||||||
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private var sessionSection: some View {
|
||||||
|
VStack(alignment: .leading, spacing: 12) {
|
||||||
|
HStack {
|
||||||
|
Text("Cached Chat Sessions")
|
||||||
|
.font(.headline)
|
||||||
|
Spacer()
|
||||||
|
Text("\(stats.cachedSessions.count) visible")
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
|
||||||
|
if stats.cachedSessions.isEmpty {
|
||||||
|
Text("No cached sessions yet.")
|
||||||
|
.font(.callout)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
.frame(maxWidth: .infinity, alignment: .leading)
|
||||||
|
} else {
|
||||||
|
ForEach(stats.cachedSessions) { session in
|
||||||
|
sessionRow(session)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.frame(maxWidth: .infinity, alignment: .leading)
|
||||||
|
.padding(12)
|
||||||
|
.background(.regularMaterial, in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - Helpers
|
// MARK: - Helpers
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private func phaseChip(title: String, count: Int, color: Color) -> some View {
|
||||||
|
HStack(spacing: 6) {
|
||||||
|
Circle()
|
||||||
|
.fill(color)
|
||||||
|
.frame(width: 7, height: 7)
|
||||||
|
Text(title)
|
||||||
|
Text("\(count)")
|
||||||
|
.monospacedDigit()
|
||||||
|
}
|
||||||
|
.font(.caption)
|
||||||
|
.padding(.horizontal, 8)
|
||||||
|
.padding(.vertical, 4)
|
||||||
|
.background(color.opacity(0.12), in: Capsule())
|
||||||
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private func statTile(title: String, value: String, color: Color) -> some View {
|
||||||
|
VStack(alignment: .leading, spacing: 4) {
|
||||||
|
Text(title)
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
Text(value)
|
||||||
|
.font(.callout.monospacedDigit().bold())
|
||||||
|
.foregroundStyle(color)
|
||||||
|
}
|
||||||
|
.frame(maxWidth: .infinity, alignment: .leading)
|
||||||
|
.padding(10)
|
||||||
|
.background(Color.primary.opacity(0.04), in: RoundedRectangle(cornerRadius: 8))
|
||||||
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private func sessionRow(_ session: ConversationSessionCache.SessionSummary) -> some View {
|
||||||
|
VStack(alignment: .leading, spacing: 10) {
|
||||||
|
HStack(alignment: .firstTextBaseline) {
|
||||||
|
HStack(spacing: 8) {
|
||||||
|
Circle()
|
||||||
|
.fill(color(for: session.phase))
|
||||||
|
.frame(width: 8, height: 8)
|
||||||
|
Text(session.modelId)
|
||||||
|
.font(.callout.weight(.semibold))
|
||||||
|
.lineLimit(1)
|
||||||
|
}
|
||||||
|
Spacer()
|
||||||
|
Text(session.phase.rawValue)
|
||||||
|
.font(.caption.monospacedDigit())
|
||||||
|
.padding(.horizontal, 8)
|
||||||
|
.padding(.vertical, 4)
|
||||||
|
.background(color(for: session.phase).opacity(0.14), in: Capsule())
|
||||||
|
}
|
||||||
|
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
sessionMetric("Msgs", "\(session.messageCount)")
|
||||||
|
sessionMetric("Cached", formatTokenCount(session.cachedTokenEstimate))
|
||||||
|
sessionMetric("Reuse", formatTokenCount(session.lastReuseTokens))
|
||||||
|
sessionMetric("Footprint", formatByteCount(session.estimatedBytes))
|
||||||
|
sessionMetric("Hits", "\(session.hitCount)")
|
||||||
|
sessionMetric("Active", "\(session.inFlightRequests)")
|
||||||
|
}
|
||||||
|
|
||||||
|
HStack(spacing: 12) {
|
||||||
|
sessionMetric("Prompt", formatTokenCount(session.lastPromptTokens))
|
||||||
|
sessionMetric("Completion", formatTokenCount(session.lastCompletionTokens))
|
||||||
|
sessionMetric("Last Access", relativeTimeString(session.lastAccessAt))
|
||||||
|
}
|
||||||
|
|
||||||
|
let ratio = maxContextRatio(for: session.cachedTokenEstimate)
|
||||||
|
ProgressView(value: ratio) {
|
||||||
|
Text("Cached Context")
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
} currentValueLabel: {
|
||||||
|
Text("\(Int(ratio * 100))%")
|
||||||
|
.font(.caption2.monospacedDigit())
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
.tint(color(for: session.phase))
|
||||||
|
}
|
||||||
|
.padding(12)
|
||||||
|
.background(Color.primary.opacity(0.035), in: RoundedRectangle(cornerRadius: 10))
|
||||||
|
}
|
||||||
|
|
||||||
|
@ViewBuilder
|
||||||
|
private func sessionMetric(_ title: String, _ value: String) -> some View {
|
||||||
|
VStack(alignment: .leading, spacing: 2) {
|
||||||
|
Text(title)
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
Text(value)
|
||||||
|
.font(.caption.monospacedDigit().bold())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private func formatTokenCount(_ count: Int) -> String {
|
private func formatTokenCount(_ count: Int) -> String {
|
||||||
if count >= 1_000_000 {
|
if count >= 1_000_000 {
|
||||||
return String(format: "%.1fM", Double(count) / 1_000_000)
|
return String(format: "%.1fM", Double(count) / 1_000_000)
|
||||||
@@ -349,4 +674,52 @@ struct MonitorView: View {
|
|||||||
}
|
}
|
||||||
return "\(count)"
|
return "\(count)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func formatByteCount(_ count: Int) -> String {
|
||||||
|
let bytes = Double(count)
|
||||||
|
if bytes >= 1_048_576 {
|
||||||
|
return String(format: "%.1f MB", bytes / 1_048_576)
|
||||||
|
}
|
||||||
|
if bytes >= 1024 {
|
||||||
|
return String(format: "%.0f KB", bytes / 1024)
|
||||||
|
}
|
||||||
|
return "\(count) B"
|
||||||
|
}
|
||||||
|
|
||||||
|
private func relativeTimeString(_ date: Date) -> String {
|
||||||
|
let seconds = max(0, Int(Date.now.timeIntervalSince(date)))
|
||||||
|
if seconds < 60 {
|
||||||
|
return "\(seconds)s"
|
||||||
|
}
|
||||||
|
let minutes = seconds / 60
|
||||||
|
if minutes < 60 {
|
||||||
|
return "\(minutes)m"
|
||||||
|
}
|
||||||
|
return "\(minutes / 60)h"
|
||||||
|
}
|
||||||
|
|
||||||
|
private func color(for phase: APISessionPhase) -> Color {
|
||||||
|
switch phase {
|
||||||
|
case .idle:
|
||||||
|
return .secondary
|
||||||
|
case .sessionBuild:
|
||||||
|
return .purple
|
||||||
|
case .prefilling:
|
||||||
|
return .blue
|
||||||
|
case .generating:
|
||||||
|
return .green
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private var cacheHitRate: Double {
|
||||||
|
let total = stats.totalCacheHits + stats.totalCacheMisses
|
||||||
|
guard total > 0 else { return 0 }
|
||||||
|
return Double(stats.totalCacheHits) / Double(total)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func maxContextRatio(for tokens: Int) -> Double {
|
||||||
|
let maxContext = max(stats.contextMax, modelManager.currentModel?.contextLength ?? 0)
|
||||||
|
guard maxContext > 0 else { return 0 }
|
||||||
|
return min(1, Double(tokens) / Double(maxContext))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user