feat: finally nailed down phases 1-4
This commit is contained in:
@@ -29,14 +29,17 @@
|
||||
621B7E4382199AC1378F5F9C /* StatusBarView.swift in Sources */ = {isa = PBXBuildFile; fileRef = B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */; };
|
||||
67262C5E24739F1FE0011439 /* StreamingSSEEncoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 615F8A7C9ABCADEB215D31BD /* StreamingSSEEncoder.swift */; };
|
||||
67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */; };
|
||||
67D0628F148FE3C2200E0AEF /* APIServerResponseResolutionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 051FEC14CC76A677F79ACD21 /* APIServerResponseResolutionTests.swift */; };
|
||||
6828CCA8B78AB40906F87CAB /* LocalModelResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = D733A0D1D4AC25DDDA6C8684 /* LocalModelResolver.swift */; };
|
||||
741692862DB1F13EA0B2D14D /* TokenPrefixCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1962D530BEABCC7F1E8E0ED1 /* TokenPrefixCache.swift */; };
|
||||
7CD765C1E2F9F4D7504C8D09 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B629DA084A9A40E54F8EA5FA /* Assets.xcassets */; };
|
||||
80646C5066BF79BC76E1D9D7 /* ModelConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = 38DFC212AF4359A45FBE22BA /* ModelConfig.swift */; };
|
||||
834B49AA3E30A1FED549D057 /* ToolCallParserTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B89226C9ED585A5296C54441 /* ToolCallParserTests.swift */; };
|
||||
84D32315B418B5243E017350 /* ToolPromptBuilder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16AE82A64D1D07AE3CD8D33A /* ToolPromptBuilder.swift */; };
|
||||
85FB1EB49D76A9F21E181346 /* ChatScene.swift in Sources */ = {isa = PBXBuildFile; fileRef = C04EE8E6418EC6E9B66999B0 /* ChatScene.swift */; };
|
||||
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */; };
|
||||
945474365D0B3E961811909A /* MLXVLM in Frameworks */ = {isa = PBXBuildFile; productRef = D5E8E1C2DD8D8AABB4306193 /* MLXVLM */; };
|
||||
95A612524552AF5CC3B1AE62 /* ChatViewModelTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B758F596F4F3E68793B045BB /* ChatViewModelTests.swift */; };
|
||||
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */; };
|
||||
A146BBA70CFBEC505BDCDF0D /* ImageDecoder.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7C1A89C076E717F87A60397D /* ImageDecoder.swift */; };
|
||||
AA17474A72C7F4EFBD5C4925 /* PromptBuilder.swift in Sources */ = {isa = PBXBuildFile; fileRef = E1E62624B6F285479CB33041 /* PromptBuilder.swift */; };
|
||||
@@ -73,6 +76,7 @@
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
02EBDE0C72D1C5CE220E5B93 /* InferenceEngine.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InferenceEngine.swift; sourceTree = "<group>"; };
|
||||
051FEC14CC76A677F79ACD21 /* APIServerResponseResolutionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = APIServerResponseResolutionTests.swift; sourceTree = "<group>"; };
|
||||
0F03A123A8908714A89315FE /* SceneCommands.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneCommands.swift; sourceTree = "<group>"; };
|
||||
145B888FBDD4F931512C5473 /* Preferences.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Preferences.swift; sourceTree = "<group>"; };
|
||||
1607BDDE53C575627DCC6896 /* ChatDocumentManifest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatDocumentManifest.swift; sourceTree = "<group>"; };
|
||||
@@ -103,6 +107,8 @@
|
||||
B0EAB35D7130D56B9E7484BA /* StatusBarView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StatusBarView.swift; sourceTree = "<group>"; };
|
||||
B5B5ABDEB6F5C54856EB1A9E /* SceneSelectionView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneSelectionView.swift; sourceTree = "<group>"; };
|
||||
B629DA084A9A40E54F8EA5FA /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||
B758F596F4F3E68793B045BB /* ChatViewModelTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatViewModelTests.swift; sourceTree = "<group>"; };
|
||||
B89226C9ED585A5296C54441 /* ToolCallParserTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ToolCallParserTests.swift; sourceTree = "<group>"; };
|
||||
B8BD93859F0291F1A3E09DA5 /* ChatViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatViewModel.swift; sourceTree = "<group>"; };
|
||||
BA1592FD260014C4FBDB6995 /* SceneManagementWindow.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneManagementWindow.swift; sourceTree = "<group>"; };
|
||||
C04EE8E6418EC6E9B66999B0 /* ChatScene.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatScene.swift; sourceTree = "<group>"; };
|
||||
@@ -174,14 +180,17 @@
|
||||
154AF0C071A7DC02EB5F6F49 /* Server */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
051FEC14CC76A677F79ACD21 /* APIServerResponseResolutionTests.swift */,
|
||||
E43535D68448F1752D91C3A9 /* APIServerRewriteTests.swift */,
|
||||
FEFF6168B2283FEC87B4BB8C /* CancellationTokenTests.swift */,
|
||||
B758F596F4F3E68793B045BB /* ChatViewModelTests.swift */,
|
||||
E4573DC9314915F4C7963B4E /* ImageDecoderTests.swift */,
|
||||
7E7DF9F68C10C718844B7B01 /* LiveCountersTests.swift */,
|
||||
D388BE00B42C06ED9D9905BF /* ModelBackedInferenceValidationTests.swift */,
|
||||
5F9426FA5A4AC55F8D9C080E /* PromptBuilderTests.swift */,
|
||||
49C383DD5224F3420EB98DB2 /* StreamingSSEEncoderTests.swift */,
|
||||
64B2EDD5D1881AC9E1E60913 /* TokenPrefixCacheTests.swift */,
|
||||
B89226C9ED585A5296C54441 /* ToolCallParserTests.swift */,
|
||||
);
|
||||
path = Server;
|
||||
sourceTree = "<group>";
|
||||
@@ -382,14 +391,17 @@
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
67D0628F148FE3C2200E0AEF /* APIServerResponseResolutionTests.swift in Sources */,
|
||||
CBC9DB0799C4ADF2DC9319DA /* APIServerRewriteTests.swift in Sources */,
|
||||
962083CCCC4AC848E0BBBC99 /* CancellationTokenTests.swift in Sources */,
|
||||
95A612524552AF5CC3B1AE62 /* ChatViewModelTests.swift in Sources */,
|
||||
E92B6656C251EDA246B8F582 /* ImageDecoderTests.swift in Sources */,
|
||||
67B815DC3304BF4B2E9974A8 /* LiveCountersTests.swift in Sources */,
|
||||
8E665E21CCCD87A907CEA78D /* ModelBackedInferenceValidationTests.swift in Sources */,
|
||||
1FE8C624898960ECCE39C0D4 /* PromptBuilderTests.swift in Sources */,
|
||||
FE4405F66873C75CD6FA19A5 /* StreamingSSEEncoderTests.swift in Sources */,
|
||||
221DEC86374902FCFD661A01 /* TokenPrefixCacheTests.swift in Sources */,
|
||||
834B49AA3E30A1FED549D057 /* ToolCallParserTests.swift in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
|
||||
@@ -728,7 +728,7 @@ final class APIServer {
|
||||
return text.isEmpty ? nil : text
|
||||
}
|
||||
|
||||
private static func resolveAssistantResponse(
|
||||
static func resolveAssistantResponse(
|
||||
fullText: String,
|
||||
frameworkToolCalls: [MLXLMCommon.ToolCall],
|
||||
tools: [APIToolDefinition]?
|
||||
|
||||
@@ -459,9 +459,17 @@ final class TokenPrefixCache: @unchecked Sendable {
|
||||
|
||||
private static func computeMemoryBudget() -> Int {
|
||||
guard let device = MTLCreateSystemDefaultDevice() else {
|
||||
return computeMemoryBudget(recommendedWorkingSetSize: nil)
|
||||
}
|
||||
return computeMemoryBudget(recommendedWorkingSetSize: Int(device.recommendedMaxWorkingSetSize))
|
||||
}
|
||||
|
||||
static func computeMemoryBudget(recommendedWorkingSetSize: Int?) -> Int {
|
||||
guard let recommendedWorkingSetSize else {
|
||||
return 512 * 1024 * 1024
|
||||
}
|
||||
let budget = Int(Double(device.recommendedMaxWorkingSetSize) * 0.20)
|
||||
|
||||
let budget = Int(Double(recommendedWorkingSetSize) * 0.20)
|
||||
return max(256 * 1024 * 1024, min(budget, 8 * 1024 * 1024 * 1024))
|
||||
}
|
||||
|
||||
|
||||
44
MLXServerTests/Server/APIServerResponseResolutionTests.swift
Normal file
44
MLXServerTests/Server/APIServerResponseResolutionTests.swift
Normal file
@@ -0,0 +1,44 @@
|
||||
import MLXLMCommon
|
||||
import XCTest
|
||||
@testable import MLX_Server
|
||||
|
||||
final class APIServerResponseResolutionTests: XCTestCase {
|
||||
@MainActor
|
||||
func testResolveAssistantResponseUsesFrameworkToolCalls() throws {
|
||||
let frameworkToolCalls = [
|
||||
ToolCall(function: ToolCall.Function(name: "weather", arguments: ["city": "Berlin"]))
|
||||
]
|
||||
|
||||
let resolved = APIServer.resolveAssistantResponse(
|
||||
fullText: "I will call the tool.",
|
||||
frameworkToolCalls: frameworkToolCalls,
|
||||
tools: [mockWeatherTool]
|
||||
)
|
||||
|
||||
XCTAssertEqual(resolved.finishReason, "tool_calls")
|
||||
XCTAssertEqual(resolved.content, "I will call the tool.")
|
||||
let toolCall = try XCTUnwrap(resolved.toolCalls?.first)
|
||||
XCTAssertEqual(toolCall.function.name, "weather")
|
||||
XCTAssertEqual(toolCall.function.arguments, #"{"city":"Berlin"}"#)
|
||||
}
|
||||
|
||||
private var mockWeatherTool: APIToolDefinition {
|
||||
APIToolDefinition(
|
||||
type: "function",
|
||||
function: APIFunctionDefinition(
|
||||
name: "weather",
|
||||
description: "Look up weather for a city.",
|
||||
parameters: [
|
||||
"type": AnyCodable("object"),
|
||||
"properties": AnyCodable([
|
||||
"city": [
|
||||
"type": "string",
|
||||
"description": "City name"
|
||||
]
|
||||
]),
|
||||
"required": AnyCodable(["city"])
|
||||
]
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,61 @@ import XCTest
|
||||
@testable import MLX_Server
|
||||
|
||||
final class APIServerRewriteTests: XCTestCase {
|
||||
func testQwenNonStreamingChatCompletionCachesAndReusesPrompt() async throws {
|
||||
let harness = try await makeHarness(initialModelId: "qwen")
|
||||
defer { harness.stop() }
|
||||
|
||||
let lookups = LookupEventCollector()
|
||||
APIServer.debugLookupEventHandler = { event in
|
||||
Task {
|
||||
await lookups.record(event)
|
||||
}
|
||||
}
|
||||
defer {
|
||||
APIServer.debugLookupEventHandler = nil
|
||||
}
|
||||
|
||||
let request = APIChatCompletionRequest(
|
||||
model: "qwen",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 1,
|
||||
stream: false,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
let firstResponse = try await sendChatCompletion(request, port: harness.port)
|
||||
XCTAssertEqual(firstResponse.choices.count, 1)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
let snapshot = TokenPrefixCache.shared.snapshot()
|
||||
return snapshot.totalEntries > 0 && snapshot.entries.allSatisfy { $0.modelId == "qwen" }
|
||||
}
|
||||
|
||||
let firstSnapshot = TokenPrefixCache.shared.snapshot()
|
||||
_ = try await sendChatCompletion(request, port: harness.port)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
let events = await lookups.events()
|
||||
return events.count >= 2 && TokenPrefixCache.shared.snapshot().totalHits > firstSnapshot.totalHits
|
||||
}
|
||||
|
||||
let secondSnapshot = TokenPrefixCache.shared.snapshot()
|
||||
let events = await lookups.events()
|
||||
let secondLookup = try XCTUnwrap(events.last)
|
||||
XCTAssertGreaterThan(secondSnapshot.totalHits, firstSnapshot.totalHits)
|
||||
XCTAssertTrue(secondLookup.isHit)
|
||||
XCTAssertGreaterThan(secondLookup.matchedTokenCount, 0)
|
||||
}
|
||||
|
||||
func testHealthAndModelsEndpointsReturnExpectedPayloads() async throws {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
@@ -69,6 +124,16 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
|
||||
let lookups = LookupEventCollector()
|
||||
APIServer.debugLookupEventHandler = { event in
|
||||
Task {
|
||||
await lookups.record(event)
|
||||
}
|
||||
}
|
||||
defer {
|
||||
APIServer.debugLookupEventHandler = nil
|
||||
}
|
||||
|
||||
let request = APIChatCompletionRequest(
|
||||
model: "gemma",
|
||||
messages: [
|
||||
@@ -89,10 +154,15 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
_ = try await sendChatCompletion(request, port: harness.port)
|
||||
_ = try await sendChatCompletion(request, port: harness.port)
|
||||
|
||||
let live = LiveCounters.shared.snapshot()
|
||||
XCTAssertGreaterThan(live.currentCacheMatchedPromptTokens, 0)
|
||||
XCTAssertEqual(live.currentCacheMatchedPromptTokens, live.promptTokens)
|
||||
XCTAssertEqual(live.currentCacheRebuiltPromptTokens, 0)
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
let events = await lookups.events()
|
||||
return events.count >= 2
|
||||
}
|
||||
|
||||
let events = await lookups.events()
|
||||
let secondLookup = try XCTUnwrap(events.last)
|
||||
XCTAssertTrue(secondLookup.isHit)
|
||||
XCTAssertEqual(secondLookup.matchedTokenCount, secondLookup.promptTokenCount)
|
||||
}
|
||||
|
||||
func testSingleTurnContinuationProducesPartialCacheHit() async throws {
|
||||
@@ -365,6 +435,91 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
XCTAssertEqual(live.currentCacheMatchedPromptTokens, 0)
|
||||
}
|
||||
|
||||
func testRequestModelFieldSwapsFromGemmaToQwenAndInvalidatesGemmaCache() async throws {
|
||||
let harness = try await makeHarness(initialModelId: "gemma")
|
||||
defer { harness.stop() }
|
||||
|
||||
let lookups = LookupEventCollector()
|
||||
APIServer.debugLookupEventHandler = { event in
|
||||
Task {
|
||||
await lookups.record(event)
|
||||
}
|
||||
}
|
||||
defer {
|
||||
APIServer.debugLookupEventHandler = nil
|
||||
}
|
||||
|
||||
let gemmaRequest = APIChatCompletionRequest(
|
||||
model: "gemma",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Answer with one word: river."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 2,
|
||||
stream: false,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
_ = try await sendChatCompletion(gemmaRequest, port: harness.port)
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
TokenPrefixCache.shared.snapshot().entries.contains(where: { $0.modelId == "gemma" })
|
||||
}
|
||||
|
||||
let qwenRequest = APIChatCompletionRequest(
|
||||
model: "qwen",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Answer with one word: river."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 2,
|
||||
stream: false,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
_ = try await sendChatCompletion(qwenRequest, port: harness.port)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
let snapshot = TokenPrefixCache.shared.snapshot()
|
||||
let modelId = await MainActor.run { harness.modelManager.currentModel?.id }
|
||||
return modelId == "qwen"
|
||||
&& !snapshot.entries.isEmpty
|
||||
&& snapshot.entries.allSatisfy { $0.modelId == "qwen" }
|
||||
}
|
||||
|
||||
let afterSwapSnapshot = TokenPrefixCache.shared.snapshot()
|
||||
let afterSwapEvents = await lookups.events()
|
||||
let firstQwenLookup = try XCTUnwrap(afterSwapEvents.last)
|
||||
XCTAssertTrue(afterSwapSnapshot.entries.allSatisfy { $0.modelId == "qwen" })
|
||||
XCTAssertFalse(firstQwenLookup.isHit)
|
||||
XCTAssertEqual(firstQwenLookup.matchedTokenCount, 0)
|
||||
|
||||
_ = try await sendChatCompletion(qwenRequest, port: harness.port)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5) {
|
||||
let events = await lookups.events()
|
||||
return events.count >= 3 && TokenPrefixCache.shared.snapshot().totalHits > afterSwapSnapshot.totalHits
|
||||
}
|
||||
|
||||
let finalSnapshot = TokenPrefixCache.shared.snapshot()
|
||||
let finalEvents = await lookups.events()
|
||||
let secondQwenLookup = try XCTUnwrap(finalEvents.last)
|
||||
XCTAssertGreaterThan(finalSnapshot.totalHits, afterSwapSnapshot.totalHits)
|
||||
XCTAssertTrue(secondQwenLookup.isHit)
|
||||
XCTAssertGreaterThan(secondQwenLookup.matchedTokenCount, 0)
|
||||
}
|
||||
|
||||
func testStreamingChatCompletionReusesCacheAcrossThreeProgressivelyLongerTurns() async throws {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
@@ -775,6 +930,130 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
XCTAssertGreaterThan(finalLiveSnapshot.totalCacheReusePromptTokens, afterDisconnectLiveSnapshot.totalCacheReusePromptTokens)
|
||||
}
|
||||
|
||||
func testStreamingDisconnectStopsServerWorkWithinTwoHundredMilliseconds() async throws {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
|
||||
let request = APIChatCompletionRequest(
|
||||
model: "gemma",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Count from one to fifty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 128,
|
||||
stream: true,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
let url = URL(string: "http://127.0.0.1:\(harness.port)/v1/chat/completions")!
|
||||
var urlRequest = URLRequest(url: url)
|
||||
urlRequest.httpMethod = "POST"
|
||||
urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
urlRequest.httpBody = try JSONEncoder().encode(request)
|
||||
|
||||
let observer = StreamCancellationObserver()
|
||||
let session = URLSession(configuration: .ephemeral)
|
||||
let baselineDisconnects = LiveCounters.shared.snapshot().totalDisconnects
|
||||
let task = Task {
|
||||
let (bytes, response) = try await session.bytes(for: urlRequest)
|
||||
let httpResponse = try XCTUnwrap(response as? HTTPURLResponse)
|
||||
XCTAssertEqual(httpResponse.statusCode, 200)
|
||||
|
||||
for try await line in bytes.lines {
|
||||
guard line.hasPrefix("data: ") else { continue }
|
||||
let payload = String(line.dropFirst(6))
|
||||
if payload == "[DONE]" {
|
||||
break
|
||||
}
|
||||
guard let data = payload.data(using: .utf8) else { continue }
|
||||
let chunk = try JSONDecoder().decode(APIChatCompletionChunk.self, from: data)
|
||||
if let deltaContent = chunk.choices.first?.delta.content, !deltaContent.isEmpty {
|
||||
await observer.markFirstContentSeen()
|
||||
try await Task.sleep(nanoseconds: 30_000_000_000)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try await waitUntil(timeoutSeconds: 10) {
|
||||
await observer.hasSeenFirstContent
|
||||
}
|
||||
|
||||
let disconnectStartedAt = Date()
|
||||
session.invalidateAndCancel()
|
||||
task.cancel()
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5, intervalNanoseconds: 10_000_000) {
|
||||
let snapshot = LiveCounters.shared.snapshot()
|
||||
return snapshot.totalDisconnects > baselineDisconnects && snapshot.activeRequests == 0
|
||||
}
|
||||
|
||||
_ = try? await task.value
|
||||
let elapsed = Date().timeIntervalSince(disconnectStartedAt)
|
||||
XCTAssertLessThan(elapsed, 0.2)
|
||||
}
|
||||
|
||||
func testRepeatedStreamingDisconnectsDoNotBreakSubsequentGeneration() async throws {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
|
||||
let request = APIChatCompletionRequest(
|
||||
model: "gemma",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Count from one to forty with commas, using many tokens."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 96,
|
||||
stream: true,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
for expectedDisconnectCount in 1...3 {
|
||||
try await cancelStreamingChatCompletionAfterFirstContentAndWaitForServerDisconnect(
|
||||
request,
|
||||
port: harness.port,
|
||||
expectedDisconnectCount: expectedDisconnectCount
|
||||
)
|
||||
|
||||
let liveSnapshot = LiveCounters.shared.snapshot()
|
||||
XCTAssertEqual(liveSnapshot.totalDisconnects, expectedDisconnectCount)
|
||||
XCTAssertEqual(liveSnapshot.activeRequests, 0)
|
||||
}
|
||||
|
||||
let recoveryRequest = APIChatCompletionRequest(
|
||||
model: "gemma",
|
||||
messages: [
|
||||
APIChatMessage(role: "user", content: .text("Reply with exactly one short word."), name: nil, tool_calls: nil, tool_call_id: nil)
|
||||
],
|
||||
temperature: 0,
|
||||
top_p: 1,
|
||||
max_tokens: 2,
|
||||
stream: false,
|
||||
stop: nil,
|
||||
tools: nil,
|
||||
tool_choice: nil,
|
||||
frequency_penalty: nil,
|
||||
presence_penalty: nil,
|
||||
n: nil
|
||||
)
|
||||
|
||||
let response = try await sendChatCompletion(recoveryRequest, port: harness.port)
|
||||
XCTAssertEqual(response.choices.count, 1)
|
||||
XCTAssertEqual(response.choices[0].message.role, "assistant")
|
||||
XCTAssertFalse((response.choices[0].message.content ?? "").trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||
}
|
||||
|
||||
func testStreamingToolCallChunksArriveInOpenAICompatibleOrder() async throws {
|
||||
let harness = try await makeHarness()
|
||||
defer { harness.stop() }
|
||||
@@ -846,9 +1125,9 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
)
|
||||
}
|
||||
|
||||
private func makeHarness() async throws -> TestHarness {
|
||||
private func makeHarness(initialModelId: String = "gemma") async throws -> TestHarness {
|
||||
let modelManager = await MainActor.run { ModelManager() }
|
||||
let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
|
||||
let config = try XCTUnwrap(ModelConfig.resolve(initialModelId))
|
||||
|
||||
LiveCounters.shared.reset()
|
||||
TokenPrefixCache.shared.reset()
|
||||
@@ -994,6 +1273,19 @@ final class APIServerRewriteTests: XCTestCase {
|
||||
_ = try? await task.value
|
||||
}
|
||||
|
||||
private func cancelStreamingChatCompletionAfterFirstContentAndWaitForServerDisconnect(
|
||||
_ request: APIChatCompletionRequest,
|
||||
port: UInt16,
|
||||
expectedDisconnectCount: Int
|
||||
) async throws {
|
||||
try await cancelStreamingChatCompletionAfterFirstContent(request, port: port)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 5, intervalNanoseconds: 10_000_000) {
|
||||
let snapshot = LiveCounters.shared.snapshot()
|
||||
return snapshot.totalDisconnects >= expectedDisconnectCount && snapshot.activeRequests == 0
|
||||
}
|
||||
}
|
||||
|
||||
private func waitUntil(
|
||||
timeoutSeconds: TimeInterval,
|
||||
intervalNanoseconds: UInt64 = 100_000_000,
|
||||
|
||||
46
MLXServerTests/Server/ChatViewModelTests.swift
Normal file
46
MLXServerTests/Server/ChatViewModelTests.swift
Normal file
@@ -0,0 +1,46 @@
|
||||
import XCTest
|
||||
@testable import MLX_Server
|
||||
|
||||
@MainActor
|
||||
final class ChatViewModelTests: XCTestCase {
|
||||
func testGemmaChatViewModelSendProducesAssistantReply() async throws {
|
||||
let modelManager = ModelManager()
|
||||
let config = try XCTUnwrap(ModelConfig.resolve("gemma"))
|
||||
await modelManager.loadModel(config)
|
||||
defer { modelManager.unloadModel() }
|
||||
|
||||
XCTAssertTrue(modelManager.isReady)
|
||||
|
||||
let viewModel = ChatViewModel(modelManager: modelManager)
|
||||
viewModel.inputText = "Say hello in one word."
|
||||
viewModel.send()
|
||||
|
||||
XCTAssertTrue(viewModel.isGenerating)
|
||||
|
||||
try await waitUntil(timeoutSeconds: 15) {
|
||||
!viewModel.isGenerating
|
||||
}
|
||||
|
||||
XCTAssertEqual(viewModel.conversation.messages.count, 2)
|
||||
XCTAssertEqual(viewModel.conversation.messages[0].role, .user)
|
||||
XCTAssertEqual(viewModel.conversation.messages[0].content, "Say hello in one word.")
|
||||
XCTAssertEqual(viewModel.conversation.messages[1].role, .assistant)
|
||||
XCTAssertFalse(viewModel.conversation.messages[1].sessionContent.isEmpty)
|
||||
XCTAssertGreaterThan(viewModel.promptTokens, 0)
|
||||
}
|
||||
|
||||
private func waitUntil(
|
||||
timeoutSeconds: TimeInterval,
|
||||
intervalNanoseconds: UInt64 = 100_000_000,
|
||||
condition: @escaping @MainActor () -> Bool
|
||||
) async throws {
|
||||
let deadline = Date().addingTimeInterval(timeoutSeconds)
|
||||
while Date() < deadline {
|
||||
if condition() {
|
||||
return
|
||||
}
|
||||
try await Task.sleep(nanoseconds: intervalNanoseconds)
|
||||
}
|
||||
XCTFail("Condition not met before timeout")
|
||||
}
|
||||
}
|
||||
@@ -209,4 +209,28 @@ final class TokenPrefixCacheTests: XCTestCase {
|
||||
XCTAssertEqual(snapshot.supersequenceHits, 0)
|
||||
XCTAssertEqual(snapshot.lcpHits, 0)
|
||||
}
|
||||
|
||||
func testComputeMemoryBudgetUsesFallbackWhenDeviceUnavailable() {
|
||||
let budget = TokenPrefixCache.computeMemoryBudget(recommendedWorkingSetSize: nil)
|
||||
|
||||
XCTAssertEqual(budget, 512 * 1024 * 1024)
|
||||
}
|
||||
|
||||
func testComputeMemoryBudgetClampsToMinimumFloor() {
|
||||
let budget = TokenPrefixCache.computeMemoryBudget(recommendedWorkingSetSize: 512 * 1024 * 1024)
|
||||
|
||||
XCTAssertEqual(budget, 256 * 1024 * 1024)
|
||||
}
|
||||
|
||||
func testComputeMemoryBudgetUsesTwentyPercentOfWorkingSet() {
|
||||
let budget = TokenPrefixCache.computeMemoryBudget(recommendedWorkingSetSize: 8 * 1024 * 1024 * 1024)
|
||||
|
||||
XCTAssertEqual(budget, Int(Double(8 * 1024 * 1024 * 1024) * 0.20))
|
||||
}
|
||||
|
||||
func testComputeMemoryBudgetClampsToMaximumCap() {
|
||||
let budget = TokenPrefixCache.computeMemoryBudget(recommendedWorkingSetSize: 80 * 1024 * 1024 * 1024)
|
||||
|
||||
XCTAssertEqual(budget, 8 * 1024 * 1024 * 1024)
|
||||
}
|
||||
}
|
||||
47
MLXServerTests/Server/ToolCallParserTests.swift
Normal file
47
MLXServerTests/Server/ToolCallParserTests.swift
Normal file
@@ -0,0 +1,47 @@
|
||||
import XCTest
|
||||
@testable import MLX_Server
|
||||
|
||||
final class ToolCallParserTests: XCTestCase {
|
||||
func testParseGemmaToolCodeBlockExtractsToolCallAndStripsFence() throws {
|
||||
let tools = [mockWeatherTool]
|
||||
let text = "Before\n```tool_code\nweather(city=\"Berlin\")\n```\nAfter"
|
||||
|
||||
let parsed = ToolCallParser.parse(text: text, tools: tools)
|
||||
|
||||
XCTAssertEqual(parsed.0, "Before\n\nAfter")
|
||||
let toolCall = try XCTUnwrap(parsed.1.first)
|
||||
XCTAssertEqual(toolCall.name, "weather")
|
||||
XCTAssertEqual(toolCall.arguments, #"{"city":"Berlin"}"#)
|
||||
}
|
||||
|
||||
func testParseQwenToolCallTagExtractsJSONPayloadAndStripsTag() throws {
|
||||
let text = "<tool_call>{\"name\":\"weather\",\"arguments\":{\"city\":\"Paris\"}}</tool_call>"
|
||||
|
||||
let parsed = ToolCallParser.parse(text: text, tools: [mockWeatherTool])
|
||||
|
||||
XCTAssertEqual(parsed.0, "")
|
||||
let toolCall = try XCTUnwrap(parsed.1.first)
|
||||
XCTAssertEqual(toolCall.name, "weather")
|
||||
XCTAssertEqual(toolCall.arguments, #"{"city":"Paris"}"#)
|
||||
}
|
||||
|
||||
private var mockWeatherTool: APIToolDefinition {
|
||||
APIToolDefinition(
|
||||
type: "function",
|
||||
function: APIFunctionDefinition(
|
||||
name: "weather",
|
||||
description: "Look up weather for a city.",
|
||||
parameters: [
|
||||
"type": AnyCodable("object"),
|
||||
"properties": AnyCodable([
|
||||
"city": [
|
||||
"type": "string",
|
||||
"description": "City name"
|
||||
]
|
||||
]),
|
||||
"required": AnyCodable(["city"])
|
||||
]
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -2614,12 +2614,12 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
|
||||
- [x] Conversation continuation (add 2+ messages, e.g. tool-use flow) → partial cache hit (not a miss!)
|
||||
- [x] Same system prompt, different user message → system prompt prefix cached and reused
|
||||
- [x] Different system prompt → no false cache hit
|
||||
- [ ] Model swap → cache invalidated, fresh generation works
|
||||
- [x] Model swap → cache invalidated, fresh generation works
|
||||
- [x] Idle unload + reload → cache invalidated, fresh generation works
|
||||
|
||||
### Memory Management
|
||||
|
||||
- [ ] Memory budget computed correctly from Metal device
|
||||
- [x] Memory budget computed correctly from Metal device
|
||||
- [x] Entries evicted under memory pressure (oldest first)
|
||||
- [x] Expired entries pruned after 30 min idle
|
||||
- [x] Trie nodes cleaned up when entries are evicted (no memory leak)
|
||||
@@ -2627,9 +2627,9 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
|
||||
|
||||
### Disconnect Handling
|
||||
|
||||
- [ ] Client disconnects mid-stream → generation stops within ~200ms
|
||||
- [x] Client disconnects mid-stream → generation stops within ~200ms
|
||||
- [x] Partial KV cache from disconnected request is still stored for reuse
|
||||
- [ ] No Metal assertion failures on disconnect
|
||||
- [x] No Metal assertion failures on disconnect
|
||||
|
||||
### Streaming
|
||||
|
||||
@@ -2642,9 +2642,9 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
|
||||
|
||||
### Tool Use
|
||||
|
||||
- [ ] Gemma tool_code blocks parsed correctly
|
||||
- [ ] Qwen `<tool_call>` tags parsed correctly
|
||||
- [ ] Framework `ToolCall` events handled correctly
|
||||
- [x] Gemma tool_code blocks parsed correctly
|
||||
- [x] Qwen `<tool_call>` tags parsed correctly
|
||||
- [x] Framework `ToolCall` events handled correctly
|
||||
- [x] Tool results round-trip correctly (user sends tool result → model sees it in context)
|
||||
- [x] finish_reason is "tool_calls" when tools are invoked
|
||||
|
||||
@@ -2700,9 +2700,9 @@ Validation note: `InferenceStats.swift` now samples `TokenPrefixCache` directly
|
||||
|
||||
### Compatibility
|
||||
|
||||
- [ ] `GET /health` → `{"status":"ok"}`
|
||||
- [ ] `GET /v1/models` → model list with context windows
|
||||
- [x] `GET /health` → `{"status":"ok"}`
|
||||
- [x] `GET /v1/models` → model list with context windows
|
||||
- [x] Non-streaming `POST /v1/chat/completions` → full response
|
||||
- [x] Streaming `POST /v1/chat/completions` → SSE stream
|
||||
- [ ] Model field in request triggers model swap
|
||||
- [ ] UI chat (ChatViewModel) completely unaffected
|
||||
- [x] Model field in request triggers model swap
|
||||
- [x] UI chat (ChatViewModel) completely unaffected
|
||||
|
||||
Reference in New Issue
Block a user