perf: batch CPU embedding inference and add A1-14c Apple GPU (EMLX) spec gap

This commit is contained in:
2026-05-29 14:43:39 +02:00
parent a1004d72bf
commit 744f7543d7
10 changed files with 275 additions and 75 deletions

View File

@@ -87,6 +87,8 @@ config {
debounce_persist: Duration = 5.seconds
-- Index file: {userData}/projects/{projectId}/embeddings.usearch
-- Key mapping is persisted alongside the embedding records
batch_size: Integer = 16 -- texts per batched inference run
sequence_length: Integer = 256 -- max tokens per input (truncated)
}
-- ─── Gating ─────────────────────────────────────────────────
@@ -224,6 +226,18 @@ invariant RealNeuralModel {
-- This is only achievable with the trained multilingual transformer model.
}
invariant NativeAcceleratedExecution {
-- Model execution MUST use the platform's native hardware acceleration
-- where available (GPU/Metal/Neural Engine on Apple Silicon, CUDA on
-- NVIDIA, etc.), and otherwise fall back to optimised native CPU execution.
-- Inference MUST be batched: batch_size inputs are run per compiled
-- inference pass and inputs are truncated to a bounded sequence_length, so
-- (re)indexing many posts is not serialised one document at a time.
-- Current implementation: Bumblebee + EXLA, which is native CPU on Apple
-- Silicon (XLA has no Metal backend). Apple GPU acceleration via EMLX/MLX
-- is tracked as a follow-up (SPECGAPS A1-14c).
}
invariant ModelCaching {
-- Model files (~100 MB) downloaded from Hugging Face Hub on first use
-- Cached in app data directory, persists across sessions