perf: batch CPU embedding inference and add A1-14c Apple GPU (EMLX) spec gap

2026-05-29 14:43:39 +02:00
parent a1004d72bf
commit 744f7543d7
10 changed files with 275 additions and 75 deletions
--- a/config/config.exs
+++ b/config/config.exs
@@ -64,7 +64,11 @@ config :bds, :embeddings,
  backend: BDS.Embeddings.Backends.Neural,
  model_id: "Xenova/multilingual-e5-small",
  model_repo: "intfloat/multilingual-e5-small",
-  dimensions: 384
+  dimensions: 384,
+  # Inference is batched: batch_size texts per compiled run, truncated to
+  # sequence_length tokens. Tuning these trades throughput against memory.
+  batch_size: 16,
+  sequence_length: 256

 # Cache downloaded model files under the app data directory so they persist
 # across sessions (ModelCaching invariant). Overridden at runtime in prod.
--- a/config/test.exs
+++ b/config/test.exs
@@ -15,4 +15,6 @@ config :bds, :embeddings,
  backend: BDS.Embeddings.Backends.InApp,
  model_id: "Xenova/multilingual-e5-small",
  model_repo: "intfloat/multilingual-e5-small",
-  dimensions: 384
+  dimensions: 384,
+  batch_size: 16,
+  sequence_length: 256