fix: A1-14 real neural embeddings via Bumblebee multilingual-e5-small with Float32 BLOB vector cache

2026-05-29 14:04:51 +02:00
parent 489d787306
commit a1004d72bf
16 changed files with 310 additions and 21 deletions
--- a/lib/bds/embeddings/backends/in_app.ex
+++ b/lib/bds/embeddings/backends/in_app.ex
@@ -1,5 +1,13 @@
 defmodule BDS.Embeddings.Backends.InApp do
-  @moduledoc false
+  @moduledoc """
+  Deterministic lexical embedding stub.
+
+  This backend does NOT satisfy the `RealNeuralModel` invariant — it projects
+  stemmed tokens and bigrams into a sparse hashed vector. It exists only as an
+  offline, dependency-free fallback for tests and environments where the neural
+  model (see `BDS.Embeddings.Backends.Neural`) cannot be loaded. Production and
+  development use the neural backend.
+  """

  @behaviour BDS.Embeddings.Backend

--- a/lib/bds/embeddings/backends/neural.ex
+++ b/lib/bds/embeddings/backends/neural.ex
@@ -0,0 +1,104 @@
+defmodule BDS.Embeddings.Backends.Neural do
+  @moduledoc """
+  Real on-device neural embedding backend.
+
+  Implements the `RealNeuralModel` and `ModelCaching` invariants from
+  `specs/embedding.allium`: embeddings are produced by the actual
+  multilingual-e5-small transformer (the `intfloat/multilingual-e5-small`
+  weights behind the `Xenova/multilingual-e5-small` identifier) via
+  Bumblebee + EXLA, never by a lexical approximation.
+
+    * Lazy-loaded — the model pipeline is built on the first embedding
+      request, not at application startup.
+    * Model files (~100 MB) are downloaded from the Hugging Face Hub on
+      first use and cached on disk (Bumblebee cache dir), persisting across
+      sessions and project switches.
+    * Text preprocessing follows the e5 convention: every input is prefixed
+      with `"query: "`, pooled with mean pooling over the attention mask, and
+      L2-normalised. This is what makes cross-language semantic similarity
+      work.
+  """
+
+  @behaviour BDS.Embeddings.Backend
+
+  use GenServer
+
+  @query_prefix "query: "
+  @embed_timeout :timer.minutes(2)
+
+  @default_model_id "Xenova/multilingual-e5-small"
+  @default_model_repo "intfloat/multilingual-e5-small"
+  @default_dimensions 384
+
+  def child_spec(opts) do
+    %{id: __MODULE__, start: {__MODULE__, :start_link, [opts]}}
+  end
+
+  def start_link(opts \\ []) do
+    GenServer.start_link(__MODULE__, opts, name: __MODULE__)
+  end
+
+  @impl BDS.Embeddings.Backend
+  def model_info do
+    config = config()
+
+    %{
+      model_id: Keyword.get(config, :model_id, @default_model_id),
+      dimensions: Keyword.get(config, :dimensions, @default_dimensions)
+    }
+  end
+
+  @impl BDS.Embeddings.Backend
+  def embed(text, _opts) when is_binary(text) do
+    GenServer.call(__MODULE__, {:embed, @query_prefix <> text}, @embed_timeout)
+  catch
+    :exit, reason -> {:error, {:embedding_backend_unavailable, reason}}
+  end
+
+  @impl GenServer
+  def init(_opts), do: {:ok, %{serving: nil}}
+
+  @impl GenServer
+  def handle_call({:embed, text}, _from, state) do
+    case ensure_serving(state) do
+      {:ok, %{serving: serving} = next_state} ->
+        %{embedding: tensor} = Nx.Serving.run(serving, text)
+        {:reply, {:ok, Nx.to_flat_list(tensor)}, next_state}
+
+      {:error, _reason} = error ->
+        {:reply, error, state}
+    end
+  rescue
+    exception ->
+      {:reply, {:error, Exception.message(exception)}, state}
+  end
+
+  defp ensure_serving(%{serving: nil} = state) do
+    case build_serving() do
+      {:ok, serving} -> {:ok, %{state | serving: serving}}
+      {:error, _reason} = error -> error
+    end
+  end
+
+  defp ensure_serving(state), do: {:ok, state}
+
+  defp build_serving do
+    repo = {:hf, Keyword.get(config(), :model_repo, @default_model_repo)}
+
+    with {:ok, model_info} <- Bumblebee.load_model(repo),
+         {:ok, tokenizer} <- Bumblebee.load_tokenizer(repo) do
+      serving =
+        Bumblebee.Text.text_embedding(model_info, tokenizer,
+          output_pool: :mean_pooling,
+          output_attribute: :hidden_state,
+          embedding_processor: :l2_norm,
+          compile: [batch_size: 1, sequence_length: 512],
+          defn_options: [compiler: EXLA]
+        )
+
+      {:ok, serving}
+    end
+  end
+
+  defp config, do: Application.get_env(:bds, :embeddings, [])
+end