fix: A1-14 real neural embeddings via Bumblebee multilingual-e5-small with Float32 BLOB vector cache

This commit is contained in:
2026-05-29 14:04:51 +02:00
parent 489d787306
commit a1004d72bf
16 changed files with 310 additions and 21 deletions

View File

@@ -0,0 +1,39 @@
defmodule BDS.Embeddings.Backends.NeuralTest do
use ExUnit.Case, async: false
alias BDS.Embeddings.Backends.Neural
setup do
previous = Application.get_env(:bds, :embeddings)
on_exit(fn ->
if previous == nil do
Application.delete_env(:bds, :embeddings)
else
Application.put_env(:bds, :embeddings, previous)
end
end)
:ok
end
test "reports the configured spec model id and dimensions without loading the model" do
Application.put_env(:bds, :embeddings,
backend: Neural,
model_id: "Xenova/multilingual-e5-small",
model_repo: "intfloat/multilingual-e5-small",
dimensions: 384
)
assert %{model_id: "Xenova/multilingual-e5-small", dimensions: 384} = Neural.model_info()
end
test "implements the embeddings backend behaviour" do
behaviours =
Neural.module_info(:attributes)
|> Keyword.get_values(:behaviour)
|> List.flatten()
assert BDS.Embeddings.Backend in behaviours
end
end

View File

@@ -321,6 +321,36 @@ defmodule BDS.EmbeddingsTest do
assert BDS.Embeddings.index_path(project.id) =~ "/embeddings.usearch"
end
test "stored embedding vectors are packed Float32 BLOBs, not JSON text", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Blob",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
assert is_binary(key.vector)
# 384 dimensions * 4 bytes per little-endian Float32 (VectorCacheInDb).
assert byte_size(key.vector) == 384 * 4
refute String.starts_with?(key.vector, "[")
decoded = for <<value::float-32-little <- key.vector>>, do: value
assert length(decoded) == 384
# The packed vector still drives similarity queries.
assert {:ok, scores} = BDS.Embeddings.compute_similarities(post.id, [post.id])
assert is_map(scores)
end
test "reindex_all rebuilds stored embeddings for the whole project", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})