feat: more on embedding

This commit is contained in:
2026-04-24 08:07:34 +02:00
parent 88f966dae9
commit 36fa08ad1e
7 changed files with 135 additions and 39 deletions

View File

@@ -18,6 +18,11 @@ config :bds, :scripting,
job_timeout: :infinity,
job_max_reductions: :none
config :bds, :embeddings,
backend: BDS.Embeddings.Backends.InApp,
model_id: "Xenova/multilingual-e5-small",
dimensions: 384
config :logger, :console,
format: "$time $metadata[$level] $message\n",
metadata: [:request_id]

View File

@@ -10,12 +10,10 @@ defmodule BDS.Embeddings do
alias BDS.Projects
alias BDS.Repo
@dimensions 384
@duplicate_threshold 0.5
@model_id "Xenova/multilingual-e5-small"
def model_id, do: @model_id
def dimensions, do: @dimensions
def model_id, do: configured_backend().model_info().model_id
def dimensions, do: configured_backend().model_info().dimensions
def sync_post(%Post{} = post) do
if enabled_for_project?(post.project_id) do
@@ -29,7 +27,7 @@ defmodule BDS.Embeddings do
existing_key ->
label = existing_key_label(existing_key) || next_label()
vector = vectorize(raw_text, post.language)
{:ok, vector} = embed_text(raw_text, post.language)
(existing_key || %Key{})
|> Key.changeset(%{
@@ -245,6 +243,11 @@ defmodule BDS.Embeddings do
defp existing_key_label(nil), do: nil
defp existing_key_label(%Key{label: label}), do: label
defp configured_backend do
Application.get_env(:bds, :embeddings, [])
|> Keyword.get(:backend, BDS.Embeddings.Backends.InApp)
end
defp next_label do
Repo.one(from key in Key, select: max(key.label))
|> case do
@@ -277,40 +280,12 @@ defmodule BDS.Embeddings do
defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}"
defp embed_text(raw_text, language) do
configured_backend().embed("query: " <> raw_text, language: language)
end
defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower)
defp vectorize(text, language) do
stemmed = BDS.Search.stem(text, language)
tokens = tokenize(stemmed)
bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::"))
weighted_tokens = tokens ++ bigrams
vector_array = :array.new(@dimensions, default: 0.0)
vector =
Enum.reduce(weighted_tokens, vector_array, fn token, acc ->
index = :erlang.phash2(token, @dimensions)
:array.set(index, :array.get(index, acc) + 1.0, acc)
end)
|> :array.to_list()
normalize(vector)
end
defp tokenize(text) do
Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text))
|> List.flatten()
end
defp normalize(vector) do
norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end))
if norm == 0.0 do
vector
else
Enum.map(vector, &(&1 / norm))
end
end
defp decode_vector(nil), do: []
defp decode_vector(vector), do: Jason.decode!(vector)

View File

@@ -0,0 +1,6 @@
defmodule BDS.Embeddings.Backend do
@moduledoc false
@callback model_info() :: %{model_id: String.t(), dimensions: pos_integer()}
@callback embed(String.t(), keyword()) :: {:ok, [number()]} | {:error, term()}
end

View File

@@ -0,0 +1,60 @@
defmodule BDS.Embeddings.Backends.InApp do
@moduledoc false
@behaviour BDS.Embeddings.Backend
@impl true
def model_info do
config = Application.get_env(:bds, :embeddings, [])
%{
model_id: Keyword.get(config, :model_id, "Xenova/multilingual-e5-small"),
dimensions: Keyword.get(config, :dimensions, 384)
}
end
@impl true
def embed(text, opts) when is_binary(text) and is_list(opts) do
language = Keyword.get(opts, :language)
dimensions = model_info().dimensions
vector =
text
|> BDS.Search.stem(language)
|> tokenize()
|> weighted_terms()
|> project_to_vector(dimensions)
|> normalize()
{:ok, vector}
end
defp tokenize(text) do
Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text))
|> List.flatten()
end
defp weighted_terms(tokens) do
bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::"))
tokens ++ bigrams
end
defp project_to_vector(terms, dimensions) do
terms
|> Enum.reduce(:array.new(dimensions, default: 0.0), fn term, acc ->
index = :erlang.phash2(term, dimensions)
:array.set(index, :array.get(index, acc) + 1.0, acc)
end)
|> :array.to_list()
end
defp normalize(vector) do
norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end))
if norm == 0.0 do
vector
else
Enum.map(vector, &(&1 / norm))
end
end
end

View File

@@ -547,6 +547,7 @@ defmodule BDS.Posts do
|> Post.changeset(attrs)
|> Repo.insert_or_update!()
|> tap(&Search.sync_post/1)
|> tap(&Embeddings.sync_post/1)
end
defp parse_post_status(status) when is_atom(status), do: status

View File

@@ -1,6 +1,20 @@
defmodule BDS.EmbeddingsTest do
use ExUnit.Case, async: false
defmodule FakeBackend do
@behaviour BDS.Embeddings.Backend
@impl true
def model_info do
%{model_id: "fake/multilingual-e5-small", dimensions: 384}
end
@impl true
def embed(text, opts) do
BDS.Embeddings.Backends.InApp.embed(text, opts)
end
end
setup do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo)
@@ -10,6 +24,18 @@ defmodule BDS.EmbeddingsTest do
on_exit(fn -> File.rm_rf(temp_dir) end)
{:ok, project} = BDS.Projects.create_project(%{name: "Embeddings", data_path: temp_dir})
previous_config = Application.get_env(:bds, :embeddings)
Application.put_env(:bds, :embeddings, backend: FakeBackend)
on_exit(fn ->
if previous_config == nil do
Application.delete_env(:bds, :embeddings)
else
Application.put_env(:bds, :embeddings, previous_config)
end
end)
%{project: project}
end
@@ -103,4 +129,25 @@ defmodule BDS.EmbeddingsTest do
assert {:ok, []} = BDS.Embeddings.find_duplicates(project.id)
assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id])
end
test "embeddings use the configured in-app backend module", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert BDS.Embeddings.model_id() == "fake/multilingual-e5-small"
assert BDS.Embeddings.dimensions() == 384
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Configured Backend",
content: "semantic runtime through the configured backend",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id)
assert post.id in indexed
end
end

View File

@@ -1,8 +1,6 @@
defmodule BDS.MaintenanceTest do
use ExUnit.Case, async: false
import Ecto.Query
alias BDS.Repo
setup do
@@ -22,6 +20,9 @@ defmodule BDS.MaintenanceTest do
project: project,
temp_dir: temp_dir
} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
posts_dir = Path.join([temp_dir, "posts", "2026", "04"])
File.mkdir_p!(posts_dir)
@@ -111,6 +112,7 @@ defmodule BDS.MaintenanceTest do
assert {:ok, posts} = BDS.Maintenance.rebuild_from_filesystem(project.id, "post")
assert length(posts) == 1
assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: "dispatch-post") != nil
assert {:ok, media_items} = BDS.Maintenance.rebuild_from_filesystem(project.id, "media")
assert length(media_items) == 1