diff --git a/config/config.exs b/config/config.exs index 8efaf7b..0a12341 100644 --- a/config/config.exs +++ b/config/config.exs @@ -18,6 +18,11 @@ config :bds, :scripting, job_timeout: :infinity, job_max_reductions: :none +config :bds, :embeddings, + backend: BDS.Embeddings.Backends.InApp, + model_id: "Xenova/multilingual-e5-small", + dimensions: 384 + config :logger, :console, format: "$time $metadata[$level] $message\n", metadata: [:request_id] diff --git a/lib/bds/embeddings.ex b/lib/bds/embeddings.ex index 6855d96..c1323c9 100644 --- a/lib/bds/embeddings.ex +++ b/lib/bds/embeddings.ex @@ -10,12 +10,10 @@ defmodule BDS.Embeddings do alias BDS.Projects alias BDS.Repo - @dimensions 384 @duplicate_threshold 0.5 - @model_id "Xenova/multilingual-e5-small" - def model_id, do: @model_id - def dimensions, do: @dimensions + def model_id, do: configured_backend().model_info().model_id + def dimensions, do: configured_backend().model_info().dimensions def sync_post(%Post{} = post) do if enabled_for_project?(post.project_id) do @@ -29,7 +27,7 @@ defmodule BDS.Embeddings do existing_key -> label = existing_key_label(existing_key) || next_label() - vector = vectorize(raw_text, post.language) + {:ok, vector} = embed_text(raw_text, post.language) (existing_key || %Key{}) |> Key.changeset(%{ @@ -245,6 +243,11 @@ defmodule BDS.Embeddings do defp existing_key_label(nil), do: nil defp existing_key_label(%Key{label: label}), do: label + defp configured_backend do + Application.get_env(:bds, :embeddings, []) + |> Keyword.get(:backend, BDS.Embeddings.Backends.InApp) + end + defp next_label do Repo.one(from key in Key, select: max(key.label)) |> case do @@ -277,40 +280,12 @@ defmodule BDS.Embeddings do defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}" + defp embed_text(raw_text, language) do + configured_backend().embed("query: " <> raw_text, language: language) + end + defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower) - defp vectorize(text, language) do - stemmed = BDS.Search.stem(text, language) - tokens = tokenize(stemmed) - bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::")) - weighted_tokens = tokens ++ bigrams - vector_array = :array.new(@dimensions, default: 0.0) - - vector = - Enum.reduce(weighted_tokens, vector_array, fn token, acc -> - index = :erlang.phash2(token, @dimensions) - :array.set(index, :array.get(index, acc) + 1.0, acc) - end) - |> :array.to_list() - - normalize(vector) - end - - defp tokenize(text) do - Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text)) - |> List.flatten() - end - - defp normalize(vector) do - norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end)) - - if norm == 0.0 do - vector - else - Enum.map(vector, &(&1 / norm)) - end - end - defp decode_vector(nil), do: [] defp decode_vector(vector), do: Jason.decode!(vector) diff --git a/lib/bds/embeddings/backend.ex b/lib/bds/embeddings/backend.ex new file mode 100644 index 0000000..b6471e3 --- /dev/null +++ b/lib/bds/embeddings/backend.ex @@ -0,0 +1,6 @@ +defmodule BDS.Embeddings.Backend do + @moduledoc false + + @callback model_info() :: %{model_id: String.t(), dimensions: pos_integer()} + @callback embed(String.t(), keyword()) :: {:ok, [number()]} | {:error, term()} +end diff --git a/lib/bds/embeddings/backends/in_app.ex b/lib/bds/embeddings/backends/in_app.ex new file mode 100644 index 0000000..e9bf768 --- /dev/null +++ b/lib/bds/embeddings/backends/in_app.ex @@ -0,0 +1,60 @@ +defmodule BDS.Embeddings.Backends.InApp do + @moduledoc false + + @behaviour BDS.Embeddings.Backend + + @impl true + def model_info do + config = Application.get_env(:bds, :embeddings, []) + + %{ + model_id: Keyword.get(config, :model_id, "Xenova/multilingual-e5-small"), + dimensions: Keyword.get(config, :dimensions, 384) + } + end + + @impl true + def embed(text, opts) when is_binary(text) and is_list(opts) do + language = Keyword.get(opts, :language) + dimensions = model_info().dimensions + + vector = + text + |> BDS.Search.stem(language) + |> tokenize() + |> weighted_terms() + |> project_to_vector(dimensions) + |> normalize() + + {:ok, vector} + end + + defp tokenize(text) do + Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text)) + |> List.flatten() + end + + defp weighted_terms(tokens) do + bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::")) + tokens ++ bigrams + end + + defp project_to_vector(terms, dimensions) do + terms + |> Enum.reduce(:array.new(dimensions, default: 0.0), fn term, acc -> + index = :erlang.phash2(term, dimensions) + :array.set(index, :array.get(index, acc) + 1.0, acc) + end) + |> :array.to_list() + end + + defp normalize(vector) do + norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end)) + + if norm == 0.0 do + vector + else + Enum.map(vector, &(&1 / norm)) + end + end +end \ No newline at end of file diff --git a/lib/bds/posts.ex b/lib/bds/posts.ex index 8971cca..452c6c2 100644 --- a/lib/bds/posts.ex +++ b/lib/bds/posts.ex @@ -547,6 +547,7 @@ defmodule BDS.Posts do |> Post.changeset(attrs) |> Repo.insert_or_update!() |> tap(&Search.sync_post/1) + |> tap(&Embeddings.sync_post/1) end defp parse_post_status(status) when is_atom(status), do: status diff --git a/test/bds/embeddings_test.exs b/test/bds/embeddings_test.exs index a78b2eb..67dd80c 100644 --- a/test/bds/embeddings_test.exs +++ b/test/bds/embeddings_test.exs @@ -1,6 +1,20 @@ defmodule BDS.EmbeddingsTest do use ExUnit.Case, async: false + defmodule FakeBackend do + @behaviour BDS.Embeddings.Backend + + @impl true + def model_info do + %{model_id: "fake/multilingual-e5-small", dimensions: 384} + end + + @impl true + def embed(text, opts) do + BDS.Embeddings.Backends.InApp.embed(text, opts) + end + end + setup do :ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo) @@ -10,6 +24,18 @@ defmodule BDS.EmbeddingsTest do on_exit(fn -> File.rm_rf(temp_dir) end) {:ok, project} = BDS.Projects.create_project(%{name: "Embeddings", data_path: temp_dir}) + + previous_config = Application.get_env(:bds, :embeddings) + Application.put_env(:bds, :embeddings, backend: FakeBackend) + + on_exit(fn -> + if previous_config == nil do + Application.delete_env(:bds, :embeddings) + else + Application.put_env(:bds, :embeddings, previous_config) + end + end) + %{project: project} end @@ -103,4 +129,25 @@ defmodule BDS.EmbeddingsTest do assert {:ok, []} = BDS.Embeddings.find_duplicates(project.id) assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id]) end + + test "embeddings use the configured in-app backend module", %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert BDS.Embeddings.model_id() == "fake/multilingual-e5-small" + assert BDS.Embeddings.dimensions() == 384 + + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Configured Backend", + content: "semantic runtime through the configured backend", + language: "en" + }) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + + assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id) + assert post.id in indexed + end end diff --git a/test/bds/maintenance_test.exs b/test/bds/maintenance_test.exs index 9a1c014..8724fb2 100644 --- a/test/bds/maintenance_test.exs +++ b/test/bds/maintenance_test.exs @@ -1,8 +1,6 @@ defmodule BDS.MaintenanceTest do use ExUnit.Case, async: false - import Ecto.Query - alias BDS.Repo setup do @@ -22,6 +20,9 @@ defmodule BDS.MaintenanceTest do project: project, temp_dir: temp_dir } do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + posts_dir = Path.join([temp_dir, "posts", "2026", "04"]) File.mkdir_p!(posts_dir) @@ -111,6 +112,7 @@ defmodule BDS.MaintenanceTest do assert {:ok, posts} = BDS.Maintenance.rebuild_from_filesystem(project.id, "post") assert length(posts) == 1 + assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: "dispatch-post") != nil assert {:ok, media_items} = BDS.Maintenance.rebuild_from_filesystem(project.id, "media") assert length(media_items) == 1