From b2086a79c5a5735c06c8dc2242f17dec369ab5a9 Mon Sep 17 00:00:00 2001 From: Chili Palmer Date: Fri, 24 Apr 2026 09:48:52 +0200 Subject: [PATCH] feat: finalisation (hopefully) for embedding --- lib/bds/embeddings.ex | 246 ++++++++++++++++++++++++++++++---- lib/bds/embeddings/index.ex | 167 +++++++++++++++++++++++ lib/bds/maintenance.ex | 8 +- lib/bds/metadata.ex | 13 ++ test/bds/embeddings_test.exs | 217 ++++++++++++++++++++++++++++-- test/bds/maintenance_test.exs | 35 +++++ test/bds/metadata_test.exs | 22 +++ 7 files changed, 669 insertions(+), 39 deletions(-) create mode 100644 lib/bds/embeddings/index.ex diff --git a/lib/bds/embeddings.ex b/lib/bds/embeddings.ex index c1323c9..1d8672d 100644 --- a/lib/bds/embeddings.ex +++ b/lib/bds/embeddings.ex @@ -4,18 +4,111 @@ defmodule BDS.Embeddings do import Ecto.Query alias BDS.Embeddings.DismissedDuplicatePair + alias BDS.Embeddings.Index alias BDS.Embeddings.Key alias BDS.Metadata alias BDS.Posts.Post alias BDS.Projects alias BDS.Repo - @duplicate_threshold 0.5 + @duplicate_threshold 0.92 + @exact_match_score 0.999999 def model_id, do: configured_backend().model_info().model_id def dimensions, do: configured_backend().model_info().dimensions + def index_path(project_id), do: Index.path(project_id) + def reindex_all(project_id), do: rebuild_project(project_id) + + def get_indexing_progress(project_id) when is_binary(project_id) do + indexed = + Repo.one( + from key in Key, + where: key.project_id == ^project_id, + select: count(key.post_id, :distinct) + ) || 0 + + total = + Repo.one( + from post in Post, + where: post.project_id == ^project_id, + select: count(post.id) + ) || 0 + + {:ok, %{indexed: indexed, total: total}} + end def sync_post(%Post{} = post) do + sync_post(post, refresh_index: true) + end + + def sync_post(post_id) when is_binary(post_id) do + case Repo.get(Post, post_id) do + nil -> :ok + post -> sync_post(post) + end + end + + def rebuild_project(project_id) when is_binary(project_id) do + if enabled_for_project?(project_id) do + posts = + Repo.all(from post in Post, where: post.project_id == ^project_id, order_by: [asc: post.created_at, asc: post.slug]) + + post_ids = Enum.map(posts, & &1.id) + + Repo.delete_all( + from key in Key, + where: key.project_id == ^project_id and key.post_id not in ^post_ids + ) + + Enum.each(posts, &sync_post(&1, refresh_index: false)) + :ok = rebuild_snapshot(project_id) + {:ok, post_ids} + else + {:ok, []} + end + end + + def diff_reports(project_id) when is_binary(project_id) do + if enabled_for_project?(project_id) do + snapshot_entries = + case Index.read(project_id) do + {:ok, snapshot} -> Map.get(snapshot, "entries", %{}) + _other -> %{} + end + + keys_by_post = + Repo.all(from key in Key, where: key.project_id == ^project_id) + |> Map.new(&{&1.post_id, &1}) + + Repo.all(from post in Post, where: post.project_id == ^project_id) + |> Enum.flat_map(fn post -> + expected_hash = post_content_hash(post) + key = Map.get(keys_by_post, post.id) + snapshot_entry = Map.get(snapshot_entries, post.id) + + differences = + [ + diff_field("content_hash", key && key.content_hash, expected_hash), + diff_field( + "snapshot_content_hash", + snapshot_entry && snapshot_entry["content_hash"], + key && key.content_hash + ) + ] + |> Enum.reject(&is_nil/1) + + if differences == [] do + [] + else + [%{entity_type: "embedding", entity_id: post.id, differences: differences}] + end + end) + else + [] + end + end + + defp sync_post(%Post{} = post, opts) do if enabled_for_project?(post.project_id) do body = resolve_post_body(post) raw_text = compose_embedding_source(post.title, body) @@ -39,6 +132,10 @@ defmodule BDS.Embeddings do }) |> Repo.insert_or_update() + if Keyword.get(opts, :refresh_index, true) do + :ok = rebuild_snapshot(post.project_id) + end + :ok end else @@ -46,15 +143,22 @@ defmodule BDS.Embeddings do end end - def sync_post(post_id) when is_binary(post_id) do - case Repo.get(Post, post_id) do - nil -> :ok - post -> sync_post(post) - end - end - def remove_post(post_id) when is_binary(post_id) do + project_id = + case Repo.get_by(Key, post_id: post_id) do + %Key{project_id: project_id} -> project_id + nil -> case Repo.get(Post, post_id) do + %Post{project_id: project_id} -> project_id + nil -> nil + end + end + Repo.delete_all(from key in Key, where: key.post_id == ^post_id) + + if is_binary(project_id) and enabled_for_project?(project_id) do + :ok = rebuild_snapshot(project_id) + end + :ok end @@ -70,10 +174,16 @@ defmodule BDS.Embeddings do case Repo.get_by(Key, post_id: post.id, project_id: project_id) do %Key{content_hash: ^content_hash} -> :ok _other -> - :ok = sync_post(%{post | content: if(post.content in [nil, ""], do: body, else: post.content)}) + :ok = + sync_post( + %{post | content: if(post.content in [nil, ""], do: body, else: post.content)}, + refresh_index: false + ) end end) + :ok = rebuild_snapshot(project_id) + indexed = Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id) {:ok, indexed} @@ -88,10 +198,14 @@ defmodule BDS.Embeddings do {:error, :not_found} -> {:ok, []} {:ok, post, source_vector} -> similar = - Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id) - |> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end) - |> Enum.sort_by(& &1.score, :desc) - |> Enum.take(max(limit, 0)) + case Index.neighbors(post.project_id, post.id, limit) do + {:ok, neighbors} -> neighbors + {:error, :missing} -> + Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id) + |> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end) + |> Enum.sort_by(& &1.score, :desc) + |> Enum.take(max(limit, 0)) + end {:ok, similar} end @@ -150,22 +264,31 @@ defmodule BDS.Embeddings do def find_duplicates(project_id) when is_binary(project_id) do if enabled_for_project?(project_id) do dismissed = dismissed_pair_keys(project_id) - keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id]) duplicates = - for left <- keys, - right <- keys, - left.post_id < right.post_id, - pair_key(left.post_id, right.post_id) not in dismissed, - similarity = cosine_similarity(decode_vector(left.vector), decode_vector(right.vector)), - similarity >= @duplicate_threshold do - %{ - post_id_a: left.post_id, - post_id_b: right.post_id, - score: similarity - } + case Index.duplicate_pairs(project_id, @duplicate_threshold) do + {:ok, pairs} -> + pairs + |> Enum.reject(fn pair -> pair_key(pair.post_id_a, pair.post_id_b) in dismissed end) + |> enrich_duplicate_pairs(project_id) + + {:error, :missing} -> + keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id]) + + for left <- keys, + right <- keys, + left.post_id < right.post_id, + pair_key(left.post_id, right.post_id) not in dismissed, + similarity = cosine_similarity(decode_vector(left.vector), decode_vector(right.vector)), + similarity >= @duplicate_threshold do + %{ + post_id_a: left.post_id, + post_id_b: right.post_id, + score: similarity + } + end + |> enrich_duplicate_pairs(project_id) end - |> Enum.sort_by(& &1.score, :desc) {:ok, duplicates} else @@ -204,6 +327,26 @@ defmodule BDS.Embeddings do end end + def dismiss_duplicate_pairs(pair_ids) when is_list(pair_ids) do + pair_ids + |> Enum.filter(fn + {post_id_a, post_id_b} when is_binary(post_id_a) and is_binary(post_id_b) -> true + _other -> false + end) + |> Enum.map(fn {post_id_a, post_id_b} -> sort_pair(post_id_a, post_id_b) end) + |> Enum.uniq() + |> Enum.reduce_while({:ok, []}, fn {post_id_a, post_id_b}, {:ok, acc} -> + case dismiss_duplicate_pair(post_id_a, post_id_b) do + {:ok, saved_pair} -> {:cont, {:ok, [saved_pair | acc]}} + {:error, reason} -> {:halt, {:error, reason}} + end + end) + |> case do + {:ok, saved_pairs} -> {:ok, Enum.reverse(saved_pairs)} + {:error, reason} -> {:error, reason} + end + end + defp source_post_and_vector(post_id) do with {:ok, post} <- fetch_post(post_id) do if enabled_for_project?(post.project_id) do @@ -233,6 +376,37 @@ defmodule BDS.Embeddings do end end + defp enrich_duplicate_pairs(pairs, project_id) do + posts_by_id = + pairs + |> Enum.flat_map(&[&1.post_id_a, &1.post_id_b]) + |> Enum.uniq() + |> then(fn post_ids -> + Repo.all(from post in Post, where: post.project_id == ^project_id and post.id in ^post_ids) + |> Map.new(&{&1.id, &1}) + end) + + pairs + |> Enum.map(fn pair -> + post_a = Map.fetch!(posts_by_id, pair.post_id_a) + post_b = Map.fetch!(posts_by_id, pair.post_id_b) + exact_match = exact_duplicate_match?(pair.score, post_a, post_b) + + pair + |> Map.put(:title_a, post_a.title || "") + |> Map.put(:title_b, post_b.title || "") + |> Map.put(:similarity, pair.score) + |> Map.put(:exact_match, exact_match) + end) + |> Enum.sort_by(fn pair -> {not pair.exact_match, -pair.score, pair.post_id_a, pair.post_id_b} end) + end + + defp exact_duplicate_match?(score, %Post{} = post_a, %Post{} = post_b) do + score >= @exact_match_score and + (post_a.title || "") == (post_b.title || "") and + resolve_post_body(post_a) == resolve_post_body(post_b) + end + defp enabled_for_project?(project_id) do case Metadata.get_project_metadata(project_id) do {:ok, metadata} -> metadata.semantic_similarity_enabled == true @@ -280,10 +454,30 @@ defmodule BDS.Embeddings do defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}" + defp post_content_hash(%Post{} = post) do + body = resolve_post_body(post) + hash_text(compose_embedding_source(post.title, body)) + end + defp embed_text(raw_text, language) do configured_backend().embed("query: " <> raw_text, language: language) end + defp rebuild_snapshot(project_id) do + Index.rebuild(project_id, model_id: model_id(), dimensions: dimensions()) + end + + defp diff_field(name, db_value, file_value) do + db_value = if(is_binary(db_value), do: db_value, else: db_value || "") + file_value = if(is_binary(file_value), do: file_value, else: file_value || "") + + if db_value == file_value do + nil + else + %{name: name, db_value: db_value, file_value: file_value} + end + end + defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower) defp decode_vector(nil), do: [] diff --git a/lib/bds/embeddings/index.ex b/lib/bds/embeddings/index.ex new file mode 100644 index 0000000..7532a0f --- /dev/null +++ b/lib/bds/embeddings/index.ex @@ -0,0 +1,167 @@ +defmodule BDS.Embeddings.Index do + @moduledoc false + + import Ecto.Query + + alias BDS.Embeddings.Key + alias BDS.Projects + alias BDS.Repo + + @neighbor_limit 21 + + def path(project_id) when is_binary(project_id) do + project = Projects.get_project!(project_id) + Path.join(Projects.project_data_dir(project), "embeddings.usearch") + end + + def rebuild(project_id, opts) when is_binary(project_id) and is_list(opts) do + model_id = Keyword.fetch!(opts, :model_id) + dimensions = Keyword.fetch!(opts, :dimensions) + + keys = + Repo.all( + from key in Key, + where: key.project_id == ^project_id, + order_by: [asc: key.post_id] + ) + + entries = + keys + |> Enum.map(fn key -> + vector = decode_vector(key.vector) + + {key.post_id, + %{ + "label" => key.label, + "content_hash" => key.content_hash, + "neighbors" => neighbor_entries(keys, key, vector) + }} + end) + |> Map.new() + + payload = %{ + "project_id" => project_id, + "model_id" => model_id, + "dimensions" => dimensions, + "updated_at" => System.system_time(:second), + "entries" => entries + } + + write_snapshot(path(project_id), payload) + end + + def read(project_id) when is_binary(project_id) do + snapshot_path = path(project_id) + + case File.read(snapshot_path) do + {:ok, contents} -> {:ok, Jason.decode!(contents)} + {:error, :enoent} -> read_legacy_snapshot(project_id) + {:error, reason} -> {:error, reason} + end + end + + def neighbors(project_id, post_id, limit) when is_binary(project_id) and is_binary(post_id) do + with {:ok, snapshot} <- read(project_id), + %{} = entry <- get_in(snapshot, ["entries", post_id]) do + entry + |> Map.get("neighbors", []) + |> Enum.take(max(limit, 0)) + |> Enum.map(fn neighbor -> + %{ + post_id: neighbor["post_id"], + score: neighbor["score"] + } + end) + |> then(&{:ok, &1}) + else + _ -> {:error, :missing} + end + end + + def duplicate_pairs(project_id, threshold) when is_binary(project_id) do + with {:ok, snapshot} <- read(project_id) do + pairs = + snapshot + |> Map.get("entries", %{}) + |> Enum.flat_map(fn {post_id, entry} -> + entry + |> Map.get("neighbors", []) + |> Enum.filter(&(&1["score"] >= threshold)) + |> Enum.map(fn neighbor -> + {post_id_a, post_id_b} = sort_pair(post_id, neighbor["post_id"]) + + {{post_id_a, post_id_b}, + %{ + post_id_a: post_id_a, + post_id_b: post_id_b, + score: neighbor["score"] + }} + end) + end) + |> Map.new() + |> Map.values() + |> Enum.sort_by(& &1.score, :desc) + + {:ok, pairs} + else + _ -> {:error, :missing} + end + end + + defp neighbor_entries(keys, current_key, current_vector) do + keys + |> Enum.reject(&(&1.post_id == current_key.post_id)) + |> Enum.map(fn other_key -> + %{ + "post_id" => other_key.post_id, + "label" => other_key.label, + "score" => cosine_similarity(current_vector, decode_vector(other_key.vector)) + } + end) + |> Enum.sort_by(& &1["score"], :desc) + |> Enum.take(@neighbor_limit) + end + + defp write_snapshot(snapshot_path, payload) do + :ok = File.mkdir_p(Path.dirname(snapshot_path)) + temp_path = snapshot_path <> ".tmp" + :ok = File.write(temp_path, Jason.encode!(payload)) + :ok = File.rename(temp_path, snapshot_path) + legacy_path = legacy_path(snapshot_path) + + if File.exists?(legacy_path) do + File.rm(legacy_path) + end + + :ok + end + + defp read_legacy_snapshot(project_id) do + legacy_snapshot_path = project_id |> path() |> legacy_path() + + case File.read(legacy_snapshot_path) do + {:ok, contents} -> {:ok, Jason.decode!(contents)} + {:error, :enoent} -> {:error, :missing} + {:error, reason} -> {:error, reason} + end + end + + defp legacy_path(snapshot_path) do + Path.join(Path.dirname(snapshot_path), "embeddings.index.json") + end + + defp decode_vector(nil), do: [] + defp decode_vector(vector), do: Jason.decode!(vector) + + defp cosine_similarity([], _other), do: 0.0 + defp cosine_similarity(_vector, []), do: 0.0 + + defp cosine_similarity(left, right) do + Enum.zip(left, right) + |> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end) + |> max(0.0) + end + + defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b} + defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a} +end \ No newline at end of file diff --git a/lib/bds/maintenance.ex b/lib/bds/maintenance.ex index 1d881cd..edc299e 100644 --- a/lib/bds/maintenance.ex +++ b/lib/bds/maintenance.ex @@ -6,6 +6,7 @@ defmodule BDS.Maintenance do alias BDS.Frontmatter alias BDS.Media.Media alias BDS.Media.Translation, as: MediaTranslation + alias BDS.Embeddings alias BDS.Posts.Post alias BDS.Posts.Translation, as: PostTranslation alias BDS.Projects @@ -20,6 +21,7 @@ defmodule BDS.Maintenance do :media -> BDS.Media.rebuild_media_from_files(project_id) :script -> BDS.Scripts.rebuild_scripts_from_files(project_id) :template -> BDS.Templates.rebuild_templates_from_files(project_id) + :embedding -> Embeddings.rebuild_project(project_id) :unsupported -> {:error, :unsupported_entity_type} end end @@ -33,7 +35,8 @@ defmodule BDS.Maintenance do media_diff_reports(project_id, project) ++ media_translation_diff_reports(project_id, project) ++ script_diff_reports(project_id, project) ++ - template_diff_reports(project_id, project) + template_diff_reports(project_id, project) ++ + Embeddings.diff_reports(project_id) orphan_reports = orphan_reports(project_id, project) @@ -44,10 +47,13 @@ defmodule BDS.Maintenance do defp normalize_entity_type(:media), do: :media defp normalize_entity_type(:script), do: :script defp normalize_entity_type(:template), do: :template + defp normalize_entity_type(:embedding), do: :embedding defp normalize_entity_type("post"), do: :post defp normalize_entity_type("media"), do: :media defp normalize_entity_type("script"), do: :script defp normalize_entity_type("template"), do: :template + defp normalize_entity_type("embedding"), do: :embedding + defp normalize_entity_type("embeddings"), do: :embedding defp normalize_entity_type(_entity_type), do: :unsupported defp post_diff_reports(project_id, project) do diff --git a/lib/bds/metadata.ex b/lib/bds/metadata.ex index 5348564..a10f356 100644 --- a/lib/bds/metadata.ex +++ b/lib/bds/metadata.ex @@ -1,6 +1,7 @@ defmodule BDS.Metadata do @moduledoc false + alias BDS.Embeddings alias BDS.Projects alias BDS.Projects.Project alias BDS.Repo @@ -50,6 +51,7 @@ defmodule BDS.Metadata do load_state(updated_project) end) |> unwrap_transaction() + |> maybe_backfill_embeddings(project_id, state, project_metadata) end def add_category(project_id, name) do @@ -311,6 +313,17 @@ defmodule BDS.Metadata do defp unwrap_transaction({:ok, result}), do: {:ok, result} defp unwrap_transaction({:error, reason}), do: {:error, reason} + defp maybe_backfill_embeddings({:ok, _metadata} = result, project_id, previous_state, project_metadata) do + if previous_state.semantic_similarity_enabled != true and + project_metadata.semantic_similarity_enabled == true do + {:ok, _indexed_post_ids} = Embeddings.index_unindexed(project_id) + end + + result + end + + defp maybe_backfill_embeddings(result, _project_id, _previous_state, _project_metadata), do: result + defp attr(attrs, key) do cond do Map.has_key?(attrs, key) -> Map.get(attrs, key) diff --git a/test/bds/embeddings_test.exs b/test/bds/embeddings_test.exs index 67dd80c..2c3a392 100644 --- a/test/bds/embeddings_test.exs +++ b/test/bds/embeddings_test.exs @@ -39,7 +39,7 @@ defmodule BDS.EmbeddingsTest do %{project: project} end - test "embeddings index published posts when semantic similarity is enabled and support similarity, duplicates, dismissals, and tag suggestions", + test "embeddings index published posts when semantic similarity is enabled and support similarity, dismissals, and tag suggestions", %{project: project} do assert {:ok, _metadata} = BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) @@ -89,20 +89,9 @@ defmodule BDS.EmbeddingsTest do assert {:ok, suggestions} = BDS.Embeddings.suggest_tags(alpha.id, "rocket orbit mission") assert "space" in suggestions - assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id) - assert Enum.any?(duplicates, fn pair -> - MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id]) - end) - assert {:ok, dismissal} = BDS.Embeddings.dismiss_duplicate_pair(alpha.id, beta.id) assert dismissal.project_id == project.id - assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id) - - refute Enum.any?(filtered_duplicates, fn pair -> - MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id]) - end) - assert {:ok, alpha} = BDS.Posts.update_post(alpha.id, %{content: "kitchen flour dough loaf"}) assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id) @@ -115,6 +104,97 @@ defmodule BDS.EmbeddingsTest do refute Map.has_key?(after_delete, gamma.id) end + test "duplicate detection keeps exact matches and excludes lower-similarity pairs below the spec threshold", + %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, exact_a} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Exact Match", + content: "space rocket launch orbit mission galaxy", + language: "en" + }) + + assert {:ok, exact_b} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Exact Match", + content: "space rocket launch orbit mission galaxy", + language: "en" + }) + + assert {:ok, fuzzy} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Fuzzy", + content: "space rocket launch mission orbit space station", + language: "en" + }) + + assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id) + assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id) + assert {:ok, fuzzy} = BDS.Posts.publish_post(fuzzy.id) + + assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id) + assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id) + + assert [%{post_id_a: post_id_a, post_id_b: post_id_b, score: score}] = duplicates + assert MapSet.new([post_id_a, post_id_b]) == MapSet.new([exact_a.id, exact_b.id]) + assert score >= 0.99 + assert hd(duplicates).similarity == score + assert hd(duplicates).exact_match == true + + assert MapSet.new([hd(duplicates).title_a, hd(duplicates).title_b]) == + MapSet.new(["Exact Match", "Exact Match"]) + + refute Enum.any?(duplicates, fn pair -> + MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([exact_a.id, fuzzy.id]) + end) + end + + test "batch duplicate dismissal stores canonical pairs and excludes them from future searches", + %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, exact_a} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Exact Match", + content: "space rocket launch orbit mission galaxy", + language: "en" + }) + + assert {:ok, exact_b} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Exact Match", + content: "space rocket launch orbit mission galaxy", + language: "en" + }) + + assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id) + assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id) + assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id) + + assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id) + assert length(duplicates) == 1 + + assert {:ok, dismissed_pairs} = + BDS.Embeddings.dismiss_duplicate_pairs([ + {exact_b.id, exact_a.id}, + {exact_a.id, exact_b.id} + ]) + + assert length(dismissed_pairs) == 1 + assert hd(dismissed_pairs).project_id == project.id + + assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id) + assert filtered_duplicates == [] + end + test "embedding queries are gated off when semantic similarity is disabled", %{project: project} do assert {:ok, post} = BDS.Posts.create_post(%{ @@ -130,6 +210,41 @@ defmodule BDS.EmbeddingsTest do assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id]) end + test "get_indexing_progress returns zero indexed and total when a project has no posts", %{ + project: project + } do + assert {:ok, %{indexed: 0, total: 0}} = BDS.Embeddings.get_indexing_progress(project.id) + end + + test "get_indexing_progress returns indexed embeddings and total posts for the project", %{ + project: project + } do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, indexed_post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Indexed", + content: "space rocket orbit mission galaxy", + language: "en" + }) + + assert {:ok, unindexed_post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Unindexed", + content: "flour yeast dough oven loaf kitchen", + language: "en" + }) + + assert {:ok, indexed_post} = BDS.Posts.publish_post(indexed_post.id) + + assert {:ok, %{indexed: 2, total: 2}} = BDS.Embeddings.get_indexing_progress(project.id) + + assert unindexed_post.id != indexed_post.id + end + test "embeddings use the configured in-app backend module", %{project: project} do assert {:ok, _metadata} = BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) @@ -150,4 +265,82 @@ defmodule BDS.EmbeddingsTest do assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id) assert post.id in indexed end + + test "embedding indexing persists a project-local similarity snapshot", %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, alpha} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Alpha", + content: "space rocket orbit mission galaxy", + language: "en" + }) + + assert {:ok, beta} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Beta", + content: "rocket launch orbit mission station", + language: "en" + }) + + assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id) + assert {:ok, beta} = BDS.Posts.publish_post(beta.id) + + assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id) + + index_path = BDS.Embeddings.index_path(project.id) + assert File.exists?(index_path) + + snapshot = index_path |> File.read!() |> Jason.decode!() + assert snapshot["project_id"] == project.id + assert snapshot["model_id"] == "fake/multilingual-e5-small" + assert snapshot["dimensions"] == 384 + assert snapshot["entries"][alpha.id]["label"] != nil + assert snapshot["entries"][alpha.id]["content_hash"] != nil + + assert Enum.any?(snapshot["entries"][alpha.id]["neighbors"], fn neighbor -> + neighbor["post_id"] == beta.id + end) + end + + test "embedding index uses the old-app persisted file name", %{project: project} do + assert BDS.Embeddings.index_path(project.id) =~ "/embeddings.usearch" + end + + + test "reindex_all rebuilds stored embeddings for the whole project", %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Reindex Target", + content: "space rocket orbit mission galaxy", + language: "en" + }) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id) + + original_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) + + assert {:ok, post} = + BDS.Posts.update_post(post.id, %{content: "kitchen flour dough oven loaf"}) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + stale_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) + + assert stale_key.content_hash != original_key.content_hash + + assert {:ok, rebuilt_ids} = BDS.Embeddings.reindex_all(project.id) + assert post.id in rebuilt_ids + + refreshed_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) + assert refreshed_key.content_hash == stale_key.content_hash + assert File.exists?(BDS.Embeddings.index_path(project.id)) + end end diff --git a/test/bds/maintenance_test.exs b/test/bds/maintenance_test.exs index 8724fb2..c3c0d55 100644 --- a/test/bds/maintenance_test.exs +++ b/test/bds/maintenance_test.exs @@ -1,6 +1,8 @@ defmodule BDS.MaintenanceTest do use ExUnit.Case, async: false + import Ecto.Query + alias BDS.Repo setup do @@ -134,6 +136,39 @@ defmodule BDS.MaintenanceTest do BDS.Maintenance.rebuild_from_filesystem(project.id, "unknown") end + test "maintenance rebuilds and diffs embedding state explicitly", %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Embedding Drift", + content: "space rocket orbit mission galaxy", + language: "en" + }) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id) + + index_path = BDS.Embeddings.index_path(project.id) + assert File.exists?(index_path) + + Repo.delete_all(from key in BDS.Embeddings.Key, where: key.project_id == ^project.id) + File.rm!(index_path) + + assert {:ok, %{diff_reports: diff_reports}} = BDS.Maintenance.metadata_diff(project.id) + + assert Enum.any?(diff_reports, fn report -> + report.entity_type == "embedding" and report.entity_id == post.id + end) + + assert {:ok, rebuilt_post_ids} = BDS.Maintenance.rebuild_from_filesystem(project.id, "embedding") + assert post.id in rebuilt_post_ids + assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil + assert File.exists?(index_path) + end + test "metadata_diff reports field differences and orphan files across managed entities", %{ project: project, temp_dir: temp_dir diff --git a/test/bds/metadata_test.exs b/test/bds/metadata_test.exs index c86d406..dda639c 100644 --- a/test/bds/metadata_test.exs +++ b/test/bds/metadata_test.exs @@ -116,4 +116,26 @@ defmodule BDS.MetadataTest do "ssh_mode" => "rsync" } end + + test "enabling semantic similarity backfills embeddings for existing published posts", %{ + project: project + } do + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Backfill Me", + content: "space rocket orbit mission galaxy", + language: "en" + }) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) == nil + + assert {:ok, metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert metadata.semantic_similarity_enabled == true + assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil + assert File.exists?(BDS.Embeddings.index_path(project.id)) + end end