feat: finalisation (hopefully) for embedding

This commit is contained in:
2026-04-24 09:48:52 +02:00
parent 36fa08ad1e
commit b2086a79c5
7 changed files with 669 additions and 39 deletions

View File

@@ -4,18 +4,111 @@ defmodule BDS.Embeddings do
import Ecto.Query
alias BDS.Embeddings.DismissedDuplicatePair
alias BDS.Embeddings.Index
alias BDS.Embeddings.Key
alias BDS.Metadata
alias BDS.Posts.Post
alias BDS.Projects
alias BDS.Repo
@duplicate_threshold 0.5
@duplicate_threshold 0.92
@exact_match_score 0.999999
def model_id, do: configured_backend().model_info().model_id
def dimensions, do: configured_backend().model_info().dimensions
def index_path(project_id), do: Index.path(project_id)
def reindex_all(project_id), do: rebuild_project(project_id)
def get_indexing_progress(project_id) when is_binary(project_id) do
indexed =
Repo.one(
from key in Key,
where: key.project_id == ^project_id,
select: count(key.post_id, :distinct)
) || 0
total =
Repo.one(
from post in Post,
where: post.project_id == ^project_id,
select: count(post.id)
) || 0
{:ok, %{indexed: indexed, total: total}}
end
def sync_post(%Post{} = post) do
sync_post(post, refresh_index: true)
end
def sync_post(post_id) when is_binary(post_id) do
case Repo.get(Post, post_id) do
nil -> :ok
post -> sync_post(post)
end
end
def rebuild_project(project_id) when is_binary(project_id) do
if enabled_for_project?(project_id) do
posts =
Repo.all(from post in Post, where: post.project_id == ^project_id, order_by: [asc: post.created_at, asc: post.slug])
post_ids = Enum.map(posts, & &1.id)
Repo.delete_all(
from key in Key,
where: key.project_id == ^project_id and key.post_id not in ^post_ids
)
Enum.each(posts, &sync_post(&1, refresh_index: false))
:ok = rebuild_snapshot(project_id)
{:ok, post_ids}
else
{:ok, []}
end
end
def diff_reports(project_id) when is_binary(project_id) do
if enabled_for_project?(project_id) do
snapshot_entries =
case Index.read(project_id) do
{:ok, snapshot} -> Map.get(snapshot, "entries", %{})
_other -> %{}
end
keys_by_post =
Repo.all(from key in Key, where: key.project_id == ^project_id)
|> Map.new(&{&1.post_id, &1})
Repo.all(from post in Post, where: post.project_id == ^project_id)
|> Enum.flat_map(fn post ->
expected_hash = post_content_hash(post)
key = Map.get(keys_by_post, post.id)
snapshot_entry = Map.get(snapshot_entries, post.id)
differences =
[
diff_field("content_hash", key && key.content_hash, expected_hash),
diff_field(
"snapshot_content_hash",
snapshot_entry && snapshot_entry["content_hash"],
key && key.content_hash
)
]
|> Enum.reject(&is_nil/1)
if differences == [] do
[]
else
[%{entity_type: "embedding", entity_id: post.id, differences: differences}]
end
end)
else
[]
end
end
defp sync_post(%Post{} = post, opts) do
if enabled_for_project?(post.project_id) do
body = resolve_post_body(post)
raw_text = compose_embedding_source(post.title, body)
@@ -39,6 +132,10 @@ defmodule BDS.Embeddings do
})
|> Repo.insert_or_update()
if Keyword.get(opts, :refresh_index, true) do
:ok = rebuild_snapshot(post.project_id)
end
:ok
end
else
@@ -46,15 +143,22 @@ defmodule BDS.Embeddings do
end
end
def sync_post(post_id) when is_binary(post_id) do
case Repo.get(Post, post_id) do
nil -> :ok
post -> sync_post(post)
def remove_post(post_id) when is_binary(post_id) do
project_id =
case Repo.get_by(Key, post_id: post_id) do
%Key{project_id: project_id} -> project_id
nil -> case Repo.get(Post, post_id) do
%Post{project_id: project_id} -> project_id
nil -> nil
end
end
def remove_post(post_id) when is_binary(post_id) do
Repo.delete_all(from key in Key, where: key.post_id == ^post_id)
if is_binary(project_id) and enabled_for_project?(project_id) do
:ok = rebuild_snapshot(project_id)
end
:ok
end
@@ -70,10 +174,16 @@ defmodule BDS.Embeddings do
case Repo.get_by(Key, post_id: post.id, project_id: project_id) do
%Key{content_hash: ^content_hash} -> :ok
_other ->
:ok = sync_post(%{post | content: if(post.content in [nil, ""], do: body, else: post.content)})
:ok =
sync_post(
%{post | content: if(post.content in [nil, ""], do: body, else: post.content)},
refresh_index: false
)
end
end)
:ok = rebuild_snapshot(project_id)
indexed = Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id)
{:ok, indexed}
@@ -88,10 +198,14 @@ defmodule BDS.Embeddings do
{:error, :not_found} -> {:ok, []}
{:ok, post, source_vector} ->
similar =
case Index.neighbors(post.project_id, post.id, limit) do
{:ok, neighbors} -> neighbors
{:error, :missing} ->
Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id)
|> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end)
|> Enum.sort_by(& &1.score, :desc)
|> Enum.take(max(limit, 0))
end
{:ok, similar}
end
@@ -150,9 +264,17 @@ defmodule BDS.Embeddings do
def find_duplicates(project_id) when is_binary(project_id) do
if enabled_for_project?(project_id) do
dismissed = dismissed_pair_keys(project_id)
keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id])
duplicates =
case Index.duplicate_pairs(project_id, @duplicate_threshold) do
{:ok, pairs} ->
pairs
|> Enum.reject(fn pair -> pair_key(pair.post_id_a, pair.post_id_b) in dismissed end)
|> enrich_duplicate_pairs(project_id)
{:error, :missing} ->
keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id])
for left <- keys,
right <- keys,
left.post_id < right.post_id,
@@ -165,7 +287,8 @@ defmodule BDS.Embeddings do
score: similarity
}
end
|> Enum.sort_by(& &1.score, :desc)
|> enrich_duplicate_pairs(project_id)
end
{:ok, duplicates}
else
@@ -204,6 +327,26 @@ defmodule BDS.Embeddings do
end
end
def dismiss_duplicate_pairs(pair_ids) when is_list(pair_ids) do
pair_ids
|> Enum.filter(fn
{post_id_a, post_id_b} when is_binary(post_id_a) and is_binary(post_id_b) -> true
_other -> false
end)
|> Enum.map(fn {post_id_a, post_id_b} -> sort_pair(post_id_a, post_id_b) end)
|> Enum.uniq()
|> Enum.reduce_while({:ok, []}, fn {post_id_a, post_id_b}, {:ok, acc} ->
case dismiss_duplicate_pair(post_id_a, post_id_b) do
{:ok, saved_pair} -> {:cont, {:ok, [saved_pair | acc]}}
{:error, reason} -> {:halt, {:error, reason}}
end
end)
|> case do
{:ok, saved_pairs} -> {:ok, Enum.reverse(saved_pairs)}
{:error, reason} -> {:error, reason}
end
end
defp source_post_and_vector(post_id) do
with {:ok, post} <- fetch_post(post_id) do
if enabled_for_project?(post.project_id) do
@@ -233,6 +376,37 @@ defmodule BDS.Embeddings do
end
end
defp enrich_duplicate_pairs(pairs, project_id) do
posts_by_id =
pairs
|> Enum.flat_map(&[&1.post_id_a, &1.post_id_b])
|> Enum.uniq()
|> then(fn post_ids ->
Repo.all(from post in Post, where: post.project_id == ^project_id and post.id in ^post_ids)
|> Map.new(&{&1.id, &1})
end)
pairs
|> Enum.map(fn pair ->
post_a = Map.fetch!(posts_by_id, pair.post_id_a)
post_b = Map.fetch!(posts_by_id, pair.post_id_b)
exact_match = exact_duplicate_match?(pair.score, post_a, post_b)
pair
|> Map.put(:title_a, post_a.title || "")
|> Map.put(:title_b, post_b.title || "")
|> Map.put(:similarity, pair.score)
|> Map.put(:exact_match, exact_match)
end)
|> Enum.sort_by(fn pair -> {not pair.exact_match, -pair.score, pair.post_id_a, pair.post_id_b} end)
end
defp exact_duplicate_match?(score, %Post{} = post_a, %Post{} = post_b) do
score >= @exact_match_score and
(post_a.title || "") == (post_b.title || "") and
resolve_post_body(post_a) == resolve_post_body(post_b)
end
defp enabled_for_project?(project_id) do
case Metadata.get_project_metadata(project_id) do
{:ok, metadata} -> metadata.semantic_similarity_enabled == true
@@ -280,10 +454,30 @@ defmodule BDS.Embeddings do
defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}"
defp post_content_hash(%Post{} = post) do
body = resolve_post_body(post)
hash_text(compose_embedding_source(post.title, body))
end
defp embed_text(raw_text, language) do
configured_backend().embed("query: " <> raw_text, language: language)
end
defp rebuild_snapshot(project_id) do
Index.rebuild(project_id, model_id: model_id(), dimensions: dimensions())
end
defp diff_field(name, db_value, file_value) do
db_value = if(is_binary(db_value), do: db_value, else: db_value || "")
file_value = if(is_binary(file_value), do: file_value, else: file_value || "")
if db_value == file_value do
nil
else
%{name: name, db_value: db_value, file_value: file_value}
end
end
defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower)
defp decode_vector(nil), do: []

167
lib/bds/embeddings/index.ex Normal file
View File

@@ -0,0 +1,167 @@
defmodule BDS.Embeddings.Index do
@moduledoc false
import Ecto.Query
alias BDS.Embeddings.Key
alias BDS.Projects
alias BDS.Repo
@neighbor_limit 21
def path(project_id) when is_binary(project_id) do
project = Projects.get_project!(project_id)
Path.join(Projects.project_data_dir(project), "embeddings.usearch")
end
def rebuild(project_id, opts) when is_binary(project_id) and is_list(opts) do
model_id = Keyword.fetch!(opts, :model_id)
dimensions = Keyword.fetch!(opts, :dimensions)
keys =
Repo.all(
from key in Key,
where: key.project_id == ^project_id,
order_by: [asc: key.post_id]
)
entries =
keys
|> Enum.map(fn key ->
vector = decode_vector(key.vector)
{key.post_id,
%{
"label" => key.label,
"content_hash" => key.content_hash,
"neighbors" => neighbor_entries(keys, key, vector)
}}
end)
|> Map.new()
payload = %{
"project_id" => project_id,
"model_id" => model_id,
"dimensions" => dimensions,
"updated_at" => System.system_time(:second),
"entries" => entries
}
write_snapshot(path(project_id), payload)
end
def read(project_id) when is_binary(project_id) do
snapshot_path = path(project_id)
case File.read(snapshot_path) do
{:ok, contents} -> {:ok, Jason.decode!(contents)}
{:error, :enoent} -> read_legacy_snapshot(project_id)
{:error, reason} -> {:error, reason}
end
end
def neighbors(project_id, post_id, limit) when is_binary(project_id) and is_binary(post_id) do
with {:ok, snapshot} <- read(project_id),
%{} = entry <- get_in(snapshot, ["entries", post_id]) do
entry
|> Map.get("neighbors", [])
|> Enum.take(max(limit, 0))
|> Enum.map(fn neighbor ->
%{
post_id: neighbor["post_id"],
score: neighbor["score"]
}
end)
|> then(&{:ok, &1})
else
_ -> {:error, :missing}
end
end
def duplicate_pairs(project_id, threshold) when is_binary(project_id) do
with {:ok, snapshot} <- read(project_id) do
pairs =
snapshot
|> Map.get("entries", %{})
|> Enum.flat_map(fn {post_id, entry} ->
entry
|> Map.get("neighbors", [])
|> Enum.filter(&(&1["score"] >= threshold))
|> Enum.map(fn neighbor ->
{post_id_a, post_id_b} = sort_pair(post_id, neighbor["post_id"])
{{post_id_a, post_id_b},
%{
post_id_a: post_id_a,
post_id_b: post_id_b,
score: neighbor["score"]
}}
end)
end)
|> Map.new()
|> Map.values()
|> Enum.sort_by(& &1.score, :desc)
{:ok, pairs}
else
_ -> {:error, :missing}
end
end
defp neighbor_entries(keys, current_key, current_vector) do
keys
|> Enum.reject(&(&1.post_id == current_key.post_id))
|> Enum.map(fn other_key ->
%{
"post_id" => other_key.post_id,
"label" => other_key.label,
"score" => cosine_similarity(current_vector, decode_vector(other_key.vector))
}
end)
|> Enum.sort_by(& &1["score"], :desc)
|> Enum.take(@neighbor_limit)
end
defp write_snapshot(snapshot_path, payload) do
:ok = File.mkdir_p(Path.dirname(snapshot_path))
temp_path = snapshot_path <> ".tmp"
:ok = File.write(temp_path, Jason.encode!(payload))
:ok = File.rename(temp_path, snapshot_path)
legacy_path = legacy_path(snapshot_path)
if File.exists?(legacy_path) do
File.rm(legacy_path)
end
:ok
end
defp read_legacy_snapshot(project_id) do
legacy_snapshot_path = project_id |> path() |> legacy_path()
case File.read(legacy_snapshot_path) do
{:ok, contents} -> {:ok, Jason.decode!(contents)}
{:error, :enoent} -> {:error, :missing}
{:error, reason} -> {:error, reason}
end
end
defp legacy_path(snapshot_path) do
Path.join(Path.dirname(snapshot_path), "embeddings.index.json")
end
defp decode_vector(nil), do: []
defp decode_vector(vector), do: Jason.decode!(vector)
defp cosine_similarity([], _other), do: 0.0
defp cosine_similarity(_vector, []), do: 0.0
defp cosine_similarity(left, right) do
Enum.zip(left, right)
|> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end)
|> max(0.0)
end
defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b}
defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a}
end

View File

@@ -6,6 +6,7 @@ defmodule BDS.Maintenance do
alias BDS.Frontmatter
alias BDS.Media.Media
alias BDS.Media.Translation, as: MediaTranslation
alias BDS.Embeddings
alias BDS.Posts.Post
alias BDS.Posts.Translation, as: PostTranslation
alias BDS.Projects
@@ -20,6 +21,7 @@ defmodule BDS.Maintenance do
:media -> BDS.Media.rebuild_media_from_files(project_id)
:script -> BDS.Scripts.rebuild_scripts_from_files(project_id)
:template -> BDS.Templates.rebuild_templates_from_files(project_id)
:embedding -> Embeddings.rebuild_project(project_id)
:unsupported -> {:error, :unsupported_entity_type}
end
end
@@ -33,7 +35,8 @@ defmodule BDS.Maintenance do
media_diff_reports(project_id, project) ++
media_translation_diff_reports(project_id, project) ++
script_diff_reports(project_id, project) ++
template_diff_reports(project_id, project)
template_diff_reports(project_id, project) ++
Embeddings.diff_reports(project_id)
orphan_reports = orphan_reports(project_id, project)
@@ -44,10 +47,13 @@ defmodule BDS.Maintenance do
defp normalize_entity_type(:media), do: :media
defp normalize_entity_type(:script), do: :script
defp normalize_entity_type(:template), do: :template
defp normalize_entity_type(:embedding), do: :embedding
defp normalize_entity_type("post"), do: :post
defp normalize_entity_type("media"), do: :media
defp normalize_entity_type("script"), do: :script
defp normalize_entity_type("template"), do: :template
defp normalize_entity_type("embedding"), do: :embedding
defp normalize_entity_type("embeddings"), do: :embedding
defp normalize_entity_type(_entity_type), do: :unsupported
defp post_diff_reports(project_id, project) do

View File

@@ -1,6 +1,7 @@
defmodule BDS.Metadata do
@moduledoc false
alias BDS.Embeddings
alias BDS.Projects
alias BDS.Projects.Project
alias BDS.Repo
@@ -50,6 +51,7 @@ defmodule BDS.Metadata do
load_state(updated_project)
end)
|> unwrap_transaction()
|> maybe_backfill_embeddings(project_id, state, project_metadata)
end
def add_category(project_id, name) do
@@ -311,6 +313,17 @@ defmodule BDS.Metadata do
defp unwrap_transaction({:ok, result}), do: {:ok, result}
defp unwrap_transaction({:error, reason}), do: {:error, reason}
defp maybe_backfill_embeddings({:ok, _metadata} = result, project_id, previous_state, project_metadata) do
if previous_state.semantic_similarity_enabled != true and
project_metadata.semantic_similarity_enabled == true do
{:ok, _indexed_post_ids} = Embeddings.index_unindexed(project_id)
end
result
end
defp maybe_backfill_embeddings(result, _project_id, _previous_state, _project_metadata), do: result
defp attr(attrs, key) do
cond do
Map.has_key?(attrs, key) -> Map.get(attrs, key)

View File

@@ -39,7 +39,7 @@ defmodule BDS.EmbeddingsTest do
%{project: project}
end
test "embeddings index published posts when semantic similarity is enabled and support similarity, duplicates, dismissals, and tag suggestions",
test "embeddings index published posts when semantic similarity is enabled and support similarity, dismissals, and tag suggestions",
%{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
@@ -89,20 +89,9 @@ defmodule BDS.EmbeddingsTest do
assert {:ok, suggestions} = BDS.Embeddings.suggest_tags(alpha.id, "rocket orbit mission")
assert "space" in suggestions
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
assert Enum.any?(duplicates, fn pair ->
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
end)
assert {:ok, dismissal} = BDS.Embeddings.dismiss_duplicate_pair(alpha.id, beta.id)
assert dismissal.project_id == project.id
assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id)
refute Enum.any?(filtered_duplicates, fn pair ->
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
end)
assert {:ok, alpha} = BDS.Posts.update_post(alpha.id, %{content: "kitchen flour dough loaf"})
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
@@ -115,6 +104,97 @@ defmodule BDS.EmbeddingsTest do
refute Map.has_key?(after_delete, gamma.id)
end
test "duplicate detection keeps exact matches and excludes lower-similarity pairs below the spec threshold",
%{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, exact_a} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Exact Match",
content: "space rocket launch orbit mission galaxy",
language: "en"
})
assert {:ok, exact_b} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Exact Match",
content: "space rocket launch orbit mission galaxy",
language: "en"
})
assert {:ok, fuzzy} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Fuzzy",
content: "space rocket launch mission orbit space station",
language: "en"
})
assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id)
assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id)
assert {:ok, fuzzy} = BDS.Posts.publish_post(fuzzy.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
assert [%{post_id_a: post_id_a, post_id_b: post_id_b, score: score}] = duplicates
assert MapSet.new([post_id_a, post_id_b]) == MapSet.new([exact_a.id, exact_b.id])
assert score >= 0.99
assert hd(duplicates).similarity == score
assert hd(duplicates).exact_match == true
assert MapSet.new([hd(duplicates).title_a, hd(duplicates).title_b]) ==
MapSet.new(["Exact Match", "Exact Match"])
refute Enum.any?(duplicates, fn pair ->
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([exact_a.id, fuzzy.id])
end)
end
test "batch duplicate dismissal stores canonical pairs and excludes them from future searches",
%{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, exact_a} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Exact Match",
content: "space rocket launch orbit mission galaxy",
language: "en"
})
assert {:ok, exact_b} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Exact Match",
content: "space rocket launch orbit mission galaxy",
language: "en"
})
assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id)
assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
assert length(duplicates) == 1
assert {:ok, dismissed_pairs} =
BDS.Embeddings.dismiss_duplicate_pairs([
{exact_b.id, exact_a.id},
{exact_a.id, exact_b.id}
])
assert length(dismissed_pairs) == 1
assert hd(dismissed_pairs).project_id == project.id
assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id)
assert filtered_duplicates == []
end
test "embedding queries are gated off when semantic similarity is disabled", %{project: project} do
assert {:ok, post} =
BDS.Posts.create_post(%{
@@ -130,6 +210,41 @@ defmodule BDS.EmbeddingsTest do
assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id])
end
test "get_indexing_progress returns zero indexed and total when a project has no posts", %{
project: project
} do
assert {:ok, %{indexed: 0, total: 0}} = BDS.Embeddings.get_indexing_progress(project.id)
end
test "get_indexing_progress returns indexed embeddings and total posts for the project", %{
project: project
} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, indexed_post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Indexed",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, unindexed_post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Unindexed",
content: "flour yeast dough oven loaf kitchen",
language: "en"
})
assert {:ok, indexed_post} = BDS.Posts.publish_post(indexed_post.id)
assert {:ok, %{indexed: 2, total: 2}} = BDS.Embeddings.get_indexing_progress(project.id)
assert unindexed_post.id != indexed_post.id
end
test "embeddings use the configured in-app backend module", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
@@ -150,4 +265,82 @@ defmodule BDS.EmbeddingsTest do
assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id)
assert post.id in indexed
end
test "embedding indexing persists a project-local similarity snapshot", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, alpha} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Alpha",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, beta} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Beta",
content: "rocket launch orbit mission station",
language: "en"
})
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
assert {:ok, beta} = BDS.Posts.publish_post(beta.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
index_path = BDS.Embeddings.index_path(project.id)
assert File.exists?(index_path)
snapshot = index_path |> File.read!() |> Jason.decode!()
assert snapshot["project_id"] == project.id
assert snapshot["model_id"] == "fake/multilingual-e5-small"
assert snapshot["dimensions"] == 384
assert snapshot["entries"][alpha.id]["label"] != nil
assert snapshot["entries"][alpha.id]["content_hash"] != nil
assert Enum.any?(snapshot["entries"][alpha.id]["neighbors"], fn neighbor ->
neighbor["post_id"] == beta.id
end)
end
test "embedding index uses the old-app persisted file name", %{project: project} do
assert BDS.Embeddings.index_path(project.id) =~ "/embeddings.usearch"
end
test "reindex_all rebuilds stored embeddings for the whole project", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Reindex Target",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
original_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
assert {:ok, post} =
BDS.Posts.update_post(post.id, %{content: "kitchen flour dough oven loaf"})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
stale_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
assert stale_key.content_hash != original_key.content_hash
assert {:ok, rebuilt_ids} = BDS.Embeddings.reindex_all(project.id)
assert post.id in rebuilt_ids
refreshed_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
assert refreshed_key.content_hash == stale_key.content_hash
assert File.exists?(BDS.Embeddings.index_path(project.id))
end
end

View File

@@ -1,6 +1,8 @@
defmodule BDS.MaintenanceTest do
use ExUnit.Case, async: false
import Ecto.Query
alias BDS.Repo
setup do
@@ -134,6 +136,39 @@ defmodule BDS.MaintenanceTest do
BDS.Maintenance.rebuild_from_filesystem(project.id, "unknown")
end
test "maintenance rebuilds and diffs embedding state explicitly", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Embedding Drift",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
index_path = BDS.Embeddings.index_path(project.id)
assert File.exists?(index_path)
Repo.delete_all(from key in BDS.Embeddings.Key, where: key.project_id == ^project.id)
File.rm!(index_path)
assert {:ok, %{diff_reports: diff_reports}} = BDS.Maintenance.metadata_diff(project.id)
assert Enum.any?(diff_reports, fn report ->
report.entity_type == "embedding" and report.entity_id == post.id
end)
assert {:ok, rebuilt_post_ids} = BDS.Maintenance.rebuild_from_filesystem(project.id, "embedding")
assert post.id in rebuilt_post_ids
assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
assert File.exists?(index_path)
end
test "metadata_diff reports field differences and orphan files across managed entities", %{
project: project,
temp_dir: temp_dir

View File

@@ -116,4 +116,26 @@ defmodule BDS.MetadataTest do
"ssh_mode" => "rsync"
}
end
test "enabling semantic similarity backfills embeddings for existing published posts", %{
project: project
} do
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Backfill Me",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) == nil
assert {:ok, metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert metadata.semantic_similarity_enabled == true
assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
assert File.exists?(BDS.Embeddings.index_path(project.id))
end
end