feat: finalisation (hopefully) for embedding
This commit is contained in:
@@ -4,18 +4,111 @@ defmodule BDS.Embeddings do
|
||||
import Ecto.Query
|
||||
|
||||
alias BDS.Embeddings.DismissedDuplicatePair
|
||||
alias BDS.Embeddings.Index
|
||||
alias BDS.Embeddings.Key
|
||||
alias BDS.Metadata
|
||||
alias BDS.Posts.Post
|
||||
alias BDS.Projects
|
||||
alias BDS.Repo
|
||||
|
||||
@duplicate_threshold 0.5
|
||||
@duplicate_threshold 0.92
|
||||
@exact_match_score 0.999999
|
||||
|
||||
def model_id, do: configured_backend().model_info().model_id
|
||||
def dimensions, do: configured_backend().model_info().dimensions
|
||||
def index_path(project_id), do: Index.path(project_id)
|
||||
def reindex_all(project_id), do: rebuild_project(project_id)
|
||||
|
||||
def get_indexing_progress(project_id) when is_binary(project_id) do
|
||||
indexed =
|
||||
Repo.one(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id,
|
||||
select: count(key.post_id, :distinct)
|
||||
) || 0
|
||||
|
||||
total =
|
||||
Repo.one(
|
||||
from post in Post,
|
||||
where: post.project_id == ^project_id,
|
||||
select: count(post.id)
|
||||
) || 0
|
||||
|
||||
{:ok, %{indexed: indexed, total: total}}
|
||||
end
|
||||
|
||||
def sync_post(%Post{} = post) do
|
||||
sync_post(post, refresh_index: true)
|
||||
end
|
||||
|
||||
def sync_post(post_id) when is_binary(post_id) do
|
||||
case Repo.get(Post, post_id) do
|
||||
nil -> :ok
|
||||
post -> sync_post(post)
|
||||
end
|
||||
end
|
||||
|
||||
def rebuild_project(project_id) when is_binary(project_id) do
|
||||
if enabled_for_project?(project_id) do
|
||||
posts =
|
||||
Repo.all(from post in Post, where: post.project_id == ^project_id, order_by: [asc: post.created_at, asc: post.slug])
|
||||
|
||||
post_ids = Enum.map(posts, & &1.id)
|
||||
|
||||
Repo.delete_all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id and key.post_id not in ^post_ids
|
||||
)
|
||||
|
||||
Enum.each(posts, &sync_post(&1, refresh_index: false))
|
||||
:ok = rebuild_snapshot(project_id)
|
||||
{:ok, post_ids}
|
||||
else
|
||||
{:ok, []}
|
||||
end
|
||||
end
|
||||
|
||||
def diff_reports(project_id) when is_binary(project_id) do
|
||||
if enabled_for_project?(project_id) do
|
||||
snapshot_entries =
|
||||
case Index.read(project_id) do
|
||||
{:ok, snapshot} -> Map.get(snapshot, "entries", %{})
|
||||
_other -> %{}
|
||||
end
|
||||
|
||||
keys_by_post =
|
||||
Repo.all(from key in Key, where: key.project_id == ^project_id)
|
||||
|> Map.new(&{&1.post_id, &1})
|
||||
|
||||
Repo.all(from post in Post, where: post.project_id == ^project_id)
|
||||
|> Enum.flat_map(fn post ->
|
||||
expected_hash = post_content_hash(post)
|
||||
key = Map.get(keys_by_post, post.id)
|
||||
snapshot_entry = Map.get(snapshot_entries, post.id)
|
||||
|
||||
differences =
|
||||
[
|
||||
diff_field("content_hash", key && key.content_hash, expected_hash),
|
||||
diff_field(
|
||||
"snapshot_content_hash",
|
||||
snapshot_entry && snapshot_entry["content_hash"],
|
||||
key && key.content_hash
|
||||
)
|
||||
]
|
||||
|> Enum.reject(&is_nil/1)
|
||||
|
||||
if differences == [] do
|
||||
[]
|
||||
else
|
||||
[%{entity_type: "embedding", entity_id: post.id, differences: differences}]
|
||||
end
|
||||
end)
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
defp sync_post(%Post{} = post, opts) do
|
||||
if enabled_for_project?(post.project_id) do
|
||||
body = resolve_post_body(post)
|
||||
raw_text = compose_embedding_source(post.title, body)
|
||||
@@ -39,6 +132,10 @@ defmodule BDS.Embeddings do
|
||||
})
|
||||
|> Repo.insert_or_update()
|
||||
|
||||
if Keyword.get(opts, :refresh_index, true) do
|
||||
:ok = rebuild_snapshot(post.project_id)
|
||||
end
|
||||
|
||||
:ok
|
||||
end
|
||||
else
|
||||
@@ -46,15 +143,22 @@ defmodule BDS.Embeddings do
|
||||
end
|
||||
end
|
||||
|
||||
def sync_post(post_id) when is_binary(post_id) do
|
||||
case Repo.get(Post, post_id) do
|
||||
nil -> :ok
|
||||
post -> sync_post(post)
|
||||
def remove_post(post_id) when is_binary(post_id) do
|
||||
project_id =
|
||||
case Repo.get_by(Key, post_id: post_id) do
|
||||
%Key{project_id: project_id} -> project_id
|
||||
nil -> case Repo.get(Post, post_id) do
|
||||
%Post{project_id: project_id} -> project_id
|
||||
nil -> nil
|
||||
end
|
||||
end
|
||||
|
||||
def remove_post(post_id) when is_binary(post_id) do
|
||||
Repo.delete_all(from key in Key, where: key.post_id == ^post_id)
|
||||
|
||||
if is_binary(project_id) and enabled_for_project?(project_id) do
|
||||
:ok = rebuild_snapshot(project_id)
|
||||
end
|
||||
|
||||
:ok
|
||||
end
|
||||
|
||||
@@ -70,10 +174,16 @@ defmodule BDS.Embeddings do
|
||||
case Repo.get_by(Key, post_id: post.id, project_id: project_id) do
|
||||
%Key{content_hash: ^content_hash} -> :ok
|
||||
_other ->
|
||||
:ok = sync_post(%{post | content: if(post.content in [nil, ""], do: body, else: post.content)})
|
||||
:ok =
|
||||
sync_post(
|
||||
%{post | content: if(post.content in [nil, ""], do: body, else: post.content)},
|
||||
refresh_index: false
|
||||
)
|
||||
end
|
||||
end)
|
||||
|
||||
:ok = rebuild_snapshot(project_id)
|
||||
|
||||
indexed = Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id)
|
||||
|
||||
{:ok, indexed}
|
||||
@@ -88,10 +198,14 @@ defmodule BDS.Embeddings do
|
||||
{:error, :not_found} -> {:ok, []}
|
||||
{:ok, post, source_vector} ->
|
||||
similar =
|
||||
case Index.neighbors(post.project_id, post.id, limit) do
|
||||
{:ok, neighbors} -> neighbors
|
||||
{:error, :missing} ->
|
||||
Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id)
|
||||
|> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end)
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
|> Enum.take(max(limit, 0))
|
||||
end
|
||||
|
||||
{:ok, similar}
|
||||
end
|
||||
@@ -150,9 +264,17 @@ defmodule BDS.Embeddings do
|
||||
def find_duplicates(project_id) when is_binary(project_id) do
|
||||
if enabled_for_project?(project_id) do
|
||||
dismissed = dismissed_pair_keys(project_id)
|
||||
keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id])
|
||||
|
||||
duplicates =
|
||||
case Index.duplicate_pairs(project_id, @duplicate_threshold) do
|
||||
{:ok, pairs} ->
|
||||
pairs
|
||||
|> Enum.reject(fn pair -> pair_key(pair.post_id_a, pair.post_id_b) in dismissed end)
|
||||
|> enrich_duplicate_pairs(project_id)
|
||||
|
||||
{:error, :missing} ->
|
||||
keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id])
|
||||
|
||||
for left <- keys,
|
||||
right <- keys,
|
||||
left.post_id < right.post_id,
|
||||
@@ -165,7 +287,8 @@ defmodule BDS.Embeddings do
|
||||
score: similarity
|
||||
}
|
||||
end
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
|> enrich_duplicate_pairs(project_id)
|
||||
end
|
||||
|
||||
{:ok, duplicates}
|
||||
else
|
||||
@@ -204,6 +327,26 @@ defmodule BDS.Embeddings do
|
||||
end
|
||||
end
|
||||
|
||||
def dismiss_duplicate_pairs(pair_ids) when is_list(pair_ids) do
|
||||
pair_ids
|
||||
|> Enum.filter(fn
|
||||
{post_id_a, post_id_b} when is_binary(post_id_a) and is_binary(post_id_b) -> true
|
||||
_other -> false
|
||||
end)
|
||||
|> Enum.map(fn {post_id_a, post_id_b} -> sort_pair(post_id_a, post_id_b) end)
|
||||
|> Enum.uniq()
|
||||
|> Enum.reduce_while({:ok, []}, fn {post_id_a, post_id_b}, {:ok, acc} ->
|
||||
case dismiss_duplicate_pair(post_id_a, post_id_b) do
|
||||
{:ok, saved_pair} -> {:cont, {:ok, [saved_pair | acc]}}
|
||||
{:error, reason} -> {:halt, {:error, reason}}
|
||||
end
|
||||
end)
|
||||
|> case do
|
||||
{:ok, saved_pairs} -> {:ok, Enum.reverse(saved_pairs)}
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp source_post_and_vector(post_id) do
|
||||
with {:ok, post} <- fetch_post(post_id) do
|
||||
if enabled_for_project?(post.project_id) do
|
||||
@@ -233,6 +376,37 @@ defmodule BDS.Embeddings do
|
||||
end
|
||||
end
|
||||
|
||||
defp enrich_duplicate_pairs(pairs, project_id) do
|
||||
posts_by_id =
|
||||
pairs
|
||||
|> Enum.flat_map(&[&1.post_id_a, &1.post_id_b])
|
||||
|> Enum.uniq()
|
||||
|> then(fn post_ids ->
|
||||
Repo.all(from post in Post, where: post.project_id == ^project_id and post.id in ^post_ids)
|
||||
|> Map.new(&{&1.id, &1})
|
||||
end)
|
||||
|
||||
pairs
|
||||
|> Enum.map(fn pair ->
|
||||
post_a = Map.fetch!(posts_by_id, pair.post_id_a)
|
||||
post_b = Map.fetch!(posts_by_id, pair.post_id_b)
|
||||
exact_match = exact_duplicate_match?(pair.score, post_a, post_b)
|
||||
|
||||
pair
|
||||
|> Map.put(:title_a, post_a.title || "")
|
||||
|> Map.put(:title_b, post_b.title || "")
|
||||
|> Map.put(:similarity, pair.score)
|
||||
|> Map.put(:exact_match, exact_match)
|
||||
end)
|
||||
|> Enum.sort_by(fn pair -> {not pair.exact_match, -pair.score, pair.post_id_a, pair.post_id_b} end)
|
||||
end
|
||||
|
||||
defp exact_duplicate_match?(score, %Post{} = post_a, %Post{} = post_b) do
|
||||
score >= @exact_match_score and
|
||||
(post_a.title || "") == (post_b.title || "") and
|
||||
resolve_post_body(post_a) == resolve_post_body(post_b)
|
||||
end
|
||||
|
||||
defp enabled_for_project?(project_id) do
|
||||
case Metadata.get_project_metadata(project_id) do
|
||||
{:ok, metadata} -> metadata.semantic_similarity_enabled == true
|
||||
@@ -280,10 +454,30 @@ defmodule BDS.Embeddings do
|
||||
|
||||
defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}"
|
||||
|
||||
defp post_content_hash(%Post{} = post) do
|
||||
body = resolve_post_body(post)
|
||||
hash_text(compose_embedding_source(post.title, body))
|
||||
end
|
||||
|
||||
defp embed_text(raw_text, language) do
|
||||
configured_backend().embed("query: " <> raw_text, language: language)
|
||||
end
|
||||
|
||||
defp rebuild_snapshot(project_id) do
|
||||
Index.rebuild(project_id, model_id: model_id(), dimensions: dimensions())
|
||||
end
|
||||
|
||||
defp diff_field(name, db_value, file_value) do
|
||||
db_value = if(is_binary(db_value), do: db_value, else: db_value || "")
|
||||
file_value = if(is_binary(file_value), do: file_value, else: file_value || "")
|
||||
|
||||
if db_value == file_value do
|
||||
nil
|
||||
else
|
||||
%{name: name, db_value: db_value, file_value: file_value}
|
||||
end
|
||||
end
|
||||
|
||||
defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower)
|
||||
|
||||
defp decode_vector(nil), do: []
|
||||
|
||||
167
lib/bds/embeddings/index.ex
Normal file
167
lib/bds/embeddings/index.ex
Normal file
@@ -0,0 +1,167 @@
|
||||
defmodule BDS.Embeddings.Index do
|
||||
@moduledoc false
|
||||
|
||||
import Ecto.Query
|
||||
|
||||
alias BDS.Embeddings.Key
|
||||
alias BDS.Projects
|
||||
alias BDS.Repo
|
||||
|
||||
@neighbor_limit 21
|
||||
|
||||
def path(project_id) when is_binary(project_id) do
|
||||
project = Projects.get_project!(project_id)
|
||||
Path.join(Projects.project_data_dir(project), "embeddings.usearch")
|
||||
end
|
||||
|
||||
def rebuild(project_id, opts) when is_binary(project_id) and is_list(opts) do
|
||||
model_id = Keyword.fetch!(opts, :model_id)
|
||||
dimensions = Keyword.fetch!(opts, :dimensions)
|
||||
|
||||
keys =
|
||||
Repo.all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id,
|
||||
order_by: [asc: key.post_id]
|
||||
)
|
||||
|
||||
entries =
|
||||
keys
|
||||
|> Enum.map(fn key ->
|
||||
vector = decode_vector(key.vector)
|
||||
|
||||
{key.post_id,
|
||||
%{
|
||||
"label" => key.label,
|
||||
"content_hash" => key.content_hash,
|
||||
"neighbors" => neighbor_entries(keys, key, vector)
|
||||
}}
|
||||
end)
|
||||
|> Map.new()
|
||||
|
||||
payload = %{
|
||||
"project_id" => project_id,
|
||||
"model_id" => model_id,
|
||||
"dimensions" => dimensions,
|
||||
"updated_at" => System.system_time(:second),
|
||||
"entries" => entries
|
||||
}
|
||||
|
||||
write_snapshot(path(project_id), payload)
|
||||
end
|
||||
|
||||
def read(project_id) when is_binary(project_id) do
|
||||
snapshot_path = path(project_id)
|
||||
|
||||
case File.read(snapshot_path) do
|
||||
{:ok, contents} -> {:ok, Jason.decode!(contents)}
|
||||
{:error, :enoent} -> read_legacy_snapshot(project_id)
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
def neighbors(project_id, post_id, limit) when is_binary(project_id) and is_binary(post_id) do
|
||||
with {:ok, snapshot} <- read(project_id),
|
||||
%{} = entry <- get_in(snapshot, ["entries", post_id]) do
|
||||
entry
|
||||
|> Map.get("neighbors", [])
|
||||
|> Enum.take(max(limit, 0))
|
||||
|> Enum.map(fn neighbor ->
|
||||
%{
|
||||
post_id: neighbor["post_id"],
|
||||
score: neighbor["score"]
|
||||
}
|
||||
end)
|
||||
|> then(&{:ok, &1})
|
||||
else
|
||||
_ -> {:error, :missing}
|
||||
end
|
||||
end
|
||||
|
||||
def duplicate_pairs(project_id, threshold) when is_binary(project_id) do
|
||||
with {:ok, snapshot} <- read(project_id) do
|
||||
pairs =
|
||||
snapshot
|
||||
|> Map.get("entries", %{})
|
||||
|> Enum.flat_map(fn {post_id, entry} ->
|
||||
entry
|
||||
|> Map.get("neighbors", [])
|
||||
|> Enum.filter(&(&1["score"] >= threshold))
|
||||
|> Enum.map(fn neighbor ->
|
||||
{post_id_a, post_id_b} = sort_pair(post_id, neighbor["post_id"])
|
||||
|
||||
{{post_id_a, post_id_b},
|
||||
%{
|
||||
post_id_a: post_id_a,
|
||||
post_id_b: post_id_b,
|
||||
score: neighbor["score"]
|
||||
}}
|
||||
end)
|
||||
end)
|
||||
|> Map.new()
|
||||
|> Map.values()
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
|
||||
{:ok, pairs}
|
||||
else
|
||||
_ -> {:error, :missing}
|
||||
end
|
||||
end
|
||||
|
||||
defp neighbor_entries(keys, current_key, current_vector) do
|
||||
keys
|
||||
|> Enum.reject(&(&1.post_id == current_key.post_id))
|
||||
|> Enum.map(fn other_key ->
|
||||
%{
|
||||
"post_id" => other_key.post_id,
|
||||
"label" => other_key.label,
|
||||
"score" => cosine_similarity(current_vector, decode_vector(other_key.vector))
|
||||
}
|
||||
end)
|
||||
|> Enum.sort_by(& &1["score"], :desc)
|
||||
|> Enum.take(@neighbor_limit)
|
||||
end
|
||||
|
||||
defp write_snapshot(snapshot_path, payload) do
|
||||
:ok = File.mkdir_p(Path.dirname(snapshot_path))
|
||||
temp_path = snapshot_path <> ".tmp"
|
||||
:ok = File.write(temp_path, Jason.encode!(payload))
|
||||
:ok = File.rename(temp_path, snapshot_path)
|
||||
legacy_path = legacy_path(snapshot_path)
|
||||
|
||||
if File.exists?(legacy_path) do
|
||||
File.rm(legacy_path)
|
||||
end
|
||||
|
||||
:ok
|
||||
end
|
||||
|
||||
defp read_legacy_snapshot(project_id) do
|
||||
legacy_snapshot_path = project_id |> path() |> legacy_path()
|
||||
|
||||
case File.read(legacy_snapshot_path) do
|
||||
{:ok, contents} -> {:ok, Jason.decode!(contents)}
|
||||
{:error, :enoent} -> {:error, :missing}
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
defp legacy_path(snapshot_path) do
|
||||
Path.join(Path.dirname(snapshot_path), "embeddings.index.json")
|
||||
end
|
||||
|
||||
defp decode_vector(nil), do: []
|
||||
defp decode_vector(vector), do: Jason.decode!(vector)
|
||||
|
||||
defp cosine_similarity([], _other), do: 0.0
|
||||
defp cosine_similarity(_vector, []), do: 0.0
|
||||
|
||||
defp cosine_similarity(left, right) do
|
||||
Enum.zip(left, right)
|
||||
|> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end)
|
||||
|> max(0.0)
|
||||
end
|
||||
|
||||
defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b}
|
||||
defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a}
|
||||
end
|
||||
@@ -6,6 +6,7 @@ defmodule BDS.Maintenance do
|
||||
alias BDS.Frontmatter
|
||||
alias BDS.Media.Media
|
||||
alias BDS.Media.Translation, as: MediaTranslation
|
||||
alias BDS.Embeddings
|
||||
alias BDS.Posts.Post
|
||||
alias BDS.Posts.Translation, as: PostTranslation
|
||||
alias BDS.Projects
|
||||
@@ -20,6 +21,7 @@ defmodule BDS.Maintenance do
|
||||
:media -> BDS.Media.rebuild_media_from_files(project_id)
|
||||
:script -> BDS.Scripts.rebuild_scripts_from_files(project_id)
|
||||
:template -> BDS.Templates.rebuild_templates_from_files(project_id)
|
||||
:embedding -> Embeddings.rebuild_project(project_id)
|
||||
:unsupported -> {:error, :unsupported_entity_type}
|
||||
end
|
||||
end
|
||||
@@ -33,7 +35,8 @@ defmodule BDS.Maintenance do
|
||||
media_diff_reports(project_id, project) ++
|
||||
media_translation_diff_reports(project_id, project) ++
|
||||
script_diff_reports(project_id, project) ++
|
||||
template_diff_reports(project_id, project)
|
||||
template_diff_reports(project_id, project) ++
|
||||
Embeddings.diff_reports(project_id)
|
||||
|
||||
orphan_reports = orphan_reports(project_id, project)
|
||||
|
||||
@@ -44,10 +47,13 @@ defmodule BDS.Maintenance do
|
||||
defp normalize_entity_type(:media), do: :media
|
||||
defp normalize_entity_type(:script), do: :script
|
||||
defp normalize_entity_type(:template), do: :template
|
||||
defp normalize_entity_type(:embedding), do: :embedding
|
||||
defp normalize_entity_type("post"), do: :post
|
||||
defp normalize_entity_type("media"), do: :media
|
||||
defp normalize_entity_type("script"), do: :script
|
||||
defp normalize_entity_type("template"), do: :template
|
||||
defp normalize_entity_type("embedding"), do: :embedding
|
||||
defp normalize_entity_type("embeddings"), do: :embedding
|
||||
defp normalize_entity_type(_entity_type), do: :unsupported
|
||||
|
||||
defp post_diff_reports(project_id, project) do
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
defmodule BDS.Metadata do
|
||||
@moduledoc false
|
||||
|
||||
alias BDS.Embeddings
|
||||
alias BDS.Projects
|
||||
alias BDS.Projects.Project
|
||||
alias BDS.Repo
|
||||
@@ -50,6 +51,7 @@ defmodule BDS.Metadata do
|
||||
load_state(updated_project)
|
||||
end)
|
||||
|> unwrap_transaction()
|
||||
|> maybe_backfill_embeddings(project_id, state, project_metadata)
|
||||
end
|
||||
|
||||
def add_category(project_id, name) do
|
||||
@@ -311,6 +313,17 @@ defmodule BDS.Metadata do
|
||||
defp unwrap_transaction({:ok, result}), do: {:ok, result}
|
||||
defp unwrap_transaction({:error, reason}), do: {:error, reason}
|
||||
|
||||
defp maybe_backfill_embeddings({:ok, _metadata} = result, project_id, previous_state, project_metadata) do
|
||||
if previous_state.semantic_similarity_enabled != true and
|
||||
project_metadata.semantic_similarity_enabled == true do
|
||||
{:ok, _indexed_post_ids} = Embeddings.index_unindexed(project_id)
|
||||
end
|
||||
|
||||
result
|
||||
end
|
||||
|
||||
defp maybe_backfill_embeddings(result, _project_id, _previous_state, _project_metadata), do: result
|
||||
|
||||
defp attr(attrs, key) do
|
||||
cond do
|
||||
Map.has_key?(attrs, key) -> Map.get(attrs, key)
|
||||
|
||||
@@ -39,7 +39,7 @@ defmodule BDS.EmbeddingsTest do
|
||||
%{project: project}
|
||||
end
|
||||
|
||||
test "embeddings index published posts when semantic similarity is enabled and support similarity, duplicates, dismissals, and tag suggestions",
|
||||
test "embeddings index published posts when semantic similarity is enabled and support similarity, dismissals, and tag suggestions",
|
||||
%{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
@@ -89,20 +89,9 @@ defmodule BDS.EmbeddingsTest do
|
||||
assert {:ok, suggestions} = BDS.Embeddings.suggest_tags(alpha.id, "rocket orbit mission")
|
||||
assert "space" in suggestions
|
||||
|
||||
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
|
||||
assert Enum.any?(duplicates, fn pair ->
|
||||
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
|
||||
end)
|
||||
|
||||
assert {:ok, dismissal} = BDS.Embeddings.dismiss_duplicate_pair(alpha.id, beta.id)
|
||||
assert dismissal.project_id == project.id
|
||||
|
||||
assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id)
|
||||
|
||||
refute Enum.any?(filtered_duplicates, fn pair ->
|
||||
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
|
||||
end)
|
||||
|
||||
assert {:ok, alpha} = BDS.Posts.update_post(alpha.id, %{content: "kitchen flour dough loaf"})
|
||||
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
|
||||
|
||||
@@ -115,6 +104,97 @@ defmodule BDS.EmbeddingsTest do
|
||||
refute Map.has_key?(after_delete, gamma.id)
|
||||
end
|
||||
|
||||
test "duplicate detection keeps exact matches and excludes lower-similarity pairs below the spec threshold",
|
||||
%{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, exact_a} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Exact Match",
|
||||
content: "space rocket launch orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, exact_b} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Exact Match",
|
||||
content: "space rocket launch orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, fuzzy} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Fuzzy",
|
||||
content: "space rocket launch mission orbit space station",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id)
|
||||
assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id)
|
||||
assert {:ok, fuzzy} = BDS.Posts.publish_post(fuzzy.id)
|
||||
|
||||
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
|
||||
|
||||
assert [%{post_id_a: post_id_a, post_id_b: post_id_b, score: score}] = duplicates
|
||||
assert MapSet.new([post_id_a, post_id_b]) == MapSet.new([exact_a.id, exact_b.id])
|
||||
assert score >= 0.99
|
||||
assert hd(duplicates).similarity == score
|
||||
assert hd(duplicates).exact_match == true
|
||||
|
||||
assert MapSet.new([hd(duplicates).title_a, hd(duplicates).title_b]) ==
|
||||
MapSet.new(["Exact Match", "Exact Match"])
|
||||
|
||||
refute Enum.any?(duplicates, fn pair ->
|
||||
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([exact_a.id, fuzzy.id])
|
||||
end)
|
||||
end
|
||||
|
||||
test "batch duplicate dismissal stores canonical pairs and excludes them from future searches",
|
||||
%{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, exact_a} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Exact Match",
|
||||
content: "space rocket launch orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, exact_b} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Exact Match",
|
||||
content: "space rocket launch orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, exact_a} = BDS.Posts.publish_post(exact_a.id)
|
||||
assert {:ok, exact_b} = BDS.Posts.publish_post(exact_b.id)
|
||||
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
|
||||
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
|
||||
assert length(duplicates) == 1
|
||||
|
||||
assert {:ok, dismissed_pairs} =
|
||||
BDS.Embeddings.dismiss_duplicate_pairs([
|
||||
{exact_b.id, exact_a.id},
|
||||
{exact_a.id, exact_b.id}
|
||||
])
|
||||
|
||||
assert length(dismissed_pairs) == 1
|
||||
assert hd(dismissed_pairs).project_id == project.id
|
||||
|
||||
assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id)
|
||||
assert filtered_duplicates == []
|
||||
end
|
||||
|
||||
test "embedding queries are gated off when semantic similarity is disabled", %{project: project} do
|
||||
assert {:ok, post} =
|
||||
BDS.Posts.create_post(%{
|
||||
@@ -130,6 +210,41 @@ defmodule BDS.EmbeddingsTest do
|
||||
assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id])
|
||||
end
|
||||
|
||||
test "get_indexing_progress returns zero indexed and total when a project has no posts", %{
|
||||
project: project
|
||||
} do
|
||||
assert {:ok, %{indexed: 0, total: 0}} = BDS.Embeddings.get_indexing_progress(project.id)
|
||||
end
|
||||
|
||||
test "get_indexing_progress returns indexed embeddings and total posts for the project", %{
|
||||
project: project
|
||||
} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, indexed_post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Indexed",
|
||||
content: "space rocket orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, unindexed_post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Unindexed",
|
||||
content: "flour yeast dough oven loaf kitchen",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, indexed_post} = BDS.Posts.publish_post(indexed_post.id)
|
||||
|
||||
assert {:ok, %{indexed: 2, total: 2}} = BDS.Embeddings.get_indexing_progress(project.id)
|
||||
|
||||
assert unindexed_post.id != indexed_post.id
|
||||
end
|
||||
|
||||
test "embeddings use the configured in-app backend module", %{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
@@ -150,4 +265,82 @@ defmodule BDS.EmbeddingsTest do
|
||||
assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
assert post.id in indexed
|
||||
end
|
||||
|
||||
test "embedding indexing persists a project-local similarity snapshot", %{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, alpha} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Alpha",
|
||||
content: "space rocket orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, beta} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Beta",
|
||||
content: "rocket launch orbit mission station",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
|
||||
assert {:ok, beta} = BDS.Posts.publish_post(beta.id)
|
||||
|
||||
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
|
||||
index_path = BDS.Embeddings.index_path(project.id)
|
||||
assert File.exists?(index_path)
|
||||
|
||||
snapshot = index_path |> File.read!() |> Jason.decode!()
|
||||
assert snapshot["project_id"] == project.id
|
||||
assert snapshot["model_id"] == "fake/multilingual-e5-small"
|
||||
assert snapshot["dimensions"] == 384
|
||||
assert snapshot["entries"][alpha.id]["label"] != nil
|
||||
assert snapshot["entries"][alpha.id]["content_hash"] != nil
|
||||
|
||||
assert Enum.any?(snapshot["entries"][alpha.id]["neighbors"], fn neighbor ->
|
||||
neighbor["post_id"] == beta.id
|
||||
end)
|
||||
end
|
||||
|
||||
test "embedding index uses the old-app persisted file name", %{project: project} do
|
||||
assert BDS.Embeddings.index_path(project.id) =~ "/embeddings.usearch"
|
||||
end
|
||||
|
||||
|
||||
test "reindex_all rebuilds stored embeddings for the whole project", %{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Reindex Target",
|
||||
content: "space rocket orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, post} = BDS.Posts.publish_post(post.id)
|
||||
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
|
||||
original_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
|
||||
|
||||
assert {:ok, post} =
|
||||
BDS.Posts.update_post(post.id, %{content: "kitchen flour dough oven loaf"})
|
||||
|
||||
assert {:ok, post} = BDS.Posts.publish_post(post.id)
|
||||
stale_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
|
||||
|
||||
assert stale_key.content_hash != original_key.content_hash
|
||||
|
||||
assert {:ok, rebuilt_ids} = BDS.Embeddings.reindex_all(project.id)
|
||||
assert post.id in rebuilt_ids
|
||||
|
||||
refreshed_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
|
||||
assert refreshed_key.content_hash == stale_key.content_hash
|
||||
assert File.exists?(BDS.Embeddings.index_path(project.id))
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
defmodule BDS.MaintenanceTest do
|
||||
use ExUnit.Case, async: false
|
||||
|
||||
import Ecto.Query
|
||||
|
||||
alias BDS.Repo
|
||||
|
||||
setup do
|
||||
@@ -134,6 +136,39 @@ defmodule BDS.MaintenanceTest do
|
||||
BDS.Maintenance.rebuild_from_filesystem(project.id, "unknown")
|
||||
end
|
||||
|
||||
test "maintenance rebuilds and diffs embedding state explicitly", %{project: project} do
|
||||
assert {:ok, _metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert {:ok, post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Embedding Drift",
|
||||
content: "space rocket orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, post} = BDS.Posts.publish_post(post.id)
|
||||
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
|
||||
|
||||
index_path = BDS.Embeddings.index_path(project.id)
|
||||
assert File.exists?(index_path)
|
||||
|
||||
Repo.delete_all(from key in BDS.Embeddings.Key, where: key.project_id == ^project.id)
|
||||
File.rm!(index_path)
|
||||
|
||||
assert {:ok, %{diff_reports: diff_reports}} = BDS.Maintenance.metadata_diff(project.id)
|
||||
|
||||
assert Enum.any?(diff_reports, fn report ->
|
||||
report.entity_type == "embedding" and report.entity_id == post.id
|
||||
end)
|
||||
|
||||
assert {:ok, rebuilt_post_ids} = BDS.Maintenance.rebuild_from_filesystem(project.id, "embedding")
|
||||
assert post.id in rebuilt_post_ids
|
||||
assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
|
||||
assert File.exists?(index_path)
|
||||
end
|
||||
|
||||
test "metadata_diff reports field differences and orphan files across managed entities", %{
|
||||
project: project,
|
||||
temp_dir: temp_dir
|
||||
|
||||
@@ -116,4 +116,26 @@ defmodule BDS.MetadataTest do
|
||||
"ssh_mode" => "rsync"
|
||||
}
|
||||
end
|
||||
|
||||
test "enabling semantic similarity backfills embeddings for existing published posts", %{
|
||||
project: project
|
||||
} do
|
||||
assert {:ok, post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Backfill Me",
|
||||
content: "space rocket orbit mission galaxy",
|
||||
language: "en"
|
||||
})
|
||||
|
||||
assert {:ok, post} = BDS.Posts.publish_post(post.id)
|
||||
assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) == nil
|
||||
|
||||
assert {:ok, metadata} =
|
||||
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
|
||||
|
||||
assert metadata.semantic_similarity_enabled == true
|
||||
assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
|
||||
assert File.exists?(BDS.Embeddings.index_path(project.id))
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user