fix: force full re-embed on explicit rebuild and degrade gracefully when embedding model is unavailable

This commit is contained in:
2026-05-29 15:49:53 +02:00
parent 61ff2a77c0
commit 74ceaeb971
8 changed files with 268 additions and 83 deletions

View File

@@ -120,16 +120,7 @@ defmodule BDS.Desktop.ShellCommands do
"rebuild_embedding_index",
"Rebuild Embedding Index",
"Embeddings",
fn report ->
{:ok, rebuilt_post_ids} = Embeddings.rebuild_project(project.id, on_progress: report)
report.(1.0, "Embedding index rebuilt")
%{
project_id: project.id,
rebuilt_post_ids: rebuilt_post_ids,
rebuilt_count: length(rebuilt_post_ids)
}
end
fn report -> rebuild_embedding_index_work(project, report) end
)
end
@@ -524,20 +515,39 @@ defmodule BDS.Desktop.ShellCommands do
},
%{
name: "Rebuild Embedding Index",
work: fn report ->
{:ok, rebuilt_post_ids} = Embeddings.rebuild_project(project.id, on_progress: report)
report.(1.0, "Embedding index rebuilt")
%{
project_id: project.id,
rebuilt_post_ids: rebuilt_post_ids,
rebuilt_count: length(rebuilt_post_ids)
}
end
work: fn report -> rebuild_embedding_index_work(project, report) end
}
]
end
defp rebuild_embedding_index_work(project, report) do
case Embeddings.rebuild_project(project.id, on_progress: report) do
{:ok, rebuilt_post_ids} ->
report.(1.0, "Embedding index rebuilt")
%{
project_id: project.id,
rebuilt_post_ids: rebuilt_post_ids,
rebuilt_count: length(rebuilt_post_ids)
}
{:error, reason} ->
{:error, embedding_error_message(reason)}
end
end
defp embedding_error_message(reason) do
detail =
case reason do
message when is_binary(message) -> message
{:embedding_backend_unavailable, _inner} -> "the embedding service did not start"
other -> inspect(other)
end
"Could not build the embedding index: #{detail}. The model is downloaded on first use, " <>
"so check your internet connection — or turn off semantic similarity in Settings."
end
defp run_rebuild_sequence(_group_id, _attrs, []), do: :ok
defp run_rebuild_sequence(group_id, attrs, [step | remaining_steps]) do

View File

@@ -2,6 +2,7 @@ defmodule BDS.Embeddings do
@moduledoc false
import Ecto.Query
require Logger
alias BDS.Persistence
alias BDS.Embeddings.DismissedDuplicatePair
@@ -75,11 +76,16 @@ defmodule BDS.Embeddings do
)
existing_keys = preload_keys_by_post_id(project_id, Enum.map(posts, & &1.id))
rows = build_key_rows(posts, existing_keys, max_label_value(), nil)
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
{:ok, Enum.map(posts, & &1.id)}
case build_key_rows(posts, existing_keys, max_label_value(), nil, false) do
{:ok, rows} ->
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
{:ok, Enum.map(posts, & &1.id)}
{:error, _reason} = error ->
error
end
else
{:ok, []}
end
@@ -106,13 +112,19 @@ defmodule BDS.Embeddings do
)
existing_keys = preload_keys_by_post_id(project_id)
rows = build_key_rows(posts, existing_keys, max_label_value(), on_progress)
batch_upsert_keys(rows)
# An explicit rebuild re-embeds every post from scratch (ReindexAll),
# ignoring the content_hash skip optimisation.
case build_key_rows(posts, existing_keys, max_label_value(), on_progress, true) do
{:ok, rows} ->
batch_upsert_keys(rows)
:ok = report_rebuild_phase(on_progress, 0.99, "Persisting embedding snapshot")
:ok = rebuild_snapshot(project_id)
{:ok, post_ids}
:ok = report_rebuild_phase(on_progress, 0.99, "Persisting embedding snapshot")
:ok = rebuild_snapshot(project_id)
{:ok, post_ids}
{:error, _reason} = error ->
error
end
else
{:ok, []}
end
@@ -172,24 +184,36 @@ defmodule BDS.Embeddings do
:ok
existing_key ->
label = existing_key_label(existing_key) || next_label()
{:ok, vector} = embed_text(raw_text, post.language)
case embed_text(raw_text, post.language) do
{:ok, vector} ->
label = existing_key_label(existing_key) || next_label()
(existing_key || %Key{})
|> Key.changeset(%{
label: label,
post_id: post.id,
project_id: post.project_id,
content_hash: content_hash,
vector: encode_vector(vector)
})
|> Repo.insert_or_update()
(existing_key || %Key{})
|> Key.changeset(%{
label: label,
post_id: post.id,
project_id: post.project_id,
content_hash: content_hash,
vector: encode_vector(vector)
})
|> Repo.insert_or_update()
if Keyword.get(opts, :refresh_index, true) do
:ok = rebuild_snapshot(post.project_id)
if Keyword.get(opts, :refresh_index, true) do
:ok = rebuild_snapshot(post.project_id)
end
:ok
{:error, reason} ->
# Embedding is best-effort on post save: if the model is unavailable
# (e.g. offline first-use download), leave the post unindexed rather
# than failing the save. An explicit reindex surfaces the error.
Logger.warning(
"Embedding unavailable for post #{post.id}: #{inspect(reason)}; left unindexed"
)
:ok
end
:ok
end
end
@@ -210,11 +234,12 @@ defmodule BDS.Embeddings do
Repo.one(from key in Key, select: max(key.label)) || 0
end
# Builds the upsert rows for a batch of posts. Posts whose content_hash is
# unchanged are skipped (ContentHashSkipsUnchanged); the rest are embedded in
# batches (see embed_pending/2) so model inference is not serialised one post
# at a time. Labels keep their existing value or take the next free integer.
defp build_key_rows(posts, existing_keys, base_label, on_progress) do
# Builds the upsert rows for a batch of posts. Unless `force?` is set, posts
# whose content_hash is unchanged are skipped (ContentHashSkipsUnchanged); the
# rest are embedded in batches (see embed_pending/2) so model inference is not
# serialised one post at a time. Labels keep their existing value or take the
# next free integer. Returns `{:error, reason}` if the model is unavailable.
defp build_key_rows(posts, existing_keys, base_label, on_progress, force?) do
prepared =
Enum.map(posts, fn post ->
raw_text = compose_embedding_source(post.title, resolve_post_body(post))
@@ -226,14 +251,20 @@ defmodule BDS.Embeddings do
existing: existing,
raw_text: raw_text,
content_hash: content_hash,
needs_embed?: is_nil(existing) or existing.content_hash != content_hash
needs_embed?: force? or is_nil(existing) or existing.content_hash != content_hash
}
end)
pending = Enum.filter(prepared, & &1.needs_embed?)
:ok = report_rebuild_started(on_progress, length(pending), "embedding entries")
vectors_by_post_id = embed_pending(pending, on_progress)
case embed_pending(pending, on_progress) do
{:ok, vectors_by_post_id} -> {:ok, collect_rows(prepared, vectors_by_post_id, base_label)}
{:error, _reason} = error -> error
end
end
defp collect_rows(prepared, vectors_by_post_id, base_label) do
{rows, _next_label} =
Enum.reduce(prepared, {[], base_label + 1}, fn entry, {acc, next_label} ->
if entry.needs_embed? do
@@ -258,7 +289,7 @@ defmodule BDS.Embeddings do
rows
end
defp embed_pending([], _on_progress), do: %{}
defp embed_pending([], _on_progress), do: {:ok, %{}}
defp embed_pending(pending, on_progress) do
total = length(pending)
@@ -268,25 +299,36 @@ defmodule BDS.Embeddings do
# Group by language so the lexical stub stems consistently; the neural
# backend is multilingual and ignores the language hint.
|> Enum.group_by(& &1.post.language)
|> Enum.reduce({%{}, 0}, fn {language, group}, acc ->
|> Enum.reduce_while({%{}, 0}, fn {language, group}, acc ->
group
|> Enum.chunk_every(batch)
|> Enum.reduce(acc, fn chunk, {vectors, done} ->
{:ok, chunk_vectors} = embed_many(Enum.map(chunk, & &1.raw_text), language)
|> Enum.reduce_while(acc, fn chunk, {vectors, done} ->
case embed_many(Enum.map(chunk, & &1.raw_text), language) do
{:ok, chunk_vectors} ->
vectors =
chunk
|> Enum.zip(chunk_vectors)
|> Enum.reduce(vectors, fn {entry, vector}, acc ->
Map.put(acc, entry.post.id, vector)
end)
vectors =
chunk
|> Enum.zip(chunk_vectors)
|> Enum.reduce(vectors, fn {entry, vector}, acc ->
Map.put(acc, entry.post.id, vector)
end)
done = done + length(chunk)
:ok = report_rebuild_progress(on_progress, done, total, "embedding entries")
{:cont, {vectors, done}}
done = done + length(chunk)
:ok = report_rebuild_progress(on_progress, done, total, "embedding entries")
{vectors, done}
{:error, reason} ->
{:halt, {:error, reason}}
end
end)
|> case do
{:error, reason} -> {:halt, {:error, reason}}
accumulator -> {:cont, accumulator}
end
end)
|> elem(0)
|> case do
{:error, reason} -> {:error, reason}
{vectors, _done} -> {:ok, vectors}
end
end
defp batch_upsert_keys([]), do: :ok
@@ -337,15 +379,20 @@ defmodule BDS.Embeddings do
)
existing_keys = preload_keys_by_post_id(project_id)
rows = build_key_rows(posts, existing_keys, max_label_value(), nil)
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
case build_key_rows(posts, existing_keys, max_label_value(), nil, false) do
{:ok, rows} ->
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
indexed =
Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id)
indexed =
Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id)
{:ok, indexed}
{:ok, indexed}
{:error, _reason} = error ->
error
end
else
{:ok, []}
end
@@ -677,13 +724,16 @@ defmodule BDS.Embeddings do
if function_exported?(backend, :embed_many, 2) do
backend.embed_many(texts, language: language)
else
vectors =
Enum.map(texts, fn text ->
{:ok, vector} = backend.embed(text, language: language)
vector
end)
{:ok, vectors}
Enum.reduce_while(texts, {:ok, []}, fn text, {:ok, acc} ->
case backend.embed(text, language: language) do
{:ok, vector} -> {:cont, {:ok, [vector | acc]}}
{:error, _reason} = error -> {:halt, error}
end
end)
|> case do
{:ok, vectors} -> {:ok, Enum.reverse(vectors)}
{:error, _reason} = error -> error
end
end
end

View File

@@ -101,11 +101,18 @@ defmodule BDS.Maintenance.Repair do
:file_to_db ->
post_ids = Enum.map(items, &metadata_diff_item_entity_id/1)
{:ok, repaired_post_ids} = Embeddings.repair_posts(project_id, post_ids)
repaired_post_ids = MapSet.new(repaired_post_ids)
# If the embedding model is unavailable, every item is reported as
# failed rather than crashing the repair task.
repaired =
case Embeddings.repair_posts(project_id, post_ids) do
{:ok, repaired_post_ids} -> repaired_post_ids
{:error, _reason} -> []
end
repaired_set = MapSet.new(repaired)
build_batch_repair_result(items, total, on_progress, fn item ->
MapSet.member?(repaired_post_ids, metadata_diff_item_entity_id(item))
MapSet.member?(repaired_set, metadata_diff_item_entity_id(item))
end)
:db_to_file ->

View File

@@ -1,6 +1,8 @@
defmodule BDS.Metadata do
@moduledoc false
require Logger
alias BDS.Embeddings
alias BDS.I18n
alias BDS.Persistence
@@ -653,7 +655,17 @@ defmodule BDS.Metadata do
) do
if previous_state.semantic_similarity_enabled != true and
project_metadata.semantic_similarity_enabled == true do
{:ok, _indexed_post_ids} = Embeddings.index_unindexed(project_id)
# Backfill is best-effort: if the embedding model is unavailable, keep the
# setting enabled and log it rather than failing the metadata update.
case Embeddings.index_unindexed(project_id) do
{:ok, _indexed_post_ids} ->
:ok
{:error, reason} ->
Logger.warning(
"Embedding backfill skipped for project #{project_id}: #{inspect(reason)}"
)
end
end
result