fix(perf): replace Enum.each + individual inserts with preloaded keys and batch upsert in embeddings (CSM-033)

This commit is contained in:
2026-05-27 19:03:21 +02:00
parent f2b340ba86
commit e29dfb490a
3 changed files with 322 additions and 25 deletions

View File

@@ -15,6 +15,7 @@ defmodule BDS.Embeddings do
@duplicate_threshold 0.92
@exact_match_score 0.999999
@key_batch_size 199
def model_id, do: configured_backend().model_info().model_id
def dimensions, do: configured_backend().model_info().dimensions
@@ -73,7 +74,24 @@ defmodule BDS.Embeddings do
order_by: [asc: post.created_at, asc: post.slug]
)
Enum.each(posts, &sync_post_if_enabled(&1, refresh_index: false))
existing_keys = preload_keys_by_post_id(project_id, Enum.map(posts, & &1.id))
base_label = max_label_value()
{rows, _next_label} =
Enum.reduce(posts, {[], base_label + 1}, fn post, {acc, next_label} ->
existing_key = Map.get(existing_keys, post.id)
case compute_key_data(post, existing_key, next_label) do
:skip ->
{acc, next_label}
{:upsert, row} ->
bump = if existing_key, do: 0, else: 1
{[row | acc], next_label + bump}
end
end)
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
{:ok, Enum.map(posts, & &1.id)}
else
@@ -104,12 +122,27 @@ defmodule BDS.Embeddings do
where: key.project_id == ^project_id and key.post_id not in ^post_ids
)
posts
|> Enum.with_index(1)
|> Enum.each(fn {post, index} ->
sync_post_if_enabled(post, refresh_index: false)
:ok = report_rebuild_progress(on_progress, index, total_posts, "embedding entries")
end)
existing_keys = preload_keys_by_post_id(project_id)
base_label = max_label_value()
{rows, _next_label} =
posts
|> Enum.with_index(1)
|> Enum.reduce({[], base_label + 1}, fn {post, index}, {acc, next_label} ->
:ok = report_rebuild_progress(on_progress, index, total_posts, "embedding entries")
existing_key = Map.get(existing_keys, post.id)
case compute_key_data(post, existing_key, next_label) do
:skip ->
{acc, next_label}
{:upsert, row} ->
bump = if existing_key, do: 0, else: 1
{[row | acc], next_label + bump}
end
end)
batch_upsert_keys(rows)
:ok = report_rebuild_phase(on_progress, 0.99, "Persisting embedding snapshot")
:ok = rebuild_snapshot(project_id)
@@ -196,6 +229,53 @@ defmodule BDS.Embeddings do
end
end
defp preload_keys_by_post_id(project_id) do
Repo.all(from key in Key, where: key.project_id == ^project_id)
|> Map.new(&{&1.post_id, &1})
end
defp preload_keys_by_post_id(project_id, post_ids) do
Repo.all(
from key in Key,
where: key.project_id == ^project_id and key.post_id in ^post_ids
)
|> Map.new(&{&1.post_id, &1})
end
defp max_label_value do
Repo.one(from key in Key, select: max(key.label)) || 0
end
defp compute_key_data(%Post{} = post, existing_key, next_label) do
body = resolve_post_body(post)
raw_text = compose_embedding_source(post.title, body)
content_hash = hash_text(raw_text)
if existing_key && existing_key.content_hash == content_hash do
:skip
else
{:ok, vector} = embed_text(raw_text, post.language)
label = if existing_key, do: existing_key.label, else: next_label
{:upsert, [label, post.id, post.project_id, content_hash, Jason.encode!(vector)]}
end
end
defp batch_upsert_keys([]), do: :ok
defp batch_upsert_keys(rows) do
rows
|> Enum.chunk_every(@key_batch_size)
|> Enum.each(fn chunk ->
placeholders = Enum.map_join(chunk, ", ", fn _ -> "(?, ?, ?, ?, ?)" end)
params = List.flatten(chunk)
Repo.query!(
"INSERT INTO embedding_keys (label, post_id, project_id, content_hash, vector) VALUES #{placeholders} ON CONFLICT(label) DO UPDATE SET content_hash = excluded.content_hash, vector = excluded.vector",
params
)
end)
end
def remove_post(post_id) when is_binary(post_id) do
project_id =
case Repo.get_by(Key, post_id: post_id) do
@@ -227,23 +307,24 @@ defmodule BDS.Embeddings do
order_by: [asc: post.created_at, asc: post.slug]
)
Enum.each(posts, fn post ->
body = resolve_post_body(post)
content_hash = hash_text(compose_embedding_source(post.title, body))
existing_keys = preload_keys_by_post_id(project_id)
base_label = max_label_value()
case Repo.get_by(Key, post_id: post.id, project_id: project_id) do
%Key{content_hash: ^content_hash} ->
:ok
{rows, _next_label} =
Enum.reduce(posts, {[], base_label + 1}, fn post, {acc, next_label} ->
existing_key = Map.get(existing_keys, post.id)
_other ->
:ok =
sync_post_if_enabled(
%{post | content: if(post.content in [nil, ""], do: body, else: post.content)},
refresh_index: false
)
end
end)
case compute_key_data(post, existing_key, next_label) do
:skip ->
{acc, next_label}
{:upsert, row} ->
bump = if existing_key, do: 0, else: 1
{[row | acc], next_label + bump}
end
end)
batch_upsert_keys(rows)
:ok = rebuild_snapshot(project_id)
indexed =