perf: A1-14b replace O(n^2) embedding snapshot with hnswlib HNSW index and debounced persistence

This commit is contained in:
2026-05-29 15:36:13 +02:00
parent 744f7543d7
commit 61ff2a77c0
12 changed files with 474 additions and 287 deletions

View File

@@ -44,32 +44,32 @@ defmodule BDS.Desktop.AutomationTest do
assert :ok = Automation.click(session, "[data-testid='toggle-sidebar']")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.sidebar_visible == false))
assert snapshot.sidebar_visible == false
assert :ok = Automation.press(session, "Meta+B")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.sidebar_visible == true))
assert snapshot.sidebar_visible == true
assert :ok = Automation.press(session, "Meta+J")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.panel_visible == true))
assert snapshot.panel_visible == true
assert :ok = Automation.press(session, "Meta+J")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.panel_visible == false))
assert snapshot.panel_visible == false
assert :ok = Automation.press(session, "Meta+,")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.editor_title == "Settings"))
assert snapshot.editor_title == "Settings"
assert :ok = Automation.click(session, "[data-testid='toggle-assistant']")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.assistant_visible == true))
assert snapshot.assistant_visible == true
assert snapshot.panel_visible == false
@@ -92,7 +92,7 @@ defmodule BDS.Desktop.AutomationTest do
assert :ok = Automation.drag(session, "[data-resize='sidebar']", 90)
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.sidebar_width >= 360 and &1.sidebar_width <= 380))
assert snapshot.sidebar_width >= 360
assert snapshot.sidebar_width <= 380
@@ -100,7 +100,7 @@ defmodule BDS.Desktop.AutomationTest do
assert :ok = Automation.reload(session)
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.sidebar_visible == true and &1.sidebar_width >= resized_width - 2))
assert snapshot.sidebar_visible == true
assert snapshot.sidebar_width >= resized_width - 2
assert snapshot.sidebar_width <= resized_width + 2
@@ -140,12 +140,12 @@ defmodule BDS.Desktop.AutomationTest do
assert :ok = Automation.native_menu_action(session, "toggle_sidebar")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.sidebar_visible == false))
assert snapshot.sidebar_visible == false
assert :ok = Automation.native_menu_action(session, "edit_preferences")
snapshot = Automation.snapshot(session)
snapshot = await(session, &(&1.editor_title == "Settings"))
assert snapshot.editor_title == "Settings"
end
@@ -175,6 +175,24 @@ defmodule BDS.Desktop.AutomationTest do
end
end
# Polls snapshots until the predicate holds (or times out), returning the
# last snapshot. UI transitions after a keypress/click/menu action are
# asynchronous, so a single immediate snapshot can race under CPU load.
defp await(session, fun, timeout \\ 5_000)
defp await(session, _fun, timeout) when timeout <= 0, do: Automation.snapshot(session)
defp await(session, fun, timeout) do
snapshot = Automation.snapshot(session)
if fun.(snapshot) do
snapshot
else
Process.sleep(50)
await(session, fun, timeout - 50)
end
end
defp wait_until(fun, timeout \\ 5_000)
defp wait_until(fun, timeout) when timeout <= 0, do: fun.()

View File

@@ -319,24 +319,28 @@ defmodule BDS.EmbeddingsTest do
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
# Persistence is debounced (5s); force it to disk to assert the files.
:ok = BDS.Embeddings.Index.flush(project.id)
index_path = BDS.Embeddings.index_path(project.id)
assert File.exists?(index_path)
assert File.exists?(index_path <> ".meta.json")
refute String.starts_with?(index_path, BDS.Projects.project_data_dir(project))
cache_root = Application.fetch_env!(:bds, :project_cache_root) |> Path.expand()
assert index_path == Path.join([cache_root, "projects", project.id, "embeddings.usearch"])
snapshot = index_path |> File.read!() |> Jason.decode!()
assert snapshot["project_id"] == project.id
assert snapshot["model_id"] == "fake/multilingual-e5-small"
assert snapshot["dimensions"] == 384
assert snapshot["entries"][alpha.id]["label"] != nil
assert snapshot["entries"][alpha.id]["content_hash"] != nil
# The sidecar carries the dimension and the label→post_id mapping.
meta = (index_path <> ".meta.json") |> File.read!() |> Jason.decode!()
assert meta["dim"] == 384
post_ids = Enum.map(meta["labels"], fn [_label, post_id] -> post_id end)
assert alpha.id in post_ids
assert beta.id in post_ids
assert Enum.any?(snapshot["entries"][alpha.id]["neighbors"], fn neighbor ->
neighbor["post_id"] == beta.id
end)
# The HNSW index answers nearest-neighbour queries.
assert {:ok, [neighbor]} = BDS.Embeddings.find_similar(alpha.id, 1)
assert neighbor.post_id == beta.id
end
test "embedding index uses the app-internal persisted file name", %{project: project} do
@@ -443,43 +447,76 @@ defmodule BDS.EmbeddingsTest do
refreshed_key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
assert refreshed_key.content_hash == stale_key.content_hash
:ok = BDS.Embeddings.Index.flush(project.id)
assert File.exists?(BDS.Embeddings.index_path(project.id))
end
test "sync_post refreshes snapshot drift when the embedding hash is already current", %{
test "similarity queries keep working when sync_post finds the embedding already current", %{
project: project
} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, post} =
assert {:ok, alpha} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Snapshot Repair",
title: "Alpha",
content: "space rocket orbit mission galaxy",
language: "en"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, beta} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Beta",
content: "rocket launch orbit mission station",
language: "en"
})
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
assert {:ok, _beta} = BDS.Posts.publish_post(beta.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
key = BDS.Repo.get_by!(BDS.Embeddings.Key, project_id: project.id, post_id: post.id)
index_path = BDS.Embeddings.index_path(project.id)
# Re-syncing with an unchanged content hash is a no-op for the index...
assert :ok = BDS.Embeddings.sync_post(alpha.id)
snapshot = index_path |> File.read!() |> Jason.decode!()
# ...and nearest-neighbour queries still resolve through the HNSW index.
assert {:ok, [neighbor]} = BDS.Embeddings.find_similar(alpha.id, 1)
assert neighbor.post_id == beta.id
end
drifted_snapshot =
put_in(snapshot, ["entries", post.id, "content_hash"], "stale-snapshot-hash")
test "find_similar rebuilds the HNSW index on demand when none is loaded", %{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
File.write!(index_path, Jason.encode!(drifted_snapshot))
assert {:ok, alpha} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Alpha",
content: "space rocket orbit mission galaxy",
language: "en"
})
refute Enum.any?(BDS.Embeddings.diff_reports(project.id), &(&1.entity_id == post.id))
assert {:ok, beta} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Beta",
content: "rocket launch orbit mission station",
language: "en"
})
assert :ok = BDS.Embeddings.sync_post(post.id)
assert {:ok, _alpha} = BDS.Posts.publish_post(alpha.id)
assert {:ok, _beta} = BDS.Posts.publish_post(beta.id)
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
repaired_snapshot = index_path |> File.read!() |> Jason.decode!()
assert get_in(repaired_snapshot, ["entries", post.id, "content_hash"]) == key.content_hash
# Drop the in-memory index and remove the persisted files, then query: it
# must self-heal by rebuilding from the DB vectors.
:ok = BDS.Embeddings.Index.forget(project.id)
File.rm_rf!(BDS.Projects.project_cache_dir(project.id))
refute Enum.any?(BDS.Embeddings.diff_reports(project.id), &(&1.entity_id == post.id))
assert {:ok, similar} = BDS.Embeddings.find_similar(alpha.id, 1)
assert [%{post_id: post_id}] = similar
assert post_id == beta.id
end
end

View File

@@ -395,6 +395,8 @@ defmodule BDS.MaintenanceTest do
assert {:ok, _indexed} = BDS.Embeddings.index_unindexed(project.id)
index_path = BDS.Embeddings.index_path(project.id)
# Index persistence is debounced (5s); force it to assert the file.
:ok = BDS.Embeddings.Index.flush(project.id)
assert File.exists?(index_path)
Repo.delete_all(from key in BDS.Embeddings.Key, where: key.project_id == ^project.id)
@@ -416,6 +418,8 @@ defmodule BDS.MaintenanceTest do
assert post.id in rebuilt_post_ids
assert Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
:ok = BDS.Embeddings.Index.flush(project.id)
assert File.exists?(index_path)
end

View File

@@ -236,6 +236,9 @@ defmodule BDS.MetadataTest do
assert metadata.semantic_similarity_enabled == true
assert BDS.Repo.get_by(BDS.Embeddings.Key, project_id: project.id, post_id: post.id) != nil
# Index persistence is debounced (5s); force it to assert the file.
:ok = BDS.Embeddings.Index.flush(project.id)
assert File.exists?(BDS.Embeddings.index_path(project.id))
end