perf: A1-14b replace O(n^2) embedding snapshot with hnswlib HNSW index and debounced persistence
This commit is contained in:
@@ -37,7 +37,8 @@ defmodule BDS.Application do
|
||||
{Task.Supervisor, name: BDS.TCP.TaskSupervisor},
|
||||
BDS.Scripting.JobStore,
|
||||
{Task.Supervisor, name: BDS.Scripting.TaskSupervisor},
|
||||
BDS.Scripting.JobSupervisor
|
||||
BDS.Scripting.JobSupervisor,
|
||||
BDS.Embeddings.Index
|
||||
] ++ embedding_children() ++ desktop_children(current_env())
|
||||
|
||||
opts = [strategy: :one_for_one, name: BDS.Supervisor]
|
||||
|
||||
@@ -166,11 +166,9 @@ defmodule BDS.Embeddings do
|
||||
|
||||
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
|
||||
%Key{content_hash: ^content_hash} ->
|
||||
if Keyword.get(opts, :refresh_index, true) and
|
||||
snapshot_content_hash(post.project_id, post.id) != content_hash do
|
||||
:ok = rebuild_snapshot(post.project_id)
|
||||
end
|
||||
|
||||
# Embedding is already current. The HNSW index self-heals on query
|
||||
# (find_similar/find_duplicates rebuild when no index is loaded), so
|
||||
# there is nothing to refresh here.
|
||||
:ok
|
||||
|
||||
existing_key ->
|
||||
@@ -361,28 +359,28 @@ defmodule BDS.Embeddings do
|
||||
{:error, :not_found} ->
|
||||
{:ok, []}
|
||||
|
||||
{:ok, post, source_vector} ->
|
||||
similar =
|
||||
case Index.neighbors(post.project_id, post.id, limit) do
|
||||
{:ok, neighbors} ->
|
||||
neighbors
|
||||
{:ok, _post, nil} ->
|
||||
{:ok, []}
|
||||
|
||||
{:error, :missing} ->
|
||||
Repo.all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^post.project_id and key.post_id != ^post.id
|
||||
)
|
||||
|> Enum.map(fn key ->
|
||||
%{
|
||||
post_id: key.post_id,
|
||||
score: cosine_similarity(source_vector, decode_vector(key.vector))
|
||||
}
|
||||
end)
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
|> Enum.take(max(limit, 0))
|
||||
end
|
||||
{:ok, post, %Key{} = key} ->
|
||||
{:ok, query_similar(post.project_id, key, limit)}
|
||||
end
|
||||
end
|
||||
|
||||
{:ok, similar}
|
||||
# Queries the HNSW index for a post's neighbours, rebuilding the index from
|
||||
# the DB vectors if it is not currently loaded (e.g. after a restart).
|
||||
defp query_similar(project_id, %Key{} = key, limit) do
|
||||
case Index.neighbors(project_id, key.label, key.vector, limit) do
|
||||
{:ok, neighbors} ->
|
||||
neighbors
|
||||
|
||||
{:error, :missing} ->
|
||||
:ok = rebuild_snapshot(project_id)
|
||||
|
||||
case Index.neighbors(project_id, key.label, key.vector, limit) do
|
||||
{:ok, neighbors} -> neighbors
|
||||
{:error, :missing} -> []
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -395,8 +393,12 @@ defmodule BDS.Embeddings do
|
||||
{:error, :not_found} ->
|
||||
{:ok, %{}}
|
||||
|
||||
{:ok, post, source_vector} ->
|
||||
{:ok, _post, nil} ->
|
||||
{:ok, %{}}
|
||||
|
||||
{:ok, post, %Key{} = source_key} ->
|
||||
target_ids = Enum.uniq(target_post_ids)
|
||||
source_vector = decode_vector(source_key.vector)
|
||||
|
||||
scores =
|
||||
Repo.all(
|
||||
@@ -452,46 +454,18 @@ defmodule BDS.Embeddings do
|
||||
if enabled_for_project?(project_id) do
|
||||
on_progress = progress_callback(opts)
|
||||
dismissed = dismissed_pair_keys(project_id)
|
||||
entries = load_index_entries(project_id)
|
||||
|
||||
pairs =
|
||||
case duplicate_pairs_with_rebuild(project_id, entries, on_progress) do
|
||||
{:ok, pairs} -> pairs
|
||||
{:error, :missing} -> []
|
||||
end
|
||||
|
||||
duplicates =
|
||||
case Index.duplicate_pairs(project_id, @duplicate_threshold, on_progress: on_progress) do
|
||||
{:ok, pairs} ->
|
||||
pairs
|
||||
|> Enum.reject(fn pair -> pair_key(pair.post_id_a, pair.post_id_b) in dismissed end)
|
||||
|> enrich_duplicate_pairs(project_id)
|
||||
|
||||
{:error, :missing} ->
|
||||
keys =
|
||||
Repo.all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id,
|
||||
order_by: [asc: key.post_id]
|
||||
)
|
||||
|
||||
total_keys = length(keys)
|
||||
|
||||
:ok = report_rebuild_started(on_progress, total_keys, "embedding entries")
|
||||
|
||||
keys
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.flat_map(fn {left, index} ->
|
||||
:ok = report_rebuild_progress(on_progress, index, total_keys, "embedding entries")
|
||||
|
||||
for right <- keys,
|
||||
left.post_id < right.post_id,
|
||||
pair_key(left.post_id, right.post_id) not in dismissed,
|
||||
similarity =
|
||||
cosine_similarity(decode_vector(left.vector), decode_vector(right.vector)),
|
||||
similarity >= @duplicate_threshold do
|
||||
%{
|
||||
post_id_a: left.post_id,
|
||||
post_id_b: right.post_id,
|
||||
score: similarity
|
||||
}
|
||||
end
|
||||
end)
|
||||
|> enrich_duplicate_pairs(project_id)
|
||||
end
|
||||
pairs
|
||||
|> Enum.reject(fn pair -> pair_key(pair.post_id_a, pair.post_id_b) in dismissed end)
|
||||
|> enrich_duplicate_pairs(project_id)
|
||||
|
||||
:ok = report_rebuild_phase(on_progress, 0.99, "Resolving duplicate candidates")
|
||||
{:ok, duplicates}
|
||||
@@ -555,17 +529,33 @@ defmodule BDS.Embeddings do
|
||||
with {:ok, post} <- fetch_post(post_id) do
|
||||
if enabled_for_project?(post.project_id) do
|
||||
:ok = ensure_key(post)
|
||||
|
||||
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
|
||||
nil -> {:ok, post, []}
|
||||
key -> {:ok, post, decode_vector(key.vector)}
|
||||
end
|
||||
{:ok, post, Repo.get_by(Key, post_id: post.id, project_id: post.project_id)}
|
||||
else
|
||||
{:disabled, post.project_id}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp duplicate_pairs_with_rebuild(project_id, entries, on_progress) do
|
||||
case Index.duplicate_pairs(project_id, entries, @duplicate_threshold, on_progress: on_progress) do
|
||||
{:ok, pairs} ->
|
||||
{:ok, pairs}
|
||||
|
||||
{:error, :missing} ->
|
||||
:ok = rebuild_snapshot(project_id)
|
||||
Index.duplicate_pairs(project_id, entries, @duplicate_threshold, on_progress: on_progress)
|
||||
end
|
||||
end
|
||||
|
||||
defp load_index_entries(project_id) do
|
||||
Repo.all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id,
|
||||
order_by: [asc: key.post_id]
|
||||
)
|
||||
|> Enum.map(fn key -> %{label: key.label, post_id: key.post_id, vector: key.vector} end)
|
||||
end
|
||||
|
||||
defp ensure_key(%Post{} = post) do
|
||||
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
|
||||
nil -> sync_post(post)
|
||||
@@ -704,7 +694,7 @@ defmodule BDS.Embeddings do
|
||||
end
|
||||
|
||||
defp rebuild_snapshot(project_id) do
|
||||
Index.rebuild(project_id, model_id: model_id(), dimensions: dimensions())
|
||||
Index.put(project_id, dimensions(), load_index_entries(project_id))
|
||||
end
|
||||
|
||||
defp progress_callback(opts), do: ProgressReporter.callback(opts)
|
||||
@@ -729,13 +719,6 @@ defmodule BDS.Embeddings do
|
||||
defp report_rebuild_phase(callback, value, label),
|
||||
do: ProgressReporter.report_phase(callback, value, label)
|
||||
|
||||
defp snapshot_content_hash(project_id, post_id) do
|
||||
case Index.read(project_id) do
|
||||
{:ok, snapshot} -> get_in(snapshot, ["entries", post_id, "content_hash"])
|
||||
_other -> nil
|
||||
end
|
||||
end
|
||||
|
||||
defp current_embedding_status(nil, _expected_hash), do: "missing"
|
||||
|
||||
defp current_embedding_status(%Key{vector: vector}, _expected_hash) when vector in [nil, ""],
|
||||
|
||||
@@ -1,220 +1,342 @@
|
||||
defmodule BDS.Embeddings.Index do
|
||||
@moduledoc false
|
||||
@moduledoc """
|
||||
Per-project approximate-nearest-neighbour index over post embeddings.
|
||||
|
||||
import Ecto.Query
|
||||
Backed by an HNSW graph (hnswlib) per the A1-14b / `specs/embedding.allium`
|
||||
requirement — cosine space, connectivity M=16, efConstruction=128,
|
||||
efSearch=64. This replaces the previous O(n²) brute-force cosine snapshot:
|
||||
building is O(n·log n) and queries are O(log n).
|
||||
|
||||
The process is intentionally **database-free**: callers (running in their own
|
||||
process, e.g. under the test SQL sandbox) read embedding vectors from the DB
|
||||
and hand them in. This GenServer owns only the in-memory HNSW graphs, the
|
||||
`label → post_id` maps, and file persistence.
|
||||
|
||||
Persistence (DebouncedPersistence invariant): the index file
|
||||
(`embeddings.usearch`) plus a small sidecar holding the dimension and the
|
||||
label→post_id map are written behind a 5s debounce, and force-saved on
|
||||
project switch / shutdown. On a cold query the index is lazily reloaded from
|
||||
those files; if they are absent the caller rebuilds from the DB vectors.
|
||||
"""
|
||||
|
||||
use GenServer
|
||||
|
||||
alias BDS.Persistence
|
||||
alias BDS.Embeddings.Key
|
||||
alias BDS.Projects
|
||||
alias BDS.ProgressReporter
|
||||
alias BDS.Repo
|
||||
|
||||
@neighbor_limit 21
|
||||
@debounce_ms 5_000
|
||||
@space :cosine
|
||||
@m 16
|
||||
@ef_construction 128
|
||||
@ef_search 64
|
||||
|
||||
# ─── Public API ─────────────────────────────────────────────
|
||||
|
||||
def start_link(opts \\ []) do
|
||||
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
|
||||
end
|
||||
|
||||
@doc "On-disk path of the HNSW index file for a project."
|
||||
def path(project_id) when is_binary(project_id) do
|
||||
Path.join(Projects.project_cache_dir(project_id), "embeddings.usearch")
|
||||
end
|
||||
|
||||
def rebuild(project_id, opts) when is_binary(project_id) and is_list(opts) do
|
||||
model_id = Keyword.fetch!(opts, :model_id)
|
||||
dimensions = Keyword.fetch!(opts, :dimensions)
|
||||
|
||||
keys =
|
||||
Repo.all(
|
||||
from key in Key,
|
||||
where: key.project_id == ^project_id,
|
||||
order_by: [asc: key.post_id]
|
||||
)
|
||||
|
||||
entries =
|
||||
keys
|
||||
|> Enum.map(fn key ->
|
||||
vector = decode_vector(key.vector)
|
||||
|
||||
{key.post_id,
|
||||
%{
|
||||
"label" => key.label,
|
||||
"content_hash" => key.content_hash,
|
||||
"neighbors" => neighbor_entries(keys, key, vector)
|
||||
}}
|
||||
end)
|
||||
|> Map.new()
|
||||
|
||||
payload = %{
|
||||
"project_id" => project_id,
|
||||
"model_id" => model_id,
|
||||
"dimensions" => dimensions,
|
||||
"updated_at" => Persistence.now_ms(),
|
||||
"entries" => entries
|
||||
}
|
||||
|
||||
write_snapshot(path(project_id), payload, project_id)
|
||||
@doc """
|
||||
(Re)builds the index for a project from the given entries and schedules a
|
||||
debounced save. `entries` is a list of `%{label:, post_id:, vector:}` where
|
||||
`vector` is the packed little-endian Float32 BLOB.
|
||||
"""
|
||||
def put(project_id, dimensions, entries)
|
||||
when is_binary(project_id) and is_integer(dimensions) and is_list(entries) do
|
||||
GenServer.call(__MODULE__, {:put, project_id, dimensions, entries}, :infinity)
|
||||
end
|
||||
|
||||
def read(project_id) when is_binary(project_id) do
|
||||
project_id
|
||||
|> candidate_paths()
|
||||
|> read_snapshot_paths()
|
||||
@doc """
|
||||
Returns up to `limit` nearest neighbours of `query_vector` (the post's packed
|
||||
BLOB), excluding `query_label`. `{:error, :missing}` if no index is available.
|
||||
"""
|
||||
def neighbors(project_id, query_label, query_vector, limit)
|
||||
when is_binary(project_id) and is_integer(query_label) and is_binary(query_vector) do
|
||||
GenServer.call(__MODULE__, {:neighbors, project_id, query_label, query_vector, limit}, :infinity)
|
||||
end
|
||||
|
||||
def neighbors(project_id, post_id, limit) when is_binary(project_id) and is_binary(post_id) do
|
||||
with {:ok, snapshot} <- read(project_id),
|
||||
%{} = entry <- get_in(snapshot, ["entries", post_id]) do
|
||||
entry
|
||||
|> Map.get("neighbors", [])
|
||||
|> Enum.take(max(limit, 0))
|
||||
|> Enum.map(fn neighbor ->
|
||||
%{
|
||||
post_id: neighbor["post_id"],
|
||||
score: neighbor["score"]
|
||||
}
|
||||
end)
|
||||
|> then(&{:ok, &1})
|
||||
else
|
||||
_ -> {:error, :missing}
|
||||
@doc """
|
||||
Finds near-duplicate pairs at/above `threshold` by querying the HNSW graph for
|
||||
each entry's neighbours. `{:error, :missing}` if no index is available.
|
||||
"""
|
||||
def duplicate_pairs(project_id, entries, threshold, opts \\ [])
|
||||
when is_binary(project_id) and is_list(entries) and is_number(threshold) do
|
||||
GenServer.call(
|
||||
__MODULE__,
|
||||
{:duplicate_pairs, project_id, entries, threshold, opts},
|
||||
:infinity
|
||||
)
|
||||
end
|
||||
|
||||
@doc "Forces a pending save for a project to disk now (e.g. on project switch)."
|
||||
def flush(project_id) when is_binary(project_id) do
|
||||
GenServer.call(__MODULE__, {:flush, project_id}, :infinity)
|
||||
end
|
||||
|
||||
@doc "Forces all pending saves to disk now (e.g. on shutdown)."
|
||||
def flush_all do
|
||||
GenServer.call(__MODULE__, :flush_all, :infinity)
|
||||
end
|
||||
|
||||
@doc "Drops the in-memory index for a project (e.g. on project deletion)."
|
||||
def forget(project_id) when is_binary(project_id) do
|
||||
GenServer.call(__MODULE__, {:forget, project_id}, :infinity)
|
||||
end
|
||||
|
||||
# ─── GenServer ──────────────────────────────────────────────
|
||||
|
||||
@impl true
|
||||
def init(_opts) do
|
||||
Process.flag(:trap_exit, true)
|
||||
{:ok, %{}}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_call({:put, project_id, dimensions, entries}, _from, state) do
|
||||
entry = build_entry(dimensions, entries)
|
||||
state = state |> Map.put(project_id, entry) |> schedule_save(project_id)
|
||||
{:reply, :ok, state}
|
||||
end
|
||||
|
||||
def handle_call({:neighbors, project_id, query_label, query_vector, limit}, _from, state) do
|
||||
case ensure_loaded(state, project_id) do
|
||||
{:ok, %{index: nil}, state} ->
|
||||
{:reply, {:error, :missing}, state}
|
||||
|
||||
{:ok, entry, state} ->
|
||||
{:reply, {:ok, query_neighbors(entry, query_label, query_vector, limit)}, state}
|
||||
|
||||
{:missing, state} ->
|
||||
{:reply, {:error, :missing}, state}
|
||||
end
|
||||
end
|
||||
|
||||
def duplicate_pairs(project_id, threshold, opts \\ []) when is_binary(project_id) do
|
||||
with {:ok, snapshot} <- read(project_id) do
|
||||
entries = Map.get(snapshot, "entries", %{})
|
||||
entry_count = map_size(entries)
|
||||
on_progress = progress_callback(opts)
|
||||
def handle_call({:duplicate_pairs, project_id, entries, threshold, opts}, _from, state) do
|
||||
case ensure_loaded(state, project_id) do
|
||||
{:ok, %{index: nil}, state} ->
|
||||
{:reply, {:error, :missing}, state}
|
||||
|
||||
:ok = report_scan_started(on_progress, entry_count, "embedding entries")
|
||||
{:ok, entry, state} ->
|
||||
{:reply, {:ok, scan_duplicates(entry, entries, threshold, opts)}, state}
|
||||
|
||||
pairs =
|
||||
entries
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.flat_map(fn {{post_id, entry}, index} ->
|
||||
:ok = report_scan_progress(on_progress, index, entry_count, "embedding entries")
|
||||
|
||||
entry
|
||||
|> Map.get("neighbors", [])
|
||||
|> Enum.filter(&(&1["score"] >= threshold))
|
||||
|> Enum.map(fn neighbor ->
|
||||
{post_id_a, post_id_b} = sort_pair(post_id, neighbor["post_id"])
|
||||
|
||||
{{post_id_a, post_id_b},
|
||||
%{
|
||||
post_id_a: post_id_a,
|
||||
post_id_b: post_id_b,
|
||||
score: neighbor["score"]
|
||||
}}
|
||||
end)
|
||||
end)
|
||||
|> Map.new()
|
||||
|> Map.values()
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
|
||||
{:ok, pairs}
|
||||
else
|
||||
_ -> {:error, :missing}
|
||||
{:missing, state} ->
|
||||
{:reply, {:error, :missing}, state}
|
||||
end
|
||||
end
|
||||
|
||||
defp neighbor_entries(keys, current_key, current_vector) do
|
||||
keys
|
||||
|> Enum.reject(&(&1.post_id == current_key.post_id))
|
||||
|> Enum.map(fn other_key ->
|
||||
%{
|
||||
"post_id" => other_key.post_id,
|
||||
"label" => other_key.label,
|
||||
"score" => cosine_similarity(current_vector, decode_vector(other_key.vector))
|
||||
}
|
||||
end)
|
||||
|> Enum.sort_by(& &1["score"], :desc)
|
||||
|> Enum.take(@neighbor_limit)
|
||||
def handle_call({:flush, project_id}, _from, state) do
|
||||
{:reply, :ok, save_now(state, project_id)}
|
||||
end
|
||||
|
||||
defp write_snapshot(snapshot_path, payload, project_id) do
|
||||
:ok = Persistence.atomic_write(snapshot_path, Jason.encode!(payload))
|
||||
legacy_path = legacy_path(snapshot_path)
|
||||
def handle_call(:flush_all, _from, state) do
|
||||
state = Enum.reduce(Map.keys(state), state, &save_now(&2, &1))
|
||||
{:reply, :ok, state}
|
||||
end
|
||||
|
||||
if File.exists?(legacy_path) do
|
||||
File.rm(legacy_path)
|
||||
def handle_call({:forget, project_id}, _from, state) do
|
||||
case Map.get(state, project_id) do
|
||||
%{timer: timer} when is_reference(timer) -> Process.cancel_timer(timer)
|
||||
_other -> :ok
|
||||
end
|
||||
|
||||
cleanup_legacy_project_snapshots(project_id, snapshot_path)
|
||||
{:reply, :ok, Map.delete(state, project_id)}
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_info({:save, project_id}, state) do
|
||||
{:noreply, save_now(state, project_id)}
|
||||
end
|
||||
|
||||
def handle_info(_message, state), do: {:noreply, state}
|
||||
|
||||
@impl true
|
||||
def terminate(_reason, state) do
|
||||
Enum.each(Map.keys(state), &save_now(state, &1))
|
||||
:ok
|
||||
end
|
||||
|
||||
defp candidate_paths(project_id) do
|
||||
current_snapshot_path = path(project_id)
|
||||
legacy_project_snapshot_path = legacy_project_snapshot_path(project_id)
|
||||
# ─── Build / query ──────────────────────────────────────────
|
||||
|
||||
[
|
||||
current_snapshot_path,
|
||||
legacy_path(current_snapshot_path),
|
||||
legacy_project_snapshot_path,
|
||||
legacy_project_snapshot_path && legacy_path(legacy_project_snapshot_path)
|
||||
]
|
||||
|> Enum.filter(&is_binary/1)
|
||||
|> Enum.uniq()
|
||||
defp build_entry(dimensions, []), do: %{index: nil, labels: %{}, dim: dimensions, timer: nil}
|
||||
|
||||
defp build_entry(dimensions, entries) do
|
||||
count = length(entries)
|
||||
{:ok, index} = HNSWLib.Index.new(@space, dimensions, count, m: @m, ef_construction: @ef_construction)
|
||||
:ok = HNSWLib.Index.set_ef(index, @ef_search)
|
||||
|
||||
tensor =
|
||||
entries
|
||||
|> Enum.map(& &1.vector)
|
||||
|> IO.iodata_to_binary()
|
||||
|> Nx.from_binary(:f32)
|
||||
|> Nx.reshape({count, dimensions})
|
||||
|
||||
:ok = HNSWLib.Index.add_items(index, tensor, ids: Enum.map(entries, & &1.label))
|
||||
|
||||
%{
|
||||
index: index,
|
||||
labels: Map.new(entries, &{&1.label, &1.post_id}),
|
||||
dim: dimensions,
|
||||
timer: nil
|
||||
}
|
||||
end
|
||||
|
||||
defp read_snapshot_paths([]), do: {:error, :missing}
|
||||
defp query_neighbors(%{index: index, labels: labels}, query_label, query_vector, limit) do
|
||||
case query(index, query_vector, limit + 1) do
|
||||
[] ->
|
||||
[]
|
||||
|
||||
defp read_snapshot_paths([snapshot_path | rest]) do
|
||||
case File.read(snapshot_path) do
|
||||
{:ok, contents} -> {:ok, Jason.decode!(contents)}
|
||||
{:error, :enoent} -> read_snapshot_paths(rest)
|
||||
{:error, reason} -> {:error, reason}
|
||||
results ->
|
||||
results
|
||||
|> Enum.reject(fn {label, _score} -> label == query_label end)
|
||||
|> Enum.map(fn {label, score} -> %{post_id: Map.get(labels, label), score: score} end)
|
||||
|> Enum.reject(&is_nil(&1.post_id))
|
||||
|> Enum.take(max(limit, 0))
|
||||
end
|
||||
end
|
||||
|
||||
defp cleanup_legacy_project_snapshots(project_id, snapshot_path) do
|
||||
current_paths = [snapshot_path, legacy_path(snapshot_path)]
|
||||
defp scan_duplicates(%{index: index, labels: labels}, entries, threshold, opts) do
|
||||
on_progress = ProgressReporter.callback(opts)
|
||||
total = length(entries)
|
||||
:ok = report_scan_started(on_progress, total, "embedding entries")
|
||||
|
||||
project_id
|
||||
|> legacy_project_snapshot_path()
|
||||
|> then(fn legacy_snapshot_path ->
|
||||
[legacy_snapshot_path, legacy_snapshot_path && legacy_path(legacy_snapshot_path)]
|
||||
end)
|
||||
|> Enum.filter(&is_binary/1)
|
||||
|> Enum.reject(&(&1 in current_paths))
|
||||
|> Enum.each(fn legacy_snapshot_path ->
|
||||
if File.exists?(legacy_snapshot_path) do
|
||||
File.rm(legacy_snapshot_path)
|
||||
end
|
||||
entries
|
||||
|> Enum.with_index(1)
|
||||
|> Enum.flat_map(fn {entry, position} ->
|
||||
:ok = report_scan_progress(on_progress, position, total, "embedding entries")
|
||||
|
||||
index
|
||||
|> query(entry.vector, @neighbor_limit)
|
||||
|> Enum.reject(fn {label, _score} -> label == entry.label end)
|
||||
|> Enum.map(fn {label, score} -> {Map.get(labels, label), score} end)
|
||||
|> Enum.filter(fn {post_id, score} -> not is_nil(post_id) and score >= threshold end)
|
||||
|> Enum.map(fn {other_post_id, score} ->
|
||||
{post_id_a, post_id_b} = sort_pair(entry.post_id, other_post_id)
|
||||
{{post_id_a, post_id_b}, %{post_id_a: post_id_a, post_id_b: post_id_b, score: score}}
|
||||
end)
|
||||
end)
|
||||
|> Map.new()
|
||||
|> Map.values()
|
||||
|> Enum.sort_by(& &1.score, :desc)
|
||||
end
|
||||
|
||||
defp legacy_project_snapshot_path(project_id) do
|
||||
case Projects.get_project(project_id) do
|
||||
nil -> nil
|
||||
project -> Path.join(Projects.project_data_dir(project), "embeddings.usearch")
|
||||
# Runs a knn query and returns [{label, similarity}] sorted by descending
|
||||
# similarity. Cosine distance is converted to similarity as max(0, 1 - d).
|
||||
defp query(index, query_vector, k) do
|
||||
case HNSWLib.Index.get_current_count(index) do
|
||||
{:ok, count} when count > 0 ->
|
||||
clamped = min(k, count)
|
||||
|
||||
case HNSWLib.Index.knn_query(index, query_vector, k: clamped) do
|
||||
{:ok, labels, distances} ->
|
||||
Enum.zip(
|
||||
Nx.to_flat_list(labels),
|
||||
Enum.map(Nx.to_flat_list(distances), fn distance -> max(0.0, 1.0 - distance) end)
|
||||
)
|
||||
|
||||
{:error, _reason} ->
|
||||
[]
|
||||
end
|
||||
|
||||
_other ->
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
defp legacy_path(snapshot_path) do
|
||||
Path.join(Path.dirname(snapshot_path), "embeddings.index.json")
|
||||
# ─── Persistence ────────────────────────────────────────────
|
||||
|
||||
defp schedule_save(state, project_id) do
|
||||
entry = Map.fetch!(state, project_id)
|
||||
if is_reference(entry.timer), do: Process.cancel_timer(entry.timer)
|
||||
timer = Process.send_after(self(), {:save, project_id}, @debounce_ms)
|
||||
Map.put(state, project_id, %{entry | timer: timer})
|
||||
end
|
||||
|
||||
# Vectors are stored as a packed little-endian Float32 BLOB; see
|
||||
# BDS.Embeddings and the VectorCacheInDb invariant in embedding.allium.
|
||||
defp decode_vector(nil), do: []
|
||||
defp decode_vector(<<>>), do: []
|
||||
defp save_now(state, project_id) do
|
||||
case Map.get(state, project_id) do
|
||||
nil ->
|
||||
state
|
||||
|
||||
defp decode_vector(binary) when is_binary(binary) do
|
||||
for <<value::float-32-little <- binary>>, do: value
|
||||
entry ->
|
||||
if is_reference(entry.timer), do: Process.cancel_timer(entry.timer)
|
||||
persist(project_id, entry)
|
||||
Map.put(state, project_id, %{entry | timer: nil})
|
||||
end
|
||||
end
|
||||
|
||||
defp cosine_similarity([], _other), do: 0.0
|
||||
defp cosine_similarity(_vector, []), do: 0.0
|
||||
defp persist(_project_id, %{index: nil}), do: :ok
|
||||
|
||||
defp cosine_similarity(left, right) do
|
||||
Enum.zip(left, right)
|
||||
|> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end)
|
||||
|> max(0.0)
|
||||
defp persist(project_id, %{index: index, labels: labels, dim: dim}) do
|
||||
index_path = path(project_id)
|
||||
File.mkdir_p!(Path.dirname(index_path))
|
||||
HNSWLib.Index.save_index(index, index_path)
|
||||
write_meta(index_path, dim, labels)
|
||||
:ok
|
||||
rescue
|
||||
_exception -> :ok
|
||||
end
|
||||
|
||||
defp write_meta(index_path, dim, labels) do
|
||||
payload = %{
|
||||
"dim" => dim,
|
||||
"labels" => Enum.map(labels, fn {label, post_id} -> [label, post_id] end)
|
||||
}
|
||||
|
||||
File.write(meta_path(index_path), Jason.encode!(payload))
|
||||
end
|
||||
|
||||
defp ensure_loaded(state, project_id) do
|
||||
case Map.get(state, project_id) do
|
||||
nil ->
|
||||
case load_from_disk(project_id) do
|
||||
{:ok, entry} -> {:ok, entry, Map.put(state, project_id, entry)}
|
||||
:error -> {:missing, state}
|
||||
end
|
||||
|
||||
entry ->
|
||||
{:ok, entry, state}
|
||||
end
|
||||
end
|
||||
|
||||
defp load_from_disk(project_id) do
|
||||
index_path = path(project_id)
|
||||
|
||||
with {:ok, %{dim: dim, labels: labels}} <- read_meta(index_path),
|
||||
true <- File.exists?(index_path),
|
||||
{:ok, index} <- HNSWLib.Index.load_index(@space, dim, index_path) do
|
||||
:ok = HNSWLib.Index.set_ef(index, @ef_search)
|
||||
{:ok, %{index: index, labels: labels, dim: dim, timer: nil}}
|
||||
else
|
||||
_other -> :error
|
||||
end
|
||||
rescue
|
||||
_exception -> :error
|
||||
end
|
||||
|
||||
defp read_meta(index_path) do
|
||||
with {:ok, contents} <- File.read(meta_path(index_path)),
|
||||
{:ok, %{"dim" => dim, "labels" => labels}} <- Jason.decode(contents) do
|
||||
{:ok,
|
||||
%{
|
||||
dim: dim,
|
||||
labels: Map.new(labels, fn [label, post_id] -> {label, post_id} end)
|
||||
}}
|
||||
else
|
||||
_other -> :error
|
||||
end
|
||||
end
|
||||
|
||||
defp meta_path(index_path), do: index_path <> ".meta.json"
|
||||
|
||||
defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b}
|
||||
defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a}
|
||||
|
||||
defp progress_callback(opts), do: ProgressReporter.callback(opts)
|
||||
|
||||
defp report_scan_started(callback, total, label) do
|
||||
ProgressReporter.report_count_started(callback, total, label,
|
||||
verb: "Scanning",
|
||||
|
||||
@@ -148,6 +148,9 @@ defmodule BDS.Projects do
|
||||
project ->
|
||||
now = Persistence.now_ms()
|
||||
|
||||
previous_active_id =
|
||||
Repo.one(from p in Project, where: p.is_active == true, select: p.id)
|
||||
|
||||
Repo.transaction(fn ->
|
||||
Repo.update_all(
|
||||
from(p in Project, where: p.is_active == true),
|
||||
@@ -159,8 +162,16 @@ defmodule BDS.Projects do
|
||||
|> Repo.update!()
|
||||
end)
|
||||
|> case do
|
||||
{:ok, active_project} -> {:ok, active_project}
|
||||
{:error, reason} -> {:error, reason}
|
||||
{:ok, active_project} ->
|
||||
# Force-save the outgoing project's embedding index (DebouncedPersistence).
|
||||
if is_binary(previous_active_id) and previous_active_id != active_project.id do
|
||||
BDS.Embeddings.Index.flush(previous_active_id)
|
||||
end
|
||||
|
||||
{:ok, active_project}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -194,6 +205,8 @@ defmodule BDS.Projects do
|
||||
end)
|
||||
|> case do
|
||||
{:ok, deleted_project} ->
|
||||
BDS.Embeddings.Index.forget(deleted_project.id)
|
||||
|
||||
Enum.each(cleanup_dirs, fn dir ->
|
||||
_ = File.rm_rf(dir)
|
||||
end)
|
||||
|
||||
Reference in New Issue
Block a user