-- allium: 1 -- bDS Semantic Similarity / Embeddings -- Scope: extension (Bucket D — Embeddings + Duplicate Detection) -- Distilled from: src/main/engine/EmbeddingEngine.ts -- Local embedding model for semantic similarity. Runs entirely on-device, -- independent of AI endpoints. Gated by semanticSimilarityEnabled project setting. use "./post.allium" as post use "./tag.allium" as tag surface EmbeddingModelSurface { context model: EmbeddingModel exposes: model.model_id model.dimensions } surface EmbeddingRuntimeSurface { facing _: EmbeddingRuntime provides: PostCreated(post) PostUpdated(post) PostDeleted(post) } surface EmbeddingControlSurface { facing _: EmbeddingOperator provides: ReindexAllRequested(project) IndexUnindexedRequested(project) FindSimilarRequested(post, limit) ComputeSimilaritiesRequested(source_post, target_post_ids) SuggestTagsRequested(post, input_text) FindDuplicatesRequested(project) DismissDuplicatePairRequested(post_a, post_b) } -- ─── Model ────────────────────────────────────────────────── value EmbeddingModel { -- multilingual-e5-small: 384-dimensional sentence embeddings -- Model files are obtained from an external model source and cached locally -- Downloaded on first use, cached in app data directory -- Lazy-loaded: pipeline created on first embedding request, not at startup -- Text preprocessing: prefix all input with "query: " (e5 convention) -- Pooling: mean pooling + L2 normalization model_id: String -- "Xenova/multilingual-e5-small" dimensions: Integer -- 384 } value EmbeddingVector { dimensions: Integer -- 384 (multilingual-e5-small) values: List } -- ─── Entities ─────────────────────────────────────────────── entity EmbeddingKey { label: Integer -- HNSW label for USearch post: post/Post content_hash: String -- SHA-256 of "{title}\n\n{content}" vector: EmbeddingVector } entity DismissedDuplicatePair { post_a: post/Post post_b: post/Post -- IDs stored in canonical order (sorted) for dedup } -- ─── USearch HNSW Index ───────────────────────────────────── config { model_id: String = "Xenova/multilingual-e5-small" embedding_dimensions: Integer = 384 hnsw_metric: String = "cosine" hnsw_connectivity: Integer = 16 -- M parameter hnsw_expansion_add: Integer = 128 -- efConstruction hnsw_expansion_search: Integer = 64 -- efSearch debounce_persist: Duration = 5.seconds -- Index file: {userData}/projects/{projectId}/embeddings.usearch -- Key mapping is persisted alongside the embedding records } -- ─── Gating ───────────────────────────────────────────────── invariant SemanticSimilarityGate { -- All embedding operations are gated by semanticSimilarityEnabled in project metadata. -- When disabled: no posts are indexed, queries return empty results. -- When toggled on: triggers IndexUnindexed to backfill all posts. -- When toggled off: index remains on disk but is not queried. } -- ─── Event-driven indexing ────────────────────────────────── -- Post lifecycle events trigger embedding updates automatically. -- See engine_side_effects.allium for the trigger points. rule EmbedPost { when: PostCreated(post) or PostUpdated(post) requires: semantic_similarity_enabled let hash = sha256(format("{title}\n\n{content}", title: post.title, content: post.content)) let existing = EmbeddingKey{post: post} if not exists existing or existing.content_hash != hash: -- Compute embedding vector via local model -- Upsert into USearch index + embedding_keys DB table -- Debounced index save (5s) ensures: EmbeddingKeyUpdated(post) } rule RemovePostEmbedding { when: PostDeleted(post) requires: semantic_similarity_enabled ensures: EmbeddingKeyRemoved(post) } -- ─── Batch indexing ───────────────────────────────────────── rule ReindexAll { when: ReindexAllRequested(project) requires: semantic_similarity_enabled -- Re-embeds all posts, rebuilds HNSW index from scratch for p in project.posts: ensures: EmbeddingKeyUpdated(p) ensures: HnswIndexRebuilt(project) } rule IndexUnindexed { when: IndexUnindexedRequested(project) requires: semantic_similarity_enabled -- Triggered at app startup (if enabled) and when setting toggled on -- Only embeds posts without existing embeddings or with changed content_hash -- Runs as background task with progress reporting for p in project.posts: let existing = EmbeddingKey{post: p} if not exists existing or existing.content_hash != p.checksum: ensures: EmbeddingKeyUpdated(p) } -- ─── Query operations ─────────────────────────────────────── rule FindSimilar { when: FindSimilarRequested(post, limit) requires: semantic_similarity_enabled -- HNSW approximate nearest neighbor search via USearch -- Searches index for (limit + 1) neighbors, excludes self -- Converts USearch cosine distance to similarity: max(0, 1 - distance) -- Returns ranked list sorted by descending similarity ensures: SimilarPostsResult(post, ranked_matches) } rule ComputeSimilarities { when: ComputeSimilaritiesRequested(source_post, target_post_ids) requires: semantic_similarity_enabled -- Exact pairwise cosine similarity between source vector and each target vector -- Uses in-memory vector cache, NOT USearch search -- Returns map of post_id -> similarity score -- Used by InsertPostLinkModal to rank FTS search results ensures: SimilarityScoresResult(source_post, scores) } rule SuggestTags { when: SuggestTagsRequested(post, input_text) requires: semantic_similarity_enabled -- 1. Find 10 most similar posts via HNSW search -- 2. Collect all tags from those posts -- 3. Weight tags by similarity score of the post they came from -- 4. Return top 5 tags by weighted score -- Used by tag input component for autocomplete suggestions ensures: TagSuggestionResult(post, suggested_tags) } rule FindDuplicates { when: FindDuplicatesRequested(project) requires: semantic_similarity_enabled -- Finds near-duplicate post pairs above similarity threshold -- For each indexed post: search 21 nearest neighbors -- Pairs above 0.92 threshold kept, dismissed pairs excluded -- At 100% embedding similarity: loads post bodies for exact match check -- Results sorted: exact matches first, then descending similarity let all_pairs = compute_all_similarities(project) let above_threshold = filter_above_threshold(all_pairs) let pairs = exclude_dismissed(above_threshold, DismissedDuplicatePairs) ensures: DuplicateReport(pairs) } rule DismissDuplicatePair { when: DismissDuplicatePairRequested(post_a, post_b) -- Stores with canonical ID ordering for consistent dedup ensures: DismissedDuplicatePair.created(post_a: post_a, post_b: post_b) } -- ─── Invariants ───────────────────────────────────────────── invariant ContentHashSkipsUnchanged { -- If a post's content_hash matches the stored embedding's content_hash, -- the post is not re-embedded. This makes bulk re-indexing efficient. } invariant DebouncedPersistence { -- USearch index persistence is debounced at 5 seconds -- Prevents excessive disk I/O during bulk operations -- Index also force-saved on project switch and app shutdown } invariant VectorCacheInDb { -- Vector cache persisted as BLOB in embedding_keys table -- Float32Array, 384 dimensions per vector (1536 bytes) -- Enables instant reload without re-embedding } invariant ModelCaching { -- Model files (~100 MB) downloaded from Hugging Face Hub on first use -- Cached in app data directory, persists across sessions -- Model pipeline stays loaded across project switches (one model, many indexes) } invariant ProjectIsolation { -- Each project has its own USearch index file and embedding_keys rows -- On project switch: save current index, load new project's index -- Model pipeline shared across projects (not reloaded) }