235 lines
9.3 KiB
Plaintext
235 lines
9.3 KiB
Plaintext
-- allium: 1
|
|
-- bDS Semantic Similarity / Embeddings
|
|
-- Scope: extension (Bucket D — Embeddings + Duplicate Detection)
|
|
-- Distilled from: src/main/engine/EmbeddingEngine.ts
|
|
|
|
-- Local embedding model for semantic similarity. Runs entirely on-device,
|
|
-- independent of AI endpoints. Gated by semanticSimilarityEnabled project setting.
|
|
|
|
use "./post.allium" as post
|
|
use "./tag.allium" as tag
|
|
|
|
surface EmbeddingModelSurface {
|
|
context model: EmbeddingModel
|
|
|
|
exposes:
|
|
model.model_id
|
|
model.dimensions
|
|
}
|
|
|
|
surface EmbeddingRuntimeSurface {
|
|
facing _: EmbeddingRuntime
|
|
|
|
provides:
|
|
PostCreated(post)
|
|
PostUpdated(post)
|
|
PostDeleted(post)
|
|
}
|
|
|
|
surface EmbeddingControlSurface {
|
|
facing _: EmbeddingOperator
|
|
|
|
provides:
|
|
ReindexAllRequested(project)
|
|
IndexUnindexedRequested(project)
|
|
FindSimilarRequested(post, limit)
|
|
ComputeSimilaritiesRequested(source_post, target_post_ids)
|
|
SuggestTagsRequested(post, input_text)
|
|
FindDuplicatesRequested(project)
|
|
DismissDuplicatePairRequested(post_a, post_b)
|
|
}
|
|
|
|
-- ─── Model ──────────────────────────────────────────────────
|
|
|
|
value EmbeddingModel {
|
|
-- multilingual-e5-small: 384-dimensional sentence embeddings
|
|
-- Model files are obtained from an external model source and cached locally
|
|
-- Downloaded on first use, cached in app data directory
|
|
-- Lazy-loaded: pipeline created on first embedding request, not at startup
|
|
-- Text preprocessing: prefix all input with "query: " (e5 convention)
|
|
-- Pooling: mean pooling + L2 normalization
|
|
model_id: String -- "Xenova/multilingual-e5-small"
|
|
dimensions: Integer -- 384
|
|
}
|
|
|
|
value EmbeddingVector {
|
|
dimensions: Integer -- 384 (multilingual-e5-small)
|
|
values: List<Decimal>
|
|
}
|
|
|
|
-- ─── Entities ───────────────────────────────────────────────
|
|
|
|
entity EmbeddingKey {
|
|
label: Integer -- HNSW label for USearch
|
|
post: post/Post
|
|
content_hash: String -- SHA-256 of "{title}\n\n{content}"
|
|
vector: EmbeddingVector
|
|
}
|
|
|
|
entity DismissedDuplicatePair {
|
|
post_a: post/Post
|
|
post_b: post/Post
|
|
-- IDs stored in canonical order (sorted) for dedup
|
|
}
|
|
|
|
-- ─── USearch HNSW Index ─────────────────────────────────────
|
|
|
|
config {
|
|
model_id: String = "Xenova/multilingual-e5-small"
|
|
embedding_dimensions: Integer = 384
|
|
hnsw_metric: String = "cosine"
|
|
hnsw_connectivity: Integer = 16 -- M parameter
|
|
hnsw_expansion_add: Integer = 128 -- efConstruction
|
|
hnsw_expansion_search: Integer = 64 -- efSearch
|
|
debounce_persist: Duration = 5.seconds
|
|
-- Index file: {userData}/projects/{projectId}/embeddings.usearch
|
|
-- Key mapping is persisted alongside the embedding records
|
|
}
|
|
|
|
-- ─── Gating ─────────────────────────────────────────────────
|
|
|
|
invariant SemanticSimilarityGate {
|
|
-- All embedding operations are gated by semanticSimilarityEnabled in project metadata.
|
|
-- When disabled: no posts are indexed, queries return empty results.
|
|
-- When toggled on: triggers IndexUnindexed to backfill all posts.
|
|
-- When toggled off: index remains on disk but is not queried.
|
|
}
|
|
|
|
-- ─── Event-driven indexing ──────────────────────────────────
|
|
|
|
-- Post lifecycle events trigger embedding updates automatically.
|
|
-- See engine_side_effects.allium for the trigger points.
|
|
|
|
rule EmbedPost {
|
|
when: PostCreated(post) or PostUpdated(post)
|
|
requires: semantic_similarity_enabled
|
|
let hash = sha256(format("{title}\n\n{content}", title: post.title, content: post.content))
|
|
let existing = EmbeddingKey{post: post}
|
|
if not exists existing or existing.content_hash != hash:
|
|
-- Compute embedding vector via local model
|
|
-- Upsert into USearch index + embedding_keys DB table
|
|
-- Debounced index save (5s)
|
|
ensures: EmbeddingKeyUpdated(post)
|
|
}
|
|
|
|
rule RemovePostEmbedding {
|
|
when: PostDeleted(post)
|
|
requires: semantic_similarity_enabled
|
|
ensures: EmbeddingKeyRemoved(post)
|
|
}
|
|
|
|
-- ─── Batch indexing ─────────────────────────────────────────
|
|
|
|
rule ReindexAll {
|
|
when: ReindexAllRequested(project)
|
|
requires: semantic_similarity_enabled
|
|
-- Re-embeds all posts, rebuilds HNSW index from scratch
|
|
for p in project.posts:
|
|
ensures: EmbeddingKeyUpdated(p)
|
|
ensures: HnswIndexRebuilt(project)
|
|
}
|
|
|
|
rule IndexUnindexed {
|
|
when: IndexUnindexedRequested(project)
|
|
requires: semantic_similarity_enabled
|
|
-- Triggered at app startup (if enabled) and when setting toggled on
|
|
-- Only embeds posts without existing embeddings or with changed content_hash
|
|
-- Runs as background task with progress reporting
|
|
for p in project.posts:
|
|
let existing = EmbeddingKey{post: p}
|
|
if not exists existing or existing.content_hash != p.checksum:
|
|
ensures: EmbeddingKeyUpdated(p)
|
|
}
|
|
|
|
-- ─── Query operations ───────────────────────────────────────
|
|
|
|
rule FindSimilar {
|
|
when: FindSimilarRequested(post, limit)
|
|
requires: semantic_similarity_enabled
|
|
-- HNSW approximate nearest neighbor search via USearch
|
|
-- Searches index for (limit + 1) neighbors, excludes self
|
|
-- Converts USearch cosine distance to similarity: max(0, 1 - distance)
|
|
-- Returns ranked list sorted by descending similarity
|
|
ensures: SimilarPostsResult(post, ranked_matches)
|
|
}
|
|
|
|
rule ComputeSimilarities {
|
|
when: ComputeSimilaritiesRequested(source_post, target_post_ids)
|
|
requires: semantic_similarity_enabled
|
|
-- Exact pairwise cosine similarity between source vector and each target vector
|
|
-- Uses in-memory vector cache, NOT USearch search
|
|
-- Returns map of post_id -> similarity score
|
|
-- Used by InsertPostLinkModal to rank FTS search results
|
|
ensures: SimilarityScoresResult(source_post, scores)
|
|
}
|
|
|
|
rule SuggestTags {
|
|
when: SuggestTagsRequested(post, input_text)
|
|
requires: semantic_similarity_enabled
|
|
-- 1. Find 10 most similar posts via HNSW search
|
|
-- 2. Collect all tags from those posts
|
|
-- 3. Weight tags by similarity score of the post they came from
|
|
-- 4. Return top 5 tags by weighted score
|
|
-- Used by tag input component for autocomplete suggestions
|
|
ensures: TagSuggestionResult(post, suggested_tags)
|
|
}
|
|
|
|
rule FindDuplicates {
|
|
when: FindDuplicatesRequested(project)
|
|
requires: semantic_similarity_enabled
|
|
-- Finds near-duplicate post pairs above similarity threshold
|
|
-- For each indexed post: search 21 nearest neighbors
|
|
-- Pairs above 0.92 threshold kept, dismissed pairs excluded
|
|
-- At 100% embedding similarity: loads post bodies for exact match check
|
|
-- Results sorted: exact matches first, then descending similarity
|
|
let all_pairs = compute_all_similarities(project)
|
|
let above_threshold = filter_above_threshold(all_pairs)
|
|
let pairs = exclude_dismissed(above_threshold, DismissedDuplicatePairs)
|
|
ensures: DuplicateReport(pairs)
|
|
}
|
|
|
|
rule DismissDuplicatePair {
|
|
when: DismissDuplicatePairRequested(post_a, post_b)
|
|
-- Stores with canonical ID ordering for consistent dedup
|
|
ensures: DismissedDuplicatePair.created(post_a: post_a, post_b: post_b)
|
|
}
|
|
|
|
-- ─── Invariants ─────────────────────────────────────────────
|
|
|
|
invariant ContentHashSkipsUnchanged {
|
|
-- If a post's content_hash matches the stored embedding's content_hash,
|
|
-- the post is not re-embedded. This makes bulk re-indexing efficient.
|
|
}
|
|
|
|
invariant DebouncedPersistence {
|
|
-- USearch index persistence is debounced at 5 seconds
|
|
-- Prevents excessive disk I/O during bulk operations
|
|
-- Index also force-saved on project switch and app shutdown
|
|
}
|
|
|
|
invariant VectorCacheInDb {
|
|
-- Vector cache persisted as BLOB in embedding_keys table
|
|
-- Float32Array, 384 dimensions per vector (1536 bytes)
|
|
-- Enables instant reload without re-embedding
|
|
}
|
|
|
|
invariant RealNeuralModel {
|
|
-- Embeddings MUST be produced by the actual ONNX neural model (multilingual-e5-small),
|
|
-- not by lexical approximations (TF-IDF, bag-of-words, hash projections).
|
|
-- Cross-language semantic similarity is a primary requirement:
|
|
-- posts in different languages about the same topic must produce similar vectors.
|
|
-- This is only achievable with the trained multilingual transformer model.
|
|
}
|
|
|
|
invariant ModelCaching {
|
|
-- Model files (~100 MB) downloaded from Hugging Face Hub on first use
|
|
-- Cached in app data directory, persists across sessions
|
|
-- Model pipeline stays loaded across project switches (one model, many indexes)
|
|
}
|
|
|
|
invariant ProjectIsolation {
|
|
-- Each project has its own USearch index file and embedding_keys rows
|
|
-- On project switch: save current index, load new project's index
|
|
-- Model pipeline shared across projects (not reloaded)
|
|
}
|