initial commit

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
2026-04-23 10:42:27 +02:00
commit cd998f24a9
57 changed files with 9751 additions and 0 deletions

226
specs/embedding.allium Normal file
View File

@@ -0,0 +1,226 @@
-- allium: 1
-- bDS Semantic Similarity / Embeddings
-- Scope: extension (Bucket D — Embeddings + Duplicate Detection)
-- Distilled from: src/main/engine/EmbeddingEngine.ts
-- Local embedding model for semantic similarity. Runs entirely on-device,
-- independent of AI endpoints. Gated by semanticSimilarityEnabled project setting.
use "./post.allium" as post
use "./tag.allium" as tag
surface EmbeddingModelSurface {
context model: EmbeddingModel
exposes:
model.model_id
model.dimensions
}
surface EmbeddingRuntimeSurface {
facing _: EmbeddingRuntime
provides:
PostCreated(post)
PostUpdated(post)
PostDeleted(post)
}
surface EmbeddingControlSurface {
facing _: EmbeddingOperator
provides:
ReindexAllRequested(project)
IndexUnindexedRequested(project)
FindSimilarRequested(post, limit)
ComputeSimilaritiesRequested(source_post, target_post_ids)
SuggestTagsRequested(post, input_text)
FindDuplicatesRequested(project)
DismissDuplicatePairRequested(post_a, post_b)
}
-- ─── Model ──────────────────────────────────────────────────
value EmbeddingModel {
-- multilingual-e5-small: 384-dimensional sentence embeddings
-- Model files are obtained from an external model source and cached locally
-- Downloaded on first use, cached in app data directory
-- Lazy-loaded: pipeline created on first embedding request, not at startup
-- Text preprocessing: prefix all input with "query: " (e5 convention)
-- Pooling: mean pooling + L2 normalization
model_id: String -- "Xenova/multilingual-e5-small"
dimensions: Integer -- 384
}
value EmbeddingVector {
dimensions: Integer -- 384 (multilingual-e5-small)
values: List<Decimal>
}
-- ─── Entities ───────────────────────────────────────────────
entity EmbeddingKey {
label: Integer -- HNSW label for USearch
post: post/Post
content_hash: String -- SHA-256 of "{title}\n\n{content}"
vector: EmbeddingVector
}
entity DismissedDuplicatePair {
post_a: post/Post
post_b: post/Post
-- IDs stored in canonical order (sorted) for dedup
}
-- ─── USearch HNSW Index ─────────────────────────────────────
config {
model_id: String = "Xenova/multilingual-e5-small"
embedding_dimensions: Integer = 384
hnsw_metric: String = "cosine"
hnsw_connectivity: Integer = 16 -- M parameter
hnsw_expansion_add: Integer = 128 -- efConstruction
hnsw_expansion_search: Integer = 64 -- efSearch
debounce_persist: Duration = 5.seconds
-- Index file: {userData}/projects/{projectId}/embeddings.usearch
-- Key mapping is persisted alongside the embedding records
}
-- ─── Gating ─────────────────────────────────────────────────
invariant SemanticSimilarityGate {
-- All embedding operations are gated by semanticSimilarityEnabled in project metadata.
-- When disabled: no posts are indexed, queries return empty results.
-- When toggled on: triggers IndexUnindexed to backfill all posts.
-- When toggled off: index remains on disk but is not queried.
}
-- ─── Event-driven indexing ──────────────────────────────────
-- Post lifecycle events trigger embedding updates automatically.
-- See engine_side_effects.allium for the trigger points.
rule EmbedPost {
when: PostCreated(post) or PostUpdated(post)
requires: semantic_similarity_enabled
let hash = sha256(format("{title}\n\n{content}", title: post.title, content: post.content))
let existing = EmbeddingKey{post: post}
if not exists existing or existing.content_hash != hash:
-- Compute embedding vector via local model
-- Upsert into USearch index + embedding_keys DB table
-- Debounced index save (5s)
ensures: EmbeddingKeyUpdated(post)
}
rule RemovePostEmbedding {
when: PostDeleted(post)
requires: semantic_similarity_enabled
ensures: EmbeddingKeyRemoved(post)
}
-- ─── Batch indexing ─────────────────────────────────────────
rule ReindexAll {
when: ReindexAllRequested(project)
requires: semantic_similarity_enabled
-- Re-embeds all posts, rebuilds HNSW index from scratch
for p in project.posts:
ensures: EmbeddingKeyUpdated(p)
ensures: HnswIndexRebuilt(project)
}
rule IndexUnindexed {
when: IndexUnindexedRequested(project)
requires: semantic_similarity_enabled
-- Triggered at app startup (if enabled) and when setting toggled on
-- Only embeds posts without existing embeddings or with changed content_hash
-- Runs as background task with progress reporting
for p in project.posts:
let existing = EmbeddingKey{post: p}
if not exists existing or existing.content_hash != p.checksum:
ensures: EmbeddingKeyUpdated(p)
}
-- ─── Query operations ───────────────────────────────────────
rule FindSimilar {
when: FindSimilarRequested(post, limit)
requires: semantic_similarity_enabled
-- HNSW approximate nearest neighbor search via USearch
-- Searches index for (limit + 1) neighbors, excludes self
-- Converts USearch cosine distance to similarity: max(0, 1 - distance)
-- Returns ranked list sorted by descending similarity
ensures: SimilarPostsResult(post, ranked_matches)
}
rule ComputeSimilarities {
when: ComputeSimilaritiesRequested(source_post, target_post_ids)
requires: semantic_similarity_enabled
-- Exact pairwise cosine similarity between source vector and each target vector
-- Uses in-memory vector cache, NOT USearch search
-- Returns map of post_id -> similarity score
-- Used by InsertPostLinkModal to rank FTS search results
ensures: SimilarityScoresResult(source_post, scores)
}
rule SuggestTags {
when: SuggestTagsRequested(post, input_text)
requires: semantic_similarity_enabled
-- 1. Find 10 most similar posts via HNSW search
-- 2. Collect all tags from those posts
-- 3. Weight tags by similarity score of the post they came from
-- 4. Return top 5 tags by weighted score
-- Used by tag input component for autocomplete suggestions
ensures: TagSuggestionResult(post, suggested_tags)
}
rule FindDuplicates {
when: FindDuplicatesRequested(project)
requires: semantic_similarity_enabled
-- Finds near-duplicate post pairs above similarity threshold
-- For each indexed post: search 21 nearest neighbors
-- Pairs above 0.92 threshold kept, dismissed pairs excluded
-- At 100% embedding similarity: loads post bodies for exact match check
-- Results sorted: exact matches first, then descending similarity
let all_pairs = compute_all_similarities(project)
let above_threshold = filter_above_threshold(all_pairs)
let pairs = exclude_dismissed(above_threshold, DismissedDuplicatePairs)
ensures: DuplicateReport(pairs)
}
rule DismissDuplicatePair {
when: DismissDuplicatePairRequested(post_a, post_b)
-- Stores with canonical ID ordering for consistent dedup
ensures: DismissedDuplicatePair.created(post_a: post_a, post_b: post_b)
}
-- ─── Invariants ─────────────────────────────────────────────
invariant ContentHashSkipsUnchanged {
-- If a post's content_hash matches the stored embedding's content_hash,
-- the post is not re-embedded. This makes bulk re-indexing efficient.
}
invariant DebouncedPersistence {
-- USearch index persistence is debounced at 5 seconds
-- Prevents excessive disk I/O during bulk operations
-- Index also force-saved on project switch and app shutdown
}
invariant VectorCacheInDb {
-- Vector cache persisted as BLOB in embedding_keys table
-- Float32Array, 384 dimensions per vector (1536 bytes)
-- Enables instant reload without re-embedding
}
invariant ModelCaching {
-- Model files (~100 MB) downloaded from Hugging Face Hub on first use
-- Cached in app data directory, persists across sessions
-- Model pipeline stays loaded across project switches (one model, many indexes)
}
invariant ProjectIsolation {
-- Each project has its own USearch index file and embedding_keys rows
-- On project switch: save current index, load new project's index
-- Model pipeline shared across projects (not reloaded)
}