From 0b5efbb5e190adf15321da2457dd13153aa9980a Mon Sep 17 00:00:00 2001 From: hugo Date: Wed, 11 Feb 2026 09:39:30 +0100 Subject: [PATCH] feat: proper multi-language stemming --- VISION.md | 2 +- package-lock.json | 7 ++ package.json | 1 + src/main/database/connection.ts | 28 ++++- src/main/engine/PostEngine.ts | 180 +++++++++++++++++++----------- src/main/engine/index.ts | 8 ++ src/main/engine/stemmer.ts | 183 +++++++++++++++++++++++++++++++ src/renderer/types/electron.d.ts | 1 - 8 files changed, 337 insertions(+), 73 deletions(-) create mode 100644 src/main/engine/stemmer.ts diff --git a/VISION.md b/VISION.md index 57a6e27..5ff0afa 100644 --- a/VISION.md +++ b/VISION.md @@ -25,7 +25,7 @@ metadata can always be reconstructed from posts. Do the same with images, keepin user local path, in that case storing the image file sand for each image file a properties sidecar file that uses the same header structure as for posts. -The application must be offline-first, everything must work in airplane mode (except publishing of course). +The application must be offline-first, everything must work in airplane mode (except exporting of course). It must be fully self-contained during editing and previewing and managing content. Every internal structure must have reflections in the filesystem, so available tags, available categories, all those things must be automatically reflected to the filesystem in a per-project way. Use a meta/ folder under the project folder diff --git a/package-lock.json b/package-lock.json index 831fa1b..b9e504a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,6 +31,7 @@ "react-dom": "^18.2.0", "react-hot-toast": "^2.6.0", "sharp": "^0.34.5", + "snowball-stemmers": "^0.6.0", "tiptap-markdown": "^0.9.0", "uuid": "^9.0.1", "zustand": "^4.4.7" @@ -10994,6 +10995,12 @@ "npm": ">= 3.0.0" } }, + "node_modules/snowball-stemmers": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/snowball-stemmers/-/snowball-stemmers-0.6.0.tgz", + "integrity": "sha512-47VTJvsZzEme3VxhbUFc9wR0CX8fYKNMgL7AiBdJcHUrwbmN7htFroeX+8ESIAaoNgwEgwI2wnv1HIcqQqAdcA==", + "license": "ISC" + }, "node_modules/source-map": { "version": "0.6.1", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", diff --git a/package.json b/package.json index bf9ce53..8b0ccaa 100644 --- a/package.json +++ b/package.json @@ -74,6 +74,7 @@ "react-dom": "^18.2.0", "react-hot-toast": "^2.6.0", "sharp": "^0.34.5", + "snowball-stemmers": "^0.6.0", "tiptap-markdown": "^0.9.0", "uuid": "^9.0.1", "zustand": "^4.4.7" diff --git a/src/main/database/connection.ts b/src/main/database/connection.ts index 3b28697..6c3927e 100644 --- a/src/main/database/connection.ts +++ b/src/main/database/connection.ts @@ -318,18 +318,38 @@ export class DatabaseConnection { } // Create FTS5 virtual table for full-text search + // Only stores: id (unindexed, for lookups) and content (stemmed text for matching) + // Post data for display comes from the posts table or filesystem files await this.localClient.execute(` CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts USING fts5( id UNINDEXED, - title, content, - excerpt, - tags, - categories, content_rowid=rowid ); `); + // Migration: Check if old FTS schema (with multiple columns) exists and recreate + // Old schema had: id, title, content, excerpt, tags, categories, content_stemmed + // New schema has: id, content (stemmed only) + try { + // Try to query old columns - if they exist, we need to migrate + await this.localClient.execute("SELECT title FROM posts_fts LIMIT 0"); + + // Old schema exists - recreate with new simple schema + console.log('Migrating posts_fts table to simplified schema...'); + await this.localClient.execute('DROP TABLE IF EXISTS posts_fts'); + await this.localClient.execute(` + CREATE VIRTUAL TABLE posts_fts USING fts5( + id UNINDEXED, + content, + content_rowid=rowid + ); + `); + console.log('FTS table migrated - rebuild index required'); + } catch { + // Old columns don't exist - we have the new schema or no data, all good + } + // Create default project if none exists const existingProjects = await this.localClient.execute('SELECT COUNT(*) as count FROM projects'); if (existingProjects.rows[0] && (existingProjects.rows[0].count as number) === 0) { diff --git a/src/main/engine/PostEngine.ts b/src/main/engine/PostEngine.ts index 8c98c12..86c0141 100644 --- a/src/main/engine/PostEngine.ts +++ b/src/main/engine/PostEngine.ts @@ -9,6 +9,7 @@ import { app } from 'electron'; import { getDatabase } from '../database'; import { posts, Post, NewPost, postLinks } from '../database/schema'; import { taskManager, Task } from './TaskManager'; +import { stemText, stemQuery, SupportedLanguage } from './stemmer'; export interface PostData { id: string; @@ -46,8 +47,6 @@ export interface SearchResult { title: string; slug: string; excerpt?: string; - matchSnippet?: string; - rank?: number; } export interface PostFilter { @@ -73,11 +72,74 @@ export interface PaginationOptions { export class PostEngine extends EventEmitter { private currentProjectId: string = 'default'; + private searchLanguage: SupportedLanguage = 'english'; constructor() { super(); } + /** + * Set the language used for full-text search stemming. + * Affects both indexing and query processing. + */ + setSearchLanguage(language: SupportedLanguage): void { + this.searchLanguage = language; + } + + /** + * Get the current search language. + */ + getSearchLanguage(): SupportedLanguage { + return this.searchLanguage; + } + + /** + * Update the FTS index for a post. + * Updates the FTS index for a post. + * Stores only the stemmed content (combining title, excerpt, content, tags, categories). + * Only the post ID is returned from searches - actual post data comes from DB/files. + */ + private async updateFTSIndex(post: { + id: string; + title: string; + content: string; + excerpt?: string; + tags: string[]; + categories: string[]; + }): Promise { + const client = getDatabase().getLocalClient(); + if (!client) return; + + // Delete existing entry + await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] }); + + // Combine all searchable fields and stem them + const allText = [ + post.title, + post.excerpt || '', + post.content, + post.tags.join(' '), + post.categories.join(' '), + ].join(' '); + + const stemmedContent = stemText(allText, this.searchLanguage); + + // Insert with only id and stemmed content + await client.execute({ + sql: 'INSERT INTO posts_fts (id, content) VALUES (?, ?)', + args: [post.id, stemmedContent], + }); + } + + /** + * Delete a post from the FTS index. + */ + private async deleteFTSIndex(id: string): Promise { + const client = getDatabase().getLocalClient(); + if (!client) return; + await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); + } + private getPostsBaseDir(): string { const userDataPath = app.getPath('userData'); return path.join(userDataPath, 'projects', this.currentProjectId, 'posts'); @@ -289,12 +351,7 @@ export class PostEngine extends EventEmitter { await db.insert(posts).values(dbPost); // Update FTS index - if (client) { - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [post.id, post.title, post.content, post.excerpt || '', post.tags.join(' '), post.categories.join(' ')], - }); - } + await this.updateFTSIndex(post); this.emit('postCreated', post); return post; @@ -369,13 +426,7 @@ export class PostEngine extends EventEmitter { .where(eq(posts.id, id)); // Update FTS index - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')], - }); - } + await this.updateFTSIndex(updated); // Update post links if content changed if (data.content) { @@ -412,9 +463,7 @@ export class PostEngine extends EventEmitter { await db.delete(posts).where(eq(posts.id, id)); // Delete from FTS index - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); - } + await this.deleteFTSIndex(id); this.emit('postDeleted', id); return true; @@ -618,28 +667,34 @@ export class PostEngine extends EventEmitter { if (!client) return []; try { + // Stem the query for multilingual matching + const stemmedQuery = stemQuery(query, this.searchLanguage); + + // Search the stemmed content, only return post IDs const result = await client.execute({ - sql: `SELECT id, title, excerpt, snippet(posts_fts, 2, '', '', '...', 32) as snippet, rank - FROM posts_fts - WHERE posts_fts MATCH ? - ORDER BY rank - LIMIT 50`, - args: [query], + sql: `SELECT id FROM posts_fts WHERE posts_fts MATCH ? ORDER BY rank LIMIT 50`, + args: [stemmedQuery], }); + // Filter to current project and fetch actual post data const projectPosts = await this.getAllPostsUnpaginated(); - const projectPostIds = new Set(projectPosts.map(p => p.id)); + const projectPostMap = new Map(projectPosts.map(p => [p.id, p])); - return result.rows - .filter(row => projectPostIds.has(row.id as string)) - .map(row => ({ - id: row.id as string, - title: row.title as string, - slug: '', // Will be filled in by caller if needed - excerpt: row.excerpt as string | undefined, - matchSnippet: row.snippet as string | undefined, - rank: row.rank as number | undefined, - })); + const searchResults: SearchResult[] = []; + for (const row of result.rows) { + const postId = row.id as string; + const post = projectPostMap.get(postId); + if (post) { + searchResults.push({ + id: post.id, + title: post.title, + slug: post.slug, + excerpt: post.excerpt, + }); + } + } + + return searchResults; } catch (error) { console.error('Search failed:', error); return []; @@ -816,13 +871,7 @@ export class PostEngine extends EventEmitter { .where(eq(posts.id, id)); // Update FTS index - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [published.id, published.title, published.content, published.excerpt || '', published.tags.join(' '), published.categories.join(' ')], - }); - } + await this.updateFTSIndex(published); // Update post links based on published content await this.updatePostLinks(id, published.content); @@ -886,13 +935,7 @@ export class PostEngine extends EventEmitter { }; // Update FTS index - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [reverted.id, reverted.title, reverted.content, reverted.excerpt || '', reverted.tags.join(' '), reverted.categories.join(' ')], - }); - } + await this.updateFTSIndex(reverted); this.emit('postUpdated', reverted); return reverted; @@ -949,18 +992,29 @@ export class PostEngine extends EventEmitter { .where(eq(posts.id, id)); // Update FTS index - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] }); - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')], - }); - } + await this.updateFTSIndex(updated); this.emit('postUpdated', updated); return updated; } + /** + * Rebuild the FTS index for all posts in the current project. + * Call this after changing the search language or after migration. + */ + async rebuildFTSIndex(): Promise { + const client = getDatabase().getLocalClient(); + if (!client) return; + + const allPosts = await this.getAllPostsUnpaginated(); + + for (const post of allPosts) { + await this.updateFTSIndex(post); + } + + console.log(`Rebuilt FTS index for ${allPosts.length} posts`); + } + async rebuildDatabaseFromFiles(): Promise { const postsBaseDir = this.getPostsBaseDir(); const task: Task = { @@ -980,10 +1034,8 @@ export class PostEngine extends EventEmitter { if (existingPosts.length > 0) { const postIds = existingPosts.map(p => p.id); // Delete FTS entries first - if (client) { - for (const post of existingPosts) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] }); - } + for (const post of existingPosts) { + await this.deleteFTSIndex(post.id); } // Delete post links where source or target is in the posts being deleted await db.delete(postLinks).where(inArray(postLinks.sourcePostId, postIds)); @@ -1069,13 +1121,7 @@ export class PostEngine extends EventEmitter { insertedSlugs.set(slugKey, filePath); // Update FTS index (use file content for search) - if (client) { - await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [postData.id] }); - await client.execute({ - sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)', - args: [postData.id, postData.title, postData.content, postData.excerpt || '', postData.tags.join(' '), postData.categories.join(' ')], - }); - } + await this.updateFTSIndex(postData); } catch (error: any) { // Handle constraint violations and other errors gracefully if (error?.code === 'SQLITE_CONSTRAINT_UNIQUE') { diff --git a/src/main/engine/index.ts b/src/main/engine/index.ts index ac6841c..c4faab0 100644 --- a/src/main/engine/index.ts +++ b/src/main/engine/index.ts @@ -4,6 +4,14 @@ export { MediaEngine, getMediaEngine, type MediaData } from './MediaEngine'; export { SyncEngine, getSyncEngine, type SyncConfig, type SyncResult, type SyncDirection, type SyncStatus } from './SyncEngine'; export { ProjectEngine, getProjectEngine, type ProjectData } from './ProjectEngine'; export { MetaEngine, getMetaEngine } from './MetaEngine'; +export { + stemText, + stemWord, + stemQuery, + prepareForFTS, + getSupportedLanguages, + type SupportedLanguage, +} from './stemmer'; export { DropboxSyncEngine, getDropboxSyncEngine, diff --git a/src/main/engine/stemmer.ts b/src/main/engine/stemmer.ts new file mode 100644 index 0000000..2d444eb --- /dev/null +++ b/src/main/engine/stemmer.ts @@ -0,0 +1,183 @@ +/** + * Multilingual text stemming utility using Snowball stemmers. + * Used to normalize text before indexing in FTS5 and before searching. + * + * Supports 24 languages including: English, German, French, Spanish, Italian, + * Portuguese, Dutch, Russian, Arabic, and more. + */ + +// eslint-disable-next-line @typescript-eslint/no-var-requires +const snowballFactory = require('snowball-stemmers'); + +export type SupportedLanguage = + | 'arabic' + | 'armenian' + | 'basque' + | 'catalan' + | 'czech' + | 'danish' + | 'dutch' + | 'english' + | 'finnish' + | 'french' + | 'german' + | 'hungarian' + | 'italian' + | 'irish' + | 'norwegian' + | 'porter' + | 'portuguese' + | 'romanian' + | 'russian' + | 'spanish' + | 'slovene' + | 'swedish' + | 'tamil' + | 'turkish'; + +interface Stemmer { + stem(word: string): string; +} + +// Cache stemmers to avoid recreating them +const stemmerCache = new Map(); + +/** + * Get a stemmer instance for a given language. + * Stemmers are cached for reuse. + */ +function getStemmer(language: SupportedLanguage): Stemmer { + let stemmer = stemmerCache.get(language); + if (!stemmer) { + stemmer = snowballFactory.newStemmer(language) as Stemmer; + stemmerCache.set(language, stemmer); + } + return stemmer; +} + +/** + * Get all supported language codes. + */ +export function getSupportedLanguages(): SupportedLanguage[] { + return snowballFactory.algorithms() as SupportedLanguage[]; +} + +/** + * Tokenize text into words. + * Uses Unicode-aware word splitting to handle non-ASCII languages. + */ +function tokenize(text: string): string[] { + // Match Unicode word characters (letters, marks, numbers) + // This handles languages like German (häuser), Russian (привет), Arabic, etc. + const wordPattern = /[\p{L}\p{M}\p{N}]+/gu; + const matches = text.match(wordPattern); + return matches || []; +} + +/** + * Stem a single word using the specified language stemmer. + */ +export function stemWord(word: string, language: SupportedLanguage = 'english'): string { + const stemmer = getStemmer(language); + return stemmer.stem(word.toLowerCase()); +} + +/** + * Stem all words in a text and return the stemmed text. + * Words are joined with spaces. + * + * @param text - The text to stem + * @param language - The language to use for stemming (default: 'english') + * @returns Text with all words replaced by their stems + * + * @example + * stemText('Running runners run', 'english') // 'run runner run' + * stemText('Häuser Haus', 'german') // 'haus haus' + */ +export function stemText(text: string, language: SupportedLanguage = 'english'): string { + if (!text) return ''; + + const words = tokenize(text); + const stemmer = getStemmer(language); + + const stemmedWords = words.map(word => stemmer.stem(word.toLowerCase())); + return stemmedWords.join(' '); +} + +/** + * Prepare a search query by stemming all words. + * This ensures searches match stemmed content in the FTS index. + * + * FTS5 query syntax is preserved: + * - Quoted phrases are stemmed but kept quoted + * - Boolean operators (AND, OR, NOT) are preserved + * - Prefix searches (word*) have the word part stemmed + * + * @param query - The search query from the user + * @param language - The language for stemming + * @returns Query with words stemmed for FTS5 + * + * @example + * stemQuery('running dogs', 'english') // 'run dog' + * stemQuery('"running fast"', 'english') // '"run fast"' + */ +export function stemQuery(query: string, language: SupportedLanguage = 'english'): string { + if (!query) return ''; + + const stemmer = getStemmer(language); + + // Handle quoted phrases - stem words inside quotes but keep quotes + const result = query.replace( + /"([^"]+)"|(\S+)/g, + (match, quoted, unquoted) => { + if (quoted) { + // Stem words in quoted phrase + const words = tokenize(quoted); + const stemmed = words.map(w => stemmer.stem(w.toLowerCase())).join(' '); + return `"${stemmed}"`; + } + + // Check for FTS5 operators + const upper = unquoted.toUpperCase(); + if (upper === 'AND' || upper === 'OR' || upper === 'NOT') { + return upper; + } + + // Handle prefix searches (word*) + if (unquoted.endsWith('*')) { + const wordPart = unquoted.slice(0, -1); + const words = tokenize(wordPart); + if (words.length > 0) { + return stemmer.stem(words[0].toLowerCase()) + '*'; + } + return match; + } + + // Regular word - stem it + const words = tokenize(unquoted); + if (words.length > 0) { + return stemmer.stem(words[0].toLowerCase()); + } + return ''; + } + ); + + // Clean up multiple spaces + return result.replace(/\s+/g, ' ').trim(); +} + +/** + * Prepare content for FTS indexing. + * Stems all text and produces a string suitable for FTS5 insertion. + * + * Also stores the original text after the stemmed text (separated by a special marker) + * so that snippet() can show the original words. However, we'll use a simpler approach: + * just return stemmed text for matching. + * + * @param text - The original content + * @param language - The language for stemming + * @returns Stemmed text for FTS5 indexing + */ +export function prepareForFTS(text: string, language: SupportedLanguage = 'english'): string { + return stemText(text, language); +} diff --git a/src/renderer/types/electron.d.ts b/src/renderer/types/electron.d.ts index fd3ef91..ff81688 100644 --- a/src/renderer/types/electron.d.ts +++ b/src/renderer/types/electron.d.ts @@ -41,7 +41,6 @@ export interface SearchResult { title: string; slug: string; excerpt?: string; - score: number; } export interface MediaData {