feat: proper multi-language stemming

This commit is contained in:
2026-02-11 09:39:30 +01:00
parent e66c64350f
commit 0b5efbb5e1
8 changed files with 337 additions and 73 deletions

View File

@@ -25,7 +25,7 @@ metadata can always be reconstructed from posts. Do the same with images, keepin
user local path, in that case storing the image file sand for each image file a properties sidecar file that user local path, in that case storing the image file sand for each image file a properties sidecar file that
uses the same header structure as for posts. uses the same header structure as for posts.
The application must be offline-first, everything must work in airplane mode (except publishing of course). The application must be offline-first, everything must work in airplane mode (except exporting of course).
It must be fully self-contained during editing and previewing and managing content. Every internal structure It must be fully self-contained during editing and previewing and managing content. Every internal structure
must have reflections in the filesystem, so available tags, available categories, all those things must be must have reflections in the filesystem, so available tags, available categories, all those things must be
automatically reflected to the filesystem in a per-project way. Use a meta/ folder under the project folder automatically reflected to the filesystem in a per-project way. Use a meta/ folder under the project folder

7
package-lock.json generated
View File

@@ -31,6 +31,7 @@
"react-dom": "^18.2.0", "react-dom": "^18.2.0",
"react-hot-toast": "^2.6.0", "react-hot-toast": "^2.6.0",
"sharp": "^0.34.5", "sharp": "^0.34.5",
"snowball-stemmers": "^0.6.0",
"tiptap-markdown": "^0.9.0", "tiptap-markdown": "^0.9.0",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zustand": "^4.4.7" "zustand": "^4.4.7"
@@ -10994,6 +10995,12 @@
"npm": ">= 3.0.0" "npm": ">= 3.0.0"
} }
}, },
"node_modules/snowball-stemmers": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/snowball-stemmers/-/snowball-stemmers-0.6.0.tgz",
"integrity": "sha512-47VTJvsZzEme3VxhbUFc9wR0CX8fYKNMgL7AiBdJcHUrwbmN7htFroeX+8ESIAaoNgwEgwI2wnv1HIcqQqAdcA==",
"license": "ISC"
},
"node_modules/source-map": { "node_modules/source-map": {
"version": "0.6.1", "version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",

View File

@@ -74,6 +74,7 @@
"react-dom": "^18.2.0", "react-dom": "^18.2.0",
"react-hot-toast": "^2.6.0", "react-hot-toast": "^2.6.0",
"sharp": "^0.34.5", "sharp": "^0.34.5",
"snowball-stemmers": "^0.6.0",
"tiptap-markdown": "^0.9.0", "tiptap-markdown": "^0.9.0",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"zustand": "^4.4.7" "zustand": "^4.4.7"

View File

@@ -318,18 +318,38 @@ export class DatabaseConnection {
} }
// Create FTS5 virtual table for full-text search // Create FTS5 virtual table for full-text search
// Only stores: id (unindexed, for lookups) and content (stemmed text for matching)
// Post data for display comes from the posts table or filesystem files
await this.localClient.execute(` await this.localClient.execute(`
CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts USING fts5( CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts USING fts5(
id UNINDEXED, id UNINDEXED,
title,
content, content,
excerpt,
tags,
categories,
content_rowid=rowid content_rowid=rowid
); );
`); `);
// Migration: Check if old FTS schema (with multiple columns) exists and recreate
// Old schema had: id, title, content, excerpt, tags, categories, content_stemmed
// New schema has: id, content (stemmed only)
try {
// Try to query old columns - if they exist, we need to migrate
await this.localClient.execute("SELECT title FROM posts_fts LIMIT 0");
// Old schema exists - recreate with new simple schema
console.log('Migrating posts_fts table to simplified schema...');
await this.localClient.execute('DROP TABLE IF EXISTS posts_fts');
await this.localClient.execute(`
CREATE VIRTUAL TABLE posts_fts USING fts5(
id UNINDEXED,
content,
content_rowid=rowid
);
`);
console.log('FTS table migrated - rebuild index required');
} catch {
// Old columns don't exist - we have the new schema or no data, all good
}
// Create default project if none exists // Create default project if none exists
const existingProjects = await this.localClient.execute('SELECT COUNT(*) as count FROM projects'); const existingProjects = await this.localClient.execute('SELECT COUNT(*) as count FROM projects');
if (existingProjects.rows[0] && (existingProjects.rows[0].count as number) === 0) { if (existingProjects.rows[0] && (existingProjects.rows[0].count as number) === 0) {

View File

@@ -9,6 +9,7 @@ import { app } from 'electron';
import { getDatabase } from '../database'; import { getDatabase } from '../database';
import { posts, Post, NewPost, postLinks } from '../database/schema'; import { posts, Post, NewPost, postLinks } from '../database/schema';
import { taskManager, Task } from './TaskManager'; import { taskManager, Task } from './TaskManager';
import { stemText, stemQuery, SupportedLanguage } from './stemmer';
export interface PostData { export interface PostData {
id: string; id: string;
@@ -46,8 +47,6 @@ export interface SearchResult {
title: string; title: string;
slug: string; slug: string;
excerpt?: string; excerpt?: string;
matchSnippet?: string;
rank?: number;
} }
export interface PostFilter { export interface PostFilter {
@@ -73,11 +72,74 @@ export interface PaginationOptions {
export class PostEngine extends EventEmitter { export class PostEngine extends EventEmitter {
private currentProjectId: string = 'default'; private currentProjectId: string = 'default';
private searchLanguage: SupportedLanguage = 'english';
constructor() { constructor() {
super(); super();
} }
/**
* Set the language used for full-text search stemming.
* Affects both indexing and query processing.
*/
setSearchLanguage(language: SupportedLanguage): void {
this.searchLanguage = language;
}
/**
* Get the current search language.
*/
getSearchLanguage(): SupportedLanguage {
return this.searchLanguage;
}
/**
* Update the FTS index for a post.
* Updates the FTS index for a post.
* Stores only the stemmed content (combining title, excerpt, content, tags, categories).
* Only the post ID is returned from searches - actual post data comes from DB/files.
*/
private async updateFTSIndex(post: {
id: string;
title: string;
content: string;
excerpt?: string;
tags: string[];
categories: string[];
}): Promise<void> {
const client = getDatabase().getLocalClient();
if (!client) return;
// Delete existing entry
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] });
// Combine all searchable fields and stem them
const allText = [
post.title,
post.excerpt || '',
post.content,
post.tags.join(' '),
post.categories.join(' '),
].join(' ');
const stemmedContent = stemText(allText, this.searchLanguage);
// Insert with only id and stemmed content
await client.execute({
sql: 'INSERT INTO posts_fts (id, content) VALUES (?, ?)',
args: [post.id, stemmedContent],
});
}
/**
* Delete a post from the FTS index.
*/
private async deleteFTSIndex(id: string): Promise<void> {
const client = getDatabase().getLocalClient();
if (!client) return;
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
}
private getPostsBaseDir(): string { private getPostsBaseDir(): string {
const userDataPath = app.getPath('userData'); const userDataPath = app.getPath('userData');
return path.join(userDataPath, 'projects', this.currentProjectId, 'posts'); return path.join(userDataPath, 'projects', this.currentProjectId, 'posts');
@@ -289,12 +351,7 @@ export class PostEngine extends EventEmitter {
await db.insert(posts).values(dbPost); await db.insert(posts).values(dbPost);
// Update FTS index // Update FTS index
if (client) { await this.updateFTSIndex(post);
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [post.id, post.title, post.content, post.excerpt || '', post.tags.join(' '), post.categories.join(' ')],
});
}
this.emit('postCreated', post); this.emit('postCreated', post);
return post; return post;
@@ -369,13 +426,7 @@ export class PostEngine extends EventEmitter {
.where(eq(posts.id, id)); .where(eq(posts.id, id));
// Update FTS index // Update FTS index
if (client) { await this.updateFTSIndex(updated);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')],
});
}
// Update post links if content changed // Update post links if content changed
if (data.content) { if (data.content) {
@@ -412,9 +463,7 @@ export class PostEngine extends EventEmitter {
await db.delete(posts).where(eq(posts.id, id)); await db.delete(posts).where(eq(posts.id, id));
// Delete from FTS index // Delete from FTS index
if (client) { await this.deleteFTSIndex(id);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
}
this.emit('postDeleted', id); this.emit('postDeleted', id);
return true; return true;
@@ -618,28 +667,34 @@ export class PostEngine extends EventEmitter {
if (!client) return []; if (!client) return [];
try { try {
// Stem the query for multilingual matching
const stemmedQuery = stemQuery(query, this.searchLanguage);
// Search the stemmed content, only return post IDs
const result = await client.execute({ const result = await client.execute({
sql: `SELECT id, title, excerpt, snippet(posts_fts, 2, '<mark>', '</mark>', '...', 32) as snippet, rank sql: `SELECT id FROM posts_fts WHERE posts_fts MATCH ? ORDER BY rank LIMIT 50`,
FROM posts_fts args: [stemmedQuery],
WHERE posts_fts MATCH ?
ORDER BY rank
LIMIT 50`,
args: [query],
}); });
// Filter to current project and fetch actual post data
const projectPosts = await this.getAllPostsUnpaginated(); const projectPosts = await this.getAllPostsUnpaginated();
const projectPostIds = new Set(projectPosts.map(p => p.id)); const projectPostMap = new Map(projectPosts.map(p => [p.id, p]));
return result.rows const searchResults: SearchResult[] = [];
.filter(row => projectPostIds.has(row.id as string)) for (const row of result.rows) {
.map(row => ({ const postId = row.id as string;
id: row.id as string, const post = projectPostMap.get(postId);
title: row.title as string, if (post) {
slug: '', // Will be filled in by caller if needed searchResults.push({
excerpt: row.excerpt as string | undefined, id: post.id,
matchSnippet: row.snippet as string | undefined, title: post.title,
rank: row.rank as number | undefined, slug: post.slug,
})); excerpt: post.excerpt,
});
}
}
return searchResults;
} catch (error) { } catch (error) {
console.error('Search failed:', error); console.error('Search failed:', error);
return []; return [];
@@ -816,13 +871,7 @@ export class PostEngine extends EventEmitter {
.where(eq(posts.id, id)); .where(eq(posts.id, id));
// Update FTS index // Update FTS index
if (client) { await this.updateFTSIndex(published);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [published.id, published.title, published.content, published.excerpt || '', published.tags.join(' '), published.categories.join(' ')],
});
}
// Update post links based on published content // Update post links based on published content
await this.updatePostLinks(id, published.content); await this.updatePostLinks(id, published.content);
@@ -886,13 +935,7 @@ export class PostEngine extends EventEmitter {
}; };
// Update FTS index // Update FTS index
if (client) { await this.updateFTSIndex(reverted);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [reverted.id, reverted.title, reverted.content, reverted.excerpt || '', reverted.tags.join(' '), reverted.categories.join(' ')],
});
}
this.emit('postUpdated', reverted); this.emit('postUpdated', reverted);
return reverted; return reverted;
@@ -949,18 +992,29 @@ export class PostEngine extends EventEmitter {
.where(eq(posts.id, id)); .where(eq(posts.id, id));
// Update FTS index // Update FTS index
if (client) { await this.updateFTSIndex(updated);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')],
});
}
this.emit('postUpdated', updated); this.emit('postUpdated', updated);
return updated; return updated;
} }
/**
* Rebuild the FTS index for all posts in the current project.
* Call this after changing the search language or after migration.
*/
async rebuildFTSIndex(): Promise<void> {
const client = getDatabase().getLocalClient();
if (!client) return;
const allPosts = await this.getAllPostsUnpaginated();
for (const post of allPosts) {
await this.updateFTSIndex(post);
}
console.log(`Rebuilt FTS index for ${allPosts.length} posts`);
}
async rebuildDatabaseFromFiles(): Promise<void> { async rebuildDatabaseFromFiles(): Promise<void> {
const postsBaseDir = this.getPostsBaseDir(); const postsBaseDir = this.getPostsBaseDir();
const task: Task<void> = { const task: Task<void> = {
@@ -980,10 +1034,8 @@ export class PostEngine extends EventEmitter {
if (existingPosts.length > 0) { if (existingPosts.length > 0) {
const postIds = existingPosts.map(p => p.id); const postIds = existingPosts.map(p => p.id);
// Delete FTS entries first // Delete FTS entries first
if (client) { for (const post of existingPosts) {
for (const post of existingPosts) { await this.deleteFTSIndex(post.id);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] });
}
} }
// Delete post links where source or target is in the posts being deleted // Delete post links where source or target is in the posts being deleted
await db.delete(postLinks).where(inArray(postLinks.sourcePostId, postIds)); await db.delete(postLinks).where(inArray(postLinks.sourcePostId, postIds));
@@ -1069,13 +1121,7 @@ export class PostEngine extends EventEmitter {
insertedSlugs.set(slugKey, filePath); insertedSlugs.set(slugKey, filePath);
// Update FTS index (use file content for search) // Update FTS index (use file content for search)
if (client) { await this.updateFTSIndex(postData);
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [postData.id] });
await client.execute({
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
args: [postData.id, postData.title, postData.content, postData.excerpt || '', postData.tags.join(' '), postData.categories.join(' ')],
});
}
} catch (error: any) { } catch (error: any) {
// Handle constraint violations and other errors gracefully // Handle constraint violations and other errors gracefully
if (error?.code === 'SQLITE_CONSTRAINT_UNIQUE') { if (error?.code === 'SQLITE_CONSTRAINT_UNIQUE') {

View File

@@ -4,6 +4,14 @@ export { MediaEngine, getMediaEngine, type MediaData } from './MediaEngine';
export { SyncEngine, getSyncEngine, type SyncConfig, type SyncResult, type SyncDirection, type SyncStatus } from './SyncEngine'; export { SyncEngine, getSyncEngine, type SyncConfig, type SyncResult, type SyncDirection, type SyncStatus } from './SyncEngine';
export { ProjectEngine, getProjectEngine, type ProjectData } from './ProjectEngine'; export { ProjectEngine, getProjectEngine, type ProjectData } from './ProjectEngine';
export { MetaEngine, getMetaEngine } from './MetaEngine'; export { MetaEngine, getMetaEngine } from './MetaEngine';
export {
stemText,
stemWord,
stemQuery,
prepareForFTS,
getSupportedLanguages,
type SupportedLanguage,
} from './stemmer';
export { export {
DropboxSyncEngine, DropboxSyncEngine,
getDropboxSyncEngine, getDropboxSyncEngine,

183
src/main/engine/stemmer.ts Normal file
View File

@@ -0,0 +1,183 @@
/**
* Multilingual text stemming utility using Snowball stemmers.
* Used to normalize text before indexing in FTS5 and before searching.
*
* Supports 24 languages including: English, German, French, Spanish, Italian,
* Portuguese, Dutch, Russian, Arabic, and more.
*/
// eslint-disable-next-line @typescript-eslint/no-var-requires
const snowballFactory = require('snowball-stemmers');
export type SupportedLanguage =
| 'arabic'
| 'armenian'
| 'basque'
| 'catalan'
| 'czech'
| 'danish'
| 'dutch'
| 'english'
| 'finnish'
| 'french'
| 'german'
| 'hungarian'
| 'italian'
| 'irish'
| 'norwegian'
| 'porter'
| 'portuguese'
| 'romanian'
| 'russian'
| 'spanish'
| 'slovene'
| 'swedish'
| 'tamil'
| 'turkish';
interface Stemmer {
stem(word: string): string;
}
// Cache stemmers to avoid recreating them
const stemmerCache = new Map<SupportedLanguage, Stemmer>();
/**
* Get a stemmer instance for a given language.
* Stemmers are cached for reuse.
*/
function getStemmer(language: SupportedLanguage): Stemmer {
let stemmer = stemmerCache.get(language);
if (!stemmer) {
stemmer = snowballFactory.newStemmer(language) as Stemmer;
stemmerCache.set(language, stemmer);
}
return stemmer;
}
/**
* Get all supported language codes.
*/
export function getSupportedLanguages(): SupportedLanguage[] {
return snowballFactory.algorithms() as SupportedLanguage[];
}
/**
* Tokenize text into words.
* Uses Unicode-aware word splitting to handle non-ASCII languages.
*/
function tokenize(text: string): string[] {
// Match Unicode word characters (letters, marks, numbers)
// This handles languages like German (häuser), Russian (привет), Arabic, etc.
const wordPattern = /[\p{L}\p{M}\p{N}]+/gu;
const matches = text.match(wordPattern);
return matches || [];
}
/**
* Stem a single word using the specified language stemmer.
*/
export function stemWord(word: string, language: SupportedLanguage = 'english'): string {
const stemmer = getStemmer(language);
return stemmer.stem(word.toLowerCase());
}
/**
* Stem all words in a text and return the stemmed text.
* Words are joined with spaces.
*
* @param text - The text to stem
* @param language - The language to use for stemming (default: 'english')
* @returns Text with all words replaced by their stems
*
* @example
* stemText('Running runners run', 'english') // 'run runner run'
* stemText('Häuser Haus', 'german') // 'haus haus'
*/
export function stemText(text: string, language: SupportedLanguage = 'english'): string {
if (!text) return '';
const words = tokenize(text);
const stemmer = getStemmer(language);
const stemmedWords = words.map(word => stemmer.stem(word.toLowerCase()));
return stemmedWords.join(' ');
}
/**
* Prepare a search query by stemming all words.
* This ensures searches match stemmed content in the FTS index.
*
* FTS5 query syntax is preserved:
* - Quoted phrases are stemmed but kept quoted
* - Boolean operators (AND, OR, NOT) are preserved
* - Prefix searches (word*) have the word part stemmed
*
* @param query - The search query from the user
* @param language - The language for stemming
* @returns Query with words stemmed for FTS5
*
* @example
* stemQuery('running dogs', 'english') // 'run dog'
* stemQuery('"running fast"', 'english') // '"run fast"'
*/
export function stemQuery(query: string, language: SupportedLanguage = 'english'): string {
if (!query) return '';
const stemmer = getStemmer(language);
// Handle quoted phrases - stem words inside quotes but keep quotes
const result = query.replace(
/"([^"]+)"|(\S+)/g,
(match, quoted, unquoted) => {
if (quoted) {
// Stem words in quoted phrase
const words = tokenize(quoted);
const stemmed = words.map(w => stemmer.stem(w.toLowerCase())).join(' ');
return `"${stemmed}"`;
}
// Check for FTS5 operators
const upper = unquoted.toUpperCase();
if (upper === 'AND' || upper === 'OR' || upper === 'NOT') {
return upper;
}
// Handle prefix searches (word*)
if (unquoted.endsWith('*')) {
const wordPart = unquoted.slice(0, -1);
const words = tokenize(wordPart);
if (words.length > 0) {
return stemmer.stem(words[0].toLowerCase()) + '*';
}
return match;
}
// Regular word - stem it
const words = tokenize(unquoted);
if (words.length > 0) {
return stemmer.stem(words[0].toLowerCase());
}
return '';
}
);
// Clean up multiple spaces
return result.replace(/\s+/g, ' ').trim();
}
/**
* Prepare content for FTS indexing.
* Stems all text and produces a string suitable for FTS5 insertion.
*
* Also stores the original text after the stemmed text (separated by a special marker)
* so that snippet() can show the original words. However, we'll use a simpler approach:
* just return stemmed text for matching.
*
* @param text - The original content
* @param language - The language for stemming
* @returns Stemmed text for FTS5 indexing
*/
export function prepareForFTS(text: string, language: SupportedLanguage = 'english'): string {
return stemText(text, language);
}

View File

@@ -41,7 +41,6 @@ export interface SearchResult {
title: string; title: string;
slug: string; slug: string;
excerpt?: string; excerpt?: string;
score: number;
} }
export interface MediaData { export interface MediaData {