feat: proper multi-language stemming
This commit is contained in:
@@ -25,7 +25,7 @@ metadata can always be reconstructed from posts. Do the same with images, keepin
|
|||||||
user local path, in that case storing the image file sand for each image file a properties sidecar file that
|
user local path, in that case storing the image file sand for each image file a properties sidecar file that
|
||||||
uses the same header structure as for posts.
|
uses the same header structure as for posts.
|
||||||
|
|
||||||
The application must be offline-first, everything must work in airplane mode (except publishing of course).
|
The application must be offline-first, everything must work in airplane mode (except exporting of course).
|
||||||
It must be fully self-contained during editing and previewing and managing content. Every internal structure
|
It must be fully self-contained during editing and previewing and managing content. Every internal structure
|
||||||
must have reflections in the filesystem, so available tags, available categories, all those things must be
|
must have reflections in the filesystem, so available tags, available categories, all those things must be
|
||||||
automatically reflected to the filesystem in a per-project way. Use a meta/ folder under the project folder
|
automatically reflected to the filesystem in a per-project way. Use a meta/ folder under the project folder
|
||||||
|
|||||||
7
package-lock.json
generated
7
package-lock.json
generated
@@ -31,6 +31,7 @@
|
|||||||
"react-dom": "^18.2.0",
|
"react-dom": "^18.2.0",
|
||||||
"react-hot-toast": "^2.6.0",
|
"react-hot-toast": "^2.6.0",
|
||||||
"sharp": "^0.34.5",
|
"sharp": "^0.34.5",
|
||||||
|
"snowball-stemmers": "^0.6.0",
|
||||||
"tiptap-markdown": "^0.9.0",
|
"tiptap-markdown": "^0.9.0",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"zustand": "^4.4.7"
|
"zustand": "^4.4.7"
|
||||||
@@ -10994,6 +10995,12 @@
|
|||||||
"npm": ">= 3.0.0"
|
"npm": ">= 3.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/snowball-stemmers": {
|
||||||
|
"version": "0.6.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/snowball-stemmers/-/snowball-stemmers-0.6.0.tgz",
|
||||||
|
"integrity": "sha512-47VTJvsZzEme3VxhbUFc9wR0CX8fYKNMgL7AiBdJcHUrwbmN7htFroeX+8ESIAaoNgwEgwI2wnv1HIcqQqAdcA==",
|
||||||
|
"license": "ISC"
|
||||||
|
},
|
||||||
"node_modules/source-map": {
|
"node_modules/source-map": {
|
||||||
"version": "0.6.1",
|
"version": "0.6.1",
|
||||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||||
|
|||||||
@@ -74,6 +74,7 @@
|
|||||||
"react-dom": "^18.2.0",
|
"react-dom": "^18.2.0",
|
||||||
"react-hot-toast": "^2.6.0",
|
"react-hot-toast": "^2.6.0",
|
||||||
"sharp": "^0.34.5",
|
"sharp": "^0.34.5",
|
||||||
|
"snowball-stemmers": "^0.6.0",
|
||||||
"tiptap-markdown": "^0.9.0",
|
"tiptap-markdown": "^0.9.0",
|
||||||
"uuid": "^9.0.1",
|
"uuid": "^9.0.1",
|
||||||
"zustand": "^4.4.7"
|
"zustand": "^4.4.7"
|
||||||
|
|||||||
@@ -318,18 +318,38 @@ export class DatabaseConnection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create FTS5 virtual table for full-text search
|
// Create FTS5 virtual table for full-text search
|
||||||
|
// Only stores: id (unindexed, for lookups) and content (stemmed text for matching)
|
||||||
|
// Post data for display comes from the posts table or filesystem files
|
||||||
await this.localClient.execute(`
|
await this.localClient.execute(`
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts USING fts5(
|
CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts USING fts5(
|
||||||
id UNINDEXED,
|
id UNINDEXED,
|
||||||
title,
|
|
||||||
content,
|
content,
|
||||||
excerpt,
|
|
||||||
tags,
|
|
||||||
categories,
|
|
||||||
content_rowid=rowid
|
content_rowid=rowid
|
||||||
);
|
);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
// Migration: Check if old FTS schema (with multiple columns) exists and recreate
|
||||||
|
// Old schema had: id, title, content, excerpt, tags, categories, content_stemmed
|
||||||
|
// New schema has: id, content (stemmed only)
|
||||||
|
try {
|
||||||
|
// Try to query old columns - if they exist, we need to migrate
|
||||||
|
await this.localClient.execute("SELECT title FROM posts_fts LIMIT 0");
|
||||||
|
|
||||||
|
// Old schema exists - recreate with new simple schema
|
||||||
|
console.log('Migrating posts_fts table to simplified schema...');
|
||||||
|
await this.localClient.execute('DROP TABLE IF EXISTS posts_fts');
|
||||||
|
await this.localClient.execute(`
|
||||||
|
CREATE VIRTUAL TABLE posts_fts USING fts5(
|
||||||
|
id UNINDEXED,
|
||||||
|
content,
|
||||||
|
content_rowid=rowid
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
console.log('FTS table migrated - rebuild index required');
|
||||||
|
} catch {
|
||||||
|
// Old columns don't exist - we have the new schema or no data, all good
|
||||||
|
}
|
||||||
|
|
||||||
// Create default project if none exists
|
// Create default project if none exists
|
||||||
const existingProjects = await this.localClient.execute('SELECT COUNT(*) as count FROM projects');
|
const existingProjects = await this.localClient.execute('SELECT COUNT(*) as count FROM projects');
|
||||||
if (existingProjects.rows[0] && (existingProjects.rows[0].count as number) === 0) {
|
if (existingProjects.rows[0] && (existingProjects.rows[0].count as number) === 0) {
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import { app } from 'electron';
|
|||||||
import { getDatabase } from '../database';
|
import { getDatabase } from '../database';
|
||||||
import { posts, Post, NewPost, postLinks } from '../database/schema';
|
import { posts, Post, NewPost, postLinks } from '../database/schema';
|
||||||
import { taskManager, Task } from './TaskManager';
|
import { taskManager, Task } from './TaskManager';
|
||||||
|
import { stemText, stemQuery, SupportedLanguage } from './stemmer';
|
||||||
|
|
||||||
export interface PostData {
|
export interface PostData {
|
||||||
id: string;
|
id: string;
|
||||||
@@ -46,8 +47,6 @@ export interface SearchResult {
|
|||||||
title: string;
|
title: string;
|
||||||
slug: string;
|
slug: string;
|
||||||
excerpt?: string;
|
excerpt?: string;
|
||||||
matchSnippet?: string;
|
|
||||||
rank?: number;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PostFilter {
|
export interface PostFilter {
|
||||||
@@ -73,11 +72,74 @@ export interface PaginationOptions {
|
|||||||
|
|
||||||
export class PostEngine extends EventEmitter {
|
export class PostEngine extends EventEmitter {
|
||||||
private currentProjectId: string = 'default';
|
private currentProjectId: string = 'default';
|
||||||
|
private searchLanguage: SupportedLanguage = 'english';
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the language used for full-text search stemming.
|
||||||
|
* Affects both indexing and query processing.
|
||||||
|
*/
|
||||||
|
setSearchLanguage(language: SupportedLanguage): void {
|
||||||
|
this.searchLanguage = language;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the current search language.
|
||||||
|
*/
|
||||||
|
getSearchLanguage(): SupportedLanguage {
|
||||||
|
return this.searchLanguage;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the FTS index for a post.
|
||||||
|
* Updates the FTS index for a post.
|
||||||
|
* Stores only the stemmed content (combining title, excerpt, content, tags, categories).
|
||||||
|
* Only the post ID is returned from searches - actual post data comes from DB/files.
|
||||||
|
*/
|
||||||
|
private async updateFTSIndex(post: {
|
||||||
|
id: string;
|
||||||
|
title: string;
|
||||||
|
content: string;
|
||||||
|
excerpt?: string;
|
||||||
|
tags: string[];
|
||||||
|
categories: string[];
|
||||||
|
}): Promise<void> {
|
||||||
|
const client = getDatabase().getLocalClient();
|
||||||
|
if (!client) return;
|
||||||
|
|
||||||
|
// Delete existing entry
|
||||||
|
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] });
|
||||||
|
|
||||||
|
// Combine all searchable fields and stem them
|
||||||
|
const allText = [
|
||||||
|
post.title,
|
||||||
|
post.excerpt || '',
|
||||||
|
post.content,
|
||||||
|
post.tags.join(' '),
|
||||||
|
post.categories.join(' '),
|
||||||
|
].join(' ');
|
||||||
|
|
||||||
|
const stemmedContent = stemText(allText, this.searchLanguage);
|
||||||
|
|
||||||
|
// Insert with only id and stemmed content
|
||||||
|
await client.execute({
|
||||||
|
sql: 'INSERT INTO posts_fts (id, content) VALUES (?, ?)',
|
||||||
|
args: [post.id, stemmedContent],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a post from the FTS index.
|
||||||
|
*/
|
||||||
|
private async deleteFTSIndex(id: string): Promise<void> {
|
||||||
|
const client = getDatabase().getLocalClient();
|
||||||
|
if (!client) return;
|
||||||
|
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
||||||
|
}
|
||||||
|
|
||||||
private getPostsBaseDir(): string {
|
private getPostsBaseDir(): string {
|
||||||
const userDataPath = app.getPath('userData');
|
const userDataPath = app.getPath('userData');
|
||||||
return path.join(userDataPath, 'projects', this.currentProjectId, 'posts');
|
return path.join(userDataPath, 'projects', this.currentProjectId, 'posts');
|
||||||
@@ -289,12 +351,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
await db.insert(posts).values(dbPost);
|
await db.insert(posts).values(dbPost);
|
||||||
|
|
||||||
// Update FTS index
|
// Update FTS index
|
||||||
if (client) {
|
await this.updateFTSIndex(post);
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [post.id, post.title, post.content, post.excerpt || '', post.tags.join(' '), post.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
this.emit('postCreated', post);
|
this.emit('postCreated', post);
|
||||||
return post;
|
return post;
|
||||||
@@ -369,13 +426,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
.where(eq(posts.id, id));
|
.where(eq(posts.id, id));
|
||||||
|
|
||||||
// Update FTS index
|
// Update FTS index
|
||||||
if (client) {
|
await this.updateFTSIndex(updated);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update post links if content changed
|
// Update post links if content changed
|
||||||
if (data.content) {
|
if (data.content) {
|
||||||
@@ -412,9 +463,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
await db.delete(posts).where(eq(posts.id, id));
|
await db.delete(posts).where(eq(posts.id, id));
|
||||||
|
|
||||||
// Delete from FTS index
|
// Delete from FTS index
|
||||||
if (client) {
|
await this.deleteFTSIndex(id);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
|
||||||
}
|
|
||||||
|
|
||||||
this.emit('postDeleted', id);
|
this.emit('postDeleted', id);
|
||||||
return true;
|
return true;
|
||||||
@@ -618,28 +667,34 @@ export class PostEngine extends EventEmitter {
|
|||||||
if (!client) return [];
|
if (!client) return [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// Stem the query for multilingual matching
|
||||||
|
const stemmedQuery = stemQuery(query, this.searchLanguage);
|
||||||
|
|
||||||
|
// Search the stemmed content, only return post IDs
|
||||||
const result = await client.execute({
|
const result = await client.execute({
|
||||||
sql: `SELECT id, title, excerpt, snippet(posts_fts, 2, '<mark>', '</mark>', '...', 32) as snippet, rank
|
sql: `SELECT id FROM posts_fts WHERE posts_fts MATCH ? ORDER BY rank LIMIT 50`,
|
||||||
FROM posts_fts
|
args: [stemmedQuery],
|
||||||
WHERE posts_fts MATCH ?
|
|
||||||
ORDER BY rank
|
|
||||||
LIMIT 50`,
|
|
||||||
args: [query],
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Filter to current project and fetch actual post data
|
||||||
const projectPosts = await this.getAllPostsUnpaginated();
|
const projectPosts = await this.getAllPostsUnpaginated();
|
||||||
const projectPostIds = new Set(projectPosts.map(p => p.id));
|
const projectPostMap = new Map(projectPosts.map(p => [p.id, p]));
|
||||||
|
|
||||||
return result.rows
|
const searchResults: SearchResult[] = [];
|
||||||
.filter(row => projectPostIds.has(row.id as string))
|
for (const row of result.rows) {
|
||||||
.map(row => ({
|
const postId = row.id as string;
|
||||||
id: row.id as string,
|
const post = projectPostMap.get(postId);
|
||||||
title: row.title as string,
|
if (post) {
|
||||||
slug: '', // Will be filled in by caller if needed
|
searchResults.push({
|
||||||
excerpt: row.excerpt as string | undefined,
|
id: post.id,
|
||||||
matchSnippet: row.snippet as string | undefined,
|
title: post.title,
|
||||||
rank: row.rank as number | undefined,
|
slug: post.slug,
|
||||||
}));
|
excerpt: post.excerpt,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return searchResults;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Search failed:', error);
|
console.error('Search failed:', error);
|
||||||
return [];
|
return [];
|
||||||
@@ -816,13 +871,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
.where(eq(posts.id, id));
|
.where(eq(posts.id, id));
|
||||||
|
|
||||||
// Update FTS index
|
// Update FTS index
|
||||||
if (client) {
|
await this.updateFTSIndex(published);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [published.id, published.title, published.content, published.excerpt || '', published.tags.join(' '), published.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update post links based on published content
|
// Update post links based on published content
|
||||||
await this.updatePostLinks(id, published.content);
|
await this.updatePostLinks(id, published.content);
|
||||||
@@ -886,13 +935,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Update FTS index
|
// Update FTS index
|
||||||
if (client) {
|
await this.updateFTSIndex(reverted);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [reverted.id, reverted.title, reverted.content, reverted.excerpt || '', reverted.tags.join(' '), reverted.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
this.emit('postUpdated', reverted);
|
this.emit('postUpdated', reverted);
|
||||||
return reverted;
|
return reverted;
|
||||||
@@ -949,18 +992,29 @@ export class PostEngine extends EventEmitter {
|
|||||||
.where(eq(posts.id, id));
|
.where(eq(posts.id, id));
|
||||||
|
|
||||||
// Update FTS index
|
// Update FTS index
|
||||||
if (client) {
|
await this.updateFTSIndex(updated);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [id] });
|
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [updated.id, updated.title, updated.content, updated.excerpt || '', updated.tags.join(' '), updated.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
this.emit('postUpdated', updated);
|
this.emit('postUpdated', updated);
|
||||||
return updated;
|
return updated;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rebuild the FTS index for all posts in the current project.
|
||||||
|
* Call this after changing the search language or after migration.
|
||||||
|
*/
|
||||||
|
async rebuildFTSIndex(): Promise<void> {
|
||||||
|
const client = getDatabase().getLocalClient();
|
||||||
|
if (!client) return;
|
||||||
|
|
||||||
|
const allPosts = await this.getAllPostsUnpaginated();
|
||||||
|
|
||||||
|
for (const post of allPosts) {
|
||||||
|
await this.updateFTSIndex(post);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Rebuilt FTS index for ${allPosts.length} posts`);
|
||||||
|
}
|
||||||
|
|
||||||
async rebuildDatabaseFromFiles(): Promise<void> {
|
async rebuildDatabaseFromFiles(): Promise<void> {
|
||||||
const postsBaseDir = this.getPostsBaseDir();
|
const postsBaseDir = this.getPostsBaseDir();
|
||||||
const task: Task<void> = {
|
const task: Task<void> = {
|
||||||
@@ -980,10 +1034,8 @@ export class PostEngine extends EventEmitter {
|
|||||||
if (existingPosts.length > 0) {
|
if (existingPosts.length > 0) {
|
||||||
const postIds = existingPosts.map(p => p.id);
|
const postIds = existingPosts.map(p => p.id);
|
||||||
// Delete FTS entries first
|
// Delete FTS entries first
|
||||||
if (client) {
|
for (const post of existingPosts) {
|
||||||
for (const post of existingPosts) {
|
await this.deleteFTSIndex(post.id);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [post.id] });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Delete post links where source or target is in the posts being deleted
|
// Delete post links where source or target is in the posts being deleted
|
||||||
await db.delete(postLinks).where(inArray(postLinks.sourcePostId, postIds));
|
await db.delete(postLinks).where(inArray(postLinks.sourcePostId, postIds));
|
||||||
@@ -1069,13 +1121,7 @@ export class PostEngine extends EventEmitter {
|
|||||||
insertedSlugs.set(slugKey, filePath);
|
insertedSlugs.set(slugKey, filePath);
|
||||||
|
|
||||||
// Update FTS index (use file content for search)
|
// Update FTS index (use file content for search)
|
||||||
if (client) {
|
await this.updateFTSIndex(postData);
|
||||||
await client.execute({ sql: 'DELETE FROM posts_fts WHERE id = ?', args: [postData.id] });
|
|
||||||
await client.execute({
|
|
||||||
sql: 'INSERT INTO posts_fts (id, title, content, excerpt, tags, categories) VALUES (?, ?, ?, ?, ?, ?)',
|
|
||||||
args: [postData.id, postData.title, postData.content, postData.excerpt || '', postData.tags.join(' '), postData.categories.join(' ')],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
// Handle constraint violations and other errors gracefully
|
// Handle constraint violations and other errors gracefully
|
||||||
if (error?.code === 'SQLITE_CONSTRAINT_UNIQUE') {
|
if (error?.code === 'SQLITE_CONSTRAINT_UNIQUE') {
|
||||||
|
|||||||
@@ -4,6 +4,14 @@ export { MediaEngine, getMediaEngine, type MediaData } from './MediaEngine';
|
|||||||
export { SyncEngine, getSyncEngine, type SyncConfig, type SyncResult, type SyncDirection, type SyncStatus } from './SyncEngine';
|
export { SyncEngine, getSyncEngine, type SyncConfig, type SyncResult, type SyncDirection, type SyncStatus } from './SyncEngine';
|
||||||
export { ProjectEngine, getProjectEngine, type ProjectData } from './ProjectEngine';
|
export { ProjectEngine, getProjectEngine, type ProjectData } from './ProjectEngine';
|
||||||
export { MetaEngine, getMetaEngine } from './MetaEngine';
|
export { MetaEngine, getMetaEngine } from './MetaEngine';
|
||||||
|
export {
|
||||||
|
stemText,
|
||||||
|
stemWord,
|
||||||
|
stemQuery,
|
||||||
|
prepareForFTS,
|
||||||
|
getSupportedLanguages,
|
||||||
|
type SupportedLanguage,
|
||||||
|
} from './stemmer';
|
||||||
export {
|
export {
|
||||||
DropboxSyncEngine,
|
DropboxSyncEngine,
|
||||||
getDropboxSyncEngine,
|
getDropboxSyncEngine,
|
||||||
|
|||||||
183
src/main/engine/stemmer.ts
Normal file
183
src/main/engine/stemmer.ts
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
/**
|
||||||
|
* Multilingual text stemming utility using Snowball stemmers.
|
||||||
|
* Used to normalize text before indexing in FTS5 and before searching.
|
||||||
|
*
|
||||||
|
* Supports 24 languages including: English, German, French, Spanish, Italian,
|
||||||
|
* Portuguese, Dutch, Russian, Arabic, and more.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
|
const snowballFactory = require('snowball-stemmers');
|
||||||
|
|
||||||
|
export type SupportedLanguage =
|
||||||
|
| 'arabic'
|
||||||
|
| 'armenian'
|
||||||
|
| 'basque'
|
||||||
|
| 'catalan'
|
||||||
|
| 'czech'
|
||||||
|
| 'danish'
|
||||||
|
| 'dutch'
|
||||||
|
| 'english'
|
||||||
|
| 'finnish'
|
||||||
|
| 'french'
|
||||||
|
| 'german'
|
||||||
|
| 'hungarian'
|
||||||
|
| 'italian'
|
||||||
|
| 'irish'
|
||||||
|
| 'norwegian'
|
||||||
|
| 'porter'
|
||||||
|
| 'portuguese'
|
||||||
|
| 'romanian'
|
||||||
|
| 'russian'
|
||||||
|
| 'spanish'
|
||||||
|
| 'slovene'
|
||||||
|
| 'swedish'
|
||||||
|
| 'tamil'
|
||||||
|
| 'turkish';
|
||||||
|
|
||||||
|
interface Stemmer {
|
||||||
|
stem(word: string): string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache stemmers to avoid recreating them
|
||||||
|
const stemmerCache = new Map<SupportedLanguage, Stemmer>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a stemmer instance for a given language.
|
||||||
|
* Stemmers are cached for reuse.
|
||||||
|
*/
|
||||||
|
function getStemmer(language: SupportedLanguage): Stemmer {
|
||||||
|
let stemmer = stemmerCache.get(language);
|
||||||
|
if (!stemmer) {
|
||||||
|
stemmer = snowballFactory.newStemmer(language) as Stemmer;
|
||||||
|
stemmerCache.set(language, stemmer);
|
||||||
|
}
|
||||||
|
return stemmer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all supported language codes.
|
||||||
|
*/
|
||||||
|
export function getSupportedLanguages(): SupportedLanguage[] {
|
||||||
|
return snowballFactory.algorithms() as SupportedLanguage[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tokenize text into words.
|
||||||
|
* Uses Unicode-aware word splitting to handle non-ASCII languages.
|
||||||
|
*/
|
||||||
|
function tokenize(text: string): string[] {
|
||||||
|
// Match Unicode word characters (letters, marks, numbers)
|
||||||
|
// This handles languages like German (häuser), Russian (привет), Arabic, etc.
|
||||||
|
const wordPattern = /[\p{L}\p{M}\p{N}]+/gu;
|
||||||
|
const matches = text.match(wordPattern);
|
||||||
|
return matches || [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem a single word using the specified language stemmer.
|
||||||
|
*/
|
||||||
|
export function stemWord(word: string, language: SupportedLanguage = 'english'): string {
|
||||||
|
const stemmer = getStemmer(language);
|
||||||
|
return stemmer.stem(word.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem all words in a text and return the stemmed text.
|
||||||
|
* Words are joined with spaces.
|
||||||
|
*
|
||||||
|
* @param text - The text to stem
|
||||||
|
* @param language - The language to use for stemming (default: 'english')
|
||||||
|
* @returns Text with all words replaced by their stems
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* stemText('Running runners run', 'english') // 'run runner run'
|
||||||
|
* stemText('Häuser Haus', 'german') // 'haus haus'
|
||||||
|
*/
|
||||||
|
export function stemText(text: string, language: SupportedLanguage = 'english'): string {
|
||||||
|
if (!text) return '';
|
||||||
|
|
||||||
|
const words = tokenize(text);
|
||||||
|
const stemmer = getStemmer(language);
|
||||||
|
|
||||||
|
const stemmedWords = words.map(word => stemmer.stem(word.toLowerCase()));
|
||||||
|
return stemmedWords.join(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare a search query by stemming all words.
|
||||||
|
* This ensures searches match stemmed content in the FTS index.
|
||||||
|
*
|
||||||
|
* FTS5 query syntax is preserved:
|
||||||
|
* - Quoted phrases are stemmed but kept quoted
|
||||||
|
* - Boolean operators (AND, OR, NOT) are preserved
|
||||||
|
* - Prefix searches (word*) have the word part stemmed
|
||||||
|
*
|
||||||
|
* @param query - The search query from the user
|
||||||
|
* @param language - The language for stemming
|
||||||
|
* @returns Query with words stemmed for FTS5
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* stemQuery('running dogs', 'english') // 'run dog'
|
||||||
|
* stemQuery('"running fast"', 'english') // '"run fast"'
|
||||||
|
*/
|
||||||
|
export function stemQuery(query: string, language: SupportedLanguage = 'english'): string {
|
||||||
|
if (!query) return '';
|
||||||
|
|
||||||
|
const stemmer = getStemmer(language);
|
||||||
|
|
||||||
|
// Handle quoted phrases - stem words inside quotes but keep quotes
|
||||||
|
const result = query.replace(
|
||||||
|
/"([^"]+)"|(\S+)/g,
|
||||||
|
(match, quoted, unquoted) => {
|
||||||
|
if (quoted) {
|
||||||
|
// Stem words in quoted phrase
|
||||||
|
const words = tokenize(quoted);
|
||||||
|
const stemmed = words.map(w => stemmer.stem(w.toLowerCase())).join(' ');
|
||||||
|
return `"${stemmed}"`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for FTS5 operators
|
||||||
|
const upper = unquoted.toUpperCase();
|
||||||
|
if (upper === 'AND' || upper === 'OR' || upper === 'NOT') {
|
||||||
|
return upper;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle prefix searches (word*)
|
||||||
|
if (unquoted.endsWith('*')) {
|
||||||
|
const wordPart = unquoted.slice(0, -1);
|
||||||
|
const words = tokenize(wordPart);
|
||||||
|
if (words.length > 0) {
|
||||||
|
return stemmer.stem(words[0].toLowerCase()) + '*';
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regular word - stem it
|
||||||
|
const words = tokenize(unquoted);
|
||||||
|
if (words.length > 0) {
|
||||||
|
return stemmer.stem(words[0].toLowerCase());
|
||||||
|
}
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Clean up multiple spaces
|
||||||
|
return result.replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare content for FTS indexing.
|
||||||
|
* Stems all text and produces a string suitable for FTS5 insertion.
|
||||||
|
*
|
||||||
|
* Also stores the original text after the stemmed text (separated by a special marker)
|
||||||
|
* so that snippet() can show the original words. However, we'll use a simpler approach:
|
||||||
|
* just return stemmed text for matching.
|
||||||
|
*
|
||||||
|
* @param text - The original content
|
||||||
|
* @param language - The language for stemming
|
||||||
|
* @returns Stemmed text for FTS5 indexing
|
||||||
|
*/
|
||||||
|
export function prepareForFTS(text: string, language: SupportedLanguage = 'english'): string {
|
||||||
|
return stemText(text, language);
|
||||||
|
}
|
||||||
1
src/renderer/types/electron.d.ts
vendored
1
src/renderer/types/electron.d.ts
vendored
@@ -41,7 +41,6 @@ export interface SearchResult {
|
|||||||
title: string;
|
title: string;
|
||||||
slug: string;
|
slug: string;
|
||||||
excerpt?: string;
|
excerpt?: string;
|
||||||
score: number;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface MediaData {
|
export interface MediaData {
|
||||||
|
|||||||
Reference in New Issue
Block a user