feat: proper multi-language stemming

2026-02-11 09:39:30 +01:00
parent e66c64350f
commit 0b5efbb5e1
8 changed files with 337 additions and 73 deletions
--- a/src/main/engine/stemmer.ts
+++ b/src/main/engine/stemmer.ts
@@ -0,0 +1,183 @@
+/**
+ * Multilingual text stemming utility using Snowball stemmers.
+ * Used to normalize text before indexing in FTS5 and before searching.
+ * 
+ * Supports 24 languages including: English, German, French, Spanish, Italian,
+ * Portuguese, Dutch, Russian, Arabic, and more.
+ */
+
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+const snowballFactory = require('snowball-stemmers');
+
+export type SupportedLanguage =
+  | 'arabic'
+  | 'armenian'
+  | 'basque'
+  | 'catalan'
+  | 'czech'
+  | 'danish'
+  | 'dutch'
+  | 'english'
+  | 'finnish'
+  | 'french'
+  | 'german'
+  | 'hungarian'
+  | 'italian'
+  | 'irish'
+  | 'norwegian'
+  | 'porter'
+  | 'portuguese'
+  | 'romanian'
+  | 'russian'
+  | 'spanish'
+  | 'slovene'
+  | 'swedish'
+  | 'tamil'
+  | 'turkish';
+
+interface Stemmer {
+  stem(word: string): string;
+}
+
+// Cache stemmers to avoid recreating them
+const stemmerCache = new Map<SupportedLanguage, Stemmer>();
+
+/**
+ * Get a stemmer instance for a given language.
+ * Stemmers are cached for reuse.
+ */
+function getStemmer(language: SupportedLanguage): Stemmer {
+  let stemmer = stemmerCache.get(language);
+  if (!stemmer) {
+    stemmer = snowballFactory.newStemmer(language) as Stemmer;
+    stemmerCache.set(language, stemmer);
+  }
+  return stemmer;
+}
+
+/**
+ * Get all supported language codes.
+ */
+export function getSupportedLanguages(): SupportedLanguage[] {
+  return snowballFactory.algorithms() as SupportedLanguage[];
+}
+
+/**
+ * Tokenize text into words.
+ * Uses Unicode-aware word splitting to handle non-ASCII languages.
+ */
+function tokenize(text: string): string[] {
+  // Match Unicode word characters (letters, marks, numbers)
+  // This handles languages like German (häuser), Russian (привет), Arabic, etc.
+  const wordPattern = /[\p{L}\p{M}\p{N}]+/gu;
+  const matches = text.match(wordPattern);
+  return matches || [];
+}
+
+/**
+ * Stem a single word using the specified language stemmer.
+ */
+export function stemWord(word: string, language: SupportedLanguage = 'english'): string {
+  const stemmer = getStemmer(language);
+  return stemmer.stem(word.toLowerCase());
+}
+
+/**
+ * Stem all words in a text and return the stemmed text.
+ * Words are joined with spaces.
+ * 
+ * @param text - The text to stem
+ * @param language - The language to use for stemming (default: 'english')
+ * @returns Text with all words replaced by their stems
+ * 
+ * @example
+ * stemText('Running runners run', 'english') // 'run runner run'
+ * stemText('Häuser Haus', 'german') // 'haus haus'
+ */
+export function stemText(text: string, language: SupportedLanguage = 'english'): string {
+  if (!text) return '';
+  
+  const words = tokenize(text);
+  const stemmer = getStemmer(language);
+  
+  const stemmedWords = words.map(word => stemmer.stem(word.toLowerCase()));
+  return stemmedWords.join(' ');
+}
+
+/**
+ * Prepare a search query by stemming all words.
+ * This ensures searches match stemmed content in the FTS index.
+ * 
+ * FTS5 query syntax is preserved:
+ * - Quoted phrases are stemmed but kept quoted
+ * - Boolean operators (AND, OR, NOT) are preserved
+ * - Prefix searches (word*) have the word part stemmed
+ * 
+ * @param query - The search query from the user
+ * @param language - The language for stemming
+ * @returns Query with words stemmed for FTS5
+ * 
+ * @example
+ * stemQuery('running dogs', 'english') // 'run dog'
+ * stemQuery('"running fast"', 'english') // '"run fast"'
+ */
+export function stemQuery(query: string, language: SupportedLanguage = 'english'): string {
+  if (!query) return '';
+  
+  const stemmer = getStemmer(language);
+  
+  // Handle quoted phrases - stem words inside quotes but keep quotes
+  const result = query.replace(
+    /"([^"]+)"|(\S+)/g,
+    (match, quoted, unquoted) => {
+      if (quoted) {
+        // Stem words in quoted phrase
+        const words = tokenize(quoted);
+        const stemmed = words.map(w => stemmer.stem(w.toLowerCase())).join(' ');
+        return `"${stemmed}"`;
+      }
+      
+      // Check for FTS5 operators
+      const upper = unquoted.toUpperCase();
+      if (upper === 'AND' || upper === 'OR' || upper === 'NOT') {
+        return upper;
+      }
+      
+      // Handle prefix searches (word*)
+      if (unquoted.endsWith('*')) {
+        const wordPart = unquoted.slice(0, -1);
+        const words = tokenize(wordPart);
+        if (words.length > 0) {
+          return stemmer.stem(words[0].toLowerCase()) + '*';
+        }
+        return match;
+      }
+      
+      // Regular word - stem it
+      const words = tokenize(unquoted);
+      if (words.length > 0) {
+        return stemmer.stem(words[0].toLowerCase());
+      }
+      return '';
+    }
+  );
+  
+  // Clean up multiple spaces
+  return result.replace(/\s+/g, ' ').trim();
+}
+
+/**
+ * Prepare content for FTS indexing.
+ * Stems all text and produces a string suitable for FTS5 insertion.
+ * 
+ * Also stores the original text after the stemmed text (separated by a special marker)
+ * so that snippet() can show the original words. However, we'll use a simpler approach:
+ * just return stemmed text for matching.
+ * 
+ * @param text - The original content
+ * @param language - The language for stemming
+ * @returns Stemmed text for FTS5 indexing
+ */
+export function prepareForFTS(text: string, language: SupportedLanguage = 'english'): string {
+  return stemText(text, language);
+}