Add stemmer tests and improve MetaEngine/TagEngine branch coverage

Co-authored-by: rfc1437 <774975+rfc1437@users.noreply.github.com>
2026-02-15 10:46:04 +00:00
parent c33f7a903b
commit 73b5fa68fa
3 changed files with 598 additions and 0 deletions
--- a/tests/engine/stemmer.test.ts
+++ b/tests/engine/stemmer.test.ts
@@ -0,0 +1,354 @@
+/**
+ * Stemmer Unit Tests
+ *
+ * Tests the REAL stemmer functions without any mocks.
+ * The stemmer provides multilingual text stemming for FTS indexing.
+ * 
+ * Tests all branches including:
+ * - Various languages
+ * - ISO language code conversion
+ * - Empty/null inputs
+ * - FTS5 query operators (AND, OR, NOT)
+ * - Quoted phrases
+ * - Prefix searches
+ * - Edge cases
+ */
+
+import { describe, it, expect, beforeEach } from 'vitest';
+import {
+  stemText,
+  stemQuery,
+  stemWord,
+  getSupportedLanguages,
+  isoToStemmerLanguage,
+  prepareForFTS,
+  SupportedLanguage,
+} from '../../src/main/engine/stemmer';
+
+describe('stemmer', () => {
+  describe('getSupportedLanguages', () => {
+    it('should return an array of supported languages', () => {
+      const languages = getSupportedLanguages();
+      
+      expect(Array.isArray(languages)).toBe(true);
+      expect(languages.length).toBeGreaterThan(0);
+      expect(languages).toContain('english');
+    });
+
+    it('should include common languages', () => {
+      const languages = getSupportedLanguages();
+      
+      expect(languages).toContain('german');
+      expect(languages).toContain('french');
+      expect(languages).toContain('spanish');
+    });
+  });
+
+  describe('isoToStemmerLanguage', () => {
+    it('should convert ISO 639-1 codes to stemmer language names', () => {
+      expect(isoToStemmerLanguage('en')).toBe('english');
+      expect(isoToStemmerLanguage('de')).toBe('german');
+      expect(isoToStemmerLanguage('fr')).toBe('french');
+      expect(isoToStemmerLanguage('es')).toBe('spanish');
+    });
+
+    it('should handle locale codes with region (e.g., en-US)', () => {
+      expect(isoToStemmerLanguage('en-US')).toBe('english');
+      expect(isoToStemmerLanguage('de-DE')).toBe('german');
+      expect(isoToStemmerLanguage('fr-FR')).toBe('french');
+      expect(isoToStemmerLanguage('es-MX')).toBe('spanish');
+    });
+
+    it('should be case insensitive', () => {
+      expect(isoToStemmerLanguage('EN')).toBe('english');
+      expect(isoToStemmerLanguage('De')).toBe('german');
+      expect(isoToStemmerLanguage('FR')).toBe('french');
+    });
+
+    it('should return english for unknown language codes', () => {
+      expect(isoToStemmerLanguage('xx')).toBe('english');
+      expect(isoToStemmerLanguage('unknown')).toBe('english');
+      expect(isoToStemmerLanguage('')).toBe('english');
+    });
+
+    it('should handle all mapped ISO codes', () => {
+      expect(isoToStemmerLanguage('ar')).toBe('arabic');
+      expect(isoToStemmerLanguage('hy')).toBe('armenian');
+      expect(isoToStemmerLanguage('eu')).toBe('basque');
+      expect(isoToStemmerLanguage('ca')).toBe('catalan');
+      expect(isoToStemmerLanguage('cs')).toBe('czech');
+      expect(isoToStemmerLanguage('da')).toBe('danish');
+      expect(isoToStemmerLanguage('nl')).toBe('dutch');
+      expect(isoToStemmerLanguage('fi')).toBe('finnish');
+      expect(isoToStemmerLanguage('hu')).toBe('hungarian');
+      expect(isoToStemmerLanguage('it')).toBe('italian');
+      expect(isoToStemmerLanguage('ga')).toBe('irish');
+      expect(isoToStemmerLanguage('no')).toBe('norwegian');
+      expect(isoToStemmerLanguage('nb')).toBe('norwegian');
+      expect(isoToStemmerLanguage('nn')).toBe('norwegian');
+      expect(isoToStemmerLanguage('pt')).toBe('portuguese');
+      expect(isoToStemmerLanguage('ro')).toBe('romanian');
+      expect(isoToStemmerLanguage('ru')).toBe('russian');
+      expect(isoToStemmerLanguage('sl')).toBe('slovene');
+      expect(isoToStemmerLanguage('sv')).toBe('swedish');
+      expect(isoToStemmerLanguage('ta')).toBe('tamil');
+      expect(isoToStemmerLanguage('tr')).toBe('turkish');
+    });
+  });
+
+  describe('stemWord', () => {
+    it('should stem English words correctly', () => {
+      expect(stemWord('running', 'english')).toBe('run');
+      expect(stemWord('dogs', 'english')).toBe('dog');
+      expect(stemWord('played', 'english')).toBe('play');
+      expect(stemWord('playing', 'english')).toBe('play');
+    });
+
+    it('should stem German words correctly', () => {
+      expect(stemWord('häuser', 'german')).toBe('haus');
+      expect(stemWord('Häuser', 'german')).toBe('haus');
+    });
+
+    it('should stem French words correctly', () => {
+      expect(stemWord('chanter', 'french')).toBe('chant');
+      expect(stemWord('chanteuse', 'french')).toBe('chanteux');
+    });
+
+    it('should default to English when no language specified', () => {
+      expect(stemWord('running')).toBe('run');
+      expect(stemWord('dogs')).toBe('dog');
+    });
+
+    it('should handle uppercase words by converting to lowercase', () => {
+      expect(stemWord('RUNNING', 'english')).toBe('run');
+      expect(stemWord('DOGS', 'english')).toBe('dog');
+    });
+  });
+
+  describe('stemText', () => {
+    it('should stem all words in a sentence', () => {
+      const result = stemText('Running dogs are playing', 'english');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should return empty string for empty input', () => {
+      expect(stemText('', 'english')).toBe('');
+      expect(stemText('   ', 'english')).toBe('');
+    });
+
+    it('should return empty string for null/undefined-like empty text', () => {
+      expect(stemText('', 'english')).toBe('');
+    });
+
+    it('should handle multiple spaces between words', () => {
+      const result = stemText('Running   dogs   are   playing', 'english');
+      const words = result.split(' ');
+      expect(words).not.toContain('');
+    });
+
+    it('should handle German text correctly', () => {
+      const result = stemText('Häuser Haus', 'german');
+      expect(result).toContain('haus');
+    });
+
+    it('should handle text with numbers', () => {
+      const result = stemText('Running 123 dogs', 'english');
+      expect(result).toContain('run');
+      expect(result).toContain('123');
+      expect(result).toContain('dog');
+    });
+
+    it('should handle punctuation by extracting words', () => {
+      const result = stemText('Hello, world! How are you?', 'english');
+      expect(result).toContain('hello');
+      expect(result).toContain('world');
+    });
+
+    it('should use default English language when not specified', () => {
+      const result = stemText('Running dogs');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should handle Unicode characters for non-ASCII languages', () => {
+      // Russian text
+      const russianResult = stemText('привет мир', 'russian');
+      expect(russianResult.length).toBeGreaterThan(0);
+
+      // Arabic text
+      const arabicResult = stemText('مرحبا', 'arabic');
+      expect(arabicResult.length).toBeGreaterThan(0);
+    });
+  });
+
+  describe('stemQuery', () => {
+    it('should stem simple queries', () => {
+      const result = stemQuery('running dogs', 'english');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should return empty string for empty query', () => {
+      expect(stemQuery('', 'english')).toBe('');
+      expect(stemQuery('   ', 'english')).toBe('');
+    });
+
+    it('should preserve AND operator in uppercase', () => {
+      const result = stemQuery('running AND dogs', 'english');
+      expect(result).toContain('AND');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should preserve OR operator in uppercase', () => {
+      const result = stemQuery('cats OR dogs', 'english');
+      expect(result).toContain('OR');
+      expect(result).toContain('cat');
+      expect(result).toContain('dog');
+    });
+
+    it('should preserve NOT operator in uppercase', () => {
+      const result = stemQuery('NOT dogs', 'english');
+      expect(result).toContain('NOT');
+      expect(result).toContain('dog');
+    });
+
+    it('should handle lowercase operators by stemming them', () => {
+      // lowercase 'and', 'or', 'not' should be stemmed as regular words
+      const andResult = stemQuery('and', 'english');
+      // 'and' stemmed might be 'and' itself
+      expect(andResult.length).toBeGreaterThan(0);
+    });
+
+    it('should stem words inside quoted phrases', () => {
+      const result = stemQuery('"running fast"', 'english');
+      expect(result).toContain('"');
+      expect(result).toContain('run');
+      expect(result).toContain('fast');
+    });
+
+    it('should keep quotes around stemmed phrase', () => {
+      const result = stemQuery('"running dogs"', 'english');
+      expect(result.startsWith('"')).toBe(true);
+      expect(result.endsWith('"')).toBe(true);
+    });
+
+    it('should handle prefix searches with asterisk', () => {
+      const result = stemQuery('runn*', 'english');
+      expect(result).toContain('*');
+      // The word part before * should be stemmed
+      expect(result.includes('run')).toBe(true);
+    });
+
+    it('should handle prefix search when word results in empty after tokenization', () => {
+      // Test with just asterisk (edge case)
+      const result = stemQuery('*', 'english');
+      // Should return the original match since no word part
+      expect(result).toBe('*');
+    });
+
+    it('should handle complex queries with multiple operators', () => {
+      const result = stemQuery('"running fast" AND dogs NOT cats', 'english');
+      expect(result).toContain('AND');
+      expect(result).toContain('NOT');
+      expect(result).toContain('dog');
+      expect(result).toContain('cat');
+    });
+
+    it('should clean up multiple spaces', () => {
+      const result = stemQuery('running    dogs', 'english');
+      expect(result).not.toContain('  ');
+    });
+
+    it('should use default English language when not specified', () => {
+      const result = stemQuery('running dogs');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should handle unquoted words that tokenize to empty', () => {
+      // Special characters only
+      const result = stemQuery('!!!', 'english');
+      // Should result in empty string or just spaces
+      expect(result.trim()).toBe('');
+    });
+
+    it('should handle mixed quoted and unquoted terms', () => {
+      const result = stemQuery('dogs "running fast" cats', 'english');
+      expect(result).toContain('dog');
+      expect(result).toContain('cat');
+      expect(result).toContain('"');
+    });
+  });
+
+  describe('prepareForFTS', () => {
+    it('should prepare text for FTS indexing by stemming', () => {
+      const result = prepareForFTS('Running dogs are playing', 'english');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should use default English when no language specified', () => {
+      const result = prepareForFTS('Running dogs');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should be identical to stemText', () => {
+      const text = 'Running dogs are playing';
+      expect(prepareForFTS(text, 'english')).toBe(stemText(text, 'english'));
+    });
+  });
+
+  describe('stemmer caching', () => {
+    it('should reuse cached stemmers for same language', () => {
+      // Call multiple times with same language
+      const result1 = stemWord('running', 'english');
+      const result2 = stemWord('playing', 'english');
+      
+      // Results should be consistent
+      expect(result1).toBe('run');
+      expect(result2).toBe('play');
+    });
+
+    it('should support different languages in sequence', () => {
+      const englishResult = stemWord('running', 'english');
+      const germanResult = stemWord('häuser', 'german');
+      const frenchResult = stemWord('chanter', 'french');
+      
+      expect(englishResult).toBe('run');
+      expect(germanResult).toBe('haus');
+      expect(frenchResult).toBe('chant');
+    });
+  });
+
+  describe('edge cases', () => {
+    it('should handle very long text', () => {
+      const longText = 'running '.repeat(1000);
+      const result = stemText(longText, 'english');
+      expect(result.length).toBeGreaterThan(0);
+      expect(result.split(' ').every(word => word === 'run' || word === '')).toBe(true);
+    });
+
+    it('should handle special Unicode characters', () => {
+      const result = stemText('café résumé naïve', 'english');
+      expect(result.length).toBeGreaterThan(0);
+    });
+
+    it('should handle emoji by extracting adjacent words', () => {
+      const result = stemText('running 🏃 dogs', 'english');
+      expect(result).toContain('run');
+      expect(result).toContain('dog');
+    });
+
+    it('should handle mixed case consistently', () => {
+      const lower = stemText('running', 'english');
+      const upper = stemText('RUNNING', 'english');
+      const mixed = stemText('RuNnInG', 'english');
+      
+      expect(lower).toBe(upper);
+      expect(lower).toBe(mixed);
+    });
+  });
+});