fix: slugify transliterates now much better

2026-03-01 07:12:38 +01:00
parent 583c37473a
commit 289535021a
7 changed files with 149 additions and 10 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -50,6 +50,7 @@
        "simple-git": "^3.31.1",
        "smol-toml": "^1.6.0",
        "snowball-stemmers": "^0.6.0",
+        "transliteration": "^2.6.1",
        "turndown": "^7.2.2",
        "uuid": "^13.0.0",
        "vanilla-calendar-pro": "^3.1.0",
@@ -15228,6 +15229,19 @@
        "node": ">=20"
      }
    },
+    "node_modules/transliteration": {
+      "version": "2.6.1",
+      "resolved": "https://registry.npmjs.org/transliteration/-/transliteration-2.6.1.tgz",
+      "integrity": "sha512-hJ9BhrQAOnNTbpOr1MxsNjZISkn7ppvF5TKUeFmTE1mG4ZPD/XVxF0L0LUoIUCWmQyxH0gJpVtfYLAWf298U9w==",
+      "license": "MIT",
+      "bin": {
+        "slugify": "dist/bin/slugify",
+        "transliterate": "dist/bin/transliterate"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      }
+    },
    "node_modules/tree-dump": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/tree-dump/-/tree-dump-1.1.0.tgz",
--- a/package.json
+++ b/package.json
@@ -111,6 +111,7 @@
    "simple-git": "^3.31.1",
    "smol-toml": "^1.6.0",
    "snowball-stemmers": "^0.6.0",
+    "transliteration": "^2.6.1",
    "turndown": "^7.2.2",
    "uuid": "^13.0.0",
    "vanilla-calendar-pro": "^3.1.0",
--- a/src/main/engine/PostEngine.ts
+++ b/src/main/engine/PostEngine.ts
@@ -13,6 +13,7 @@ import { stemText, stemQuery, SupportedLanguage } from './stemmer';
 import { readPostFile as readPostFileShared, type PostFileData } from './postFileUtils';
 import { CliNotifier, NoopNotifier } from './CliNotifier';
 import type { MediaEngine } from './MediaEngine';
+import { slugify } from './slugify';

 export interface PostData {
  id: string;
@@ -216,10 +217,7 @@ export class PostEngine extends EventEmitter {
  }

  private generateSlug(title: string): string {
-    return title
-      .toLowerCase()
-      .replace(/[^a-z0-9]+/g, '-')
-      .replace(/^-|-$/g, '');
+    return slugify(title);
  }

  /**
--- a/src/main/engine/ProjectEngine.ts
+++ b/src/main/engine/ProjectEngine.ts
@@ -6,6 +6,7 @@ import { eq } from 'drizzle-orm';
 import { app } from 'electron';
 import { getDatabase } from '../database';
 import { projects, posts, media, Project, NewProject } from '../database/schema';
+import { slugify } from './slugify';

 export interface ProjectData {
  id: string;
@@ -43,10 +44,7 @@ export class ProjectEngine extends EventEmitter {
  }

  private generateSlug(name: string): string {
-    return name
-      .toLowerCase()
-      .replace(/[^a-z0-9]+/g, '-')
-      .replace(/^-|-$/g, '');
+    return slugify(name);
  }

  /**
--- a/src/main/engine/slugify.ts
+++ b/src/main/engine/slugify.ts
@@ -0,0 +1,18 @@
+import { transliterate } from 'transliteration';
+
+/**
+ * Generate a URL-safe slug from a string.
+ *
+ * - Transliterates umlauts and accented characters to ASCII equivalents
+ *   using the `transliteration` package for broad Unicode coverage
+ * - Removes non-alphanumeric characters (except hyphens used as separators)
+ * - Separates words with normal hyphens (U+002D)
+ * - Collapses consecutive separators into a single hyphen
+ * - Strips leading/trailing hyphens
+ */
+export function slugify(input: string): string {
+  return transliterate(input)
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '');
+}
--- a/tests/engine/PostEngine.test.ts
+++ b/tests/engine/PostEngine.test.ts
@@ -227,9 +227,9 @@ describe('PostEngine', () => {
      expect(post.slug).toBe('multiple-spaces-here');
    });

-    it('should handle unicode characters by removing them', async () => {
+    it('should handle unicode characters by transliterating them', async () => {
      const post = await postEngine.createPost({ title: 'Café Test' });
-      expect(post.slug).toBe('caf-test');
+      expect(post.slug).toBe('cafe-test');
    });
  });

--- a/tests/engine/slugify.test.ts
+++ b/tests/engine/slugify.test.ts
@@ -0,0 +1,110 @@
+import { describe, expect, it } from 'vitest';
+import { slugify } from '../../src/main/engine/slugify';
+
+describe('slugify', () => {
+  describe('basic transformations', () => {
+    it('lowercases the input', () => {
+      expect(slugify('Hello World')).toBe('hello-world');
+    });
+
+    it('replaces spaces with hyphens', () => {
+      expect(slugify('hello world')).toBe('hello-world');
+    });
+
+    it('collapses multiple spaces into a single hyphen', () => {
+      expect(slugify('Multiple   Spaces   Here')).toBe('multiple-spaces-here');
+    });
+
+    it('removes leading and trailing hyphens', () => {
+      expect(slugify('---Test---')).toBe('test');
+    });
+
+    it('handles numbers in input', () => {
+      expect(slugify('10 Tips for Testing')).toBe('10-tips-for-testing');
+    });
+
+    it('returns empty string for empty input', () => {
+      expect(slugify('')).toBe('');
+    });
+
+    it('returns empty string for whitespace-only input', () => {
+      expect(slugify('   ')).toBe('');
+    });
+  });
+
+  describe('umlaut and special character transliteration', () => {
+    it('transliterates German umlauts', () => {
+      expect(slugify('Über die Brücke')).toBe('uber-die-brucke');
+      expect(slugify('Ärger')).toBe('arger');
+      expect(slugify('schön')).toBe('schon');
+    });
+
+    it('transliterates ß to ss', () => {
+      expect(slugify('Straße')).toBe('strasse');
+      expect(slugify('Großmutter')).toBe('grossmutter');
+    });
+
+    it('transliterates French accented characters', () => {
+      expect(slugify('Café Test')).toBe('cafe-test');
+      expect(slugify('crème brûlée')).toBe('creme-brulee');
+      expect(slugify('naïve')).toBe('naive');
+    });
+
+    it('transliterates Nordic characters', () => {
+      expect(slugify('Ångström')).toBe('angstrom');
+      expect(slugify('Ærø')).toBe('aero');
+      expect(slugify('Ødegaard')).toBe('odegaard');
+    });
+
+    it('transliterates Spanish characters', () => {
+      expect(slugify('España')).toBe('espana');
+      expect(slugify('niño')).toBe('nino');
+    });
+
+    it('transliterates Polish characters', () => {
+      expect(slugify('Łódź')).toBe('lodz');
+    });
+
+    it('transliterates Czech characters', () => {
+      expect(slugify('Dvořák')).toBe('dvorak');
+      expect(slugify('Háček')).toBe('hacek');
+    });
+  });
+
+  describe('special characters removal', () => {
+    it('removes punctuation', () => {
+      expect(slugify('Hello, World! How are you?')).toBe('hello-world-how-are-you');
+    });
+
+    it('removes brackets and parentheses', () => {
+      expect(slugify('Hello (World) [Test]')).toBe('hello-world-test');
+    });
+
+    it('removes symbols', () => {
+      expect(slugify('Hello @World #Test $100')).toBe('hello-world-test-100');
+    });
+
+    it('removes emoji and non-Latin characters', () => {
+      expect(slugify('Hello 🌍 World')).toBe('hello-world');
+    });
+  });
+
+  describe('word separation', () => {
+    it('separates words with normal hyphens', () => {
+      const result = slugify('Hello World');
+      expect(result).toBe('hello-world');
+      // Verify it's a normal hyphen (U+002D), not en-dash or em-dash
+      expect(result.charCodeAt(5)).toBe(0x2d);
+    });
+
+    it('converts en-dashes and em-dashes to hyphens', () => {
+      expect(slugify('hello–world')).toBe('hello-world'); // en-dash
+      expect(slugify('hello—world')).toBe('hello-world'); // em-dash
+    });
+
+    it('collapses consecutive special chars into single hyphen', () => {
+      expect(slugify('hello!!!world')).toBe('hello-world');
+      expect(slugify('hello...world')).toBe('hello-world');
+    });
+  });
+});