feat: importer starting point

2026-02-13 13:07:44 +01:00
parent deb0f3ae3b
commit d88fb1d9fa
19 changed files with 2666 additions and 10 deletions
--- a/tests/engine/ImportAnalysisEngine.test.ts
+++ b/tests/engine/ImportAnalysisEngine.test.ts
@@ -0,0 +1,537 @@
+/**
+ * ImportAnalysisEngine Unit Tests
+ *
+ * Tests the REAL ImportAnalysisEngine class with mocked dependencies.
+ * Following TDD: mock database and filesystem, test real analysis logic.
+ */
+
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+import { ImportAnalysisEngine } from '../../src/main/engine/ImportAnalysisEngine';
+import type { ImportAnalysisReport, AnalyzedPost, AnalyzedMedia } from '../../src/main/engine/ImportAnalysisEngine';
+import type { WxrData, WxrPost, WxrMedia, WxrSiteInfo } from '../../src/main/engine/WxrParser';
+import crypto from 'crypto';
+
+// Mock data stores
+const mockPostRows: any[] = [];
+const mockMediaRows: any[] = [];
+const mockTagRows: any[] = [];
+
+function createSelectChain() {
+  return {
+    from: vi.fn().mockReturnThis(),
+    where: vi.fn().mockReturnThis(),
+    all: vi.fn().mockImplementation(() => {
+      // Return appropriate data based on the table being queried
+      return Promise.resolve([]);
+    }),
+    get: vi.fn().mockImplementation(() => Promise.resolve(undefined)),
+  };
+}
+
+const mockLocalDb = {
+  select: vi.fn(() => {
+    const chain = createSelectChain();
+    // The chain.all will be overridden per test
+    return chain;
+  }),
+};
+
+// Mock the database module
+vi.mock('../../src/main/database', () => ({
+  getDatabase: vi.fn(() => ({
+    getLocal: vi.fn(() => mockLocalDb),
+  })),
+}));
+
+// Mock fs/promises for media file reading
+const mockFileBuffers = new Map<string, Buffer>();
+vi.mock('fs/promises', () => ({
+  readFile: vi.fn(async (path: string) => {
+    const buffer = mockFileBuffers.get(path.replace(/\\/g, '/'));
+    if (!buffer) {
+      const error = new Error(`ENOENT: no such file or directory, open '${path}'`);
+      (error as any).code = 'ENOENT';
+      throw error;
+    }
+    return buffer;
+  }),
+  stat: vi.fn(async (path: string) => {
+    const buffer = mockFileBuffers.get(path.replace(/\\/g, '/'));
+    if (!buffer) {
+      const error = new Error(`ENOENT: no such file or directory, stat '${path}'`);
+      (error as any).code = 'ENOENT';
+      throw error;
+    }
+    return { size: buffer.length };
+  }),
+  access: vi.fn(async (path: string) => {
+    const normalizedPath = path.replace(/\\/g, '/');
+    if (!mockFileBuffers.has(normalizedPath)) {
+      const error = new Error(`ENOENT`);
+      (error as any).code = 'ENOENT';
+      throw error;
+    }
+  }),
+}));
+
+// Helper to create a WxrPost
+function createWxrPost(overrides: Partial<WxrPost> = {}): WxrPost {
+  return {
+    wpId: 1,
+    title: 'Test Post',
+    slug: 'test-post',
+    content: '<p>Test content</p>',
+    excerpt: '',
+    pubDate: new Date('2024-01-15'),
+    creator: 'admin',
+    status: 'publish',
+    postType: 'post',
+    categories: [],
+    tags: [],
+    ...overrides,
+  };
+}
+
+// Helper to create a WxrMedia
+function createWxrMedia(overrides: Partial<WxrMedia> = {}): WxrMedia {
+  return {
+    wpId: 100,
+    title: 'test-image',
+    url: 'https://example.com/wp-content/uploads/2024/01/test.jpg',
+    filename: 'test.jpg',
+    relativePath: '2024/01/test.jpg',
+    pubDate: null,
+    parentId: 0,
+    mimeType: 'image/jpeg',
+    description: '',
+    ...overrides,
+  };
+}
+
+// Helper to create WxrData
+function createWxrData(overrides: Partial<WxrData> = {}): WxrData {
+  return {
+    site: {
+      title: 'Test Blog',
+      link: 'https://example.com',
+      description: 'A test blog',
+      language: 'en',
+    },
+    posts: [],
+    pages: [],
+    media: [],
+    categories: [],
+    tags: [],
+    ...overrides,
+  };
+}
+
+// Helper to compute expected MD5 hash (same algo as PostEngine)
+function md5(content: string): string {
+  return crypto.createHash('md5').update(content).digest('hex');
+}
+
+describe('ImportAnalysisEngine', () => {
+  let engine: ImportAnalysisEngine;
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockPostRows.length = 0;
+    mockMediaRows.length = 0;
+    mockTagRows.length = 0;
+    mockFileBuffers.clear();
+    engine = new ImportAnalysisEngine();
+    engine.setProjectContext('test-project');
+  });
+
+  describe('analyzeWxr - posts', () => {
+    it('should classify a post as new when slug and hash do not exist in DB', async () => {
+      // DB has no existing posts
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ slug: 'new-post', content: '<p>New content</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/path/to/export.xml');
+
+      expect(report.posts.total).toBe(1);
+      expect(report.posts.new).toBe(1);
+      expect(report.posts.items[0].status).toBe('new');
+    });
+
+    it('should classify a post as update when slug AND hash match', async () => {
+      // The engine converts HTML to markdown then hashes it
+      // <p>Existing content</p> -> "Existing content\n" in turndown (approx)
+      // We need to compute what turndown gives us and hash that
+      const markdownContent = 'Existing content';
+      const hash = md5(markdownContent);
+
+      setupDbReturns([
+        { id: 'existing-1', slug: 'existing-post', title: 'Existing Post', checksum: hash },
+      ], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ slug: 'existing-post', content: '<p>Existing content</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/path/to/export.xml');
+
+      expect(report.posts.total).toBe(1);
+      expect(report.posts.updates).toBe(1);
+      expect(report.posts.items[0].status).toBe('update');
+      expect(report.posts.items[0].existingPost?.id).toBe('existing-1');
+    });
+
+    it('should classify a post as conflict when slug matches but hash differs', async () => {
+      setupDbReturns([
+        { id: 'existing-1', slug: 'my-post', title: 'My Post', checksum: 'different-hash' },
+      ], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ slug: 'my-post', content: '<p>Changed content</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/path/to/export.xml');
+
+      expect(report.posts.total).toBe(1);
+      expect(report.posts.conflicts).toBe(1);
+      expect(report.posts.items[0].status).toBe('conflict');
+      expect(report.posts.items[0].existingPost?.id).toBe('existing-1');
+    });
+
+    it('should classify a post as content-duplicate when hash matches but slug differs', async () => {
+      const markdownContent = 'Same content here';
+      const hash = md5(markdownContent);
+
+      setupDbReturns([
+        { id: 'other-post', slug: 'different-slug', title: 'Different Title', checksum: hash },
+      ], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ slug: 'my-original-slug', content: '<p>Same content here</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/path/to/export.xml');
+
+      expect(report.posts.total).toBe(1);
+      expect(report.posts.contentDuplicates).toBe(1);
+      expect(report.posts.items[0].status).toBe('content-duplicate');
+      expect(report.posts.items[0].existingPost?.id).toBe('other-post');
+    });
+
+    it('should analyze multiple posts with mixed statuses', async () => {
+      const existingContent = 'Unchanged content';
+      const existingHash = md5(existingContent);
+
+      setupDbReturns([
+        { id: 'post-1', slug: 'unchanged', title: 'Unchanged', checksum: existingHash },
+        { id: 'post-2', slug: 'modified', title: 'Modified', checksum: 'old-hash' },
+      ], [], []);
+
+      const wxrData = createWxrData({
+        posts: [
+          createWxrPost({ slug: 'unchanged', content: '<p>Unchanged content</p>' }),
+          createWxrPost({ slug: 'modified', content: '<p>New modified content</p>' }),
+          createWxrPost({ slug: 'brand-new', content: '<p>Brand new post</p>' }),
+        ],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.posts.total).toBe(3);
+      expect(report.posts.updates).toBe(1);
+      expect(report.posts.conflicts).toBe(1);
+      expect(report.posts.new).toBe(1);
+    });
+
+    it('should include markdown preview in analyzed posts', async () => {
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ content: '<p>This is a preview of the <strong>content</strong>.</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      const item = report.posts.items[0];
+      expect(item.markdownPreview).toBeTruthy();
+      expect(item.markdownPreview.length).toBeGreaterThan(0);
+      expect(item.markdownPreview.length).toBeLessThanOrEqual(200);
+    });
+
+    it('should compute content hash from markdown conversion of HTML', async () => {
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ content: '<p>Hello world</p>' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      const item = report.posts.items[0];
+      expect(item.contentHash).toBeTruthy();
+      // Hash should be MD5 of the markdown conversion
+      expect(item.contentHash).toMatch(/^[a-f0-9]{32}$/);
+    });
+  });
+
+  describe('analyzeWxr - pages', () => {
+    it('should analyze pages separately from posts', async () => {
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        posts: [createWxrPost({ slug: 'post-1' })],
+        pages: [createWxrPost({ slug: 'about', postType: 'page' })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.posts.total).toBe(1);
+      expect(report.pages.total).toBe(1);
+      expect(report.pages.items[0].wxrPost.slug).toBe('about');
+    });
+  });
+
+  describe('analyzeWxr - media', () => {
+    it('should classify media as new when filename not in DB and file exists in uploads', async () => {
+      setupDbReturns([], [], []);
+      const fileBuffer = Buffer.from('fake image data');
+      mockFileBuffers.set('/uploads/2024/01/photo.jpg', fileBuffer);
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({
+          filename: 'photo.jpg',
+          relativePath: '2024/01/photo.jpg',
+        })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml', '/uploads');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.new).toBe(1);
+      expect(report.media.items[0].status).toBe('new');
+      expect(report.media.items[0].fileHash).toBeTruthy();
+    });
+
+    it('should classify media as update when filename matches AND hash matches', async () => {
+      const fileBuffer = Buffer.from('same file data');
+      const fileHash = md5(fileBuffer.toString('binary'));
+      mockFileBuffers.set('/uploads/2024/01/logo.png', fileBuffer);
+
+      setupDbReturns([], [
+        { id: 'media-1', originalName: 'logo.png', checksum: fileHash },
+      ], []);
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({
+          filename: 'logo.png',
+          relativePath: '2024/01/logo.png',
+        })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml', '/uploads');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.updates).toBe(1);
+      expect(report.media.items[0].status).toBe('update');
+      expect(report.media.items[0].existingMedia?.id).toBe('media-1');
+    });
+
+    it('should classify media as conflict when filename matches but hash differs', async () => {
+      const fileBuffer = Buffer.from('new file data');
+      mockFileBuffers.set('/uploads/2024/01/logo.png', fileBuffer);
+
+      setupDbReturns([], [
+        { id: 'media-1', originalName: 'logo.png', checksum: 'old-hash-value' },
+      ], []);
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({
+          filename: 'logo.png',
+          relativePath: '2024/01/logo.png',
+        })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml', '/uploads');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.conflicts).toBe(1);
+      expect(report.media.items[0].status).toBe('conflict');
+    });
+
+    it('should classify media as content-duplicate when hash matches but filename differs', async () => {
+      const fileBuffer = Buffer.from('duplicate content');
+      const fileHash = md5(fileBuffer.toString('binary'));
+      mockFileBuffers.set('/uploads/2024/01/new-name.jpg', fileBuffer);
+
+      setupDbReturns([], [
+        { id: 'media-1', originalName: 'old-name.jpg', checksum: fileHash },
+      ], []);
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({
+          filename: 'new-name.jpg',
+          relativePath: '2024/01/new-name.jpg',
+        })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml', '/uploads');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.contentDuplicates).toBe(1);
+      expect(report.media.items[0].status).toBe('content-duplicate');
+    });
+
+    it('should mark media as missing when file not found in uploads folder', async () => {
+      setupDbReturns([], [], []);
+      // No file added to mockFileBuffers
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({
+          filename: 'missing.jpg',
+          relativePath: '2024/01/missing.jpg',
+        })],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml', '/uploads');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.missing).toBe(1);
+      expect(report.media.items[0].status).toBe('missing');
+      expect(report.media.items[0].fileHash).toBeNull();
+    });
+
+    it('should handle media analysis without uploads folder (all missing)', async () => {
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        media: [createWxrMedia({ filename: 'test.jpg' })],
+      });
+
+      // No uploads folder provided
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.media.total).toBe(1);
+      expect(report.media.missing).toBe(1);
+      expect(report.media.items[0].status).toBe('missing');
+    });
+  });
+
+  describe('analyzeWxr - categories and tags', () => {
+    it('should check existing categories against project tags', async () => {
+      setupDbReturns([], [], [
+        { name: 'Technology' },
+      ]);
+
+      const wxrData = createWxrData({
+        categories: [
+          { name: 'Technology', slug: 'technology', parent: '' },
+          { name: 'Science', slug: 'science', parent: '' },
+        ],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.categories).toHaveLength(2);
+      expect(report.categories[0].existsInProject).toBe(true);
+      expect(report.categories[1].existsInProject).toBe(false);
+    });
+
+    it('should check existing tags against project tags', async () => {
+      setupDbReturns([], [], [
+        { name: 'javascript' },
+      ]);
+
+      const wxrData = createWxrData({
+        tags: [
+          { name: 'javascript', slug: 'javascript' },
+          { name: 'python', slug: 'python' },
+        ],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.tags).toHaveLength(2);
+      expect(report.tags[0].existsInProject).toBe(true);
+      expect(report.tags[1].existsInProject).toBe(false);
+    });
+  });
+
+  describe('analyzeWxr - report metadata', () => {
+    it('should include source file and site info in report', async () => {
+      setupDbReturns([], [], []);
+
+      const wxrData = createWxrData({
+        site: {
+          title: 'My Blog',
+          link: 'https://myblog.com',
+          description: 'A great blog',
+          language: 'de-DE',
+        },
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/exports/myblog.xml');
+
+      expect(report.sourceFile).toBe('/exports/myblog.xml');
+      expect(report.site.title).toBe('My Blog');
+      expect(report.site.link).toBe('https://myblog.com');
+      expect(report.analyzedAt).toBeInstanceOf(Date);
+    });
+
+    it('should correctly count all post statuses', async () => {
+      const contentA = 'Content A';
+      const hashA = md5(contentA);
+
+      setupDbReturns([
+        { id: 'p1', slug: 'update-me', title: 'Update Me', checksum: hashA },
+        { id: 'p2', slug: 'conflict-me', title: 'Conflict Me', checksum: 'old-hash' },
+      ], [], []);
+
+      const wxrData = createWxrData({
+        posts: [
+          createWxrPost({ slug: 'update-me', content: '<p>Content A</p>' }),
+          createWxrPost({ slug: 'conflict-me', content: '<p>Different content</p>' }),
+          createWxrPost({ slug: 'new-one', content: '<p>Brand new</p>' }),
+          createWxrPost({ slug: 'another-new', content: '<p>Also new</p>' }),
+        ],
+      });
+
+      const report = await engine.analyzeWxr(wxrData, '/test.xml');
+
+      expect(report.posts.total).toBe(4);
+      expect(report.posts.updates).toBe(1);
+      expect(report.posts.conflicts).toBe(1);
+      expect(report.posts.new).toBe(2);
+      expect(report.posts.contentDuplicates).toBe(0);
+    });
+  });
+});
+
+/**
+ * Helper to set up mock DB return values.
+ * Uses a counter-based approach to return different data for different queries.
+ */
+let dbQueryCount = 0;
+function setupDbReturns(
+  existingPosts: Array<{ id: string; slug: string; title: string; checksum: string }>,
+  existingMedia: Array<{ id: string; originalName: string; checksum: string }>,
+  existingTags: Array<{ name: string }>,
+) {
+  dbQueryCount = 0;
+  mockLocalDb.select.mockImplementation(() => {
+    const currentQuery = dbQueryCount++;
+    return {
+      from: vi.fn().mockReturnValue({
+        where: vi.fn().mockReturnValue({
+          all: vi.fn().mockImplementation(() => {
+            if (currentQuery === 0) return Promise.resolve(existingPosts);
+            if (currentQuery === 1) return Promise.resolve(existingMedia);
+            if (currentQuery === 2) return Promise.resolve(existingTags);
+            return Promise.resolve([]);
+          }),
+        }),
+      }),
+    };
+  });
+}
--- a/tests/engine/WxrParser.test.ts
+++ b/tests/engine/WxrParser.test.ts
@@ -0,0 +1,478 @@
+/**
+ * WxrParser Unit Tests
+ *
+ * Tests the REAL WxrParser class with mocked filesystem.
+ * Following TDD best practices: mock external dependencies, test real implementation.
+ */
+
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+import { WxrParser } from '../../src/main/engine/WxrParser';
+import type { WxrData } from '../../src/main/engine/WxrParser';
+
+// Mock fs/promises
+vi.mock('fs/promises', () => ({
+  readFile: vi.fn(),
+}));
+
+// Minimal valid WXR XML for testing
+const MINIMAL_WXR = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>My Test Blog</title>
+    <link>https://example.com</link>
+    <description>A test blog</description>
+    <language>en-US</language>
+  </channel>
+</rss>`;
+
+// WXR with categories and tags at channel level
+const WXR_WITH_TAXONOMIES = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>My Blog</title>
+    <link>https://example.com</link>
+    <description>Test</description>
+    <language>en</language>
+    <wp:category>
+      <wp:term_id>1</wp:term_id>
+      <wp:category_nicename>technology</wp:category_nicename>
+      <wp:category_parent></wp:category_parent>
+      <wp:cat_name><![CDATA[Technology]]></wp:cat_name>
+    </wp:category>
+    <wp:category>
+      <wp:term_id>2</wp:term_id>
+      <wp:category_nicename>web-dev</wp:category_nicename>
+      <wp:category_parent>technology</wp:category_parent>
+      <wp:cat_name><![CDATA[Web Development]]></wp:cat_name>
+    </wp:category>
+    <wp:tag>
+      <wp:term_id>10</wp:term_id>
+      <wp:tag_slug>javascript</wp:tag_slug>
+      <wp:tag_name><![CDATA[JavaScript]]></wp:tag_name>
+    </wp:tag>
+    <wp:tag>
+      <wp:term_id>11</wp:term_id>
+      <wp:tag_slug>typescript</wp:tag_slug>
+      <wp:tag_name><![CDATA[TypeScript]]></wp:tag_name>
+    </wp:tag>
+  </channel>
+</rss>`;
+
+// WXR with a single published post
+const WXR_WITH_POST = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>My Blog</title>
+    <link>https://example.com</link>
+    <description>Test</description>
+    <language>en</language>
+    <item>
+      <title>Hello World</title>
+      <link>https://example.com/hello-world/</link>
+      <pubDate>Mon, 15 Jan 2024 10:30:00 +0000</pubDate>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+      <category domain="category" nicename="uncategorized"><![CDATA[Uncategorized]]></category>
+      <category domain="post_tag" nicename="intro"><![CDATA[Intro]]></category>
+      <category domain="post_tag" nicename="welcome"><![CDATA[Welcome]]></category>
+      <content:encoded><![CDATA[<p>Welcome to my blog. This is my <strong>first</strong> post.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[Welcome to my blog.]]></excerpt:encoded>
+      <wp:post_id>42</wp:post_id>
+      <wp:post_date>2024-01-15 10:30:00</wp:post_date>
+      <wp:post_name>hello-world</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+  </channel>
+</rss>`;
+
+// WXR with a page
+const WXR_WITH_PAGE = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>My Blog</title>
+    <link>https://example.com</link>
+    <description>Test</description>
+    <language>en</language>
+    <item>
+      <title>About Me</title>
+      <content:encoded><![CDATA[<h2>About</h2><p>This is the about page.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>10</wp:post_id>
+      <wp:post_name>about</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>page</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+    </item>
+  </channel>
+</rss>`;
+
+// WXR with a media attachment
+const WXR_WITH_MEDIA = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>My Blog</title>
+    <link>https://example.com</link>
+    <description>Test</description>
+    <language>en</language>
+    <item>
+      <title>sunset-photo</title>
+      <content:encoded><![CDATA[A beautiful sunset]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>100</wp:post_id>
+      <wp:post_name>sunset-photo</wp:post_name>
+      <wp:status>inherit</wp:status>
+      <wp:post_type>attachment</wp:post_type>
+      <wp:post_parent>42</wp:post_parent>
+      <wp:attachment_url>https://example.com/wp-content/uploads/2024/01/sunset.jpg</wp:attachment_url>
+      <wp:postmeta>
+        <wp:meta_key>_wp_attached_file</wp:meta_key>
+        <wp:meta_value>2024/01/sunset.jpg</wp:meta_value>
+      </wp:postmeta>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+    </item>
+  </channel>
+</rss>`;
+
+// WXR with mixed content: posts, pages, and media
+const WXR_MIXED = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>Full Blog</title>
+    <link>https://fullblog.com</link>
+    <description>A full blog export</description>
+    <language>de-DE</language>
+    <wp:category>
+      <wp:category_nicename>news</wp:category_nicename>
+      <wp:category_parent></wp:category_parent>
+      <wp:cat_name><![CDATA[News]]></wp:cat_name>
+    </wp:category>
+    <wp:tag>
+      <wp:tag_slug>featured</wp:tag_slug>
+      <wp:tag_name><![CDATA[Featured]]></wp:tag_name>
+    </wp:tag>
+    <item>
+      <title>First Post</title>
+      <pubDate>Tue, 02 Jan 2024 08:00:00 +0000</pubDate>
+      <dc:creator><![CDATA[editor]]></dc:creator>
+      <category domain="category" nicename="news"><![CDATA[News]]></category>
+      <category domain="post_tag" nicename="featured"><![CDATA[Featured]]></category>
+      <content:encoded><![CDATA[<p>First post content.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[First post]]></excerpt:encoded>
+      <wp:post_id>1</wp:post_id>
+      <wp:post_name>first-post</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+    <item>
+      <title>Second Post</title>
+      <pubDate>Wed, 03 Jan 2024 09:00:00 +0000</pubDate>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+      <content:encoded><![CDATA[<p>Second post content.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>2</wp:post_id>
+      <wp:post_name>second-post</wp:post_name>
+      <wp:status>draft</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+    <item>
+      <title>Contact</title>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+      <content:encoded><![CDATA[<p>Contact us here.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>3</wp:post_id>
+      <wp:post_name>contact</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>page</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+    <item>
+      <title>logo</title>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+      <content:encoded><![CDATA[]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>4</wp:post_id>
+      <wp:post_name>logo</wp:post_name>
+      <wp:status>inherit</wp:status>
+      <wp:post_type>attachment</wp:post_type>
+      <wp:post_parent>3</wp:post_parent>
+      <wp:attachment_url>https://fullblog.com/wp-content/uploads/2024/02/logo.png</wp:attachment_url>
+    </item>
+  </channel>
+</rss>`;
+
+// WXR with draft and trashed posts
+const WXR_WITH_STATUSES = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.2/">
+  <channel>
+    <title>Blog</title>
+    <link>https://example.com</link>
+    <description></description>
+    <language>en</language>
+    <item>
+      <title>Published Post</title>
+      <content:encoded><![CDATA[<p>Published</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>1</wp:post_id>
+      <wp:post_name>published-post</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+    </item>
+    <item>
+      <title>Draft Post</title>
+      <content:encoded><![CDATA[<p>Draft</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>2</wp:post_id>
+      <wp:post_name>draft-post</wp:post_name>
+      <wp:status>draft</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+    </item>
+    <item>
+      <title>Trashed Post</title>
+      <content:encoded><![CDATA[<p>Trash</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[]]></excerpt:encoded>
+      <wp:post_id>3</wp:post_id>
+      <wp:post_name>__trashed</wp:post_name>
+      <wp:status>trash</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+      <dc:creator><![CDATA[admin]]></dc:creator>
+    </item>
+  </channel>
+</rss>`;
+
+describe('WxrParser', () => {
+  let parser: WxrParser;
+
+  beforeEach(() => {
+    parser = new WxrParser();
+  });
+
+  describe('parseXml', () => {
+    it('should parse minimal WXR and extract site info', () => {
+      const result = parser.parseXml(MINIMAL_WXR);
+
+      expect(result.site.title).toBe('My Test Blog');
+      expect(result.site.link).toBe('https://example.com');
+      expect(result.site.description).toBe('A test blog');
+      expect(result.site.language).toBe('en-US');
+    });
+
+    it('should return empty arrays when no items exist', () => {
+      const result = parser.parseXml(MINIMAL_WXR);
+
+      expect(result.posts).toEqual([]);
+      expect(result.pages).toEqual([]);
+      expect(result.media).toEqual([]);
+      expect(result.categories).toEqual([]);
+      expect(result.tags).toEqual([]);
+    });
+
+    it('should extract channel-level categories with parent relationships', () => {
+      const result = parser.parseXml(WXR_WITH_TAXONOMIES);
+
+      expect(result.categories).toHaveLength(2);
+      expect(result.categories[0]).toEqual({
+        name: 'Technology',
+        slug: 'technology',
+        parent: '',
+      });
+      expect(result.categories[1]).toEqual({
+        name: 'Web Development',
+        slug: 'web-dev',
+        parent: 'technology',
+      });
+    });
+
+    it('should extract channel-level tags', () => {
+      const result = parser.parseXml(WXR_WITH_TAXONOMIES);
+
+      expect(result.tags).toHaveLength(2);
+      expect(result.tags[0]).toEqual({
+        name: 'JavaScript',
+        slug: 'javascript',
+      });
+      expect(result.tags[1]).toEqual({
+        name: 'TypeScript',
+        slug: 'typescript',
+      });
+    });
+
+    it('should parse a published post with all fields', () => {
+      const result = parser.parseXml(WXR_WITH_POST);
+
+      expect(result.posts).toHaveLength(1);
+      const post = result.posts[0];
+      expect(post.wpId).toBe(42);
+      expect(post.title).toBe('Hello World');
+      expect(post.slug).toBe('hello-world');
+      expect(post.content).toBe('<p>Welcome to my blog. This is my <strong>first</strong> post.</p>');
+      expect(post.excerpt).toBe('Welcome to my blog.');
+      expect(post.creator).toBe('admin');
+      expect(post.status).toBe('publish');
+      expect(post.postType).toBe('post');
+      expect(post.categories).toEqual(['Uncategorized']);
+      expect(post.tags).toEqual(['Intro', 'Welcome']);
+      expect(post.pubDate).toBeInstanceOf(Date);
+    });
+
+    it('should parse a page and put it in pages array', () => {
+      const result = parser.parseXml(WXR_WITH_PAGE);
+
+      expect(result.posts).toHaveLength(0);
+      expect(result.pages).toHaveLength(1);
+
+      const page = result.pages[0];
+      expect(page.wpId).toBe(10);
+      expect(page.title).toBe('About Me');
+      expect(page.slug).toBe('about');
+      expect(page.content).toContain('<h2>About</h2>');
+      expect(page.postType).toBe('page');
+    });
+
+    it('should parse a media attachment with URL and filename', () => {
+      const result = parser.parseXml(WXR_WITH_MEDIA);
+
+      expect(result.posts).toHaveLength(0);
+      expect(result.media).toHaveLength(1);
+
+      const media = result.media[0];
+      expect(media.wpId).toBe(100);
+      expect(media.title).toBe('sunset-photo');
+      expect(media.url).toBe('https://example.com/wp-content/uploads/2024/01/sunset.jpg');
+      expect(media.filename).toBe('sunset.jpg');
+      expect(media.relativePath).toBe('2024/01/sunset.jpg');
+      expect(media.parentId).toBe(42);
+      expect(media.description).toBe('A beautiful sunset');
+    });
+
+    it('should separate posts, pages, and media from mixed content', () => {
+      const result = parser.parseXml(WXR_MIXED);
+
+      expect(result.posts).toHaveLength(2);
+      expect(result.pages).toHaveLength(1);
+      expect(result.media).toHaveLength(1);
+      expect(result.categories).toHaveLength(1);
+      expect(result.tags).toHaveLength(1);
+
+      expect(result.posts[0].title).toBe('First Post');
+      expect(result.posts[1].title).toBe('Second Post');
+      expect(result.pages[0].title).toBe('Contact');
+      expect(result.media[0].title).toBe('logo');
+    });
+
+    it('should extract post categories and tags from item-level category elements', () => {
+      const result = parser.parseXml(WXR_MIXED);
+
+      const firstPost = result.posts[0];
+      expect(firstPost.categories).toEqual(['News']);
+      expect(firstPost.tags).toEqual(['Featured']);
+
+      // Second post has no categories or tags
+      const secondPost = result.posts[1];
+      expect(secondPost.categories).toEqual([]);
+      expect(secondPost.tags).toEqual([]);
+    });
+
+    it('should handle different post statuses', () => {
+      const result = parser.parseXml(WXR_WITH_STATUSES);
+
+      expect(result.posts).toHaveLength(3);
+      expect(result.posts[0].status).toBe('publish');
+      expect(result.posts[1].status).toBe('draft');
+      expect(result.posts[2].status).toBe('trash');
+    });
+
+    it('should extract relative path from media URL based on wp-content/uploads', () => {
+      const result = parser.parseXml(WXR_WITH_MEDIA);
+      const media = result.media[0];
+
+      // The path after wp-content/uploads/
+      expect(media.relativePath).toBe('2024/01/sunset.jpg');
+    });
+
+    it('should extract relative path from mixed content media', () => {
+      const result = parser.parseXml(WXR_MIXED);
+      const media = result.media[0];
+
+      expect(media.relativePath).toBe('2024/02/logo.png');
+      expect(media.filename).toBe('logo.png');
+    });
+
+    it('should handle empty content gracefully', () => {
+      const result = parser.parseXml(WXR_WITH_MEDIA);
+      // Media items in WXR often have empty excerpt
+      const media = result.media[0];
+      expect(media).toBeDefined();
+    });
+
+    it('should infer mime type from file extension', () => {
+      const result = parser.parseXml(WXR_WITH_MEDIA);
+      expect(result.media[0].mimeType).toBe('image/jpeg');
+
+      const mixedResult = parser.parseXml(WXR_MIXED);
+      expect(mixedResult.media[0].mimeType).toBe('image/png');
+    });
+
+    it('should handle missing pubDate gracefully', () => {
+      const result = parser.parseXml(WXR_WITH_PAGE);
+      // Page has no pubDate element
+      expect(result.pages[0].pubDate).toBeNull();
+    });
+  });
+
+  describe('parseFile', () => {
+    it('should read a file and parse its contents', async () => {
+      const fs = await import('fs/promises');
+      vi.mocked(fs.readFile).mockResolvedValueOnce(WXR_WITH_POST);
+
+      const result = await parser.parseFile('/path/to/export.xml');
+
+      expect(fs.readFile).toHaveBeenCalledWith('/path/to/export.xml', 'utf-8');
+      expect(result.posts).toHaveLength(1);
+      expect(result.posts[0].title).toBe('Hello World');
+    });
+
+    it('should throw an error if the file cannot be read', async () => {
+      const fs = await import('fs/promises');
+      vi.mocked(fs.readFile).mockRejectedValueOnce(new Error('ENOENT'));
+
+      await expect(parser.parseFile('/nonexistent.xml')).rejects.toThrow('ENOENT');
+    });
+  });
+});