diff --git a/REFACTOR_DUPLICATION.md b/REFACTOR_DUPLICATION.md index a983815..839deac 100644 --- a/REFACTOR_DUPLICATION.md +++ b/REFACTOR_DUPLICATION.md @@ -261,7 +261,7 @@ Move color contrast logic into a shared renderer utility. 1. Phase 1 (finish API/store type convergence) 2. Phase 2 (finish TagEngine workflow dedup) 3. Phase 3 (finish PostMedia single/batch dedup) -4. Phase 7 (WxrParser repeated parse blocks) +4. ~~Phase 7 (WxrParser repeated parse blocks)~~ ✅ Completed 5. Phase 8 (MetaEngine ↔ TagEngine overlap) 6. Phase 9 (renderer tag event subscription helper) 7. Phase 10 (local UI repeated blocks in component files) @@ -274,6 +274,8 @@ Rationale: complete in-flight high-impact phases first, then address newly detec ## Phase 7 — Consolidate WXR Item Parse Blocks +Status: ✅ Completed + ### Problem `WxrParser` contains repeated `pubDate` parsing + return-shape scaffolding in nearby item parse paths. @@ -289,6 +291,10 @@ Extract shared `parsePubDate` and/or shared item base builder helper to avoid dr - No behavior change in parsed output. - Duplicated `pubDate`/return scaffolding materially reduced. +### Progress Check +- Completed: extracted shared `pubDate` parser helper and shared base item builder for post/media parse paths. +- Completed: added branch-focused tests for valid/invalid/missing `pubDate` and post/page parse-branch parity. + ### Coverage & Test Quality (fresh run: `npm run test:coverage`) - `src/main/engine/WxrParser.ts`: 93.55% statements, 100.00% functions, 67.14% branches. - `tests/engine/WxrParser.test.ts`: 19 tests covering parse variants, status handling, metadata extraction, and file-read paths. diff --git a/src/main/engine/WxrParser.ts b/src/main/engine/WxrParser.ts index b885f2b..d735e37 100644 --- a/src/main/engine/WxrParser.ts +++ b/src/main/engine/WxrParser.ts @@ -87,6 +87,40 @@ const EXT_TO_MIME: Record = { export class WxrParser { + private parsePubDate(item: Element): Date | null { + const pubDateStr = this.getDirectChildText(item, 'pubDate'); + if (!pubDateStr) { + return null; + } + + const parsed = new Date(pubDateStr); + return isNaN(parsed.getTime()) ? null : parsed; + } + + private parseItemBase(item: Element): { + wpId: number; + title: string; + slug: string; + content: string; + excerpt: string; + pubDate: Date | null; + creator: string; + status: string; + postType: string; + } { + return { + wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10), + title: this.getDirectChildText(item, 'title'), + slug: this.getElementText(item, 'post_name', NS.wp), + content: this.getElementText(item, 'encoded', NS.content), + excerpt: this.getElementText(item, 'encoded', NS.excerpt), + pubDate: this.parsePubDate(item), + creator: this.getElementText(item, 'creator', NS.dc), + status: this.getElementText(item, 'status', NS.wp), + postType: this.getElementText(item, 'post_type', NS.wp), + }; + } + async parseFile(filePath: string): Promise { const content = await fs.readFile(filePath, 'utf-8'); return this.parseXml(content); @@ -172,6 +206,7 @@ export class WxrParser { } private parsePostItem(item: Element): WxrPost { + const base = this.parseItemBase(item); const categories: string[] = []; const tags: string[] = []; @@ -190,15 +225,6 @@ export class WxrParser { } } - const pubDateStr = this.getDirectChildText(item, 'pubDate'); - let pubDate: Date | null = null; - if (pubDateStr) { - const parsed = new Date(pubDateStr); - if (!isNaN(parsed.getTime())) { - pubDate = parsed; - } - } - // Parse WordPress local post date (wp:post_date) const postDateStr = this.getElementText(item, 'post_date', NS.wp); let postDate: Date | null = null; @@ -220,46 +246,38 @@ export class WxrParser { } return { - wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10), - title: this.getDirectChildText(item, 'title'), - slug: this.getElementText(item, 'post_name', NS.wp), - content: this.getElementText(item, 'encoded', NS.content), - excerpt: this.getElementText(item, 'encoded', NS.excerpt), - pubDate, + wpId: base.wpId, + title: base.title, + slug: base.slug, + content: base.content, + excerpt: base.excerpt, + pubDate: base.pubDate, postDate, postModified, - creator: this.getElementText(item, 'creator', NS.dc), - status: this.getElementText(item, 'status', NS.wp), - postType: this.getElementText(item, 'post_type', NS.wp), + creator: base.creator, + status: base.status, + postType: base.postType, categories, tags, }; } private parseMediaItem(item: Element): WxrMedia { + const base = this.parseItemBase(item); const url = this.getElementText(item, 'attachment_url', NS.wp); const filename = this.extractFilename(url); const relativePath = this.extractRelativePath(url); - const pubDateStr = this.getDirectChildText(item, 'pubDate'); - let pubDate: Date | null = null; - if (pubDateStr) { - const parsed = new Date(pubDateStr); - if (!isNaN(parsed.getTime())) { - pubDate = parsed; - } - } - return { - wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10), - title: this.getDirectChildText(item, 'title'), + wpId: base.wpId, + title: base.title, url, filename, relativePath, - pubDate, + pubDate: base.pubDate, parentId: parseInt(this.getElementText(item, 'post_parent', NS.wp) || '0', 10), mimeType: this.inferMimeType(filename), - description: this.getElementText(item, 'encoded', NS.content), + description: base.content, }; } diff --git a/tests/engine/WxrParser.test.ts b/tests/engine/WxrParser.test.ts index 087ceb5..a0b1238 100644 --- a/tests/engine/WxrParser.test.ts +++ b/tests/engine/WxrParser.test.ts @@ -158,6 +158,72 @@ const WXR_WITH_MEDIA = ` `; +const WXR_WITH_MEDIA_PUBDATE = ` + + + My Blog + https://example.com + Test + en + + header-image + Fri, 05 Jan 2024 12:34:56 +0000 + + + 101 + header-image + inherit + attachment + 0 + https://example.com/wp-content/uploads/2024/01/header.jpg + + + +`; + +const WXR_WITH_INVALID_PUBDATE = ` + + + Dates Blog + https://example.com + Test + en + + Bad Date Post + not-a-date + bad date

]]>
+ + 201 + bad-date-post + publish + post + 0 + +
+ + Bad Date Media + also-not-a-date + + + 202 + bad-date-media + inherit + attachment + 0 + https://example.com/wp-content/uploads/2024/01/bad-date.jpg + + +
+
`; + // WXR with mixed content: posts, pages, and media const WXR_MIXED = ` { // Page has no pubDate element expect(result.pages[0].pubDate).toBeNull(); }); + + it('should parse valid RFC822 pubDate for media items', () => { + const result = parser.parseXml(WXR_WITH_MEDIA_PUBDATE); + + expect(result.media).toHaveLength(1); + expect(result.media[0].pubDate).toBeInstanceOf(Date); + expect(result.media[0].pubDate?.toISOString()).toBe('2024-01-05T12:34:56.000Z'); + }); + + it('should fallback to null for invalid pubDate nodes in post and media items', () => { + const result = parser.parseXml(WXR_WITH_INVALID_PUBDATE); + + expect(result.posts).toHaveLength(1); + expect(result.media).toHaveLength(1); + expect(result.posts[0].pubDate).toBeNull(); + expect(result.media[0].pubDate).toBeNull(); + }); + + it('should keep base fields parity between post and page parse branches', () => { + const result = parser.parseXml(WXR_MIXED); + const post = result.posts[0]; + const page = result.pages[0]; + + expect(post.postType).toBe('post'); + expect(page.postType).toBe('page'); + expect(post.wpId).toBeGreaterThan(0); + expect(page.wpId).toBeGreaterThan(0); + expect(post.title).toBeTruthy(); + expect(page.title).toBeTruthy(); + expect(post.slug).toBeTruthy(); + expect(page.slug).toBeTruthy(); + expect(typeof post.content).toBe('string'); + expect(typeof page.content).toBe('string'); + expect(typeof post.excerpt).toBe('string'); + expect(typeof page.excerpt).toBe('string'); + }); }); describe('parseFile', () => {