diff --git a/REFACTOR_DUPLICATION.md b/REFACTOR_DUPLICATION.md
index a983815..839deac 100644
--- a/REFACTOR_DUPLICATION.md
+++ b/REFACTOR_DUPLICATION.md
@@ -261,7 +261,7 @@ Move color contrast logic into a shared renderer utility.
1. Phase 1 (finish API/store type convergence)
2. Phase 2 (finish TagEngine workflow dedup)
3. Phase 3 (finish PostMedia single/batch dedup)
-4. Phase 7 (WxrParser repeated parse blocks)
+4. ~~Phase 7 (WxrParser repeated parse blocks)~~ ✅ Completed
5. Phase 8 (MetaEngine ↔ TagEngine overlap)
6. Phase 9 (renderer tag event subscription helper)
7. Phase 10 (local UI repeated blocks in component files)
@@ -274,6 +274,8 @@ Rationale: complete in-flight high-impact phases first, then address newly detec
## Phase 7 — Consolidate WXR Item Parse Blocks
+Status: ✅ Completed
+
### Problem
`WxrParser` contains repeated `pubDate` parsing + return-shape scaffolding in nearby item parse paths.
@@ -289,6 +291,10 @@ Extract shared `parsePubDate` and/or shared item base builder helper to avoid dr
- No behavior change in parsed output.
- Duplicated `pubDate`/return scaffolding materially reduced.
+### Progress Check
+- Completed: extracted shared `pubDate` parser helper and shared base item builder for post/media parse paths.
+- Completed: added branch-focused tests for valid/invalid/missing `pubDate` and post/page parse-branch parity.
+
### Coverage & Test Quality (fresh run: `npm run test:coverage`)
- `src/main/engine/WxrParser.ts`: 93.55% statements, 100.00% functions, 67.14% branches.
- `tests/engine/WxrParser.test.ts`: 19 tests covering parse variants, status handling, metadata extraction, and file-read paths.
diff --git a/src/main/engine/WxrParser.ts b/src/main/engine/WxrParser.ts
index b885f2b..d735e37 100644
--- a/src/main/engine/WxrParser.ts
+++ b/src/main/engine/WxrParser.ts
@@ -87,6 +87,40 @@ const EXT_TO_MIME: Record = {
export class WxrParser {
+ private parsePubDate(item: Element): Date | null {
+ const pubDateStr = this.getDirectChildText(item, 'pubDate');
+ if (!pubDateStr) {
+ return null;
+ }
+
+ const parsed = new Date(pubDateStr);
+ return isNaN(parsed.getTime()) ? null : parsed;
+ }
+
+ private parseItemBase(item: Element): {
+ wpId: number;
+ title: string;
+ slug: string;
+ content: string;
+ excerpt: string;
+ pubDate: Date | null;
+ creator: string;
+ status: string;
+ postType: string;
+ } {
+ return {
+ wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
+ title: this.getDirectChildText(item, 'title'),
+ slug: this.getElementText(item, 'post_name', NS.wp),
+ content: this.getElementText(item, 'encoded', NS.content),
+ excerpt: this.getElementText(item, 'encoded', NS.excerpt),
+ pubDate: this.parsePubDate(item),
+ creator: this.getElementText(item, 'creator', NS.dc),
+ status: this.getElementText(item, 'status', NS.wp),
+ postType: this.getElementText(item, 'post_type', NS.wp),
+ };
+ }
+
async parseFile(filePath: string): Promise {
const content = await fs.readFile(filePath, 'utf-8');
return this.parseXml(content);
@@ -172,6 +206,7 @@ export class WxrParser {
}
private parsePostItem(item: Element): WxrPost {
+ const base = this.parseItemBase(item);
const categories: string[] = [];
const tags: string[] = [];
@@ -190,15 +225,6 @@ export class WxrParser {
}
}
- const pubDateStr = this.getDirectChildText(item, 'pubDate');
- let pubDate: Date | null = null;
- if (pubDateStr) {
- const parsed = new Date(pubDateStr);
- if (!isNaN(parsed.getTime())) {
- pubDate = parsed;
- }
- }
-
// Parse WordPress local post date (wp:post_date)
const postDateStr = this.getElementText(item, 'post_date', NS.wp);
let postDate: Date | null = null;
@@ -220,46 +246,38 @@ export class WxrParser {
}
return {
- wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
- title: this.getDirectChildText(item, 'title'),
- slug: this.getElementText(item, 'post_name', NS.wp),
- content: this.getElementText(item, 'encoded', NS.content),
- excerpt: this.getElementText(item, 'encoded', NS.excerpt),
- pubDate,
+ wpId: base.wpId,
+ title: base.title,
+ slug: base.slug,
+ content: base.content,
+ excerpt: base.excerpt,
+ pubDate: base.pubDate,
postDate,
postModified,
- creator: this.getElementText(item, 'creator', NS.dc),
- status: this.getElementText(item, 'status', NS.wp),
- postType: this.getElementText(item, 'post_type', NS.wp),
+ creator: base.creator,
+ status: base.status,
+ postType: base.postType,
categories,
tags,
};
}
private parseMediaItem(item: Element): WxrMedia {
+ const base = this.parseItemBase(item);
const url = this.getElementText(item, 'attachment_url', NS.wp);
const filename = this.extractFilename(url);
const relativePath = this.extractRelativePath(url);
- const pubDateStr = this.getDirectChildText(item, 'pubDate');
- let pubDate: Date | null = null;
- if (pubDateStr) {
- const parsed = new Date(pubDateStr);
- if (!isNaN(parsed.getTime())) {
- pubDate = parsed;
- }
- }
-
return {
- wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
- title: this.getDirectChildText(item, 'title'),
+ wpId: base.wpId,
+ title: base.title,
url,
filename,
relativePath,
- pubDate,
+ pubDate: base.pubDate,
parentId: parseInt(this.getElementText(item, 'post_parent', NS.wp) || '0', 10),
mimeType: this.inferMimeType(filename),
- description: this.getElementText(item, 'encoded', NS.content),
+ description: base.content,
};
}
diff --git a/tests/engine/WxrParser.test.ts b/tests/engine/WxrParser.test.ts
index 087ceb5..a0b1238 100644
--- a/tests/engine/WxrParser.test.ts
+++ b/tests/engine/WxrParser.test.ts
@@ -158,6 +158,72 @@ const WXR_WITH_MEDIA = `
`;
+const WXR_WITH_MEDIA_PUBDATE = `
+
+
+ My Blog
+ https://example.com
+ Test
+ en
+ -
+ header-image
+ Fri, 05 Jan 2024 12:34:56 +0000
+
+
+ 101
+ header-image
+ inherit
+ attachment
+ 0
+ https://example.com/wp-content/uploads/2024/01/header.jpg
+
+
+
+`;
+
+const WXR_WITH_INVALID_PUBDATE = `
+
+
+ Dates Blog
+ https://example.com
+ Test
+ en
+ -
+ Bad Date Post
+ not-a-date
+ bad date
]]>
+
+ 201
+ bad-date-post
+ publish
+ post
+ 0
+
+
+ -
+ Bad Date Media
+ also-not-a-date
+
+
+ 202
+ bad-date-media
+ inherit
+ attachment
+ 0
+ https://example.com/wp-content/uploads/2024/01/bad-date.jpg
+
+
+
+`;
+
// WXR with mixed content: posts, pages, and media
const WXR_MIXED = `
{
// Page has no pubDate element
expect(result.pages[0].pubDate).toBeNull();
});
+
+ it('should parse valid RFC822 pubDate for media items', () => {
+ const result = parser.parseXml(WXR_WITH_MEDIA_PUBDATE);
+
+ expect(result.media).toHaveLength(1);
+ expect(result.media[0].pubDate).toBeInstanceOf(Date);
+ expect(result.media[0].pubDate?.toISOString()).toBe('2024-01-05T12:34:56.000Z');
+ });
+
+ it('should fallback to null for invalid pubDate nodes in post and media items', () => {
+ const result = parser.parseXml(WXR_WITH_INVALID_PUBDATE);
+
+ expect(result.posts).toHaveLength(1);
+ expect(result.media).toHaveLength(1);
+ expect(result.posts[0].pubDate).toBeNull();
+ expect(result.media[0].pubDate).toBeNull();
+ });
+
+ it('should keep base fields parity between post and page parse branches', () => {
+ const result = parser.parseXml(WXR_MIXED);
+ const post = result.posts[0];
+ const page = result.pages[0];
+
+ expect(post.postType).toBe('post');
+ expect(page.postType).toBe('page');
+ expect(post.wpId).toBeGreaterThan(0);
+ expect(page.wpId).toBeGreaterThan(0);
+ expect(post.title).toBeTruthy();
+ expect(page.title).toBeTruthy();
+ expect(post.slug).toBeTruthy();
+ expect(page.slug).toBeTruthy();
+ expect(typeof post.content).toBe('string');
+ expect(typeof page.content).toBe('string');
+ expect(typeof post.excerpt).toBe('string');
+ expect(typeof page.excerpt).toBe('string');
+ });
});
describe('parseFile', () => {