From 416a7ad5d3b8049bbb34645a90a0ab366d6aa9ad Mon Sep 17 00:00:00 2001 From: hugo Date: Sun, 15 Feb 2026 09:25:07 +0100 Subject: [PATCH] fix: handling linked images better --- src/main/engine/ImportAnalysisEngine.ts | 70 +++++++++++++++++- src/main/engine/ImportExecutionEngine.ts | 74 ++++++++++++++++++- tests/assets/import-test-cases.wxr | 56 +++++++++++++- .../engine/ImportExecutionEngine.e2e.test.ts | 62 +++++++++++++++- 4 files changed, 257 insertions(+), 5 deletions(-) diff --git a/src/main/engine/ImportAnalysisEngine.ts b/src/main/engine/ImportAnalysisEngine.ts index fa11342..5a13fb6 100644 --- a/src/main/engine/ImportAnalysisEngine.ts +++ b/src/main/engine/ImportAnalysisEngine.ts @@ -171,6 +171,53 @@ export class ImportAnalysisEngine { bulletListMarker: '-', }); + // Custom rule for linked images: -> ![alt](src) + // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images + this.turndown.addRule('linkedImage', { + filter: (node) => { + // Match tags that contain only an (possibly with whitespace) + if (node.nodeName !== 'A') return false; + const children = Array.from(node.childNodes).filter( + child => !(child.nodeType === 3 && !child.textContent?.trim()) + ); + return children.length === 1 && children[0].nodeName === 'IMG'; + }, + replacement: (_content, node) => { + const anchor = node as HTMLAnchorElement; + const img = anchor.querySelector('img'); + if (!img) return ''; + + const href = anchor.getAttribute('href') || ''; + const imgSrc = img.getAttribute('src') || ''; + const imgAlt = img.getAttribute('alt') || ''; + const imgTitle = img.getAttribute('title') || ''; + + // Check if the link href points to an image (common WordPress pattern for "click for larger") + const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i; + const hrefIsImage = imageExtensions.test(href); + + // Determine which URL to use: + // - If href is an image URL (WordPress "click for full-size" pattern), use the href + // - Otherwise, use the original image src + const imageUrl = hrefIsImage ? href : imgSrc; + + // Derive alt text: use image alt if not empty, otherwise extract filename from the URL + let altText = imgAlt.trim(); + if (!altText) { + // Extract filename from the image URL + const urlPath = imageUrl.split('?')[0]; // Remove query string + const filename = urlPath.split('/').pop() || ''; + altText = filename; + } + + // Build the markdown image link + if (imgTitle) { + return `![${altText}](${imageUrl} "${imgTitle}")`; + } + return `![${altText}](${imageUrl})`; + }, + }); + // Load macro definitions from shared config this.loadMacroConfigsFromShared(); } @@ -482,7 +529,28 @@ export class ImportAnalysisEngine { private convertToMarkdown(html: string): string { if (!html || !html.trim()) return ''; - return this.turndown.turndown(html); + // Preprocess: Convert newlines within text to
tags to preserve line breaks + const preprocessed = this.preserveLineBreaks(html); + return this.turndown.turndown(preprocessed); + } + + /** + * Preserve line breaks in HTML content by converting \n to
tags + * Only converts newlines that appear within meaningful text content, + * not newlines that are just whitespace between tags + */ + private preserveLineBreaks(html: string): string { + // Convert newlines that appear within text content (between > and <) + // But only if the text content has actual content before or after the newline + return html.replace(/>([^<]+) { + // Skip if the text content is only whitespace (just formatting between tags) + if (!textContent.trim()) { + return '>' + textContent + '<'; + } + // Replace all newlines with
(the text has actual content) + const preserved = textContent.replace(/\n/g, '
'); + return '>' + preserved + '<'; + }); } private calculateChecksum(content: string): string { diff --git a/src/main/engine/ImportExecutionEngine.ts b/src/main/engine/ImportExecutionEngine.ts index 2695de6..5e68a4d 100644 --- a/src/main/engine/ImportExecutionEngine.ts +++ b/src/main/engine/ImportExecutionEngine.ts @@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter { codeBlockStyle: 'fenced', bulletListMarker: '-', }); + + // Custom rule for linked images:
-> ![alt](src) + // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images + this.turndown.addRule('linkedImage', { + filter: (node) => { + // Match tags that contain only an (possibly with whitespace) + if (node.nodeName !== 'A') return false; + const children = Array.from(node.childNodes).filter( + child => !(child.nodeType === 3 && !child.textContent?.trim()) + ); + return children.length === 1 && children[0].nodeName === 'IMG'; + }, + replacement: (_content, node) => { + const anchor = node as HTMLAnchorElement; + const img = anchor.querySelector('img'); + if (!img) return ''; + + const href = anchor.getAttribute('href') || ''; + const imgSrc = img.getAttribute('src') || ''; + const imgAlt = img.getAttribute('alt') || ''; + const imgTitle = img.getAttribute('title') || ''; + + // Check if the link href points to an image (common WordPress pattern for "click for larger") + const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i; + const hrefIsImage = imageExtensions.test(href); + + // Determine which URL to use: + // - If href is an image URL (WordPress "click for full-size" pattern), use the href + // - Otherwise, use the original image src + const imageUrl = hrefIsImage ? href : imgSrc; + + // Derive alt text: use image alt if not empty, otherwise extract filename from the URL + let altText = imgAlt.trim(); + if (!altText) { + // Extract filename from the image URL + const urlPath = imageUrl.split('?')[0]; // Remove query string + const filename = urlPath.split('/').pop() || ''; + altText = filename; + } + + // Build the markdown image link + if (imgTitle) { + return `![${altText}](${imageUrl} "${imgTitle}")`; + } + return `![${altText}](${imageUrl})`; + }, + }); } setProjectContext(projectId: string, dataDir?: string): void { @@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter { */ private convertToMarkdown(html: string): string { if (!html || !html.trim()) return ''; - let markdown = this.turndown.turndown(html); + + // Preprocess: Convert newlines within text to
tags to preserve line breaks + // This handles the common case where WordPress exports have line breaks in the XML + // that should be preserved in markdown + const preprocessed = this.preserveLineBreaks(html); + + let markdown = this.turndown.turndown(preprocessed); // Unescape double-bracket macros that TurndownService escaped // \[\[ becomes [[ and \]\] becomes ]] markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]'); @@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter { return markdown; } + /** + * Preserve line breaks in HTML content by converting \n to
tags + * Only converts newlines that appear within meaningful text content, + * not newlines that are just whitespace between tags + */ + private preserveLineBreaks(html: string): string { + // Convert newlines that appear within text content (between > and <) + // But only if the text content has actual content before or after the newline + return html.replace(/>([^<]+) { + // Skip if the text content is only whitespace (just formatting between tags) + if (!textContent.trim()) { + return '>' + textContent + '<'; + } + // Replace all newlines with
(the text has actual content) + const preserved = textContent.replace(/\n/g, '
'); + return '>' + preserved + '<'; + }); + } + /** * Transform WordPress shortcodes [shortcode] to [[shortcode]] */ diff --git a/tests/assets/import-test-cases.wxr b/tests/assets/import-test-cases.wxr index 8f32359..dbcb5d3 100644 --- a/tests/assets/import-test-cases.wxr +++ b/tests/assets/import-test-cases.wxr @@ -16,7 +16,8 @@ - Post ID 104: Links and images - Post ID 105: Code blocks (inline and fenced) - Post ID 106: Blockquotes - - Post ID 107: Tables + - Post ID 107: Linked images with empty/missing alt + - Post ID 108: Line breaks preservation 2. WORDPRESS SHORTCODE/MACRO CONVERSION - Post ID 201: [gallery] shortcode → [[gallery]] macro @@ -297,6 +298,59 @@ with multiple lines]]> 0 + + + HTML Formatting Test: Linked Images + https://testblog.example.com/html-formatting-linked-images/ + Sun, 07 Jan 2024 10:00:00 +0000 + + + Here is an image inside a link with empty alt (common WordPress pattern):

+
+

Another linked image with no alt attribute at all:

+ +

Linked image where link and image src are the same:

+ +

For comparison, an image with proper alt inside a link should preserve the alt:

+Company Logo]]> + + 107 + 2024-01-07 10:00:00 + 2024-01-07 10:00:00 + 2024-01-07 10:00:00 + 2024-01-07 10:00:00 + html-formatting-linked-images + publish + post + 0 + + + + + HTML Formatting Test: Line Breaks + https://testblog.example.com/html-formatting-line-breaks/ + Mon, 08 Jan 2024 10:00:00 +0000 + + + This paragraph has line breaks +inside the text that should +be preserved in markdown.

+

Here is another paragraph +with different content +on multiple lines.

+

Single line paragraph for comparison.

]]>
+ + 108 + 2024-01-08 10:00:00 + 2024-01-08 10:00:00 + 2024-01-08 10:00:00 + 2024-01-08 10:00:00 + html-formatting-line-breaks + publish + post + 0 +
+ diff --git a/tests/engine/ImportExecutionEngine.e2e.test.ts b/tests/engine/ImportExecutionEngine.e2e.test.ts index 9de38f6..b554126 100644 --- a/tests/engine/ImportExecutionEngine.e2e.test.ts +++ b/tests/engine/ImportExecutionEngine.e2e.test.ts @@ -353,8 +353,9 @@ describe('ImportExecutionEngine E2E Tests', () => { expect(content).toContain('![Test image](https://example.com/image.jpg)'); expect(content).toContain('![Photo](https://example.com/photo.png'); - // Verify linked image - expect(content).toContain('[![Banner](https://example.com/banner.jpg)](https://example.com)'); + // Verify linked image - should become a plain image (link is unwrapped) + // The link href is not an image URL, so the image src is used + expect(content).toContain('![Banner](https://example.com/banner.jpg)'); }); it('should convert code blocks (inline and fenced)', async () => { @@ -404,6 +405,63 @@ describe('ImportExecutionEngine E2E Tests', () => { expect(content).toContain('> Outer quote'); expect(content).toContain('> > Inner quote'); }); + + it('should convert linked images with empty alt to plain images with derived alt', async () => { + // Post 107: Linked Images with empty/missing alt + const post = wxrData.posts.find(p => p.wpId === 107); + expect(post).toBeDefined(); + + const report = createSinglePostReport(post!); + await engine.executeImport(report, {}); + + const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-linked-images')); + expect(writtenFile).toBeDefined(); + + const content = writtenFile!.content; + + // Linked image with empty alt should become a plain image with filename-derived alt + // The link target is the full-size image, so use that for the image src + expect(content).toContain('![full-size.png](http://example.com/wp-content/uploads/2020/03/full-size.png)'); + + // Linked image with no alt attribute (link and image different) + expect(content).toContain('![photo.jpg](http://example.com/gallery/photo.jpg)'); + + // Linked image where link and src are the same + expect(content).toContain('![photo.jpg](http://example.com/photo.jpg)'); + + // Image with proper alt inside link should preserve the alt text + expect(content).toContain('![Company Logo](http://example.com/logo.png)'); + + // Should NOT have empty image alt text (the broken pattern we're fixing) + expect(content).not.toMatch(/!\[\]\([^)]+\)/); + }); + + it('should preserve line breaks in paragraph text', async () => { + // Post 108: Line Breaks Preservation + const post = wxrData.posts.find(p => p.wpId === 108); + expect(post).toBeDefined(); + + const report = createSinglePostReport(post!); + await engine.executeImport(report, {}); + + const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-line-breaks')); + expect(writtenFile).toBeDefined(); + + const content = writtenFile!.content; + + // Line breaks within paragraphs should be preserved as markdown line breaks + // (either as two trailing spaces + newline, or as actual newlines) + // The key is that "inside the text that should" appears on a separate line from + // "This paragraph has line breaks" + expect(content).toMatch(/has line breaks\s*\n.*inside the text/); + expect(content).toMatch(/inside the text that should\s*\n.*be preserved/); + + // Second paragraph should also preserve line breaks + expect(content).toMatch(/another paragraph\s*\n.*with different content/); + + // Single line paragraph should remain intact + expect(content).toContain('Single line paragraph for comparison.'); + }); }); // ==========================================================================