From 416a7ad5d3b8049bbb34645a90a0ab366d6aa9ad Mon Sep 17 00:00:00 2001
From: hugo <hugoms@me.com>
Date: Sun, 15 Feb 2026 09:25:07 +0100
Subject: [PATCH] fix: handling linked images better

---
 src/main/engine/ImportAnalysisEngine.ts       | 70 +++++++++++++++++-
 src/main/engine/ImportExecutionEngine.ts      | 74 ++++++++++++++++++-
 tests/assets/import-test-cases.wxr            | 56 +++++++++++++-
 .../engine/ImportExecutionEngine.e2e.test.ts  | 62 +++++++++++++++-
 4 files changed, 257 insertions(+), 5 deletions(-)
diff --git a/src/main/engine/ImportAnalysisEngine.ts b/src/main/engine/ImportAnalysisEngine.ts
index fa11342..5a13fb6 100644
--- a/src/main/engine/ImportAnalysisEngine.ts
+++ b/src/main/engine/ImportAnalysisEngine.ts
@@ -171,6 +171,53 @@ export class ImportAnalysisEngine {
       bulletListMarker: '-',
     });
     
+    // Custom rule for linked images: <a><img></a> -> ![alt](src)
+    // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
+    this.turndown.addRule('linkedImage', {
+      filter: (node) => {
+        // Match <a> tags that contain only an <img> (possibly with whitespace)
+        if (node.nodeName !== 'A') return false;
+        const children = Array.from(node.childNodes).filter(
+          child => !(child.nodeType === 3 && !child.textContent?.trim())
+        );
+        return children.length === 1 && children[0].nodeName === 'IMG';
+      },
+      replacement: (_content, node) => {
+        const anchor = node as HTMLAnchorElement;
+        const img = anchor.querySelector('img');
+        if (!img) return '';
+
+        const href = anchor.getAttribute('href') || '';
+        const imgSrc = img.getAttribute('src') || '';
+        const imgAlt = img.getAttribute('alt') || '';
+        const imgTitle = img.getAttribute('title') || '';
+
+        // Check if the link href points to an image (common WordPress pattern for "click for larger")
+        const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
+        const hrefIsImage = imageExtensions.test(href);
+
+        // Determine which URL to use:
+        // - If href is an image URL (WordPress "click for full-size" pattern), use the href
+        // - Otherwise, use the original image src
+        const imageUrl = hrefIsImage ? href : imgSrc;
+
+        // Derive alt text: use image alt if not empty, otherwise extract filename from the URL
+        let altText = imgAlt.trim();
+        if (!altText) {
+          // Extract filename from the image URL
+          const urlPath = imageUrl.split('?')[0]; // Remove query string
+          const filename = urlPath.split('/').pop() || '';
+          altText = filename;
+        }
+
+        // Build the markdown image link
+        if (imgTitle) {
+          return `![${altText}](${imageUrl} "${imgTitle}")`;
+        }
+        return `![${altText}](${imageUrl})`;
+      },
+    });
+    
     // Load macro definitions from shared config
     this.loadMacroConfigsFromShared();
   }
@@ -482,7 +529,28 @@ export class ImportAnalysisEngine {
 
   private convertToMarkdown(html: string): string {
     if (!html || !html.trim()) return '';
-    return this.turndown.turndown(html);
+    // Preprocess: Convert newlines within text to <br> tags to preserve line breaks
+    const preprocessed = this.preserveLineBreaks(html);
+    return this.turndown.turndown(preprocessed);
+  }
+
+  /**
+   * Preserve line breaks in HTML content by converting \n to <br> tags
+   * Only converts newlines that appear within meaningful text content,
+   * not newlines that are just whitespace between tags
+   */
+  private preserveLineBreaks(html: string): string {
+    // Convert newlines that appear within text content (between > and <)
+    // But only if the text content has actual content before or after the newline
+    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+      // Skip if the text content is only whitespace (just formatting between tags)
+      if (!textContent.trim()) {
+        return '>' + textContent + '<';
+      }
+      // Replace all newlines with <br> (the text has actual content)
+      const preserved = textContent.replace(/\n/g, '<br>');
+      return '>' + preserved + '<';
+    });
   }
 
   private calculateChecksum(content: string): string {
diff --git a/src/main/engine/ImportExecutionEngine.ts b/src/main/engine/ImportExecutionEngine.ts
index 2695de6..5e68a4d 100644
--- a/src/main/engine/ImportExecutionEngine.ts
+++ b/src/main/engine/ImportExecutionEngine.ts
@@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter {
       codeBlockStyle: 'fenced',
       bulletListMarker: '-',
     });
+
+    // Custom rule for linked images: <a><img></a> -> ![alt](src)
+    // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
+    this.turndown.addRule('linkedImage', {
+      filter: (node) => {
+        // Match <a> tags that contain only an <img> (possibly with whitespace)
+        if (node.nodeName !== 'A') return false;
+        const children = Array.from(node.childNodes).filter(
+          child => !(child.nodeType === 3 && !child.textContent?.trim())
+        );
+        return children.length === 1 && children[0].nodeName === 'IMG';
+      },
+      replacement: (_content, node) => {
+        const anchor = node as HTMLAnchorElement;
+        const img = anchor.querySelector('img');
+        if (!img) return '';
+
+        const href = anchor.getAttribute('href') || '';
+        const imgSrc = img.getAttribute('src') || '';
+        const imgAlt = img.getAttribute('alt') || '';
+        const imgTitle = img.getAttribute('title') || '';
+
+        // Check if the link href points to an image (common WordPress pattern for "click for larger")
+        const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
+        const hrefIsImage = imageExtensions.test(href);
+
+        // Determine which URL to use:
+        // - If href is an image URL (WordPress "click for full-size" pattern), use the href
+        // - Otherwise, use the original image src
+        const imageUrl = hrefIsImage ? href : imgSrc;
+
+        // Derive alt text: use image alt if not empty, otherwise extract filename from the URL
+        let altText = imgAlt.trim();
+        if (!altText) {
+          // Extract filename from the image URL
+          const urlPath = imageUrl.split('?')[0]; // Remove query string
+          const filename = urlPath.split('/').pop() || '';
+          altText = filename;
+        }
+
+        // Build the markdown image link
+        if (imgTitle) {
+          return `![${altText}](${imageUrl} "${imgTitle}")`;
+        }
+        return `![${altText}](${imageUrl})`;
+      },
+    });
   }
 
   setProjectContext(projectId: string, dataDir?: string): void {
@@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter {
    */
   private convertToMarkdown(html: string): string {
     if (!html || !html.trim()) return '';
-    let markdown = this.turndown.turndown(html);
+
+    // Preprocess: Convert newlines within text to <br> tags to preserve line breaks
+    // This handles the common case where WordPress exports have line breaks in the XML
+    // that should be preserved in markdown
+    const preprocessed = this.preserveLineBreaks(html);
+
+    let markdown = this.turndown.turndown(preprocessed);
     // Unescape double-bracket macros that TurndownService escaped
     // \[\[ becomes [[ and \]\] becomes ]]
     markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
@@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter {
     return markdown;
   }
 
+  /**
+   * Preserve line breaks in HTML content by converting \n to <br> tags
+   * Only converts newlines that appear within meaningful text content,
+   * not newlines that are just whitespace between tags
+   */
+  private preserveLineBreaks(html: string): string {
+    // Convert newlines that appear within text content (between > and <)
+    // But only if the text content has actual content before or after the newline
+    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+      // Skip if the text content is only whitespace (just formatting between tags)
+      if (!textContent.trim()) {
+        return '>' + textContent + '<';
+      }
+      // Replace all newlines with <br> (the text has actual content)
+      const preserved = textContent.replace(/\n/g, '<br>');
+      return '>' + preserved + '<';
+    });
+  }
+
   /**
    * Transform WordPress shortcodes [shortcode] to [[shortcode]]
    */
diff --git a/tests/assets/import-test-cases.wxr b/tests/assets/import-test-cases.wxr
index 8f32359..dbcb5d3 100644
--- a/tests/assets/import-test-cases.wxr
+++ b/tests/assets/import-test-cases.wxr
@@ -16,7 +16,8 @@
      - Post ID 104: Links and images
      - Post ID 105: Code blocks (inline and fenced)
      - Post ID 106: Blockquotes
-     - Post ID 107: Tables
+     - Post ID 107: Linked images with empty/missing alt
+     - Post ID 108: Line breaks preservation
      
   2. WORDPRESS SHORTCODE/MACRO CONVERSION
      - Post ID 201: [gallery] shortcode → [[gallery]] macro
@@ -297,6 +298,59 @@ with multiple lines</pre>]]></content:encoded>
       <wp:post_parent>0</wp:post_parent>
     </item>
     
+    <!-- Post 107: Linked Images with empty/missing alt -->
+    <item>
+      <title>HTML Formatting Test: Linked Images</title>
+      <link>https://testblog.example.com/html-formatting-linked-images/</link>
+      <pubDate>Sun, 07 Jan 2024 10:00:00 +0000</pubDate>
+      <dc:creator><![CDATA[testauthor]]></dc:creator>
+      <category domain="category" nicename="technology"><![CDATA[Technology]]></category>
+      <content:encoded><![CDATA[<p>Here is an image inside a link with empty alt (common WordPress pattern):</p>
+<a href="http://example.com/wp-content/uploads/2020/03/full-size.png"><img class="size-medium wp-image-7801 aligncenter" src="http://example.com/wp-content/uploads/2020/03/thumbnail.png" alt="" width="300" height="223" /></a>
+<p>Another linked image with no alt attribute at all:</p>
+<a href="http://example.com/gallery/photo.jpg"><img src="http://example.com/gallery/photo-thumb.jpg" /></a>
+<p>Linked image where link and image src are the same:</p>
+<a href="http://example.com/photo.jpg"><img src="http://example.com/photo.jpg" alt="" /></a>
+<p>For comparison, an image with proper alt inside a link should preserve the alt:</p>
+<a href="http://example.com/about"><img src="http://example.com/logo.png" alt="Company Logo" /></a>]]></content:encoded>
+      <excerpt:encoded><![CDATA[Testing linked images conversion]]></excerpt:encoded>
+      <wp:post_id>107</wp:post_id>
+      <wp:post_date>2024-01-07 10:00:00</wp:post_date>
+      <wp:post_date_gmt>2024-01-07 10:00:00</wp:post_date_gmt>
+      <wp:post_modified>2024-01-07 10:00:00</wp:post_modified>
+      <wp:post_modified_gmt>2024-01-07 10:00:00</wp:post_modified_gmt>
+      <wp:post_name>html-formatting-linked-images</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+    
+    <!-- Post 108: Line Breaks Preservation -->
+    <item>
+      <title>HTML Formatting Test: Line Breaks</title>
+      <link>https://testblog.example.com/html-formatting-line-breaks/</link>
+      <pubDate>Mon, 08 Jan 2024 10:00:00 +0000</pubDate>
+      <dc:creator><![CDATA[testauthor]]></dc:creator>
+      <category domain="category" nicename="technology"><![CDATA[Technology]]></category>
+      <content:encoded><![CDATA[<p>This paragraph has line breaks
+inside the text that should
+be preserved in markdown.</p>
+<p>Here is another paragraph
+with different content
+on multiple lines.</p>
+<p>Single line paragraph for comparison.</p>]]></content:encoded>
+      <excerpt:encoded><![CDATA[Testing line break preservation]]></excerpt:encoded>
+      <wp:post_id>108</wp:post_id>
+      <wp:post_date>2024-01-08 10:00:00</wp:post_date>
+      <wp:post_date_gmt>2024-01-08 10:00:00</wp:post_date_gmt>
+      <wp:post_modified>2024-01-08 10:00:00</wp:post_modified>
+      <wp:post_modified_gmt>2024-01-08 10:00:00</wp:post_modified_gmt>
+      <wp:post_name>html-formatting-line-breaks</wp:post_name>
+      <wp:status>publish</wp:status>
+      <wp:post_type>post</wp:post_type>
+      <wp:post_parent>0</wp:post_parent>
+    </item>
+    
     <!-- ======================================== -->
     <!-- SECTION 2: SHORTCODE/MACRO CONVERSION   -->
     <!-- ======================================== -->
diff --git a/tests/engine/ImportExecutionEngine.e2e.test.ts b/tests/engine/ImportExecutionEngine.e2e.test.ts
index 9de38f6..b554126 100644
--- a/tests/engine/ImportExecutionEngine.e2e.test.ts
+++ b/tests/engine/ImportExecutionEngine.e2e.test.ts
@@ -353,8 +353,9 @@ describe('ImportExecutionEngine E2E Tests', () => {
       expect(content).toContain('![Test image](https://example.com/image.jpg)');
       expect(content).toContain('![Photo](https://example.com/photo.png');
 
-      // Verify linked image
-      expect(content).toContain('[![Banner](https://example.com/banner.jpg)](https://example.com)');
+      // Verify linked image - should become a plain image (link is unwrapped)
+      // The link href is not an image URL, so the image src is used
+      expect(content).toContain('![Banner](https://example.com/banner.jpg)');
     });
 
     it('should convert code blocks (inline and fenced)', async () => {
@@ -404,6 +405,63 @@ describe('ImportExecutionEngine E2E Tests', () => {
       expect(content).toContain('> Outer quote');
       expect(content).toContain('> > Inner quote');
     });
+
+    it('should convert linked images with empty alt to plain images with derived alt', async () => {
+      // Post 107: Linked Images with empty/missing alt
+      const post = wxrData.posts.find(p => p.wpId === 107);
+      expect(post).toBeDefined();
+
+      const report = createSinglePostReport(post!);
+      await engine.executeImport(report, {});
+
+      const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-linked-images'));
+      expect(writtenFile).toBeDefined();
+
+      const content = writtenFile!.content;
+
+      // Linked image with empty alt should become a plain image with filename-derived alt
+      // The link target is the full-size image, so use that for the image src
+      expect(content).toContain('![full-size.png](http://example.com/wp-content/uploads/2020/03/full-size.png)');
+
+      // Linked image with no alt attribute (link and image different)
+      expect(content).toContain('![photo.jpg](http://example.com/gallery/photo.jpg)');
+
+      // Linked image where link and src are the same
+      expect(content).toContain('![photo.jpg](http://example.com/photo.jpg)');
+
+      // Image with proper alt inside link should preserve the alt text
+      expect(content).toContain('![Company Logo](http://example.com/logo.png)');
+
+      // Should NOT have empty image alt text (the broken pattern we're fixing)
+      expect(content).not.toMatch(/!\[\]\([^)]+\)/);
+    });
+
+    it('should preserve line breaks in paragraph text', async () => {
+      // Post 108: Line Breaks Preservation
+      const post = wxrData.posts.find(p => p.wpId === 108);
+      expect(post).toBeDefined();
+
+      const report = createSinglePostReport(post!);
+      await engine.executeImport(report, {});
+
+      const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-line-breaks'));
+      expect(writtenFile).toBeDefined();
+
+      const content = writtenFile!.content;
+
+      // Line breaks within paragraphs should be preserved as markdown line breaks
+      // (either as two trailing spaces + newline, or as actual newlines)
+      // The key is that "inside the text that should" appears on a separate line from 
+      // "This paragraph has line breaks"
+      expect(content).toMatch(/has line breaks\s*\n.*inside the text/);
+      expect(content).toMatch(/inside the text that should\s*\n.*be preserved/);
+
+      // Second paragraph should also preserve line breaks
+      expect(content).toMatch(/another paragraph\s*\n.*with different content/);
+
+      // Single line paragraph should remain intact
+      expect(content).toContain('Single line paragraph for comparison.');
+    });
   });
 
   // ==========================================================================