fix: handling linked images better

2026-02-15 09:25:07 +01:00
parent b5795867a8
commit 416a7ad5d3
4 changed files with 257 additions and 5 deletions
--- a/src/main/engine/ImportExecutionEngine.ts
+++ b/src/main/engine/ImportExecutionEngine.ts
@@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter {
      codeBlockStyle: 'fenced',
      bulletListMarker: '-',
    });
+
+    // Custom rule for linked images: <a><img></a> -> ![alt](src)
+    // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
+    this.turndown.addRule('linkedImage', {
+      filter: (node) => {
+        // Match <a> tags that contain only an <img> (possibly with whitespace)
+        if (node.nodeName !== 'A') return false;
+        const children = Array.from(node.childNodes).filter(
+          child => !(child.nodeType === 3 && !child.textContent?.trim())
+        );
+        return children.length === 1 && children[0].nodeName === 'IMG';
+      },
+      replacement: (_content, node) => {
+        const anchor = node as HTMLAnchorElement;
+        const img = anchor.querySelector('img');
+        if (!img) return '';
+
+        const href = anchor.getAttribute('href') || '';
+        const imgSrc = img.getAttribute('src') || '';
+        const imgAlt = img.getAttribute('alt') || '';
+        const imgTitle = img.getAttribute('title') || '';
+
+        // Check if the link href points to an image (common WordPress pattern for "click for larger")
+        const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
+        const hrefIsImage = imageExtensions.test(href);
+
+        // Determine which URL to use:
+        // - If href is an image URL (WordPress "click for full-size" pattern), use the href
+        // - Otherwise, use the original image src
+        const imageUrl = hrefIsImage ? href : imgSrc;
+
+        // Derive alt text: use image alt if not empty, otherwise extract filename from the URL
+        let altText = imgAlt.trim();
+        if (!altText) {
+          // Extract filename from the image URL
+          const urlPath = imageUrl.split('?')[0]; // Remove query string
+          const filename = urlPath.split('/').pop() || '';
+          altText = filename;
+        }
+
+        // Build the markdown image link
+        if (imgTitle) {
+          return `![${altText}](${imageUrl} "${imgTitle}")`;
+        }
+        return `![${altText}](${imageUrl})`;
+      },
+    });
  }

  setProjectContext(projectId: string, dataDir?: string): void {
@@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter {
   */
  private convertToMarkdown(html: string): string {
    if (!html || !html.trim()) return '';
-    let markdown = this.turndown.turndown(html);
+
+    // Preprocess: Convert newlines within text to <br> tags to preserve line breaks
+    // This handles the common case where WordPress exports have line breaks in the XML
+    // that should be preserved in markdown
+    const preprocessed = this.preserveLineBreaks(html);
+
+    let markdown = this.turndown.turndown(preprocessed);
    // Unescape double-bracket macros that TurndownService escaped
    // \[\[ becomes [[ and \]\] becomes ]]
    markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
@@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter {
    return markdown;
  }

+  /**
+   * Preserve line breaks in HTML content by converting \n to <br> tags
+   * Only converts newlines that appear within meaningful text content,
+   * not newlines that are just whitespace between tags
+   */
+  private preserveLineBreaks(html: string): string {
+    // Convert newlines that appear within text content (between > and <)
+    // But only if the text content has actual content before or after the newline
+    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+      // Skip if the text content is only whitespace (just formatting between tags)
+      if (!textContent.trim()) {
+        return '>' + textContent + '<';
+      }
+      // Replace all newlines with <br> (the text has actual content)
+      const preserved = textContent.replace(/\n/g, '<br>');
+      return '>' + preserved + '<';
+    });
+  }
+
  /**
   * Transform WordPress shortcodes [shortcode] to [[shortcode]]
   */