chore: updated reference files for details

2026-02-15 18:14:52 +01:00
parent 2a44ea454b
commit e158b2bcd8
4 changed files with 102 additions and 22 deletions
--- a/src/main/engine/ImportAnalysisEngine.ts
+++ b/src/main/engine/ImportAnalysisEngine.ts
@@ -562,22 +562,64 @@ export class ImportAnalysisEngine {
  }

  /**
-   * Preserve line breaks in HTML content by converting \n to <br> tags
-   * Only converts newlines that appear within meaningful text content,
-   * not newlines that are just whitespace between tags
+   * Preserve line breaks and paragraph structure in content.
+   * 
+   * WordPress exports often have:
+   * - Plain text mixed with HTML
+   * - Double newlines representing paragraph breaks
+   * - Single newlines that should become <br>
+   * 
+   * This function converts:
+   * - Double newlines (\n\n) to paragraph breaks (</p><p>)
+   * - Single newlines within text to <br>
+   * - Wraps content in <p> tags if it starts with plain text
   */
  private preserveLineBreaks(html: string): string {
-    // Convert newlines that appear within text content (between > and <)
-    // But only if the text content has actual content before or after the newline
-    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
-      // Skip if the text content is only whitespace (just formatting between tags)
+    if (!html || !html.trim()) return html;
+
+    // Check if content starts with a tag or plain text
+    const startsWithTag = /^\s*</.test(html);
+    
+    // If it starts with plain text, we need to handle the whole content differently
+    if (!startsWithTag) {
+      // First, convert double newlines to paragraph markers
+      let processed = html.replace(/\n\n+/g, '</p>\n<p>');
+      
+      // Convert remaining single newlines within text to <br>
+      // (but not newlines that are just between tags)
+      processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
        if (!textContent.trim()) {
          return '>' + textContent + '<';
        }
-      // Replace all newlines with <br> (the text has actual content)
        const preserved = textContent.replace(/\n/g, '<br>');
        return '>' + preserved + '<';
      });
+      
+      // Also handle newlines at the start (before any tags)
+      processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
+        if (!textContent.trim()) return match;
+        return textContent.replace(/\n/g, '<br>');
+      });
+      
+      // Wrap in <p> if we added paragraph markers
+      if (processed.includes('</p>')) {
+        processed = '<p>' + processed + '</p>';
+      }
+      
+      return processed;
+    }
+
+    // For content that starts with HTML, handle newlines within text content
+    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+      if (!textContent.trim()) {
+        return '>' + textContent + '<';
+      }
+      // First convert double newlines to paragraph breaks
+      let preserved = textContent.replace(/\n\n+/g, '</p><p>');
+      // Then convert remaining single newlines to <br>
+      preserved = preserved.replace(/\n/g, '<br>');
+      return '>' + preserved + '<';
+    });
  }

  private calculateChecksum(content: string): string {
--- a/src/main/engine/ImportExecutionEngine.ts
+++ b/src/main/engine/ImportExecutionEngine.ts
@@ -754,22 +754,64 @@ export class ImportExecutionEngine extends EventEmitter {
  }

  /**
-   * Preserve line breaks in HTML content by converting \n to <br> tags
-   * Only converts newlines that appear within meaningful text content,
-   * not newlines that are just whitespace between tags
+   * Preserve line breaks and paragraph structure in content.
+   * 
+   * WordPress exports often have:
+   * - Plain text mixed with HTML
+   * - Double newlines representing paragraph breaks
+   * - Single newlines that should become <br>
+   * 
+   * This function converts:
+   * - Double newlines (\n\n) to paragraph breaks (</p><p>)
+   * - Single newlines within text to <br>
+   * - Wraps content in <p> tags if it starts with plain text
   */
  private preserveLineBreaks(html: string): string {
-    // Convert newlines that appear within text content (between > and <)
-    // But only if the text content has actual content before or after the newline
-    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
-      // Skip if the text content is only whitespace (just formatting between tags)
+    if (!html || !html.trim()) return html;
+
+    // Check if content starts with a tag or plain text
+    const startsWithTag = /^\s*</.test(html);
+    
+    // If it starts with plain text, we need to handle the whole content differently
+    if (!startsWithTag) {
+      // First, convert double newlines to paragraph markers
+      let processed = html.replace(/\n\n+/g, '</p>\n<p>');
+      
+      // Convert remaining single newlines within text to <br>
+      // (but not newlines that are just between tags)
+      processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
        if (!textContent.trim()) {
          return '>' + textContent + '<';
        }
-      // Replace all newlines with <br> (the text has actual content)
        const preserved = textContent.replace(/\n/g, '<br>');
        return '>' + preserved + '<';
      });
+      
+      // Also handle newlines at the start (before any tags)
+      processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
+        if (!textContent.trim()) return match;
+        return textContent.replace(/\n/g, '<br>');
+      });
+      
+      // Wrap in <p> if we added paragraph markers
+      if (processed.includes('</p>')) {
+        processed = '<p>' + processed + '</p>';
+      }
+      
+      return processed;
+    }
+
+    // For content that starts with HTML, handle newlines within text content
+    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+      if (!textContent.trim()) {
+        return '>' + textContent + '<';
+      }
+      // First convert double newlines to paragraph breaks
+      let preserved = textContent.replace(/\n\n+/g, '</p><p>');
+      // Then convert remaining single newlines to <br>
+      preserved = preserved.replace(/\n/g, '<br>');
+      return '>' + preserved + '<';
+    });
  }

  /**
--- a/tests/assets/wxr-ref/what-a-superb-owl.md
+++ b/tests/assets/wxr-ref/what-a-superb-owl.md
@@ -13,6 +13,4 @@ categories:
 author: hugo
 publishedAt: '2011-02-06T22:02:46.000Z'
 ---
-![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png)
-
-[Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen).
+![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png) [Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen).
--- a/tests/assets/wxr-ref/wir-haben-geheiratet.md
+++ b/tests/assets/wxr-ref/wir-haben-geheiratet.md
@@ -14,6 +14,4 @@ categories:
 author: hugo
 publishedAt: '2011-09-04T14:50:06.000Z'
 ---
-Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht).
-
-[[Embedded content: https://picasaweb.google.com/s/c/bin/slideshow.swf]]
+Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht). FLASH PLAYER NOT SUPPORTED