chore: updated reference files for details

2026-02-15 18:14:52 +01:00
parent 2a44ea454b
commit e158b2bcd8
4 changed files with 102 additions and 22 deletions
--- a/src/main/engine/ImportAnalysisEngine.ts
+++ b/src/main/engine/ImportAnalysisEngine.ts
@@ -562,22 +562,64 @@ export class ImportAnalysisEngine {
  }
  /**
-   * Preserve line breaks in HTML content by converting \n to <br> tags
+   * Preserve line breaks and paragraph structure in content.
-   * Only converts newlines that appear within meaningful text content,
+   * 
-   * not newlines that are just whitespace between tags
+   * WordPress exports often have:
   * - Plain text mixed with HTML
   * - Double newlines representing paragraph breaks
   * - Single newlines that should become <br>
   * 
   * This function converts:
   * - Double newlines (\n\n) to paragraph breaks (</p><p>)
   * - Single newlines within text to <br>
   * - Wraps content in <p> tags if it starts with plain text
   */
  private preserveLineBreaks(html: string): string {
-    // Convert newlines that appear within text content (between > and <)
+    if (!html || !html.trim()) return html;
-    // But only if the text content has actual content before or after the newline
+
-    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+    // Check if content starts with a tag or plain text
-      // Skip if the text content is only whitespace (just formatting between tags)
+    const startsWithTag = /^\s*</.test(html);
    // If it starts with plain text, we need to handle the whole content differently
    if (!startsWithTag) {
      // First, convert double newlines to paragraph markers
      let processed = html.replace(/\n\n+/g, '</p>\n<p>');
      // Convert remaining single newlines within text to <br>
      // (but not newlines that are just between tags)
      processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
        if (!textContent.trim()) {
          return '>' + textContent + '<';
        }
      // Replace all newlines with <br> (the text has actual content)
        const preserved = textContent.replace(/\n/g, '<br>');
        return '>' + preserved + '<';
      });
      // Also handle newlines at the start (before any tags)
      processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
        if (!textContent.trim()) return match;
        return textContent.replace(/\n/g, '<br>');
      });
      // Wrap in <p> if we added paragraph markers
      if (processed.includes('</p>')) {
        processed = '<p>' + processed + '</p>';
      }
      return processed;
    }
    // For content that starts with HTML, handle newlines within text content
    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
      if (!textContent.trim()) {
        return '>' + textContent + '<';
      }
      // First convert double newlines to paragraph breaks
      let preserved = textContent.replace(/\n\n+/g, '</p><p>');
      // Then convert remaining single newlines to <br>
      preserved = preserved.replace(/\n/g, '<br>');
      return '>' + preserved + '<';
    });
  }
  private calculateChecksum(content: string): string {
--- a/src/main/engine/ImportExecutionEngine.ts
+++ b/src/main/engine/ImportExecutionEngine.ts
@@ -754,22 +754,64 @@ export class ImportExecutionEngine extends EventEmitter {
  }
  /**
-   * Preserve line breaks in HTML content by converting \n to <br> tags
+   * Preserve line breaks and paragraph structure in content.
-   * Only converts newlines that appear within meaningful text content,
+   * 
-   * not newlines that are just whitespace between tags
+   * WordPress exports often have:
   * - Plain text mixed with HTML
   * - Double newlines representing paragraph breaks
   * - Single newlines that should become <br>
   * 
   * This function converts:
   * - Double newlines (\n\n) to paragraph breaks (</p><p>)
   * - Single newlines within text to <br>
   * - Wraps content in <p> tags if it starts with plain text
   */
  private preserveLineBreaks(html: string): string {
-    // Convert newlines that appear within text content (between > and <)
+    if (!html || !html.trim()) return html;
-    // But only if the text content has actual content before or after the newline
+
-    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
+    // Check if content starts with a tag or plain text
-      // Skip if the text content is only whitespace (just formatting between tags)
+    const startsWithTag = /^\s*</.test(html);
    // If it starts with plain text, we need to handle the whole content differently
    if (!startsWithTag) {
      // First, convert double newlines to paragraph markers
      let processed = html.replace(/\n\n+/g, '</p>\n<p>');
      // Convert remaining single newlines within text to <br>
      // (but not newlines that are just between tags)
      processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
        if (!textContent.trim()) {
          return '>' + textContent + '<';
        }
      // Replace all newlines with <br> (the text has actual content)
        const preserved = textContent.replace(/\n/g, '<br>');
        return '>' + preserved + '<';
      });
      // Also handle newlines at the start (before any tags)
      processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
        if (!textContent.trim()) return match;
        return textContent.replace(/\n/g, '<br>');
      });
      // Wrap in <p> if we added paragraph markers
      if (processed.includes('</p>')) {
        processed = '<p>' + processed + '</p>';
      }
      return processed;
    }
    // For content that starts with HTML, handle newlines within text content
    return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
      if (!textContent.trim()) {
        return '>' + textContent + '<';
      }
      // First convert double newlines to paragraph breaks
      let preserved = textContent.replace(/\n\n+/g, '</p><p>');
      // Then convert remaining single newlines to <br>
      preserved = preserved.replace(/\n/g, '<br>');
      return '>' + preserved + '<';
    });
  }
  /**
--- a/tests/assets/wxr-ref/what-a-superb-owl.md
+++ b/tests/assets/wxr-ref/what-a-superb-owl.md
@@ -13,6 +13,4 @@ categories:
 author: hugo
 publishedAt: '2011-02-06T22:02:46.000Z'
 ---
-![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png)
+![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png) [Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen).
 [Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen).
--- a/tests/assets/wxr-ref/wir-haben-geheiratet.md
+++ b/tests/assets/wxr-ref/wir-haben-geheiratet.md
@@ -14,6 +14,4 @@ categories:
 author: hugo
 publishedAt: '2011-09-04T14:50:06.000Z'
 ---
-Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht).
+Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht). FLASH PLAYER NOT SUPPORTED
 [[Embedded content: https://picasaweb.google.com/s/c/bin/slideshow.swf]]