From e158b2bcd80615d0ee5d7803c880d177db368a0f Mon Sep 17 00:00:00 2001 From: hugo Date: Sun, 15 Feb 2026 18:14:52 +0100 Subject: [PATCH] chore: updated reference files for details --- src/main/engine/ImportAnalysisEngine.ts | 58 +++++++++++++++++--- src/main/engine/ImportExecutionEngine.ts | 58 +++++++++++++++++--- tests/assets/wxr-ref/what-a-superb-owl.md | 4 +- tests/assets/wxr-ref/wir-haben-geheiratet.md | 4 +- 4 files changed, 102 insertions(+), 22 deletions(-) diff --git a/src/main/engine/ImportAnalysisEngine.ts b/src/main/engine/ImportAnalysisEngine.ts index cbdd443..e50078d 100644 --- a/src/main/engine/ImportAnalysisEngine.ts +++ b/src/main/engine/ImportAnalysisEngine.ts @@ -562,20 +562,62 @@ export class ImportAnalysisEngine { } /** - * Preserve line breaks in HTML content by converting \n to
tags - * Only converts newlines that appear within meaningful text content, - * not newlines that are just whitespace between tags + * Preserve line breaks and paragraph structure in content. + * + * WordPress exports often have: + * - Plain text mixed with HTML + * - Double newlines representing paragraph breaks + * - Single newlines that should become
+ * + * This function converts: + * - Double newlines (\n\n) to paragraph breaks (

) + * - Single newlines within text to
+ * - Wraps content in

tags if it starts with plain text */ private preserveLineBreaks(html: string): string { - // Convert newlines that appear within text content (between > and <) - // But only if the text content has actual content before or after the newline + if (!html || !html.trim()) return html; + + // Check if content starts with a tag or plain text + const startsWithTag = /^\s*\n

'); + + // Convert remaining single newlines within text to
+ // (but not newlines that are just between tags) + processed = processed.replace(/>([^<]+) { + if (!textContent.trim()) { + return '>' + textContent + '<'; + } + const preserved = textContent.replace(/\n/g, '
'); + return '>' + preserved + '<'; + }); + + // Also handle newlines at the start (before any tags) + processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => { + if (!textContent.trim()) return match; + return textContent.replace(/\n/g, '
'); + }); + + // Wrap in

if we added paragraph markers + if (processed.includes('

')) { + processed = '

' + processed + '

'; + } + + return processed; + } + + // For content that starts with HTML, handle newlines within text content return html.replace(/>([^<]+) { - // Skip if the text content is only whitespace (just formatting between tags) if (!textContent.trim()) { return '>' + textContent + '<'; } - // Replace all newlines with
(the text has actual content) - const preserved = textContent.replace(/\n/g, '
'); + // First convert double newlines to paragraph breaks + let preserved = textContent.replace(/\n\n+/g, '

'); + // Then convert remaining single newlines to
+ preserved = preserved.replace(/\n/g, '
'); return '>' + preserved + '<'; }); } diff --git a/src/main/engine/ImportExecutionEngine.ts b/src/main/engine/ImportExecutionEngine.ts index 32cfdac..645e017 100644 --- a/src/main/engine/ImportExecutionEngine.ts +++ b/src/main/engine/ImportExecutionEngine.ts @@ -754,20 +754,62 @@ export class ImportExecutionEngine extends EventEmitter { } /** - * Preserve line breaks in HTML content by converting \n to
tags - * Only converts newlines that appear within meaningful text content, - * not newlines that are just whitespace between tags + * Preserve line breaks and paragraph structure in content. + * + * WordPress exports often have: + * - Plain text mixed with HTML + * - Double newlines representing paragraph breaks + * - Single newlines that should become
+ * + * This function converts: + * - Double newlines (\n\n) to paragraph breaks (

) + * - Single newlines within text to
+ * - Wraps content in

tags if it starts with plain text */ private preserveLineBreaks(html: string): string { - // Convert newlines that appear within text content (between > and <) - // But only if the text content has actual content before or after the newline + if (!html || !html.trim()) return html; + + // Check if content starts with a tag or plain text + const startsWithTag = /^\s*\n

'); + + // Convert remaining single newlines within text to
+ // (but not newlines that are just between tags) + processed = processed.replace(/>([^<]+) { + if (!textContent.trim()) { + return '>' + textContent + '<'; + } + const preserved = textContent.replace(/\n/g, '
'); + return '>' + preserved + '<'; + }); + + // Also handle newlines at the start (before any tags) + processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => { + if (!textContent.trim()) return match; + return textContent.replace(/\n/g, '
'); + }); + + // Wrap in

if we added paragraph markers + if (processed.includes('

')) { + processed = '

' + processed + '

'; + } + + return processed; + } + + // For content that starts with HTML, handle newlines within text content return html.replace(/>([^<]+) { - // Skip if the text content is only whitespace (just formatting between tags) if (!textContent.trim()) { return '>' + textContent + '<'; } - // Replace all newlines with
(the text has actual content) - const preserved = textContent.replace(/\n/g, '
'); + // First convert double newlines to paragraph breaks + let preserved = textContent.replace(/\n\n+/g, '

'); + // Then convert remaining single newlines to
+ preserved = preserved.replace(/\n/g, '
'); return '>' + preserved + '<'; }); } diff --git a/tests/assets/wxr-ref/what-a-superb-owl.md b/tests/assets/wxr-ref/what-a-superb-owl.md index 4e7766f..7bcc6da 100644 --- a/tests/assets/wxr-ref/what-a-superb-owl.md +++ b/tests/assets/wxr-ref/what-a-superb-owl.md @@ -13,6 +13,4 @@ categories: author: hugo publishedAt: '2011-02-06T22:02:46.000Z' --- -![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png) - -[Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen). +![superb owl](http://28.media.tumblr.com/tumblr_lg7mqyuVsE1qzlnwmo1_500.png) [Warscheinlich von hier](http://jephjacques.tumblr.com/post/3148377589/superb-owl-joeks) (ich habs nur indirekt über Twitter mitbekommen). diff --git a/tests/assets/wxr-ref/wir-haben-geheiratet.md b/tests/assets/wxr-ref/wir-haben-geheiratet.md index 5ae1e98..e9f8045 100644 --- a/tests/assets/wxr-ref/wir-haben-geheiratet.md +++ b/tests/assets/wxr-ref/wir-haben-geheiratet.md @@ -14,6 +14,4 @@ categories: author: hugo publishedAt: '2011-09-04T14:50:06.000Z' --- -Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht). - -[[Embedded content: https://picasaweb.google.com/s/c/bin/slideshow.swf]] +Anstelle die Bilder schon hier hinzupacken gibt es nur einen Verweis auf ein Album von Bildern die meine Schwiegermutter gemacht hat - die offiziellen Fotos warten noch etwas (und ich selber hab ja keine gemacht). FLASH PLAYER NOT SUPPORTED