diff --git a/src/main/engine/ImportAnalysisEngine.ts b/src/main/engine/ImportAnalysisEngine.ts
index fa11342..5a13fb6 100644
--- a/src/main/engine/ImportAnalysisEngine.ts
+++ b/src/main/engine/ImportAnalysisEngine.ts
@@ -171,6 +171,53 @@ export class ImportAnalysisEngine {
bulletListMarker: '-',
});
+ // Custom rule for linked images: -> 
+ // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
+ this.turndown.addRule('linkedImage', {
+ filter: (node) => {
+ // Match tags that contain only an
(possibly with whitespace)
+ if (node.nodeName !== 'A') return false;
+ const children = Array.from(node.childNodes).filter(
+ child => !(child.nodeType === 3 && !child.textContent?.trim())
+ );
+ return children.length === 1 && children[0].nodeName === 'IMG';
+ },
+ replacement: (_content, node) => {
+ const anchor = node as HTMLAnchorElement;
+ const img = anchor.querySelector('img');
+ if (!img) return '';
+
+ const href = anchor.getAttribute('href') || '';
+ const imgSrc = img.getAttribute('src') || '';
+ const imgAlt = img.getAttribute('alt') || '';
+ const imgTitle = img.getAttribute('title') || '';
+
+ // Check if the link href points to an image (common WordPress pattern for "click for larger")
+ const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
+ const hrefIsImage = imageExtensions.test(href);
+
+ // Determine which URL to use:
+ // - If href is an image URL (WordPress "click for full-size" pattern), use the href
+ // - Otherwise, use the original image src
+ const imageUrl = hrefIsImage ? href : imgSrc;
+
+ // Derive alt text: use image alt if not empty, otherwise extract filename from the URL
+ let altText = imgAlt.trim();
+ if (!altText) {
+ // Extract filename from the image URL
+ const urlPath = imageUrl.split('?')[0]; // Remove query string
+ const filename = urlPath.split('/').pop() || '';
+ altText = filename;
+ }
+
+ // Build the markdown image link
+ if (imgTitle) {
+ return ``;
+ }
+ return ``;
+ },
+ });
+
// Load macro definitions from shared config
this.loadMacroConfigsFromShared();
}
@@ -482,7 +529,28 @@ export class ImportAnalysisEngine {
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
- return this.turndown.turndown(html);
+ // Preprocess: Convert newlines within text to
tags to preserve line breaks
+ const preprocessed = this.preserveLineBreaks(html);
+ return this.turndown.turndown(preprocessed);
+ }
+
+ /**
+ * Preserve line breaks in HTML content by converting \n to
tags
+ * Only converts newlines that appear within meaningful text content,
+ * not newlines that are just whitespace between tags
+ */
+ private preserveLineBreaks(html: string): string {
+ // Convert newlines that appear within text content (between > and <)
+ // But only if the text content has actual content before or after the newline
+ return html.replace(/>([^<]+) {
+ // Skip if the text content is only whitespace (just formatting between tags)
+ if (!textContent.trim()) {
+ return '>' + textContent + '<';
+ }
+ // Replace all newlines with
(the text has actual content)
+ const preserved = textContent.replace(/\n/g, '
');
+ return '>' + preserved + '<';
+ });
}
private calculateChecksum(content: string): string {
diff --git a/src/main/engine/ImportExecutionEngine.ts b/src/main/engine/ImportExecutionEngine.ts
index 2695de6..5e68a4d 100644
--- a/src/main/engine/ImportExecutionEngine.ts
+++ b/src/main/engine/ImportExecutionEngine.ts
@@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter {
codeBlockStyle: 'fenced',
bulletListMarker: '-',
});
+
+ // Custom rule for linked images: -> 
+ // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
+ this.turndown.addRule('linkedImage', {
+ filter: (node) => {
+ // Match tags that contain only an
(possibly with whitespace)
+ if (node.nodeName !== 'A') return false;
+ const children = Array.from(node.childNodes).filter(
+ child => !(child.nodeType === 3 && !child.textContent?.trim())
+ );
+ return children.length === 1 && children[0].nodeName === 'IMG';
+ },
+ replacement: (_content, node) => {
+ const anchor = node as HTMLAnchorElement;
+ const img = anchor.querySelector('img');
+ if (!img) return '';
+
+ const href = anchor.getAttribute('href') || '';
+ const imgSrc = img.getAttribute('src') || '';
+ const imgAlt = img.getAttribute('alt') || '';
+ const imgTitle = img.getAttribute('title') || '';
+
+ // Check if the link href points to an image (common WordPress pattern for "click for larger")
+ const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
+ const hrefIsImage = imageExtensions.test(href);
+
+ // Determine which URL to use:
+ // - If href is an image URL (WordPress "click for full-size" pattern), use the href
+ // - Otherwise, use the original image src
+ const imageUrl = hrefIsImage ? href : imgSrc;
+
+ // Derive alt text: use image alt if not empty, otherwise extract filename from the URL
+ let altText = imgAlt.trim();
+ if (!altText) {
+ // Extract filename from the image URL
+ const urlPath = imageUrl.split('?')[0]; // Remove query string
+ const filename = urlPath.split('/').pop() || '';
+ altText = filename;
+ }
+
+ // Build the markdown image link
+ if (imgTitle) {
+ return ``;
+ }
+ return ``;
+ },
+ });
}
setProjectContext(projectId: string, dataDir?: string): void {
@@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter {
*/
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
- let markdown = this.turndown.turndown(html);
+
+ // Preprocess: Convert newlines within text to
tags to preserve line breaks
+ // This handles the common case where WordPress exports have line breaks in the XML
+ // that should be preserved in markdown
+ const preprocessed = this.preserveLineBreaks(html);
+
+ let markdown = this.turndown.turndown(preprocessed);
// Unescape double-bracket macros that TurndownService escaped
// \[\[ becomes [[ and \]\] becomes ]]
markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
@@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter {
return markdown;
}
+ /**
+ * Preserve line breaks in HTML content by converting \n to
tags
+ * Only converts newlines that appear within meaningful text content,
+ * not newlines that are just whitespace between tags
+ */
+ private preserveLineBreaks(html: string): string {
+ // Convert newlines that appear within text content (between > and <)
+ // But only if the text content has actual content before or after the newline
+ return html.replace(/>([^<]+) {
+ // Skip if the text content is only whitespace (just formatting between tags)
+ if (!textContent.trim()) {
+ return '>' + textContent + '<';
+ }
+ // Replace all newlines with
(the text has actual content)
+ const preserved = textContent.replace(/\n/g, '
');
+ return '>' + preserved + '<';
+ });
+ }
+
/**
* Transform WordPress shortcodes [shortcode] to [[shortcode]]
*/
diff --git a/tests/assets/import-test-cases.wxr b/tests/assets/import-test-cases.wxr
index 8f32359..dbcb5d3 100644
--- a/tests/assets/import-test-cases.wxr
+++ b/tests/assets/import-test-cases.wxr
@@ -16,7 +16,8 @@
- Post ID 104: Links and images
- Post ID 105: Code blocks (inline and fenced)
- Post ID 106: Blockquotes
- - Post ID 107: Tables
+ - Post ID 107: Linked images with empty/missing alt
+ - Post ID 108: Line breaks preservation
2. WORDPRESS SHORTCODE/MACRO CONVERSION
- Post ID 201: [gallery] shortcode → [[gallery]] macro
@@ -297,6 +298,59 @@ with multiple lines]]>
Another linked image with no alt attribute at all:
+Linked image where link and image src are the same:
+
+For comparison, an image with proper alt inside a link should preserve the alt:
+
]]>
+ Here is another paragraph +with different content +on multiple lines.
+Single line paragraph for comparison.
]]>