fix: handling linked images better

This commit is contained in:
2026-02-15 09:25:07 +01:00
parent b5795867a8
commit 416a7ad5d3
4 changed files with 257 additions and 5 deletions

View File

@@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter {
codeBlockStyle: 'fenced',
bulletListMarker: '-',
});
// Custom rule for linked images: <a><img></a> -> ![alt](src)
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
this.turndown.addRule('linkedImage', {
filter: (node) => {
// Match <a> tags that contain only an <img> (possibly with whitespace)
if (node.nodeName !== 'A') return false;
const children = Array.from(node.childNodes).filter(
child => !(child.nodeType === 3 && !child.textContent?.trim())
);
return children.length === 1 && children[0].nodeName === 'IMG';
},
replacement: (_content, node) => {
const anchor = node as HTMLAnchorElement;
const img = anchor.querySelector('img');
if (!img) return '';
const href = anchor.getAttribute('href') || '';
const imgSrc = img.getAttribute('src') || '';
const imgAlt = img.getAttribute('alt') || '';
const imgTitle = img.getAttribute('title') || '';
// Check if the link href points to an image (common WordPress pattern for "click for larger")
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
const hrefIsImage = imageExtensions.test(href);
// Determine which URL to use:
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
// - Otherwise, use the original image src
const imageUrl = hrefIsImage ? href : imgSrc;
// Derive alt text: use image alt if not empty, otherwise extract filename from the URL
let altText = imgAlt.trim();
if (!altText) {
// Extract filename from the image URL
const urlPath = imageUrl.split('?')[0]; // Remove query string
const filename = urlPath.split('/').pop() || '';
altText = filename;
}
// Build the markdown image link
if (imgTitle) {
return `![${altText}](${imageUrl} "${imgTitle}")`;
}
return `![${altText}](${imageUrl})`;
},
});
}
setProjectContext(projectId: string, dataDir?: string): void {
@@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter {
*/
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
let markdown = this.turndown.turndown(html);
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
// This handles the common case where WordPress exports have line breaks in the XML
// that should be preserved in markdown
const preprocessed = this.preserveLineBreaks(html);
let markdown = this.turndown.turndown(preprocessed);
// Unescape double-bracket macros that TurndownService escaped
// \[\[ becomes [[ and \]\] becomes ]]
markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
@@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter {
return markdown;
}
/**
* Preserve line breaks in HTML content by converting \n to <br> tags
* Only converts newlines that appear within meaningful text content,
* not newlines that are just whitespace between tags
*/
private preserveLineBreaks(html: string): string {
// Convert newlines that appear within text content (between > and <)
// But only if the text content has actual content before or after the newline
return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
// Skip if the text content is only whitespace (just formatting between tags)
if (!textContent.trim()) {
return '>' + textContent + '<';
}
// Replace all newlines with <br> (the text has actual content)
const preserved = textContent.replace(/\n/g, '<br>');
return '>' + preserved + '<';
});
}
/**
* Transform WordPress shortcodes [shortcode] to [[shortcode]]
*/