fix: handling linked images better
This commit is contained in:
@@ -171,6 +171,53 @@ export class ImportAnalysisEngine {
|
||||
bulletListMarker: '-',
|
||||
});
|
||||
|
||||
// Custom rule for linked images: <a><img></a> -> 
|
||||
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
||||
this.turndown.addRule('linkedImage', {
|
||||
filter: (node) => {
|
||||
// Match <a> tags that contain only an <img> (possibly with whitespace)
|
||||
if (node.nodeName !== 'A') return false;
|
||||
const children = Array.from(node.childNodes).filter(
|
||||
child => !(child.nodeType === 3 && !child.textContent?.trim())
|
||||
);
|
||||
return children.length === 1 && children[0].nodeName === 'IMG';
|
||||
},
|
||||
replacement: (_content, node) => {
|
||||
const anchor = node as HTMLAnchorElement;
|
||||
const img = anchor.querySelector('img');
|
||||
if (!img) return '';
|
||||
|
||||
const href = anchor.getAttribute('href') || '';
|
||||
const imgSrc = img.getAttribute('src') || '';
|
||||
const imgAlt = img.getAttribute('alt') || '';
|
||||
const imgTitle = img.getAttribute('title') || '';
|
||||
|
||||
// Check if the link href points to an image (common WordPress pattern for "click for larger")
|
||||
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
|
||||
const hrefIsImage = imageExtensions.test(href);
|
||||
|
||||
// Determine which URL to use:
|
||||
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
|
||||
// - Otherwise, use the original image src
|
||||
const imageUrl = hrefIsImage ? href : imgSrc;
|
||||
|
||||
// Derive alt text: use image alt if not empty, otherwise extract filename from the URL
|
||||
let altText = imgAlt.trim();
|
||||
if (!altText) {
|
||||
// Extract filename from the image URL
|
||||
const urlPath = imageUrl.split('?')[0]; // Remove query string
|
||||
const filename = urlPath.split('/').pop() || '';
|
||||
altText = filename;
|
||||
}
|
||||
|
||||
// Build the markdown image link
|
||||
if (imgTitle) {
|
||||
return ``;
|
||||
}
|
||||
return ``;
|
||||
},
|
||||
});
|
||||
|
||||
// Load macro definitions from shared config
|
||||
this.loadMacroConfigsFromShared();
|
||||
}
|
||||
@@ -482,7 +529,28 @@ export class ImportAnalysisEngine {
|
||||
|
||||
private convertToMarkdown(html: string): string {
|
||||
if (!html || !html.trim()) return '';
|
||||
return this.turndown.turndown(html);
|
||||
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
||||
const preprocessed = this.preserveLineBreaks(html);
|
||||
return this.turndown.turndown(preprocessed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Preserve line breaks in HTML content by converting \n to <br> tags
|
||||
* Only converts newlines that appear within meaningful text content,
|
||||
* not newlines that are just whitespace between tags
|
||||
*/
|
||||
private preserveLineBreaks(html: string): string {
|
||||
// Convert newlines that appear within text content (between > and <)
|
||||
// But only if the text content has actual content before or after the newline
|
||||
return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
||||
// Skip if the text content is only whitespace (just formatting between tags)
|
||||
if (!textContent.trim()) {
|
||||
return '>' + textContent + '<';
|
||||
}
|
||||
// Replace all newlines with <br> (the text has actual content)
|
||||
const preserved = textContent.replace(/\n/g, '<br>');
|
||||
return '>' + preserved + '<';
|
||||
});
|
||||
}
|
||||
|
||||
private calculateChecksum(content: string): string {
|
||||
|
||||
@@ -80,6 +80,53 @@ export class ImportExecutionEngine extends EventEmitter {
|
||||
codeBlockStyle: 'fenced',
|
||||
bulletListMarker: '-',
|
||||
});
|
||||
|
||||
// Custom rule for linked images: <a><img></a> -> 
|
||||
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
||||
this.turndown.addRule('linkedImage', {
|
||||
filter: (node) => {
|
||||
// Match <a> tags that contain only an <img> (possibly with whitespace)
|
||||
if (node.nodeName !== 'A') return false;
|
||||
const children = Array.from(node.childNodes).filter(
|
||||
child => !(child.nodeType === 3 && !child.textContent?.trim())
|
||||
);
|
||||
return children.length === 1 && children[0].nodeName === 'IMG';
|
||||
},
|
||||
replacement: (_content, node) => {
|
||||
const anchor = node as HTMLAnchorElement;
|
||||
const img = anchor.querySelector('img');
|
||||
if (!img) return '';
|
||||
|
||||
const href = anchor.getAttribute('href') || '';
|
||||
const imgSrc = img.getAttribute('src') || '';
|
||||
const imgAlt = img.getAttribute('alt') || '';
|
||||
const imgTitle = img.getAttribute('title') || '';
|
||||
|
||||
// Check if the link href points to an image (common WordPress pattern for "click for larger")
|
||||
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
|
||||
const hrefIsImage = imageExtensions.test(href);
|
||||
|
||||
// Determine which URL to use:
|
||||
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
|
||||
// - Otherwise, use the original image src
|
||||
const imageUrl = hrefIsImage ? href : imgSrc;
|
||||
|
||||
// Derive alt text: use image alt if not empty, otherwise extract filename from the URL
|
||||
let altText = imgAlt.trim();
|
||||
if (!altText) {
|
||||
// Extract filename from the image URL
|
||||
const urlPath = imageUrl.split('?')[0]; // Remove query string
|
||||
const filename = urlPath.split('/').pop() || '';
|
||||
altText = filename;
|
||||
}
|
||||
|
||||
// Build the markdown image link
|
||||
if (imgTitle) {
|
||||
return ``;
|
||||
}
|
||||
return ``;
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
setProjectContext(projectId: string, dataDir?: string): void {
|
||||
@@ -639,7 +686,13 @@ export class ImportExecutionEngine extends EventEmitter {
|
||||
*/
|
||||
private convertToMarkdown(html: string): string {
|
||||
if (!html || !html.trim()) return '';
|
||||
let markdown = this.turndown.turndown(html);
|
||||
|
||||
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
||||
// This handles the common case where WordPress exports have line breaks in the XML
|
||||
// that should be preserved in markdown
|
||||
const preprocessed = this.preserveLineBreaks(html);
|
||||
|
||||
let markdown = this.turndown.turndown(preprocessed);
|
||||
// Unescape double-bracket macros that TurndownService escaped
|
||||
// \[\[ becomes [[ and \]\] becomes ]]
|
||||
markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
|
||||
@@ -650,6 +703,25 @@ export class ImportExecutionEngine extends EventEmitter {
|
||||
return markdown;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preserve line breaks in HTML content by converting \n to <br> tags
|
||||
* Only converts newlines that appear within meaningful text content,
|
||||
* not newlines that are just whitespace between tags
|
||||
*/
|
||||
private preserveLineBreaks(html: string): string {
|
||||
// Convert newlines that appear within text content (between > and <)
|
||||
// But only if the text content has actual content before or after the newline
|
||||
return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
||||
// Skip if the text content is only whitespace (just formatting between tags)
|
||||
if (!textContent.trim()) {
|
||||
return '>' + textContent + '<';
|
||||
}
|
||||
// Replace all newlines with <br> (the text has actual content)
|
||||
const preserved = textContent.replace(/\n/g, '<br>');
|
||||
return '>' + preserved + '<';
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform WordPress shortcodes [shortcode] to [[shortcode]]
|
||||
*/
|
||||
|
||||
@@ -16,7 +16,8 @@
|
||||
- Post ID 104: Links and images
|
||||
- Post ID 105: Code blocks (inline and fenced)
|
||||
- Post ID 106: Blockquotes
|
||||
- Post ID 107: Tables
|
||||
- Post ID 107: Linked images with empty/missing alt
|
||||
- Post ID 108: Line breaks preservation
|
||||
|
||||
2. WORDPRESS SHORTCODE/MACRO CONVERSION
|
||||
- Post ID 201: [gallery] shortcode → [[gallery]] macro
|
||||
@@ -297,6 +298,59 @@ with multiple lines</pre>]]></content:encoded>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
</item>
|
||||
|
||||
<!-- Post 107: Linked Images with empty/missing alt -->
|
||||
<item>
|
||||
<title>HTML Formatting Test: Linked Images</title>
|
||||
<link>https://testblog.example.com/html-formatting-linked-images/</link>
|
||||
<pubDate>Sun, 07 Jan 2024 10:00:00 +0000</pubDate>
|
||||
<dc:creator><![CDATA[testauthor]]></dc:creator>
|
||||
<category domain="category" nicename="technology"><![CDATA[Technology]]></category>
|
||||
<content:encoded><![CDATA[<p>Here is an image inside a link with empty alt (common WordPress pattern):</p>
|
||||
<a href="http://example.com/wp-content/uploads/2020/03/full-size.png"><img class="size-medium wp-image-7801 aligncenter" src="http://example.com/wp-content/uploads/2020/03/thumbnail.png" alt="" width="300" height="223" /></a>
|
||||
<p>Another linked image with no alt attribute at all:</p>
|
||||
<a href="http://example.com/gallery/photo.jpg"><img src="http://example.com/gallery/photo-thumb.jpg" /></a>
|
||||
<p>Linked image where link and image src are the same:</p>
|
||||
<a href="http://example.com/photo.jpg"><img src="http://example.com/photo.jpg" alt="" /></a>
|
||||
<p>For comparison, an image with proper alt inside a link should preserve the alt:</p>
|
||||
<a href="http://example.com/about"><img src="http://example.com/logo.png" alt="Company Logo" /></a>]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[Testing linked images conversion]]></excerpt:encoded>
|
||||
<wp:post_id>107</wp:post_id>
|
||||
<wp:post_date>2024-01-07 10:00:00</wp:post_date>
|
||||
<wp:post_date_gmt>2024-01-07 10:00:00</wp:post_date_gmt>
|
||||
<wp:post_modified>2024-01-07 10:00:00</wp:post_modified>
|
||||
<wp:post_modified_gmt>2024-01-07 10:00:00</wp:post_modified_gmt>
|
||||
<wp:post_name>html-formatting-linked-images</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
</item>
|
||||
|
||||
<!-- Post 108: Line Breaks Preservation -->
|
||||
<item>
|
||||
<title>HTML Formatting Test: Line Breaks</title>
|
||||
<link>https://testblog.example.com/html-formatting-line-breaks/</link>
|
||||
<pubDate>Mon, 08 Jan 2024 10:00:00 +0000</pubDate>
|
||||
<dc:creator><![CDATA[testauthor]]></dc:creator>
|
||||
<category domain="category" nicename="technology"><![CDATA[Technology]]></category>
|
||||
<content:encoded><![CDATA[<p>This paragraph has line breaks
|
||||
inside the text that should
|
||||
be preserved in markdown.</p>
|
||||
<p>Here is another paragraph
|
||||
with different content
|
||||
on multiple lines.</p>
|
||||
<p>Single line paragraph for comparison.</p>]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[Testing line break preservation]]></excerpt:encoded>
|
||||
<wp:post_id>108</wp:post_id>
|
||||
<wp:post_date>2024-01-08 10:00:00</wp:post_date>
|
||||
<wp:post_date_gmt>2024-01-08 10:00:00</wp:post_date_gmt>
|
||||
<wp:post_modified>2024-01-08 10:00:00</wp:post_modified>
|
||||
<wp:post_modified_gmt>2024-01-08 10:00:00</wp:post_modified_gmt>
|
||||
<wp:post_name>html-formatting-line-breaks</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
</item>
|
||||
|
||||
<!-- ======================================== -->
|
||||
<!-- SECTION 2: SHORTCODE/MACRO CONVERSION -->
|
||||
<!-- ======================================== -->
|
||||
|
||||
@@ -353,8 +353,9 @@ describe('ImportExecutionEngine E2E Tests', () => {
|
||||
expect(content).toContain('');
|
||||
expect(content).toContain(';
|
||||
|
||||
// Verify linked image
|
||||
expect(content).toContain('[](https://example.com)');
|
||||
// Verify linked image - should become a plain image (link is unwrapped)
|
||||
// The link href is not an image URL, so the image src is used
|
||||
expect(content).toContain('');
|
||||
});
|
||||
|
||||
it('should convert code blocks (inline and fenced)', async () => {
|
||||
@@ -404,6 +405,63 @@ describe('ImportExecutionEngine E2E Tests', () => {
|
||||
expect(content).toContain('> Outer quote');
|
||||
expect(content).toContain('> > Inner quote');
|
||||
});
|
||||
|
||||
it('should convert linked images with empty alt to plain images with derived alt', async () => {
|
||||
// Post 107: Linked Images with empty/missing alt
|
||||
const post = wxrData.posts.find(p => p.wpId === 107);
|
||||
expect(post).toBeDefined();
|
||||
|
||||
const report = createSinglePostReport(post!);
|
||||
await engine.executeImport(report, {});
|
||||
|
||||
const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-linked-images'));
|
||||
expect(writtenFile).toBeDefined();
|
||||
|
||||
const content = writtenFile!.content;
|
||||
|
||||
// Linked image with empty alt should become a plain image with filename-derived alt
|
||||
// The link target is the full-size image, so use that for the image src
|
||||
expect(content).toContain('');
|
||||
|
||||
// Linked image with no alt attribute (link and image different)
|
||||
expect(content).toContain('');
|
||||
|
||||
// Linked image where link and src are the same
|
||||
expect(content).toContain('');
|
||||
|
||||
// Image with proper alt inside link should preserve the alt text
|
||||
expect(content).toContain('');
|
||||
|
||||
// Should NOT have empty image alt text (the broken pattern we're fixing)
|
||||
expect(content).not.toMatch(/!\[\]\([^)]+\)/);
|
||||
});
|
||||
|
||||
it('should preserve line breaks in paragraph text', async () => {
|
||||
// Post 108: Line Breaks Preservation
|
||||
const post = wxrData.posts.find(p => p.wpId === 108);
|
||||
expect(post).toBeDefined();
|
||||
|
||||
const report = createSinglePostReport(post!);
|
||||
await engine.executeImport(report, {});
|
||||
|
||||
const writtenFile = writtenFiles.find(f => f.path.includes('html-formatting-line-breaks'));
|
||||
expect(writtenFile).toBeDefined();
|
||||
|
||||
const content = writtenFile!.content;
|
||||
|
||||
// Line breaks within paragraphs should be preserved as markdown line breaks
|
||||
// (either as two trailing spaces + newline, or as actual newlines)
|
||||
// The key is that "inside the text that should" appears on a separate line from
|
||||
// "This paragraph has line breaks"
|
||||
expect(content).toMatch(/has line breaks\s*\n.*inside the text/);
|
||||
expect(content).toMatch(/inside the text that should\s*\n.*be preserved/);
|
||||
|
||||
// Second paragraph should also preserve line breaks
|
||||
expect(content).toMatch(/another paragraph\s*\n.*with different content/);
|
||||
|
||||
// Single line paragraph should remain intact
|
||||
expect(content).toContain('Single line paragraph for comparison.');
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================================================
|
||||
|
||||
Reference in New Issue
Block a user