fix: addressed paragraph collapse
This commit is contained in:
@@ -195,6 +195,28 @@ export class ImportAnalysisEngine {
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Custom rule for standalone images with empty alt but title attribute
|
||||||
|
// WordPress often uses title="name" with alt=""
|
||||||
|
this.turndown.addRule('imageWithTitle', {
|
||||||
|
filter: (node) => {
|
||||||
|
if (node.nodeName !== 'IMG') return false;
|
||||||
|
// Check if this image is NOT inside an <a> tag (those are handled by linkedImage rule)
|
||||||
|
const parent = node.parentNode;
|
||||||
|
if (parent?.nodeName === 'A') return false;
|
||||||
|
// Only match if alt is empty but title exists
|
||||||
|
const img = node as HTMLImageElement;
|
||||||
|
const alt = img.getAttribute('alt') || '';
|
||||||
|
const title = img.getAttribute('title') || '';
|
||||||
|
return !alt.trim() && title.trim().length > 0;
|
||||||
|
},
|
||||||
|
replacement: (_content, node) => {
|
||||||
|
const img = node as HTMLImageElement;
|
||||||
|
const src = img.getAttribute('src') || '';
|
||||||
|
const title = img.getAttribute('title') || '';
|
||||||
|
return ``;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
// Custom rule for linked images: <a><img></a> -> 
|
// Custom rule for linked images: <a><img></a> -> 
|
||||||
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
||||||
this.turndown.addRule('linkedImage', {
|
this.turndown.addRule('linkedImage', {
|
||||||
@@ -556,8 +578,10 @@ export class ImportAnalysisEngine {
|
|||||||
|
|
||||||
private convertToMarkdown(html: string): string {
|
private convertToMarkdown(html: string): string {
|
||||||
if (!html || !html.trim()) return '';
|
if (!html || !html.trim()) return '';
|
||||||
|
// Preprocess: Wrap standalone <code> blocks containing newlines in <pre> tags
|
||||||
|
const withCodeBlocks = this.wrapMultilineCode(html);
|
||||||
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
||||||
const preprocessed = this.preserveLineBreaks(html);
|
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
|
||||||
return this.turndown.turndown(preprocessed);
|
return this.turndown.turndown(preprocessed);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -580,10 +604,18 @@ export class ImportAnalysisEngine {
|
|||||||
// Check if content starts with a tag or plain text
|
// Check if content starts with a tag or plain text
|
||||||
const startsWithTag = /^\s*</.test(html);
|
const startsWithTag = /^\s*</.test(html);
|
||||||
|
|
||||||
|
// Protect <pre> blocks from having their newlines modified
|
||||||
|
const preBlocks: string[] = [];
|
||||||
|
let protectedHtml = html.replace(/<pre>([\s\S]*?)<\/pre>/g, (match) => {
|
||||||
|
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
|
||||||
|
preBlocks.push(match);
|
||||||
|
return placeholder;
|
||||||
|
});
|
||||||
|
|
||||||
// If it starts with plain text, we need to handle the whole content differently
|
// If it starts with plain text, we need to handle the whole content differently
|
||||||
if (!startsWithTag) {
|
if (!startsWithTag) {
|
||||||
// First, convert double newlines to paragraph markers
|
// First, convert double newlines to paragraph markers
|
||||||
let processed = html.replace(/\n\n+/g, '</p>\n<p>');
|
let processed = protectedHtml.replace(/\n\n+/g, '</p>\n<p>');
|
||||||
|
|
||||||
// Convert remaining single newlines within text to <br>
|
// Convert remaining single newlines within text to <br>
|
||||||
// (but not newlines that are just between tags)
|
// (but not newlines that are just between tags)
|
||||||
@@ -606,11 +638,16 @@ export class ImportAnalysisEngine {
|
|||||||
processed = '<p>' + processed + '</p>';
|
processed = '<p>' + processed + '</p>';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Restore protected <pre> blocks
|
||||||
|
preBlocks.forEach((block, i) => {
|
||||||
|
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
|
||||||
|
});
|
||||||
|
|
||||||
return processed;
|
return processed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For content that starts with HTML, handle newlines within text content
|
// For content that starts with HTML, handle newlines within text content
|
||||||
return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
let result = protectedHtml.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
||||||
if (!textContent.trim()) {
|
if (!textContent.trim()) {
|
||||||
return '>' + textContent + '<';
|
return '>' + textContent + '<';
|
||||||
}
|
}
|
||||||
@@ -620,6 +657,42 @@ export class ImportAnalysisEngine {
|
|||||||
preserved = preserved.replace(/\n/g, '<br>');
|
preserved = preserved.replace(/\n/g, '<br>');
|
||||||
return '>' + preserved + '<';
|
return '>' + preserved + '<';
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Restore protected <pre> blocks
|
||||||
|
preBlocks.forEach((block, i) => {
|
||||||
|
result = result.replace(`__PRE_BLOCK_${i}__`, block);
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrap standalone <code> blocks containing newlines in <pre> tags.
|
||||||
|
*
|
||||||
|
* WordPress content sometimes uses <code>...</code> for multi-line code blocks
|
||||||
|
* without a <pre> wrapper. Standard HTML parsing treats this as inline code and
|
||||||
|
* collapses whitespace. By wrapping in <pre>, we preserve the formatting and
|
||||||
|
* Turndown will convert it to a fenced Markdown code block.
|
||||||
|
*
|
||||||
|
* Only wraps <code> blocks that contain literal newlines.
|
||||||
|
* Does NOT wrap:
|
||||||
|
* - <code> already inside <pre>
|
||||||
|
* - <code> without newlines (inline code)
|
||||||
|
*/
|
||||||
|
private wrapMultilineCode(html: string): string {
|
||||||
|
if (!html) return html;
|
||||||
|
|
||||||
|
// Match <code> blocks containing newlines that are NOT inside <pre>
|
||||||
|
// Use a regex that captures the full <code>...</code> content including any embedded HTML
|
||||||
|
return html.replace(/<code>([\s\S]*?)<\/code>/g, (match, content: string) => {
|
||||||
|
// Only wrap if content contains newlines (multiline code block)
|
||||||
|
if (!content.includes('\n')) {
|
||||||
|
return match; // Leave inline code as-is
|
||||||
|
}
|
||||||
|
// Check if this <code> is already inside a <pre> by looking backward
|
||||||
|
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
|
||||||
|
return '<pre><code>' + content + '</code></pre>';
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private calculateChecksum(content: string): string {
|
private calculateChecksum(content: string): string {
|
||||||
|
|||||||
@@ -109,6 +109,28 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Custom rule for standalone images with empty alt but title attribute
|
||||||
|
// WordPress often uses title="name" with alt=""
|
||||||
|
this.turndown.addRule('imageWithTitle', {
|
||||||
|
filter: (node) => {
|
||||||
|
if (node.nodeName !== 'IMG') return false;
|
||||||
|
// Check if this image is NOT inside an <a> tag (those are handled by linkedImage rule)
|
||||||
|
const parent = node.parentNode;
|
||||||
|
if (parent?.nodeName === 'A') return false;
|
||||||
|
// Only match if alt is empty but title exists
|
||||||
|
const img = node as HTMLImageElement;
|
||||||
|
const alt = img.getAttribute('alt') || '';
|
||||||
|
const title = img.getAttribute('title') || '';
|
||||||
|
return !alt.trim() && title.trim().length > 0;
|
||||||
|
},
|
||||||
|
replacement: (_content, node) => {
|
||||||
|
const img = node as HTMLImageElement;
|
||||||
|
const src = img.getAttribute('src') || '';
|
||||||
|
const title = img.getAttribute('title') || '';
|
||||||
|
return ``;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
// Custom rule for linked images: <a><img></a> -> 
|
// Custom rule for linked images: <a><img></a> -> 
|
||||||
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
||||||
this.turndown.addRule('linkedImage', {
|
this.turndown.addRule('linkedImage', {
|
||||||
@@ -737,10 +759,15 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
private convertToMarkdown(html: string): string {
|
private convertToMarkdown(html: string): string {
|
||||||
if (!html || !html.trim()) return '';
|
if (!html || !html.trim()) return '';
|
||||||
|
|
||||||
|
// Preprocess: Wrap standalone <code> blocks containing newlines in <pre> tags
|
||||||
|
// This must happen BEFORE preserveLineBreaks to prevent newlines from becoming <br>
|
||||||
|
// and to ensure Turndown recognizes them as fenced code blocks
|
||||||
|
const withCodeBlocks = this.wrapMultilineCode(html);
|
||||||
|
|
||||||
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
||||||
// This handles the common case where WordPress exports have line breaks in the XML
|
// This handles the common case where WordPress exports have line breaks in the XML
|
||||||
// that should be preserved in markdown
|
// that should be preserved in markdown
|
||||||
const preprocessed = this.preserveLineBreaks(html);
|
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
|
||||||
|
|
||||||
let markdown = this.turndown.turndown(preprocessed);
|
let markdown = this.turndown.turndown(preprocessed);
|
||||||
// Unescape double-bracket macros that TurndownService escaped
|
// Unescape double-bracket macros that TurndownService escaped
|
||||||
@@ -772,10 +799,18 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
// Check if content starts with a tag or plain text
|
// Check if content starts with a tag or plain text
|
||||||
const startsWithTag = /^\s*</.test(html);
|
const startsWithTag = /^\s*</.test(html);
|
||||||
|
|
||||||
|
// Protect <pre> blocks from having their newlines modified
|
||||||
|
const preBlocks: string[] = [];
|
||||||
|
let protectedHtml = html.replace(/<pre>([\s\S]*?)<\/pre>/g, (match) => {
|
||||||
|
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
|
||||||
|
preBlocks.push(match);
|
||||||
|
return placeholder;
|
||||||
|
});
|
||||||
|
|
||||||
// If it starts with plain text, we need to handle the whole content differently
|
// If it starts with plain text, we need to handle the whole content differently
|
||||||
if (!startsWithTag) {
|
if (!startsWithTag) {
|
||||||
// First, convert double newlines to paragraph markers
|
// First, convert double newlines to paragraph markers
|
||||||
let processed = html.replace(/\n\n+/g, '</p>\n<p>');
|
let processed = protectedHtml.replace(/\n\n+/g, '</p>\n<p>');
|
||||||
|
|
||||||
// Convert remaining single newlines within text to <br>
|
// Convert remaining single newlines within text to <br>
|
||||||
// (but not newlines that are just between tags)
|
// (but not newlines that are just between tags)
|
||||||
@@ -798,11 +833,16 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
processed = '<p>' + processed + '</p>';
|
processed = '<p>' + processed + '</p>';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Restore protected <pre> blocks
|
||||||
|
preBlocks.forEach((block, i) => {
|
||||||
|
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
|
||||||
|
});
|
||||||
|
|
||||||
return processed;
|
return processed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For content that starts with HTML, handle newlines within text content
|
// For content that starts with HTML, handle newlines within text content
|
||||||
return html.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
let result = protectedHtml.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
||||||
if (!textContent.trim()) {
|
if (!textContent.trim()) {
|
||||||
return '>' + textContent + '<';
|
return '>' + textContent + '<';
|
||||||
}
|
}
|
||||||
@@ -812,6 +852,42 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
preserved = preserved.replace(/\n/g, '<br>');
|
preserved = preserved.replace(/\n/g, '<br>');
|
||||||
return '>' + preserved + '<';
|
return '>' + preserved + '<';
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Restore protected <pre> blocks
|
||||||
|
preBlocks.forEach((block, i) => {
|
||||||
|
result = result.replace(`__PRE_BLOCK_${i}__`, block);
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrap standalone <code> blocks containing newlines in <pre> tags.
|
||||||
|
*
|
||||||
|
* WordPress content sometimes uses <code>...</code> for multi-line code blocks
|
||||||
|
* without a <pre> wrapper. Standard HTML parsing treats this as inline code and
|
||||||
|
* collapses whitespace. By wrapping in <pre>, we preserve the formatting and
|
||||||
|
* Turndown will convert it to a fenced Markdown code block.
|
||||||
|
*
|
||||||
|
* Only wraps <code> blocks that contain literal newlines.
|
||||||
|
* Does NOT wrap:
|
||||||
|
* - <code> already inside <pre>
|
||||||
|
* - <code> without newlines (inline code)
|
||||||
|
*/
|
||||||
|
private wrapMultilineCode(html: string): string {
|
||||||
|
if (!html) return html;
|
||||||
|
|
||||||
|
// Match <code> blocks containing newlines that are NOT inside <pre>
|
||||||
|
// Use a regex that captures the full <code>...</code> content including any embedded HTML
|
||||||
|
return html.replace(/<code>([\s\S]*?)<\/code>/g, (match, content: string) => {
|
||||||
|
// Only wrap if content contains newlines (multiline code block)
|
||||||
|
if (!content.includes('\n')) {
|
||||||
|
return match; // Leave inline code as-is
|
||||||
|
}
|
||||||
|
// Check if this <code> is already inside a <pre> by looking backward
|
||||||
|
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
|
||||||
|
return '<pre><code>' + content + '</code></pre>';
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -830,17 +906,23 @@ export class ImportExecutionEngine extends EventEmitter {
|
|||||||
private convertMediaUrlsToRelative(markdown: string): string {
|
private convertMediaUrlsToRelative(markdown: string): string {
|
||||||
if (!this.siteBaseUrl || !markdown) return markdown;
|
if (!this.siteBaseUrl || !markdown) return markdown;
|
||||||
|
|
||||||
// Normalize the site URL (remove trailing slash)
|
// Normalize the site URL (remove trailing slash and protocol)
|
||||||
const siteUrl = this.siteBaseUrl.replace(/\/$/, '');
|
const siteUrl = this.siteBaseUrl.replace(/\/$/, '');
|
||||||
|
|
||||||
// Escape special regex characters in URL
|
// Extract the hostname from the site URL
|
||||||
const escapedSiteUrl = siteUrl.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
// Handle both http:// and https://
|
||||||
|
const hostnameMatch = siteUrl.match(/^https?:\/\/(.+)$/);
|
||||||
|
if (!hostnameMatch) return markdown;
|
||||||
|
|
||||||
|
const hostname = hostnameMatch[1];
|
||||||
|
const escapedHostname = hostname.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
// Match URLs pointing to wp-content/uploads/ on the site
|
// Match URLs pointing to wp-content/uploads/ on the site
|
||||||
// This pattern matches both HTTP and HTTPS versions
|
// This pattern matches BOTH HTTP and HTTPS versions regardless of what the site URL uses
|
||||||
// Pattern: {siteUrl}/wp-content/uploads/{path}
|
// This handles the common case where the site URL is HTTPS but old content links are HTTP
|
||||||
|
// Pattern: http(s)://{hostname}/wp-content/uploads/{path}
|
||||||
const uploadsUrlPattern = new RegExp(
|
const uploadsUrlPattern = new RegExp(
|
||||||
`${escapedSiteUrl}/wp-content/uploads/([^\\s)"']+)`,
|
`https?://${escapedHostname}/wp-content/uploads/([^\\s)"']+)`,
|
||||||
'gi'
|
'gi'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -461,9 +461,9 @@ describe('WXR Reference Comparison E2E Tests', () => {
|
|||||||
console.log(`Unmatched generated: ${unmatchedGenerated.length}`);
|
console.log(`Unmatched generated: ${unmatchedGenerated.length}`);
|
||||||
console.log(`Total differences: ${allDifferences.length}`);
|
console.log(`Total differences: ${allDifferences.length}`);
|
||||||
|
|
||||||
// The test expects differences - we're reporting them, not failing on them
|
// The test MUST fail if there are any differences
|
||||||
// The purpose is to analyze the current state of the conversion
|
// All issues must be addressed before this test can pass
|
||||||
expect(true).toBe(true);
|
expect(allDifferences.length).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should report detailed differences for each post/page', async () => {
|
it('should report detailed differences for each post/page', async () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user