/** * ImportExecutionEngine - Executes WXR import based on analysis results * * Handles the 4-phase import process: * 1. Create new tags/categories * 2. Import posts (handling conflicts correctly) * 3. Import media (with post linkage) * 4. Import pages (as posts with "page" category) */ import { EventEmitter } from 'events'; import { v4 as uuidv4 } from 'uuid'; import * as fs from 'fs/promises'; import * as path from 'path'; import * as crypto from 'crypto'; import matter from 'gray-matter'; import { app } from 'electron'; import TurndownService from 'turndown'; import { getDatabase } from '../database'; import { posts, media, NewPost, NewMedia } from '../database/schema'; import { eq } from 'drizzle-orm'; import type { TagEngine } from './TagEngine'; import type { PostEngine, PostData } from './PostEngine'; import type { MediaEngine, MediaData } from './MediaEngine'; import type { PostMediaEngine } from './PostMediaEngine'; import type { ImportAnalysisReport, AnalyzedPost, AnalyzedMedia, AnalyzedCategory, AnalyzedTag, ImportConflictResolution, } from './ImportAnalysisEngine'; import type { WxrPost, WxrMedia } from './WxrParser'; export interface ImportExecutionOptions { /** Path to the WordPress uploads folder for media files */ uploadsFolder?: string; /** Default author to use when WXR post/media has no author */ defaultAuthor?: string; /** Progress callback */ onProgress?: (phase: string, current: number, total: number, detail?: string) => void; } export interface ImportExecutionResult { success: boolean; tags: { created: number; skipped: number; }; posts: { imported: number; skipped: number; errors: number; }; media: { imported: number; skipped: number; errors: number; }; pages: { imported: number; skipped: number; errors: number; }; /** Mapping from WordPress post ID to our post GUID */ wpIdToPostId: Map; errors: string[]; } // Regex to match WordPress shortcodes: [macroname ...] but NOT [[macroname ...]] const WP_SHORTCODE_REGEX = /(? { content = content .replace(/^\n+/, '') // Remove leading newlines .replace(/\n+$/, '\n') // Replace trailing newlines with single newline .replace(/\n/gm, '\n '); // Indent subsequent lines with 2 spaces const parent = node.parentNode as HTMLElement; const isOrdered = parent?.nodeName === 'OL'; let prefix = options.bulletListMarker + ' '; if (isOrdered) { const start = parent.getAttribute('start'); const index = Array.prototype.indexOf.call(parent.children, node); const startNum = start ? parseInt(start, 10) : 1; prefix = (startNum + index) + '. '; } return prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : ''); }, }); // Custom rule for standalone images with empty alt but title attribute // WordPress often uses title="name" with alt="" this.turndown.addRule('imageWithTitle', { filter: (node) => { if (node.nodeName !== 'IMG') return false; // Check if this image is NOT inside an tag (those are handled by linkedImage rule) const parent = node.parentNode; if (parent?.nodeName === 'A') return false; // Only match if alt is empty but title exists const img = node as HTMLImageElement; const alt = img.getAttribute('alt') || ''; const title = img.getAttribute('title') || ''; return !alt.trim() && title.trim().length > 0; }, replacement: (_content, node) => { const img = node as HTMLImageElement; const src = img.getAttribute('src') || ''; const title = img.getAttribute('title') || ''; return `![${title}](${src})`; }, }); // Custom rule for linked images: -> ![alt](src) // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images this.turndown.addRule('linkedImage', { filter: (node) => { // Match tags that contain only an (possibly with whitespace) if (node.nodeName !== 'A') return false; const children = Array.from(node.childNodes).filter( child => !(child.nodeType === 3 && !child.textContent?.trim()) ); return children.length === 1 && children[0].nodeName === 'IMG'; }, replacement: (_content, node) => { const anchor = node as HTMLAnchorElement; const img = anchor.querySelector('img'); if (!img) return ''; const href = anchor.getAttribute('href') || ''; const imgSrc = img.getAttribute('src') || ''; const imgAlt = img.getAttribute('alt') || ''; const imgTitle = img.getAttribute('title') || ''; // Check if the link href points to an image (common WordPress pattern for "click for larger") const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i; const hrefIsImage = imageExtensions.test(href); // Determine which URL to use: // - If href is an image URL (WordPress "click for full-size" pattern), use the href // - Otherwise, use the original image src const imageUrl = hrefIsImage ? href : imgSrc; // Derive alt text: prefer alt, then title, then cleaned filename let altText = imgAlt.trim(); if (!altText) { altText = imgTitle.trim(); } if (!altText) { // Extract filename from the image URL as last resort const urlPath = imageUrl.split('?')[0]; // Remove query string const filename = urlPath.split('/').pop() || ''; // Clean the filename: remove extension and replace underscores with spaces altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' '); } // Build the markdown image link (without title attribute) return `![${altText}](${imageUrl})`; }, }); // Custom rule for Flash embeds - replace with placeholder text this.turndown.addRule('flashEmbed', { filter: (node) => { if (node.nodeName !== 'EMBED') return false; const embed = node as HTMLEmbedElement; const type = embed.getAttribute('type') || ''; const src = embed.getAttribute('src') || ''; // Match Flash content by type or file extension return type.toLowerCase().includes('flash') || type.toLowerCase().includes('shockwave') || src.toLowerCase().endsWith('.swf'); }, replacement: () => 'FLASH PLAYER NOT SUPPORTED', }); } setProjectContext(projectId: string, dataDir?: string): void { this.currentProjectId = projectId; this.dataDir = dataDir || null; } getProjectContext(): string { return this.currentProjectId; } private getBaseDir(): string { if (this.dataDir) return this.dataDir; const userDataPath = app.getPath('userData'); return path.join(userDataPath, 'projects', this.currentProjectId); } private getPostsBaseDir(): string { return path.join(this.getBaseDir(), 'posts'); } private getMediaBaseDir(): string { return path.join(this.getBaseDir(), 'media'); } /** * Get the date-based directory for posts (posts/YYYY/MM/) */ private getPostsDirForDate(date: Date): string { const baseDir = this.getPostsBaseDir(); const year = date.getFullYear().toString(); const month = (date.getMonth() + 1).toString().padStart(2, '0'); return path.join(baseDir, year, month); } /** * Get the date-based directory for media (media/YYYY/MM/) */ private getMediaDirForDate(date: Date): string { const baseDir = this.getMediaBaseDir(); const year = date.getFullYear().toString(); const month = (date.getMonth() + 1).toString().padStart(2, '0'); return path.join(baseDir, year, month); } /** * Execute the full import process */ async executeImport( report: ImportAnalysisReport, options: ImportExecutionOptions ): Promise { const result: ImportExecutionResult = { success: true, tags: { created: 0, skipped: 0 }, posts: { imported: 0, skipped: 0, errors: 0 }, media: { imported: 0, skipped: 0, errors: 0 }, pages: { imported: 0, skipped: 0, errors: 0 }, wpIdToPostId: new Map(), errors: [], }; const progress = options.onProgress || (() => {}); // Store site URL for media URL conversion this.siteBaseUrl = report.site.link || null; try { // Build tag/category mappings const tagMapping = this.buildTaxonomyMapping(report.tags); const categoryMapping = this.buildTaxonomyMapping(report.categories); // Phase 1: Create new tags progress('tags', 0, report.tags.length + report.categories.length, 'Creating tags...'); await this.executePhase1Tags(report, tagMapping, categoryMapping, result, progress); // Phase 2: Import posts progress('posts', 0, report.posts.items.length, 'Importing posts...'); await this.executePhase2Posts(report, tagMapping, categoryMapping, result, options, progress); // Phase 3: Import media progress('media', 0, report.media.items.length, 'Importing media...'); await this.executePhase3Media(report, result, options, progress); // Phase 4: Import pages progress('pages', 0, report.pages.items.length, 'Importing pages...'); await this.executePhase4Pages(report, tagMapping, categoryMapping, result, options, progress); progress('complete', 1, 1, 'Import complete'); } catch (error) { result.success = false; result.errors.push(error instanceof Error ? error.message : String(error)); } return result; } /** * Build a mapping from original taxonomy name to resolved name * - If existsInProject: use the name as-is (lowercase) * - If mappedTo: use the mappedTo value (lowercase) * - Otherwise: use the name and mark for creation */ private buildTaxonomyMapping( items: Array<{ name: string; existsInProject: boolean; mappedTo?: string }> ): Map { const mapping = new Map(); for (const item of items) { const key = item.name.toLowerCase(); if (item.mappedTo) { // Mapped to existing tag mapping.set(key, { resolved: item.mappedTo.toLowerCase(), needsCreation: false }); } else if (item.existsInProject) { // Already exists mapping.set(key, { resolved: key, needsCreation: false }); } else { // New tag to create mapping.set(key, { resolved: key, needsCreation: true }); } } return mapping; } /** * Phase 1: Create new tags and categories */ private async executePhase1Tags( report: ImportAnalysisReport, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, progress: (phase: string, current: number, total: number, detail?: string) => void ): Promise { const tagEngine = this.tagEngine; tagEngine.setProjectContext(this.currentProjectId); let current = 0; const total = report.tags.length + report.categories.length; // Create new tags for (const tag of report.tags) { current++; const mapping = tagMapping.get(tag.name.toLowerCase()); if (mapping?.needsCreation) { try { await tagEngine.createTag({ name: mapping.resolved }); result.tags.created++; progress('tags', current, total, `Created tag: ${mapping.resolved}`); } catch (error) { // Tag might already exist (race condition or duplicate in list) result.tags.skipped++; } } else { result.tags.skipped++; } } // Create new categories (as tags) for (const category of report.categories) { current++; const mapping = categoryMapping.get(category.name.toLowerCase()); if (mapping?.needsCreation) { try { await tagEngine.createTag({ name: mapping.resolved }); result.tags.created++; progress('tags', current, total, `Created category tag: ${mapping.resolved}`); } catch (error) { result.tags.skipped++; } } else { result.tags.skipped++; } } } /** * Phase 2: Import posts */ private async executePhase2Posts( report: ImportAnalysisReport, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions, progress: (phase: string, current: number, total: number, detail?: string) => void ): Promise { // Filter to only actual posts (postType === 'post'), skip nav_menu_item, revision, etc. const postsToImport = report.posts.items.filter(item => item.wxrPost.postType === 'post'); const total = postsToImport.length; // Count skipped "other" post types const skippedOther = report.posts.items.length - postsToImport.length; result.posts.skipped += skippedOther; for (let i = 0; i < postsToImport.length; i++) { const analyzed = postsToImport[i]; progress('posts', i + 1, total, `Processing: ${analyzed.wxrPost.title}`); try { const imported = await this.importPost(analyzed, tagMapping, categoryMapping, result, options); if (imported) { result.posts.imported++; } else { result.posts.skipped++; } } catch (error) { result.posts.errors++; result.errors.push(`Failed to import post "${analyzed.wxrPost.title}": ${error instanceof Error ? error.message : String(error)}`); } } } /** * Import a single post */ private async importPost( analyzed: AnalyzedPost, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions ): Promise { const wxrPost = analyzed.wxrPost; // Handle different analysis statuses if (analyzed.status === 'content-duplicate') { // Skip content duplicates return false; } if (analyzed.status === 'update') { // Skip updates (same content already exists) return false; } if (analyzed.status === 'conflict') { const resolution = analyzed.conflictResolution || 'ignore'; if (resolution === 'ignore') { return false; } // Handle overwrite and import return await this.importPostWithConflict(analyzed, resolution, tagMapping, categoryMapping, result, options); } // New post - import it return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published'); } /** * Import a post that has a conflict */ private async importPostWithConflict( analyzed: AnalyzedPost, resolution: ImportConflictResolution, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions ): Promise { const postEngine = this.postEngine; if (resolution === 'overwrite') { // Update the existing post with new content and set to draft for review if (!analyzed.existingPost?.id) { // Fallback: if no existing post ID, create as new draft return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'draft'); } return await this.updateExistingPost(analyzed, analyzed.existingPost.id, tagMapping, categoryMapping, result, options); } if (resolution === 'import') { // Create with a new unique slug const newSlug = await postEngine.generateUniqueSlug(analyzed.wxrPost.title); return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published', newSlug); } return false; } /** * Update an existing post with imported content (for overwrite conflict resolution) * Sets the post to draft status so user can review before publishing */ private async updateExistingPost( analyzed: AnalyzedPost, existingPostId: string, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions ): Promise { const wxrPost = analyzed.wxrPost; const db = getDatabase().getLocal(); const postEngine = this.postEngine; // Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion const contentWithVimeo = this.convertVimeoIframes(wxrPost.content); // Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion const contentWithShortcodes = this.transformShortcodes(contentWithVimeo); // Convert HTML content to Markdown let transformedContent = this.convertToMarkdown(contentWithShortcodes); // Convert absolute media URLs from the site to relative paths transformedContent = this.convertMediaUrlsToRelative(transformedContent); // Resolve tags const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping); // Resolve categories const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping); // Calculate checksum const checksum = this.calculateChecksum(transformedContent); // Update the existing post in the database // Set to draft status so user can review the imported content await db.update(posts) .set({ title: wxrPost.title, excerpt: wxrPost.excerpt || null, content: transformedContent, // Store in DB since it's now a draft status: 'draft', author: wxrPost.creator || options.defaultAuthor || null, updatedAt: new Date(), publishedAt: null, // Clear publishedAt since it's now a draft checksum, tags: JSON.stringify(resolvedTags), categories: JSON.stringify(resolvedCategories), }) .where(eq(posts.id, existingPostId)); // Update FTS index await postEngine.updateFTSIndex({ id: existingPostId, projectId: this.currentProjectId, title: wxrPost.title, content: transformedContent, excerpt: wxrPost.excerpt || undefined, tags: resolvedTags, categories: resolvedCategories, }); // Track wpId to postId mapping (use existing ID) result.wpIdToPostId.set(wxrPost.wpId, existingPostId); return true; } /** * Create an imported post */ private async createImportedPost( analyzed: AnalyzedPost, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions, status: 'draft' | 'published', overrideSlug?: string ): Promise { const wxrPost = analyzed.wxrPost; const db = getDatabase().getLocal(); // Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion const contentWithVimeo = this.convertVimeoIframes(wxrPost.content); // Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion // (TurndownService escapes brackets, so we must transform first) const contentWithShortcodes = this.transformShortcodes(contentWithVimeo); // Convert HTML content to Markdown let transformedContent = this.convertToMarkdown(contentWithShortcodes); // Convert absolute media URLs from the site to relative paths transformedContent = this.convertMediaUrlsToRelative(transformedContent); // Resolve tags const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping); // Resolve categories const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping); // Determine dates (dates may be strings after JSON serialization through IPC) const createdAt = this.toDate(wxrPost.postDate) || this.toDate(wxrPost.pubDate) || new Date(); const updatedAt = this.toDate(wxrPost.postModified) || createdAt; const publishedAt = status === 'published' ? (this.toDate(wxrPost.pubDate) || createdAt) : undefined; // Generate post ID const postId = uuidv4(); // Build post data const postData: PostData = { id: postId, projectId: this.currentProjectId, title: wxrPost.title, slug: overrideSlug || wxrPost.slug, excerpt: wxrPost.excerpt || undefined, content: transformedContent, status, author: wxrPost.creator || options.defaultAuthor || undefined, createdAt, updatedAt, publishedAt, tags: resolvedTags, categories: resolvedCategories, }; // Write to filesystem first (for published posts) let filePath = ''; if (status === 'published') { filePath = await this.writePostFile(postData); } // Calculate checksum const checksum = this.calculateChecksum(transformedContent); // Insert into database const dbPost: NewPost = { id: postData.id, projectId: postData.projectId, title: postData.title, slug: postData.slug, excerpt: postData.excerpt, content: status === 'draft' ? postData.content : null, // Draft content in DB, published in file status: postData.status, author: postData.author, createdAt: postData.createdAt, updatedAt: postData.updatedAt, publishedAt: postData.publishedAt, filePath, checksum, tags: JSON.stringify(postData.tags), categories: JSON.stringify(postData.categories), }; await db.insert(posts).values(dbPost); // Update FTS index const postEngine = this.postEngine; await postEngine.updateFTSIndex(postData); // Track wpId to postId mapping result.wpIdToPostId.set(wxrPost.wpId, postId); return true; } /** * Write a post file to the filesystem */ private async writePostFile(post: PostData): Promise { const metadata: Record = { id: post.id, projectId: post.projectId, title: post.title, slug: post.slug, status: post.status, createdAt: post.createdAt.toISOString(), updatedAt: post.updatedAt.toISOString(), tags: post.tags, categories: post.categories, }; if (post.excerpt) metadata.excerpt = post.excerpt; if (post.author) metadata.author = post.author; if (post.publishedAt) metadata.publishedAt = post.publishedAt.toISOString(); const postsDir = this.getPostsDirForDate(post.createdAt); await fs.mkdir(postsDir, { recursive: true }); const fileContent = matter.stringify(post.content, metadata); const filePath = path.join(postsDir, `${post.slug}.md`); await fs.writeFile(filePath, fileContent, 'utf-8'); return filePath; } /** * Phase 3: Import media files */ private async executePhase3Media( report: ImportAnalysisReport, result: ImportExecutionResult, options: ImportExecutionOptions, progress: (phase: string, current: number, total: number, detail?: string) => void ): Promise { const total = report.media.items.length; for (let i = 0; i < report.media.items.length; i++) { const analyzed = report.media.items[i]; progress('media', i + 1, total, `Processing: ${analyzed.wxrMedia.filename}`); try { const imported = await this.importMediaFile(analyzed, result, options); if (imported) { result.media.imported++; } else { result.media.skipped++; } } catch (error) { result.media.errors++; result.errors.push(`Failed to import media "${analyzed.wxrMedia.filename}": ${error instanceof Error ? error.message : String(error)}`); } } } /** * Import a single media file */ private async importMediaFile( analyzed: AnalyzedMedia, result: ImportExecutionResult, options: ImportExecutionOptions ): Promise { const wxrMedia = analyzed.wxrMedia; // Skip missing files if (analyzed.status === 'missing') { return false; } // Skip content duplicates if (analyzed.status === 'content-duplicate') { return false; } // Handle conflicts if (analyzed.status === 'conflict') { const resolution = analyzed.conflictResolution || 'ignore'; if (resolution === 'ignore') { return false; } // For 'overwrite', update the existing media entry if (resolution === 'overwrite' && analyzed.existingMedia?.id) { return await this.updateExistingMedia(analyzed, analyzed.existingMedia.id, result, options); } // For 'import', fall through to create new entry } // Skip updates (same content already exists) if (analyzed.status === 'update') { return false; } // Build source path if (!options.uploadsFolder) { return false; } const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath); // Check if file exists try { await fs.access(sourcePath); } catch { return false; } // Resolve parent post ID const linkedPostIds: string[] = []; if (wxrMedia.parentId && wxrMedia.parentId > 0) { const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId); if (parentPostId) { linkedPostIds.push(parentPostId); } } // Determine creation date from WXR (may be string after JSON serialization) const createdAt = this.toDate(wxrMedia.pubDate) || new Date(); // Import the media file const mediaEngine = this.mediaEngine; const importedMedia = await mediaEngine.importMedia(sourcePath, { title: wxrMedia.title || undefined, alt: wxrMedia.description || undefined, mimeType: wxrMedia.mimeType, author: options.defaultAuthor, tags: [], linkedPostIds, createdAt, updatedAt: createdAt, }); // Link media to posts in the postMedia table if (linkedPostIds.length > 0) { const postMediaEngine = this.postMediaEngine; postMediaEngine.setProjectContext(this.currentProjectId); for (const postId of linkedPostIds) { await postMediaEngine.linkMediaToPost(postId, importedMedia.id); } } return true; } /** * Update an existing media entry with imported file (for overwrite conflict resolution) * Replaces the file on disk and updates metadata in the database */ private async updateExistingMedia( analyzed: AnalyzedMedia, existingMediaId: string, result: ImportExecutionResult, options: ImportExecutionOptions ): Promise { const wxrMedia = analyzed.wxrMedia; // Build source path if (!options.uploadsFolder) { return false; } const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath); // Check if file exists try { await fs.access(sourcePath); } catch { return false; } const mediaEngine = this.mediaEngine; // Replace the file on disk and update size/checksum/dimensions in database await mediaEngine.replaceMediaFile(existingMediaId, sourcePath); // Update metadata (title, alt, etc.) await mediaEngine.updateMedia(existingMediaId, { title: wxrMedia.title || undefined, alt: wxrMedia.description || undefined, author: options.defaultAuthor, }); // Resolve parent post ID for linking const linkedPostIds: string[] = []; if (wxrMedia.parentId && wxrMedia.parentId > 0) { const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId); if (parentPostId) { linkedPostIds.push(parentPostId); } } // Link media to posts in the postMedia table if needed if (linkedPostIds.length > 0) { const postMediaEngine = this.postMediaEngine; postMediaEngine.setProjectContext(this.currentProjectId); for (const postId of linkedPostIds) { await postMediaEngine.linkMediaToPost(postId, existingMediaId); } } return true; } /** * Phase 4: Import pages as posts with "page" category */ private async executePhase4Pages( report: ImportAnalysisReport, tagMapping: Map, categoryMapping: Map, result: ImportExecutionResult, options: ImportExecutionOptions, progress: (phase: string, current: number, total: number, detail?: string) => void ): Promise { const total = report.pages.items.length; // Ensure "page" category exists in mapping if (!categoryMapping.has('page')) { categoryMapping.set('page', { resolved: 'page', needsCreation: false }); } for (let i = 0; i < report.pages.items.length; i++) { const analyzed = report.pages.items[i]; const wxrPage = analyzed.wxrPost; // Add "page" to categories const modifiedWxrPost: WxrPost = { ...wxrPage, categories: [...wxrPage.categories, 'page'], }; const modifiedAnalyzed: AnalyzedPost = { ...analyzed, wxrPost: modifiedWxrPost, }; progress('pages', i + 1, total, `Processing: ${wxrPage.title}`); try { const imported = await this.importPost(modifiedAnalyzed, tagMapping, categoryMapping, result, options); if (imported) { result.pages.imported++; } else { result.pages.skipped++; } } catch (error) { result.pages.errors++; result.errors.push(`Failed to import page "${wxrPage.title}": ${error instanceof Error ? error.message : String(error)}`); } } } /** * Convert HTML to Markdown using Turndown */ private convertToMarkdown(html: string): string { if (!html || !html.trim()) return ''; // Preprocess: Wrap standalone blocks containing newlines in
 tags
    // This must happen BEFORE preserveLineBreaks to prevent newlines from becoming 
// and to ensure Turndown recognizes them as fenced code blocks const withCodeBlocks = this.wrapMultilineCode(html); // Preprocess: Convert newlines within text to
tags to preserve line breaks // This handles the common case where WordPress exports have line breaks in the XML // that should be preserved in markdown const preprocessed = this.preserveLineBreaks(withCodeBlocks); let markdown = this.turndown.turndown(preprocessed); // Unescape double-bracket macros that TurndownService escaped // \[\[ becomes [[ and \]\] becomes ]] markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]'); // Remove backslash escapes inside [[macro]] blocks (e.g. photo\_archive → photo_archive) markdown = markdown.replace(/\[\[([^\]]*?)\]\]/g, (_match, inner: string) => { return '[[' + inner.replace(/\\(.)/g, '$1') + ']]'; }); // Normalize non-breaking spaces to regular spaces markdown = markdown.replace(/\u00A0/g, ' '); // Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation markdown = markdown.split('\n').map(line => { const trimmed = line.trimEnd(); // Preserve space after ">" for blockquote continuation lines if (trimmed === '>' && line.startsWith('> ')) { return '> '; } return trimmed; }).join('\n'); // Normalize multiple blank lines (3+ consecutive newlines → 2 newlines) markdown = markdown.replace(/\n{3,}/g, '\n\n'); return markdown; } /** * Preserve line breaks and paragraph structure in content. * * WordPress exports often have: * - Plain text mixed with HTML * - Double newlines representing paragraph breaks * - Single newlines that should become
* * This function converts: * - Double newlines (\n\n) to paragraph breaks (

) * - Single newlines within text to
* - Wraps content in

tags if it starts with plain text */ private preserveLineBreaks(html: string): string { if (!html || !html.trim()) return html; // Check if content starts with a tag or plain text const startsWithTag = /^\s* blocks from having their newlines modified const preBlocks: string[] = []; let protectedHtml = html.replace(/

([\s\S]*?)<\/pre>/g, (match) => {
      const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
      preBlocks.push(match);
      return placeholder;
    });
    
    // If it starts with plain text, we need to handle the whole content differently
    if (!startsWithTag) {
      // First, convert double newlines to paragraph markers
      let processed = protectedHtml.replace(/\n\n+/g, '

\n

'); // Convert remaining single newlines within text to
// (but not newlines that are just between tags) processed = processed.replace(/>([^<]+) { if (!textContent.trim()) { return '>' + textContent + '<'; } const preserved = textContent.replace(/\n/g, '
'); return '>' + preserved + '<'; }); // Also handle newlines at the start (before any tags) processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => { if (!textContent.trim()) return match; return textContent.replace(/\n/g, '
'); }); // Wrap in

if we added paragraph markers if (processed.includes('

')) { processed = '

' + processed + '

'; } // Restore protected
 blocks
      preBlocks.forEach((block, i) => {
        processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
      });
      
      return processed;
    }

    // For content that starts with HTML, handle newlines within text content
    let result = protectedHtml.replace(/>([^<]+) {
      if (!textContent.trim()) {
        return '>' + textContent + '<';
      }
      // First convert double newlines to paragraph breaks
      let preserved = textContent.replace(/\n\n+/g, '

'); // Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
'); return '>' + preserved + '<'; }); // Also handle text at the END of content (after the last tag) // This catches text after closing tags like --> or /> that goes to the end result = result.replace(/>([^<]+)$/g, (match, textContent: string) => { if (!textContent.trim()) { return match; } // First convert double newlines to paragraph breaks let preserved = textContent.replace(/\n\n+/g, '

'); // Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
'); return '>' + preserved; }); // Restore protected

 blocks
    preBlocks.forEach((block, i) => {
      result = result.replace(`__PRE_BLOCK_${i}__`, block);
    });
    
    return result;
  }

  /**
   * Wrap standalone  blocks containing newlines in 
 tags.
   * 
   * WordPress content sometimes uses ... for multi-line code blocks
   * without a 
 wrapper. Standard HTML parsing treats this as inline code and
   * collapses whitespace. By wrapping in 
, we preserve the formatting and
   * Turndown will convert it to a fenced Markdown code block.
   * 
   * Only wraps  blocks that contain literal newlines.
   * Does NOT wrap:
   *   -  already inside 
   *   -  without newlines (inline code)
   */
  private wrapMultilineCode(html: string): string {
    if (!html) return html;

    // Match  blocks containing newlines that are NOT inside 
    // Use a regex that captures the full ... content including any embedded HTML
    return html.replace(/([\s\S]*?)<\/code>/g, (match, content: string) => {
      // Only wrap if content contains newlines (multiline code block)
      if (!content.includes('\n')) {
        return match; // Leave inline code as-is
      }
      // Check if this  is already inside a 
 by looking backward
      // Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
      return '
' + content + '
'; }); } /** * Convert absolute media URLs from the WordPress site to relative paths. * * Converts URLs like: * https://site.com/wp-content/uploads/2022/11/image.jpg * To: * media/2022/11/image.jpg * * Only converts URLs from the site being imported (based on site.link). * Does NOT convert: * - URLs from external sites * - URLs from wp-content/themes/ or wp-content/plugins/ (not imported media) */ private convertMediaUrlsToRelative(markdown: string): string { if (!this.siteBaseUrl || !markdown) return markdown; // Normalize the site URL (remove trailing slash and protocol) const siteUrl = this.siteBaseUrl.replace(/\/$/, ''); // Extract the hostname from the site URL // Handle both http:// and https:// const hostnameMatch = siteUrl.match(/^https?:\/\/(.+)$/); if (!hostnameMatch) return markdown; const hostname = hostnameMatch[1]; const escapedHostname = hostname.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // Match URLs pointing to wp-content/uploads/ on the site // This pattern matches BOTH HTTP and HTTPS versions regardless of what the site URL uses // This handles the common case where the site URL is HTTPS but old content links are HTTP // Pattern: http(s)://{hostname}/wp-content/uploads/{path} const uploadsUrlPattern = new RegExp( `https?://${escapedHostname}/wp-content/uploads/([^\\s)"']+)`, 'gi' ); // Replace with relative media path return markdown.replace(uploadsUrlPattern, 'media/$1'); } /** * Convert Vimeo iframes to [[vimeo id=...]] macros. * Matches