import crypto from 'crypto'; import * as fs from 'fs/promises'; import * as path from 'path'; import TurndownService from 'turndown'; import { getDatabase } from '../database'; import { posts, media, tags } from '../database/schema'; import { eq } from 'drizzle-orm'; import type { WxrData, WxrPost, WxrMedia, WxrSiteInfo, WxrCategory, WxrTag } from './WxrParser'; import { getMacroConfigMap, type MacroConfig } from '../config/macroConfig'; export type PostAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate'; export type MediaAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate' | 'missing'; /** How to resolve a slug conflict during import */ export type ImportConflictResolution = 'ignore' | 'overwrite' | 'import'; export interface AnalyzedPost { wxrPost: WxrPost; status: PostAnalysisStatus; contentHash: string; markdownPreview: string; /** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */ conflictResolution?: ImportConflictResolution; existingPost?: { id: string; title: string; slug: string; checksum: string | null; /** Date the existing post was created/published */ pubDate: string | null; /** Excerpt from existing post */ excerpt: string | null; /** Author of the existing post */ author: string | null; /** Tags of the existing post */ tags: string[]; /** Categories of the existing post */ categories: string[]; }; } export interface AnalyzedMedia { wxrMedia: WxrMedia; status: MediaAnalysisStatus; fileHash: string | null; /** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */ conflictResolution?: ImportConflictResolution; existingMedia?: { id: string; originalName: string; checksum: string | null; }; } export interface AnalyzedCategory { name: string; slug: string; existsInProject: boolean; mappedTo?: string; // When set, indicates this item should be mapped to the given name on import } export interface AnalyzedTag { name: string; slug: string; existsInProject: boolean; mappedTo?: string; // When set, indicates this item should be mapped to the given name on import } /** Validation status for a macro usage */ export type MacroValidationStatus = 'valid' | 'invalid' | 'unknown'; /** A single unique usage pattern of a macro */ export interface MacroUsage { /** The parameters used in this particular usage */ params: Record; /** How many times this exact parameter combination was used */ count: number; /** Whether this usage is valid according to our macro definition */ validationStatus: MacroValidationStatus; /** Error message if validation failed */ validationError?: string; /** Serialized params for deduplication */ paramsKey: string; } /** A discovered macro from the import content */ export interface DiscoveredMacro { /** The macro name (lowercase) */ name: string; /** Whether this macro maps to an internal definition */ mapped: boolean; /** Total number of times this macro appears across all content */ totalCount: number; /** Unique usages with different parameters */ usages: MacroUsage[]; /** Slugs of posts/pages where this macro is used */ postSlugs: string[]; } /** Summary of macro analysis */ export interface MacroAnalysisSummary { /** Total unique macros discovered */ total: number; /** Number of macros that map to internal definitions */ mappedCount: number; /** Number of macros that don't map to internal definitions */ unmappedCount: number; /** All discovered macros with their usages */ discovered: DiscoveredMacro[]; } /** Minimal interface for macro definition validation */ export interface MacroDefinitionLike { name: string; validate?: (params: Record) => string | undefined; } export interface ImportAnalysisReport { sourceFile: string; site: WxrSiteInfo; analyzedAt: Date; posts: { total: number; new: number; updates: number; conflicts: number; contentDuplicates: number; items: AnalyzedPost[]; }; pages: { total: number; new: number; updates: number; conflicts: number; contentDuplicates: number; items: AnalyzedPost[]; }; media: { total: number; new: number; updates: number; conflicts: number; contentDuplicates: number; missing: number; items: AnalyzedMedia[]; }; categories: AnalyzedCategory[]; tags: AnalyzedTag[]; macros: MacroAnalysisSummary; } export class ImportAnalysisEngine { private currentProjectId: string = ''; private turndown: TurndownService; private macroDefinitions: Map = new Map(); // Progress callback for reporting analysis steps onProgress?: (step: string, detail?: string) => void; // Regex to match WordPress shortcodes: [macroname param="val" param2='val2'] // This matches single brackets (NOT double brackets like our internal format) // Uses negative lookbehind (? { content = content .replace(/^\n+/, '') // Remove leading newlines .replace(/\n+$/, '\n') // Replace trailing newlines with single newline .replace(/\n/gm, '\n '); // Indent subsequent lines with 2 spaces const parent = node.parentNode as HTMLElement; const isOrdered = parent?.nodeName === 'OL'; let prefix = options.bulletListMarker + ' '; if (isOrdered) { const start = parent.getAttribute('start'); const index = Array.prototype.indexOf.call(parent.children, node); const startNum = start ? parseInt(start, 10) : 1; prefix = (startNum + index) + '. '; } return prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : ''); }, }); // Custom rule for standalone images with empty alt but title attribute // WordPress often uses title="name" with alt="" this.turndown.addRule('imageWithTitle', { filter: (node) => { if (node.nodeName !== 'IMG') return false; // Check if this image is NOT inside an tag (those are handled by linkedImage rule) const parent = node.parentNode; if (parent?.nodeName === 'A') return false; // Only match if alt is empty but title exists const img = node as HTMLImageElement; const alt = img.getAttribute('alt') || ''; const title = img.getAttribute('title') || ''; return !alt.trim() && title.trim().length > 0; }, replacement: (_content, node) => { const img = node as HTMLImageElement; const src = img.getAttribute('src') || ''; const title = img.getAttribute('title') || ''; return `![${title}](${src})`; }, }); // Custom rule for linked images: -> ![alt](src) // This handles the common WordPress pattern of wrapping thumbnails in links to full-size images this.turndown.addRule('linkedImage', { filter: (node) => { // Match tags that contain only an (possibly with whitespace) if (node.nodeName !== 'A') return false; const children = Array.from(node.childNodes).filter( child => !(child.nodeType === 3 && !child.textContent?.trim()) ); return children.length === 1 && children[0].nodeName === 'IMG'; }, replacement: (_content, node) => { const anchor = node as HTMLAnchorElement; const img = anchor.querySelector('img'); if (!img) return ''; const href = anchor.getAttribute('href') || ''; const imgSrc = img.getAttribute('src') || ''; const imgAlt = img.getAttribute('alt') || ''; const imgTitle = img.getAttribute('title') || ''; // Check if the link href points to an image (common WordPress pattern for "click for larger") const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i; const hrefIsImage = imageExtensions.test(href); // Determine which URL to use: // - If href is an image URL (WordPress "click for full-size" pattern), use the href // - Otherwise, use the original image src const imageUrl = hrefIsImage ? href : imgSrc; // Derive alt text: prefer alt, then title, then cleaned filename let altText = imgAlt.trim(); if (!altText) { altText = imgTitle.trim(); } if (!altText) { // Extract filename from the image URL as last resort const urlPath = imageUrl.split('?')[0]; // Remove query string const filename = urlPath.split('/').pop() || ''; // Clean the filename: remove extension and replace underscores with spaces altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' '); } // Build the markdown image link (without title attribute) return `![${altText}](${imageUrl})`; }, }); // Custom rule for Flash embeds - replace with placeholder text this.turndown.addRule('flashEmbed', { filter: (node) => { if (node.nodeName !== 'EMBED') return false; const embed = node as HTMLEmbedElement; const type = embed.getAttribute('type') || ''; const src = embed.getAttribute('src') || ''; // Match Flash content by type or file extension return type.toLowerCase().includes('flash') || type.toLowerCase().includes('shockwave') || src.toLowerCase().endsWith('.swf'); }, replacement: () => 'FLASH PLAYER NOT SUPPORTED', }); // Load macro definitions from shared config this.loadMacroConfigsFromShared(); } /** * Load macro definitions from the shared macro config. * Called automatically in constructor. */ private loadMacroConfigsFromShared(): void { try { const configs = getMacroConfigMap(); // Convert MacroConfig to MacroDefinitionLike for (const [name, config] of configs) { this.macroDefinitions.set(name, { name: config.name, validate: config.validate, }); } } catch (error) { // Config not available - macros will be marked as unmapped console.warn('Could not load macro configs:', error); } } setProjectContext(projectId: string): void { this.currentProjectId = projectId; } /** * Set macro definitions for mapping and validation. * This overrides the auto-loaded shared config. Useful for testing. * @param definitions Map of macro name (lowercase) to definition */ setMacroDefinitions(definitions: Map): void { this.macroDefinitions = definitions; } async analyzeWxr(wxrData: WxrData, sourceFile: string, uploadsFolder?: string): Promise { const db = getDatabase().getLocal(); this.onProgress?.('Loading existing posts...'); // Fetch existing posts for this project const existingPosts = await db .select({ id: posts.id, slug: posts.slug, title: posts.title, checksum: posts.checksum, excerpt: posts.excerpt, author: posts.author, publishedAt: posts.publishedAt, createdAt: posts.createdAt, status: posts.status, tags: posts.tags, categories: posts.categories, }) .from(posts) .where(eq(posts.projectId, this.currentProjectId)) .all(); this.onProgress?.('Loading existing media...', `${existingPosts.length} posts in project`); // Fetch existing media for this project const existingMedia = await db .select({ id: media.id, originalName: media.originalName, checksum: media.checksum, }) .from(media) .where(eq(media.projectId, this.currentProjectId)) .all(); this.onProgress?.('Loading existing tags...', `${existingMedia.length} media in project`); // Fetch existing tags for this project const existingTags = await db .select({ name: tags.name, }) .from(tags) .where(eq(tags.projectId, this.currentProjectId)) .all(); // Build lookup maps for posts const slugToPost = new Map(); const checksumToPost = new Map(); for (const post of existingPosts) { slugToPost.set(post.slug, post); if (post.checksum) { checksumToPost.set(post.checksum, post); } } // Build lookup maps for media const nameToMedia = new Map(); const checksumToMedia = new Map(); for (const m of existingMedia) { nameToMedia.set(m.originalName.toLowerCase(), m); if (m.checksum) { checksumToMedia.set(m.checksum, m); } } // Build tag set const existingTagNames = new Set(existingTags.map(t => t.name.toLowerCase())); this.onProgress?.('Analyzing posts...', `${wxrData.posts.length} posts to analyze`); // Analyze posts const analyzedPosts = this.analyzePostItems(wxrData.posts, slugToPost, checksumToPost); this.onProgress?.('Analyzing pages...', `${wxrData.pages.length} pages to analyze`); const analyzedPages = this.analyzePostItems(wxrData.pages, slugToPost, checksumToPost); this.onProgress?.('Analyzing media files...', `${wxrData.media.length} media files to analyze`); // Analyze media const analyzedMedia = await this.analyzeMediaItems(wxrData.media, nameToMedia, checksumToMedia, uploadsFolder); this.onProgress?.('Processing categories and tags...'); // Analyze categories const analyzedCategories: AnalyzedCategory[] = wxrData.categories.map(cat => ({ name: cat.name, slug: cat.slug, existsInProject: existingTagNames.has(cat.name.toLowerCase()), })); // Analyze tags const analyzedTags: AnalyzedTag[] = wxrData.tags.map(tag => ({ name: tag.name, slug: tag.slug, existsInProject: existingTagNames.has(tag.name.toLowerCase()), })); this.onProgress?.('Discovering macros...'); // Analyze macros from posts and pages content const macroAnalysis = this.analyzeMacros([...wxrData.posts, ...wxrData.pages]); return { sourceFile, site: wxrData.site, analyzedAt: new Date(), posts: this.summarizePostAnalysis(analyzedPosts), pages: this.summarizePostAnalysis(analyzedPages), media: this.summarizeMediaAnalysis(analyzedMedia), categories: analyzedCategories, tags: analyzedTags, macros: macroAnalysis, }; } private analyzePostItems( wxrPosts: WxrPost[], slugToPost: Map, checksumToPost: Map, ): AnalyzedPost[] { return wxrPosts.map(wxrPost => { const markdown = this.convertToMarkdown(wxrPost.content); const contentHash = this.calculateChecksum(markdown); const markdownPreview = markdown.substring(0, 200); const existingBySlug = slugToPost.get(wxrPost.slug); const existingByHash = checksumToPost.get(contentHash); let status: PostAnalysisStatus; let existingPost: AnalyzedPost['existingPost']; if (existingBySlug) { if (existingBySlug.checksum === contentHash) { status = 'update'; } else { status = 'conflict'; } const existingDate = existingBySlug.publishedAt || existingBySlug.createdAt; const existingTags = existingBySlug.tags ? JSON.parse(existingBySlug.tags) : []; const existingCategories = existingBySlug.categories ? JSON.parse(existingBySlug.categories) : []; existingPost = { id: existingBySlug.id, title: existingBySlug.title, slug: existingBySlug.slug, checksum: existingBySlug.checksum, pubDate: existingDate ? existingDate.toISOString() : null, excerpt: existingBySlug.excerpt, author: existingBySlug.author, tags: existingTags, categories: existingCategories, }; } else if (existingByHash) { status = 'content-duplicate'; const existingDate = existingByHash.publishedAt || existingByHash.createdAt; const existingTagsByHash = existingByHash.tags ? JSON.parse(existingByHash.tags) : []; const existingCategoriesByHash = existingByHash.categories ? JSON.parse(existingByHash.categories) : []; existingPost = { id: existingByHash.id, title: existingByHash.title, slug: existingByHash.slug, checksum: existingByHash.checksum, pubDate: existingDate ? existingDate.toISOString() : null, excerpt: existingByHash.excerpt, author: existingByHash.author, tags: existingTagsByHash, categories: existingCategoriesByHash, }; } else { status = 'new'; } // For conflicts, default resolution is 'ignore' const conflictResolution = status === 'conflict' ? 'ignore' as const : undefined; return { wxrPost, status, contentHash, markdownPreview, existingPost, conflictResolution }; }); } private async analyzeMediaItems( wxrMediaItems: WxrMedia[], nameToMedia: Map, checksumToMedia: Map, uploadsFolder?: string, ): Promise { const results: AnalyzedMedia[] = []; for (const wxrMedia of wxrMediaItems) { let fileHash: string | null = null; let fileFound = false; // Try to read the actual file from the uploads folder if (uploadsFolder) { try { const filePath = path.join(uploadsFolder, wxrMedia.relativePath); const buffer = await fs.readFile(filePath); fileHash = this.calculateChecksum(buffer.toString('binary')); fileFound = true; } catch { // File not found in uploads folder } } if (!fileFound) { results.push({ wxrMedia, status: 'missing', fileHash: null, }); continue; } const existingByName = nameToMedia.get(wxrMedia.filename.toLowerCase()); const existingByHash = fileHash ? checksumToMedia.get(fileHash) : undefined; let status: MediaAnalysisStatus; let existingMedia: AnalyzedMedia['existingMedia']; if (existingByName) { if (fileHash && existingByName.checksum === fileHash) { status = 'update'; } else { status = 'conflict'; } existingMedia = { id: existingByName.id, originalName: existingByName.originalName, checksum: existingByName.checksum, }; } else if (existingByHash) { status = 'content-duplicate'; existingMedia = { id: existingByHash.id, originalName: existingByHash.originalName, checksum: existingByHash.checksum, }; } else { status = 'new'; } results.push({ wxrMedia, status, fileHash, existingMedia }); } return results; } private summarizePostAnalysis(items: AnalyzedPost[]): ImportAnalysisReport['posts'] { return { total: items.length, new: items.filter(i => i.status === 'new').length, updates: items.filter(i => i.status === 'update').length, conflicts: items.filter(i => i.status === 'conflict').length, contentDuplicates: items.filter(i => i.status === 'content-duplicate').length, items, }; } private summarizeMediaAnalysis(items: AnalyzedMedia[]): ImportAnalysisReport['media'] { return { total: items.length, new: items.filter(i => i.status === 'new').length, updates: items.filter(i => i.status === 'update').length, conflicts: items.filter(i => i.status === 'conflict').length, contentDuplicates: items.filter(i => i.status === 'content-duplicate').length, missing: items.filter(i => i.status === 'missing').length, items, }; } private convertToMarkdown(html: string): string { if (!html || !html.trim()) return ''; // Preprocess: Wrap standalone blocks containing newlines in
 tags
    const withCodeBlocks = this.wrapMultilineCode(html);
    // Preprocess: Convert newlines within text to 
tags to preserve line breaks const preprocessed = this.preserveLineBreaks(withCodeBlocks); let markdown = this.turndown.turndown(preprocessed); // Normalize non-breaking spaces to regular spaces markdown = markdown.replace(/\u00A0/g, ' '); // Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation markdown = markdown.split('\n').map(line => { const trimmed = line.trimEnd(); // Preserve space after ">" for blockquote continuation lines if (trimmed === '>' && line.startsWith('> ')) { return '> '; } return trimmed; }).join('\n'); // Normalize multiple blank lines (3+ consecutive newlines → 2 newlines) markdown = markdown.replace(/\n{3,}/g, '\n\n'); return markdown; } /** * Preserve line breaks and paragraph structure in content. * * WordPress exports often have: * - Plain text mixed with HTML * - Double newlines representing paragraph breaks * - Single newlines that should become
* * This function converts: * - Double newlines (\n\n) to paragraph breaks (

) * - Single newlines within text to
* - Wraps content in

tags if it starts with plain text */ private preserveLineBreaks(html: string): string { if (!html || !html.trim()) return html; // Check if content starts with a tag or plain text const startsWithTag = /^\s* blocks from having their newlines modified const preBlocks: string[] = []; let protectedHtml = html.replace(/

([\s\S]*?)<\/pre>/g, (match) => {
      const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
      preBlocks.push(match);
      return placeholder;
    });
    
    // If it starts with plain text, we need to handle the whole content differently
    if (!startsWithTag) {
      // First, convert double newlines to paragraph markers
      let processed = protectedHtml.replace(/\n\n+/g, '

\n

'); // Convert remaining single newlines within text to
// (but not newlines that are just between tags) processed = processed.replace(/>([^<]+) { if (!textContent.trim()) { return '>' + textContent + '<'; } const preserved = textContent.replace(/\n/g, '
'); return '>' + preserved + '<'; }); // Also handle newlines at the start (before any tags) processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => { if (!textContent.trim()) return match; return textContent.replace(/\n/g, '
'); }); // Wrap in

if we added paragraph markers if (processed.includes('

')) { processed = '

' + processed + '

'; } // Restore protected
 blocks
      preBlocks.forEach((block, i) => {
        processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
      });
      
      return processed;
    }

    // For content that starts with HTML, handle newlines within text content
    let result = protectedHtml.replace(/>([^<]+) {
      if (!textContent.trim()) {
        return '>' + textContent + '<';
      }
      // First convert double newlines to paragraph breaks
      let preserved = textContent.replace(/\n\n+/g, '

'); // Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
'); return '>' + preserved + '<'; }); // Also handle text at the END of content (after the last tag) // This catches text after closing tags like --> or /> that goes to the end result = result.replace(/>([^<]+)$/g, (match, textContent: string) => { if (!textContent.trim()) { return match; } // First convert double newlines to paragraph breaks let preserved = textContent.replace(/\n\n+/g, '

'); // Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
'); return '>' + preserved; }); // Restore protected

 blocks
    preBlocks.forEach((block, i) => {
      result = result.replace(`__PRE_BLOCK_${i}__`, block);
    });
    
    return result;
  }

  /**
   * Wrap standalone  blocks containing newlines in 
 tags.
   * 
   * WordPress content sometimes uses ... for multi-line code blocks
   * without a 
 wrapper. Standard HTML parsing treats this as inline code and
   * collapses whitespace. By wrapping in 
, we preserve the formatting and
   * Turndown will convert it to a fenced Markdown code block.
   * 
   * Only wraps  blocks that contain literal newlines.
   * Does NOT wrap:
   *   -  already inside 
   *   -  without newlines (inline code)
   */
  private wrapMultilineCode(html: string): string {
    if (!html) return html;

    // Match  blocks containing newlines that are NOT inside 
    // Use a regex that captures the full ... content including any embedded HTML
    return html.replace(/([\s\S]*?)<\/code>/g, (match, content: string) => {
      // Only wrap if content contains newlines (multiline code block)
      if (!content.includes('\n')) {
        return match; // Leave inline code as-is
      }
      // Check if this  is already inside a 
 by looking backward
      // Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
      return '
' + content + '
'; }); } private calculateChecksum(content: string): string { return crypto.createHash('md5').update(content).digest('hex'); } /** * Analyze macros (WordPress shortcodes) from post/page content. * Discovers all shortcodes, aggregates their usages, and validates against definitions. */ private analyzeMacros(posts: WxrPost[]): MacroAnalysisSummary { // Map of macro name -> discovered macro data const macroMap = new Map; count: number }>; postSlugs: Set; }>(); // Process each post/page for (const post of posts) { if (!post.content) continue; const shortcodes = this.parseShortcodes(post.content); for (const shortcode of shortcodes) { const name = shortcode.name.toLowerCase(); let macroData = macroMap.get(name); if (!macroData) { macroData = { name, totalCount: 0, usages: new Map(), postSlugs: new Set(), }; macroMap.set(name, macroData); } macroData.totalCount++; macroData.postSlugs.add(post.slug); // Create a key for this parameter combination const paramsKey = this.serializeParams(shortcode.params); const existingUsage = macroData.usages.get(paramsKey); if (existingUsage) { existingUsage.count++; } else { macroData.usages.set(paramsKey, { params: shortcode.params, count: 1 }); } } } // Convert to final format with validation const discovered: DiscoveredMacro[] = []; for (const macroData of macroMap.values()) { const definition = this.macroDefinitions.get(macroData.name); const mapped = definition !== undefined; const usages: MacroUsage[] = []; for (const [paramsKey, usage] of macroData.usages) { let validationStatus: MacroValidationStatus = 'unknown'; let validationError: string | undefined; if (mapped && definition) { if (definition.validate) { const error = definition.validate(usage.params); if (error) { validationStatus = 'invalid'; validationError = error; } else { validationStatus = 'valid'; } } else { // Macro is mapped but has no validation - consider valid validationStatus = 'valid'; } } usages.push({ params: usage.params, count: usage.count, validationStatus, validationError, paramsKey, }); } discovered.push({ name: macroData.name, mapped, totalCount: macroData.totalCount, usages, postSlugs: Array.from(macroData.postSlugs), }); } // Sort discovered macros by name discovered.sort((a, b) => a.name.localeCompare(b.name)); return { total: discovered.length, mappedCount: discovered.filter(m => m.mapped).length, unmappedCount: discovered.filter(m => !m.mapped).length, discovered, }; } /** * Parse WordPress shortcodes from content. * Returns array of { name, params } for each shortcode found. */ private parseShortcodes(content: string): Array<{ name: string; params: Record }> { const shortcodes: Array<{ name: string; params: Record }> = []; // Reset regex lastIndex ImportAnalysisEngine.SHORTCODE_REGEX.lastIndex = 0; let match; while ((match = ImportAnalysisEngine.SHORTCODE_REGEX.exec(content)) !== null) { const name = match[1]; const paramString = match[2] || ''; const params = this.parseShortcodeParams(paramString); shortcodes.push({ name, params }); } return shortcodes; } /** * Parse parameters from a shortcode parameter string. * Supports: key="value", key='value', and key=value (unquoted) */ private parseShortcodeParams(paramString: string): Record { const params: Record = {}; // Reset regex lastIndex ImportAnalysisEngine.PARAM_REGEX.lastIndex = 0; let match; while ((match = ImportAnalysisEngine.PARAM_REGEX.exec(paramString)) !== null) { const key = match[1]; // Value is in group 2 (double-quoted), 3 (single-quoted), or 4 (unquoted) const value = match[2] ?? match[3] ?? match[4] ?? ''; params[key] = value; } return params; } /** * Serialize params to a stable string for deduplication. */ private serializeParams(params: Record): string { const sorted = Object.entries(params).sort(([a], [b]) => a.localeCompare(b)); return JSON.stringify(sorted); } }