import crypto from 'crypto';
import * as fs from 'fs/promises';
import * as path from 'path';
import TurndownService from 'turndown';
import { getDatabase } from '../database';
import { posts, media, tags } from '../database/schema';
import { eq } from 'drizzle-orm';
import type { WxrData, WxrPost, WxrMedia, WxrSiteInfo, WxrCategory, WxrTag } from './WxrParser';
import { getMacroConfigMap, type MacroConfig } from '../config/macroConfig';
export type PostAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate';
export type MediaAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate' | 'missing';
/** How to resolve a slug conflict during import */
export type ImportConflictResolution = 'ignore' | 'overwrite' | 'import';
export interface AnalyzedPost {
wxrPost: WxrPost;
status: PostAnalysisStatus;
contentHash: string;
markdownPreview: string;
/** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */
conflictResolution?: ImportConflictResolution;
existingPost?: {
id: string;
title: string;
slug: string;
checksum: string | null;
/** Date the existing post was created/published */
pubDate: string | null;
/** Excerpt from existing post */
excerpt: string | null;
/** Author of the existing post */
author: string | null;
/** Tags of the existing post */
tags: string[];
/** Categories of the existing post */
categories: string[];
};
}
export interface AnalyzedMedia {
wxrMedia: WxrMedia;
status: MediaAnalysisStatus;
fileHash: string | null;
/** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */
conflictResolution?: ImportConflictResolution;
existingMedia?: {
id: string;
originalName: string;
checksum: string | null;
};
}
export interface AnalyzedCategory {
name: string;
slug: string;
existsInProject: boolean;
mappedTo?: string; // When set, indicates this item should be mapped to the given name on import
}
export interface AnalyzedTag {
name: string;
slug: string;
existsInProject: boolean;
mappedTo?: string; // When set, indicates this item should be mapped to the given name on import
}
/** Validation status for a macro usage */
export type MacroValidationStatus = 'valid' | 'invalid' | 'unknown';
/** A single unique usage pattern of a macro */
export interface MacroUsage {
/** The parameters used in this particular usage */
params: Record -> 
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
this.turndown.addRule('linkedImage', {
filter: (node) => {
// Match tags that contain only an
(possibly with whitespace)
if (node.nodeName !== 'A') return false;
const children = Array.from(node.childNodes).filter(
child => !(child.nodeType === 3 && !child.textContent?.trim())
);
return children.length === 1 && children[0].nodeName === 'IMG';
},
replacement: (_content, node) => {
const anchor = node as HTMLAnchorElement;
const img = anchor.querySelector('img');
if (!img) return '';
const href = anchor.getAttribute('href') || '';
const imgSrc = img.getAttribute('src') || '';
const imgAlt = img.getAttribute('alt') || '';
const imgTitle = img.getAttribute('title') || '';
// Check if the link href points to an image (common WordPress pattern for "click for larger")
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
const hrefIsImage = imageExtensions.test(href);
// Determine which URL to use:
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
// - Otherwise, use the original image src
const imageUrl = hrefIsImage ? href : imgSrc;
// Derive alt text: prefer alt, then title, then cleaned filename
let altText = imgAlt.trim();
if (!altText) {
altText = imgTitle.trim();
}
if (!altText) {
// Extract filename from the image URL as last resort
const urlPath = imageUrl.split('?')[0]; // Remove query string
const filename = urlPath.split('/').pop() || '';
// Clean the filename: remove extension and replace underscores with spaces
altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' ');
}
// Build the markdown image link (without title attribute)
return ``;
},
});
// Custom rule for Flash embeds - replace with placeholder text
this.turndown.addRule('flashEmbed', {
filter: (node) => {
if (node.nodeName !== 'EMBED') return false;
const embed = node as HTMLEmbedElement;
const type = embed.getAttribute('type') || '';
const src = embed.getAttribute('src') || '';
// Match Flash content by type or file extension
return type.toLowerCase().includes('flash') ||
type.toLowerCase().includes('shockwave') ||
src.toLowerCase().endsWith('.swf');
},
replacement: () => 'FLASH PLAYER NOT SUPPORTED',
});
// Load macro definitions from shared config
this.loadMacroConfigsFromShared();
}
/**
* Load macro definitions from the shared macro config.
* Called automatically in constructor.
*/
private loadMacroConfigsFromShared(): void {
try {
const configs = getMacroConfigMap();
// Convert MacroConfig to MacroDefinitionLike
for (const [name, config] of configs) {
this.macroDefinitions.set(name, {
name: config.name,
validate: config.validate,
});
}
} catch (error) {
// Config not available - macros will be marked as unmapped
console.warn('Could not load macro configs:', error);
}
}
setProjectContext(projectId: string): void {
this.currentProjectId = projectId;
}
/**
* Set macro definitions for mapping and validation.
* This overrides the auto-loaded shared config. Useful for testing.
* @param definitions Map of macro name (lowercase) to definition
*/
setMacroDefinitions(definitions: Map
blocks containing newlines in tags
const withCodeBlocks = this.wrapMultilineCode(html);
// Preprocess: Convert newlines within text to
tags to preserve line breaks
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
let markdown = this.turndown.turndown(preprocessed);
// Normalize non-breaking spaces to regular spaces
markdown = markdown.replace(/\u00A0/g, ' ');
// Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation
markdown = markdown.split('\n').map(line => {
const trimmed = line.trimEnd();
// Preserve space after ">" for blockquote continuation lines
if (trimmed === '>' && line.startsWith('> ')) {
return '> ';
}
return trimmed;
}).join('\n');
// Normalize multiple blank lines (3+ consecutive newlines → 2 newlines)
markdown = markdown.replace(/\n{3,}/g, '\n\n');
return markdown;
}
/**
* Preserve line breaks and paragraph structure in content.
*
* WordPress exports often have:
* - Plain text mixed with HTML
* - Double newlines representing paragraph breaks
* - Single newlines that should become
*
* This function converts:
* - Double newlines (\n\n) to paragraph breaks (
)
* - Single newlines within text to
* - Wraps content in
tags if it starts with plain text */ private preserveLineBreaks(html: string): string { if (!html || !html.trim()) return html; // Check if content starts with a tag or plain text const startsWithTag = /^\s* blocks from having their newlines modified const preBlocks: string[] = []; let protectedHtml = html.replace(/
([\s\S]*?)<\/pre>/g, (match) => {
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
preBlocks.push(match);
return placeholder;
});
// If it starts with plain text, we need to handle the whole content differently
if (!startsWithTag) {
// First, convert double newlines to paragraph markers
let processed = protectedHtml.replace(/\n\n+/g, '\n');
// Convert remaining single newlines within text to
// (but not newlines that are just between tags)
processed = processed.replace(/>([^<]+) {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
const preserved = textContent.replace(/\n/g, '
');
return '>' + preserved + '<';
});
// Also handle newlines at the start (before any tags)
processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
if (!textContent.trim()) return match;
return textContent.replace(/\n/g, '
');
});
// Wrap in
if we added paragraph markers
if (processed.includes('
')) {
processed = '' + processed + '
';
}
// Restore protected blocks
preBlocks.forEach((block, i) => {
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
});
return processed;
}
// For content that starts with HTML, handle newlines within text content
let result = protectedHtml.replace(/>([^<]+) {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '');
// Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
');
return '>' + preserved + '<';
});
// Also handle text at the END of content (after the last tag)
// This catches text after closing tags like --> or /> that goes to the end
result = result.replace(/>([^<]+)$/g, (match, textContent: string) => {
if (!textContent.trim()) {
return match;
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '
');
// Then convert remaining single newlines to
preserved = preserved.replace(/\n/g, '
');
return '>' + preserved;
});
// Restore protected
blocks
preBlocks.forEach((block, i) => {
result = result.replace(`__PRE_BLOCK_${i}__`, block);
});
return result;
}
/**
* Wrap standalone blocks containing newlines in tags.
*
* WordPress content sometimes uses ... for multi-line code blocks
* without a wrapper. Standard HTML parsing treats this as inline code and
* collapses whitespace. By wrapping in , we preserve the formatting and
* Turndown will convert it to a fenced Markdown code block.
*
* Only wraps blocks that contain literal newlines.
* Does NOT wrap:
* - already inside
* - without newlines (inline code)
*/
private wrapMultilineCode(html: string): string {
if (!html) return html;
// Match blocks containing newlines that are NOT inside
// Use a regex that captures the full ... content including any embedded HTML
return html.replace(/([\s\S]*?)<\/code>/g, (match, content: string) => {
// Only wrap if content contains newlines (multiline code block)
if (!content.includes('\n')) {
return match; // Leave inline code as-is
}
// Check if this is already inside a by looking backward
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
return '' + content + '
';
});
}
private calculateChecksum(content: string): string {
return crypto.createHash('md5').update(content).digest('hex');
}
/**
* Analyze macros (WordPress shortcodes) from post/page content.
* Discovers all shortcodes, aggregates their usages, and validates against definitions.
*/
private analyzeMacros(posts: WxrPost[]): MacroAnalysisSummary {
// Map of macro name -> discovered macro data
const macroMap = new Map; count: number }>;
postSlugs: Set;
}>();
// Process each post/page
for (const post of posts) {
if (!post.content) continue;
const shortcodes = this.parseShortcodes(post.content);
for (const shortcode of shortcodes) {
const name = shortcode.name.toLowerCase();
let macroData = macroMap.get(name);
if (!macroData) {
macroData = {
name,
totalCount: 0,
usages: new Map(),
postSlugs: new Set(),
};
macroMap.set(name, macroData);
}
macroData.totalCount++;
macroData.postSlugs.add(post.slug);
// Create a key for this parameter combination
const paramsKey = this.serializeParams(shortcode.params);
const existingUsage = macroData.usages.get(paramsKey);
if (existingUsage) {
existingUsage.count++;
} else {
macroData.usages.set(paramsKey, { params: shortcode.params, count: 1 });
}
}
}
// Convert to final format with validation
const discovered: DiscoveredMacro[] = [];
for (const macroData of macroMap.values()) {
const definition = this.macroDefinitions.get(macroData.name);
const mapped = definition !== undefined;
const usages: MacroUsage[] = [];
for (const [paramsKey, usage] of macroData.usages) {
let validationStatus: MacroValidationStatus = 'unknown';
let validationError: string | undefined;
if (mapped && definition) {
if (definition.validate) {
const error = definition.validate(usage.params);
if (error) {
validationStatus = 'invalid';
validationError = error;
} else {
validationStatus = 'valid';
}
} else {
// Macro is mapped but has no validation - consider valid
validationStatus = 'valid';
}
}
usages.push({
params: usage.params,
count: usage.count,
validationStatus,
validationError,
paramsKey,
});
}
discovered.push({
name: macroData.name,
mapped,
totalCount: macroData.totalCount,
usages,
postSlugs: Array.from(macroData.postSlugs),
});
}
// Sort discovered macros by name
discovered.sort((a, b) => a.name.localeCompare(b.name));
return {
total: discovered.length,
mappedCount: discovered.filter(m => m.mapped).length,
unmappedCount: discovered.filter(m => !m.mapped).length,
discovered,
};
}
/**
* Parse WordPress shortcodes from content.
* Returns array of { name, params } for each shortcode found.
*/
private parseShortcodes(content: string): Array<{ name: string; params: Record }> {
const shortcodes: Array<{ name: string; params: Record }> = [];
// Reset regex lastIndex
ImportAnalysisEngine.SHORTCODE_REGEX.lastIndex = 0;
let match;
while ((match = ImportAnalysisEngine.SHORTCODE_REGEX.exec(content)) !== null) {
const name = match[1];
const paramString = match[2] || '';
const params = this.parseShortcodeParams(paramString);
shortcodes.push({ name, params });
}
return shortcodes;
}
/**
* Parse parameters from a shortcode parameter string.
* Supports: key="value", key='value', and key=value (unquoted)
*/
private parseShortcodeParams(paramString: string): Record {
const params: Record = {};
// Reset regex lastIndex
ImportAnalysisEngine.PARAM_REGEX.lastIndex = 0;
let match;
while ((match = ImportAnalysisEngine.PARAM_REGEX.exec(paramString)) !== null) {
const key = match[1];
// Value is in group 2 (double-quoted), 3 (single-quoted), or 4 (unquoted)
const value = match[2] ?? match[3] ?? match[4] ?? '';
params[key] = value;
}
return params;
}
/**
* Serialize params to a stable string for deduplication.
*/
private serializeParams(params: Record): string {
const sorted = Object.entries(params).sort(([a], [b]) => a.localeCompare(b));
return JSON.stringify(sorted);
}
}