Files
bDS/src/main/engine/ImportAnalysisEngine.ts

899 lines
32 KiB
TypeScript

import crypto from 'crypto';
import * as fs from 'fs/promises';
import * as path from 'path';
import TurndownService from 'turndown';
import { getDatabase } from '../database';
import { posts, media, tags } from '../database/schema';
import { eq } from 'drizzle-orm';
import type { WxrData, WxrPost, WxrMedia, WxrSiteInfo, WxrCategory, WxrTag } from './WxrParser';
import { getMacroConfigMap, type MacroConfig } from '../config/macroConfig';
export type PostAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate';
export type MediaAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate' | 'missing';
/** How to resolve a slug conflict during import */
export type ImportConflictResolution = 'ignore' | 'overwrite' | 'import';
export interface AnalyzedPost {
wxrPost: WxrPost;
status: PostAnalysisStatus;
contentHash: string;
markdownPreview: string;
/** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */
conflictResolution?: ImportConflictResolution;
existingPost?: {
id: string;
title: string;
slug: string;
checksum: string | null;
/** Date the existing post was created/published */
pubDate: string | null;
/** Excerpt from existing post */
excerpt: string | null;
/** Author of the existing post */
author: string | null;
/** Tags of the existing post */
tags: string[];
/** Categories of the existing post */
categories: string[];
};
}
export interface AnalyzedMedia {
wxrMedia: WxrMedia;
status: MediaAnalysisStatus;
fileHash: string | null;
/** How to resolve conflict (only relevant when status is 'conflict'). Default is 'ignore'. */
conflictResolution?: ImportConflictResolution;
existingMedia?: {
id: string;
originalName: string;
checksum: string | null;
};
}
export interface AnalyzedCategory {
name: string;
slug: string;
existsInProject: boolean;
mappedTo?: string; // When set, indicates this item should be mapped to the given name on import
}
export interface AnalyzedTag {
name: string;
slug: string;
existsInProject: boolean;
mappedTo?: string; // When set, indicates this item should be mapped to the given name on import
}
/** Validation status for a macro usage */
export type MacroValidationStatus = 'valid' | 'invalid' | 'unknown';
/** A single unique usage pattern of a macro */
export interface MacroUsage {
/** The parameters used in this particular usage */
params: Record<string, string>;
/** How many times this exact parameter combination was used */
count: number;
/** Whether this usage is valid according to our macro definition */
validationStatus: MacroValidationStatus;
/** Error message if validation failed */
validationError?: string;
/** Serialized params for deduplication */
paramsKey: string;
}
/** A discovered macro from the import content */
export interface DiscoveredMacro {
/** The macro name (lowercase) */
name: string;
/** Whether this macro maps to an internal definition */
mapped: boolean;
/** Total number of times this macro appears across all content */
totalCount: number;
/** Unique usages with different parameters */
usages: MacroUsage[];
/** Slugs of posts/pages where this macro is used */
postSlugs: string[];
}
/** Summary of macro analysis */
export interface MacroAnalysisSummary {
/** Total unique macros discovered */
total: number;
/** Number of macros that map to internal definitions */
mappedCount: number;
/** Number of macros that don't map to internal definitions */
unmappedCount: number;
/** All discovered macros with their usages */
discovered: DiscoveredMacro[];
}
/** Minimal interface for macro definition validation */
export interface MacroDefinitionLike {
name: string;
validate?: (params: Record<string, string>) => string | undefined;
}
export interface ImportAnalysisReport {
sourceFile: string;
site: WxrSiteInfo;
analyzedAt: Date;
posts: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
items: AnalyzedPost[];
};
pages: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
items: AnalyzedPost[];
};
media: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
missing: number;
items: AnalyzedMedia[];
};
categories: AnalyzedCategory[];
tags: AnalyzedTag[];
macros: MacroAnalysisSummary;
}
export class ImportAnalysisEngine {
private currentProjectId: string = '';
private turndown: TurndownService;
private macroDefinitions: Map<string, MacroDefinitionLike> = new Map();
// Progress callback for reporting analysis steps
onProgress?: (step: string, detail?: string) => void;
// Regex to match WordPress shortcodes: [macroname param="val" param2='val2']
// This matches single brackets (NOT double brackets like our internal format)
// Uses negative lookbehind (?<!\[) and negative lookahead (?!\]) to exclude [[...]]
private static readonly SHORTCODE_REGEX = /(?<!\[)\[(\w+)([^\]]*?)(?:\s*\/)?\](?!\])/g;
// Regex to extract individual parameters from shortcode
// Supports: key="value", key='value', and key=value (unquoted)
private static readonly PARAM_REGEX = /(\w+)=(?:"([^"]*)"|'([^']*)'|([^\s\]"']+))/g;
constructor() {
this.turndown = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
emDelimiter: '*',
});
// Custom rule for list items: use single space after marker instead of multiple spaces
this.turndown.addRule('listItem', {
filter: 'li',
replacement: (content, node, options) => {
content = content
.replace(/^\n+/, '') // Remove leading newlines
.replace(/\n+$/, '\n') // Replace trailing newlines with single newline
.replace(/\n/gm, '\n '); // Indent subsequent lines with 2 spaces
const parent = node.parentNode as HTMLElement;
const isOrdered = parent?.nodeName === 'OL';
let prefix = options.bulletListMarker + ' ';
if (isOrdered) {
const start = parent.getAttribute('start');
const index = Array.prototype.indexOf.call(parent.children, node);
const startNum = start ? parseInt(start, 10) : 1;
prefix = (startNum + index) + '. ';
}
return prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
},
});
// Custom rule for standalone images with empty alt but title attribute
// WordPress often uses title="name" with alt=""
this.turndown.addRule('imageWithTitle', {
filter: (node) => {
if (node.nodeName !== 'IMG') return false;
// Check if this image is NOT inside an <a> tag (those are handled by linkedImage rule)
const parent = node.parentNode;
if (parent?.nodeName === 'A') return false;
// Only match if alt is empty but title exists
const img = node as HTMLImageElement;
const alt = img.getAttribute('alt') || '';
const title = img.getAttribute('title') || '';
return !alt.trim() && title.trim().length > 0;
},
replacement: (_content, node) => {
const img = node as HTMLImageElement;
const src = img.getAttribute('src') || '';
const title = img.getAttribute('title') || '';
return `![${title}](${src})`;
},
});
// Custom rule for linked images: <a><img></a> -> ![alt](src)
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
this.turndown.addRule('linkedImage', {
filter: (node) => {
// Match <a> tags that contain only an <img> (possibly with whitespace)
if (node.nodeName !== 'A') return false;
const children = Array.from(node.childNodes).filter(
child => !(child.nodeType === 3 && !child.textContent?.trim())
);
return children.length === 1 && children[0].nodeName === 'IMG';
},
replacement: (_content, node) => {
const anchor = node as HTMLAnchorElement;
const img = anchor.querySelector('img');
if (!img) return '';
const href = anchor.getAttribute('href') || '';
const imgSrc = img.getAttribute('src') || '';
const imgAlt = img.getAttribute('alt') || '';
const imgTitle = img.getAttribute('title') || '';
// Check if the link href points to an image (common WordPress pattern for "click for larger")
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
const hrefIsImage = imageExtensions.test(href);
// Determine which URL to use:
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
// - Otherwise, use the original image src
const imageUrl = hrefIsImage ? href : imgSrc;
// Derive alt text: prefer alt, then title, then cleaned filename
let altText = imgAlt.trim();
if (!altText) {
altText = imgTitle.trim();
}
if (!altText) {
// Extract filename from the image URL as last resort
const urlPath = imageUrl.split('?')[0]; // Remove query string
const filename = urlPath.split('/').pop() || '';
// Clean the filename: remove extension and replace underscores with spaces
altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' ');
}
// Build the markdown image link (without title attribute)
return `![${altText}](${imageUrl})`;
},
});
// Custom rule for Flash embeds - replace with placeholder text
this.turndown.addRule('flashEmbed', {
filter: (node) => {
if (node.nodeName !== 'EMBED') return false;
const embed = node as HTMLEmbedElement;
const type = embed.getAttribute('type') || '';
const src = embed.getAttribute('src') || '';
// Match Flash content by type or file extension
return type.toLowerCase().includes('flash') ||
type.toLowerCase().includes('shockwave') ||
src.toLowerCase().endsWith('.swf');
},
replacement: () => 'FLASH PLAYER NOT SUPPORTED',
});
// Load macro definitions from shared config
this.loadMacroConfigsFromShared();
}
/**
* Load macro definitions from the shared macro config.
* Called automatically in constructor.
*/
private loadMacroConfigsFromShared(): void {
try {
const configs = getMacroConfigMap();
// Convert MacroConfig to MacroDefinitionLike
for (const [name, config] of configs) {
this.macroDefinitions.set(name, {
name: config.name,
validate: config.validate,
});
}
} catch (error) {
// Config not available - macros will be marked as unmapped
console.warn('Could not load macro configs:', error);
}
}
setProjectContext(projectId: string): void {
this.currentProjectId = projectId;
}
/**
* Set macro definitions for mapping and validation.
* This overrides the auto-loaded shared config. Useful for testing.
* @param definitions Map of macro name (lowercase) to definition
*/
setMacroDefinitions(definitions: Map<string, MacroDefinitionLike>): void {
this.macroDefinitions = definitions;
}
async analyzeWxr(wxrData: WxrData, sourceFile: string, uploadsFolder?: string): Promise<ImportAnalysisReport> {
const db = getDatabase().getLocal();
this.onProgress?.('Loading existing posts...');
// Fetch existing posts for this project
const existingPosts = await db
.select({
id: posts.id,
slug: posts.slug,
title: posts.title,
checksum: posts.checksum,
excerpt: posts.excerpt,
author: posts.author,
publishedAt: posts.publishedAt,
createdAt: posts.createdAt,
status: posts.status,
tags: posts.tags,
categories: posts.categories,
})
.from(posts)
.where(eq(posts.projectId, this.currentProjectId))
.all();
this.onProgress?.('Loading existing media...', `${existingPosts.length} posts in project`);
// Fetch existing media for this project
const existingMedia = await db
.select({
id: media.id,
originalName: media.originalName,
checksum: media.checksum,
})
.from(media)
.where(eq(media.projectId, this.currentProjectId))
.all();
this.onProgress?.('Loading existing tags...', `${existingMedia.length} media in project`);
// Fetch existing tags for this project
const existingTags = await db
.select({
name: tags.name,
})
.from(tags)
.where(eq(tags.projectId, this.currentProjectId))
.all();
// Build lookup maps for posts
const slugToPost = new Map<string, typeof existingPosts[0]>();
const checksumToPost = new Map<string, typeof existingPosts[0]>();
for (const post of existingPosts) {
slugToPost.set(post.slug, post);
if (post.checksum) {
checksumToPost.set(post.checksum, post);
}
}
// Build lookup maps for media
const nameToMedia = new Map<string, typeof existingMedia[0]>();
const checksumToMedia = new Map<string, typeof existingMedia[0]>();
for (const m of existingMedia) {
nameToMedia.set(m.originalName.toLowerCase(), m);
if (m.checksum) {
checksumToMedia.set(m.checksum, m);
}
}
// Build tag set
const existingTagNames = new Set(existingTags.map(t => t.name.toLowerCase()));
this.onProgress?.('Analyzing posts...', `${wxrData.posts.length} posts to analyze`);
// Analyze posts
const analyzedPosts = this.analyzePostItems(wxrData.posts, slugToPost, checksumToPost);
this.onProgress?.('Analyzing pages...', `${wxrData.pages.length} pages to analyze`);
const analyzedPages = this.analyzePostItems(wxrData.pages, slugToPost, checksumToPost);
this.onProgress?.('Analyzing media files...', `${wxrData.media.length} media files to analyze`);
// Analyze media
const analyzedMedia = await this.analyzeMediaItems(wxrData.media, nameToMedia, checksumToMedia, uploadsFolder);
this.onProgress?.('Processing categories and tags...');
// Analyze categories
const analyzedCategories: AnalyzedCategory[] = wxrData.categories.map(cat => ({
name: cat.name,
slug: cat.slug,
existsInProject: existingTagNames.has(cat.name.toLowerCase()),
}));
// Analyze tags
const analyzedTags: AnalyzedTag[] = wxrData.tags.map(tag => ({
name: tag.name,
slug: tag.slug,
existsInProject: existingTagNames.has(tag.name.toLowerCase()),
}));
this.onProgress?.('Discovering macros...');
// Analyze macros from posts and pages content
const macroAnalysis = this.analyzeMacros([...wxrData.posts, ...wxrData.pages]);
return {
sourceFile,
site: wxrData.site,
analyzedAt: new Date(),
posts: this.summarizePostAnalysis(analyzedPosts),
pages: this.summarizePostAnalysis(analyzedPages),
media: this.summarizeMediaAnalysis(analyzedMedia),
categories: analyzedCategories,
tags: analyzedTags,
macros: macroAnalysis,
};
}
private analyzePostItems(
wxrPosts: WxrPost[],
slugToPost: Map<string, { id: string; slug: string; title: string; checksum: string | null; excerpt: string | null; author: string | null; publishedAt: Date | null; createdAt: Date; status: string; tags: string | null; categories: string | null }>,
checksumToPost: Map<string, { id: string; slug: string; title: string; checksum: string | null; excerpt: string | null; author: string | null; publishedAt: Date | null; createdAt: Date; status: string; tags: string | null; categories: string | null }>,
): AnalyzedPost[] {
return wxrPosts.map(wxrPost => {
const markdown = this.convertToMarkdown(wxrPost.content);
const contentHash = this.calculateChecksum(markdown);
const markdownPreview = markdown.substring(0, 200);
const existingBySlug = slugToPost.get(wxrPost.slug);
const existingByHash = checksumToPost.get(contentHash);
let status: PostAnalysisStatus;
let existingPost: AnalyzedPost['existingPost'];
if (existingBySlug) {
if (existingBySlug.checksum === contentHash) {
status = 'update';
} else {
status = 'conflict';
}
const existingDate = existingBySlug.publishedAt || existingBySlug.createdAt;
const existingTags = existingBySlug.tags ? JSON.parse(existingBySlug.tags) : [];
const existingCategories = existingBySlug.categories ? JSON.parse(existingBySlug.categories) : [];
existingPost = {
id: existingBySlug.id,
title: existingBySlug.title,
slug: existingBySlug.slug,
checksum: existingBySlug.checksum,
pubDate: existingDate ? existingDate.toISOString() : null,
excerpt: existingBySlug.excerpt,
author: existingBySlug.author,
tags: existingTags,
categories: existingCategories,
};
} else if (existingByHash) {
status = 'content-duplicate';
const existingDate = existingByHash.publishedAt || existingByHash.createdAt;
const existingTagsByHash = existingByHash.tags ? JSON.parse(existingByHash.tags) : [];
const existingCategoriesByHash = existingByHash.categories ? JSON.parse(existingByHash.categories) : [];
existingPost = {
id: existingByHash.id,
title: existingByHash.title,
slug: existingByHash.slug,
checksum: existingByHash.checksum,
pubDate: existingDate ? existingDate.toISOString() : null,
excerpt: existingByHash.excerpt,
author: existingByHash.author,
tags: existingTagsByHash,
categories: existingCategoriesByHash,
};
} else {
status = 'new';
}
// For conflicts, default resolution is 'ignore'
const conflictResolution = status === 'conflict' ? 'ignore' as const : undefined;
return { wxrPost, status, contentHash, markdownPreview, existingPost, conflictResolution };
});
}
private async analyzeMediaItems(
wxrMediaItems: WxrMedia[],
nameToMedia: Map<string, { id: string; originalName: string; checksum: string | null }>,
checksumToMedia: Map<string, { id: string; originalName: string; checksum: string | null }>,
uploadsFolder?: string,
): Promise<AnalyzedMedia[]> {
const results: AnalyzedMedia[] = [];
for (const wxrMedia of wxrMediaItems) {
let fileHash: string | null = null;
let fileFound = false;
// Try to read the actual file from the uploads folder
if (uploadsFolder) {
try {
const filePath = path.join(uploadsFolder, wxrMedia.relativePath);
const buffer = await fs.readFile(filePath);
fileHash = this.calculateChecksum(buffer.toString('binary'));
fileFound = true;
} catch {
// File not found in uploads folder
}
}
if (!fileFound) {
results.push({
wxrMedia,
status: 'missing',
fileHash: null,
});
continue;
}
const existingByName = nameToMedia.get(wxrMedia.filename.toLowerCase());
const existingByHash = fileHash ? checksumToMedia.get(fileHash) : undefined;
let status: MediaAnalysisStatus;
let existingMedia: AnalyzedMedia['existingMedia'];
if (existingByName) {
if (fileHash && existingByName.checksum === fileHash) {
status = 'update';
} else {
status = 'conflict';
}
existingMedia = {
id: existingByName.id,
originalName: existingByName.originalName,
checksum: existingByName.checksum,
};
} else if (existingByHash) {
status = 'content-duplicate';
existingMedia = {
id: existingByHash.id,
originalName: existingByHash.originalName,
checksum: existingByHash.checksum,
};
} else {
status = 'new';
}
results.push({ wxrMedia, status, fileHash, existingMedia });
}
return results;
}
private summarizePostAnalysis(items: AnalyzedPost[]): ImportAnalysisReport['posts'] {
return {
total: items.length,
new: items.filter(i => i.status === 'new').length,
updates: items.filter(i => i.status === 'update').length,
conflicts: items.filter(i => i.status === 'conflict').length,
contentDuplicates: items.filter(i => i.status === 'content-duplicate').length,
items,
};
}
private summarizeMediaAnalysis(items: AnalyzedMedia[]): ImportAnalysisReport['media'] {
return {
total: items.length,
new: items.filter(i => i.status === 'new').length,
updates: items.filter(i => i.status === 'update').length,
conflicts: items.filter(i => i.status === 'conflict').length,
contentDuplicates: items.filter(i => i.status === 'content-duplicate').length,
missing: items.filter(i => i.status === 'missing').length,
items,
};
}
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
// Preprocess: Wrap standalone <code> blocks containing newlines in <pre> tags
const withCodeBlocks = this.wrapMultilineCode(html);
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
let markdown = this.turndown.turndown(preprocessed);
// Normalize non-breaking spaces to regular spaces
markdown = markdown.replace(/\u00A0/g, ' ');
// Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation
markdown = markdown.split('\n').map(line => {
const trimmed = line.trimEnd();
// Preserve space after ">" for blockquote continuation lines
if (trimmed === '>' && line.startsWith('> ')) {
return '> ';
}
return trimmed;
}).join('\n');
// Normalize multiple blank lines (3+ consecutive newlines → 2 newlines)
markdown = markdown.replace(/\n{3,}/g, '\n\n');
return markdown;
}
/**
* Preserve line breaks and paragraph structure in content.
*
* WordPress exports often have:
* - Plain text mixed with HTML
* - Double newlines representing paragraph breaks
* - Single newlines that should become <br>
*
* This function converts:
* - Double newlines (\n\n) to paragraph breaks (</p><p>)
* - Single newlines within text to <br>
* - Wraps content in <p> tags if it starts with plain text
*/
private preserveLineBreaks(html: string): string {
if (!html || !html.trim()) return html;
// Check if content starts with a tag or plain text
const startsWithTag = /^\s*</.test(html);
// Protect <pre> blocks from having their newlines modified
const preBlocks: string[] = [];
let protectedHtml = html.replace(/<pre>([\s\S]*?)<\/pre>/g, (match) => {
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
preBlocks.push(match);
return placeholder;
});
// If it starts with plain text, we need to handle the whole content differently
if (!startsWithTag) {
// First, convert double newlines to paragraph markers
let processed = protectedHtml.replace(/\n\n+/g, '</p>\n<p>');
// Convert remaining single newlines within text to <br>
// (but not newlines that are just between tags)
processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
const preserved = textContent.replace(/\n/g, '<br>');
return '>' + preserved + '<';
});
// Also handle newlines at the start (before any tags)
processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
if (!textContent.trim()) return match;
return textContent.replace(/\n/g, '<br>');
});
// Wrap in <p> if we added paragraph markers
if (processed.includes('</p>')) {
processed = '<p>' + processed + '</p>';
}
// Restore protected <pre> blocks
preBlocks.forEach((block, i) => {
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
});
return processed;
}
// For content that starts with HTML, handle newlines within text content
let result = protectedHtml.replace(/>([^<]+)</g, (_match, textContent: string) => {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
// Then convert remaining single newlines to <br>
preserved = preserved.replace(/\n/g, '<br>');
return '>' + preserved + '<';
});
// Also handle text at the END of content (after the last tag)
// This catches text after closing tags like --> or /> that goes to the end
result = result.replace(/>([^<]+)$/g, (match, textContent: string) => {
if (!textContent.trim()) {
return match;
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
// Then convert remaining single newlines to <br>
preserved = preserved.replace(/\n/g, '<br>');
return '>' + preserved;
});
// Restore protected <pre> blocks
preBlocks.forEach((block, i) => {
result = result.replace(`__PRE_BLOCK_${i}__`, block);
});
return result;
}
/**
* Wrap standalone <code> blocks containing newlines in <pre> tags.
*
* WordPress content sometimes uses <code>...</code> for multi-line code blocks
* without a <pre> wrapper. Standard HTML parsing treats this as inline code and
* collapses whitespace. By wrapping in <pre>, we preserve the formatting and
* Turndown will convert it to a fenced Markdown code block.
*
* Only wraps <code> blocks that contain literal newlines.
* Does NOT wrap:
* - <code> already inside <pre>
* - <code> without newlines (inline code)
*/
private wrapMultilineCode(html: string): string {
if (!html) return html;
// Match <code> blocks containing newlines that are NOT inside <pre>
// Use a regex that captures the full <code>...</code> content including any embedded HTML
return html.replace(/<code>([\s\S]*?)<\/code>/g, (match, content: string) => {
// Only wrap if content contains newlines (multiline code block)
if (!content.includes('\n')) {
return match; // Leave inline code as-is
}
// Check if this <code> is already inside a <pre> by looking backward
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
return '<pre><code>' + content + '</code></pre>';
});
}
private calculateChecksum(content: string): string {
return crypto.createHash('md5').update(content).digest('hex');
}
/**
* Analyze macros (WordPress shortcodes) from post/page content.
* Discovers all shortcodes, aggregates their usages, and validates against definitions.
*/
private analyzeMacros(posts: WxrPost[]): MacroAnalysisSummary {
// Map of macro name -> discovered macro data
const macroMap = new Map<string, {
name: string;
totalCount: number;
usages: Map<string, { params: Record<string, string>; count: number }>;
postSlugs: Set<string>;
}>();
// Process each post/page
for (const post of posts) {
if (!post.content) continue;
const shortcodes = this.parseShortcodes(post.content);
for (const shortcode of shortcodes) {
const name = shortcode.name.toLowerCase();
let macroData = macroMap.get(name);
if (!macroData) {
macroData = {
name,
totalCount: 0,
usages: new Map(),
postSlugs: new Set(),
};
macroMap.set(name, macroData);
}
macroData.totalCount++;
macroData.postSlugs.add(post.slug);
// Create a key for this parameter combination
const paramsKey = this.serializeParams(shortcode.params);
const existingUsage = macroData.usages.get(paramsKey);
if (existingUsage) {
existingUsage.count++;
} else {
macroData.usages.set(paramsKey, { params: shortcode.params, count: 1 });
}
}
}
// Convert to final format with validation
const discovered: DiscoveredMacro[] = [];
for (const macroData of macroMap.values()) {
const definition = this.macroDefinitions.get(macroData.name);
const mapped = definition !== undefined;
const usages: MacroUsage[] = [];
for (const [paramsKey, usage] of macroData.usages) {
let validationStatus: MacroValidationStatus = 'unknown';
let validationError: string | undefined;
if (mapped && definition) {
if (definition.validate) {
const error = definition.validate(usage.params);
if (error) {
validationStatus = 'invalid';
validationError = error;
} else {
validationStatus = 'valid';
}
} else {
// Macro is mapped but has no validation - consider valid
validationStatus = 'valid';
}
}
usages.push({
params: usage.params,
count: usage.count,
validationStatus,
validationError,
paramsKey,
});
}
discovered.push({
name: macroData.name,
mapped,
totalCount: macroData.totalCount,
usages,
postSlugs: Array.from(macroData.postSlugs),
});
}
// Sort discovered macros by name
discovered.sort((a, b) => a.name.localeCompare(b.name));
return {
total: discovered.length,
mappedCount: discovered.filter(m => m.mapped).length,
unmappedCount: discovered.filter(m => !m.mapped).length,
discovered,
};
}
/**
* Parse WordPress shortcodes from content.
* Returns array of { name, params } for each shortcode found.
*/
private parseShortcodes(content: string): Array<{ name: string; params: Record<string, string> }> {
const shortcodes: Array<{ name: string; params: Record<string, string> }> = [];
// Reset regex lastIndex
ImportAnalysisEngine.SHORTCODE_REGEX.lastIndex = 0;
let match;
while ((match = ImportAnalysisEngine.SHORTCODE_REGEX.exec(content)) !== null) {
const name = match[1];
const paramString = match[2] || '';
const params = this.parseShortcodeParams(paramString);
shortcodes.push({ name, params });
}
return shortcodes;
}
/**
* Parse parameters from a shortcode parameter string.
* Supports: key="value", key='value', and key=value (unquoted)
*/
private parseShortcodeParams(paramString: string): Record<string, string> {
const params: Record<string, string> = {};
// Reset regex lastIndex
ImportAnalysisEngine.PARAM_REGEX.lastIndex = 0;
let match;
while ((match = ImportAnalysisEngine.PARAM_REGEX.exec(paramString)) !== null) {
const key = match[1];
// Value is in group 2 (double-quoted), 3 (single-quoted), or 4 (unquoted)
const value = match[2] ?? match[3] ?? match[4] ?? '';
params[key] = value;
}
return params;
}
/**
* Serialize params to a stable string for deduplication.
*/
private serializeParams(params: Record<string, string>): string {
const sorted = Object.entries(params).sort(([a], [b]) => a.localeCompare(b));
return JSON.stringify(sorted);
}
}