Files
bDS/src/main/engine/ImportExecutionEngine.ts

1182 lines
40 KiB
TypeScript

/**
* ImportExecutionEngine - Executes WXR import based on analysis results
*
* Handles the 4-phase import process:
* 1. Create new tags/categories
* 2. Import posts (handling conflicts correctly)
* 3. Import media (with post linkage)
* 4. Import pages (as posts with "page" category)
*/
import { EventEmitter } from 'events';
import { v4 as uuidv4 } from 'uuid';
import * as fs from 'fs/promises';
import * as path from 'path';
import * as crypto from 'crypto';
import matter from 'gray-matter';
import { app } from 'electron';
import TurndownService from 'turndown';
import { getDatabase } from '../database';
import { posts, media, NewPost, NewMedia } from '../database/schema';
import { eq } from 'drizzle-orm';
import type { TagEngine } from './TagEngine';
import type { PostEngine, PostData } from './PostEngine';
import type { MediaEngine, MediaData } from './MediaEngine';
import type { PostMediaEngine } from './PostMediaEngine';
import type {
ImportAnalysisReport,
AnalyzedPost,
AnalyzedMedia,
AnalyzedCategory,
AnalyzedTag,
ImportConflictResolution,
} from './ImportAnalysisEngine';
import type { WxrPost, WxrMedia } from './WxrParser';
export interface ImportExecutionOptions {
/** Path to the WordPress uploads folder for media files */
uploadsFolder?: string;
/** Default author to use when WXR post/media has no author */
defaultAuthor?: string;
/** Progress callback */
onProgress?: (phase: string, current: number, total: number, detail?: string) => void;
}
export interface ImportExecutionResult {
success: boolean;
tags: {
created: number;
skipped: number;
};
posts: {
imported: number;
skipped: number;
errors: number;
};
media: {
imported: number;
skipped: number;
errors: number;
};
pages: {
imported: number;
skipped: number;
errors: number;
};
/** Mapping from WordPress post ID to our post GUID */
wpIdToPostId: Map<number, string>;
errors: string[];
}
// Regex to match WordPress shortcodes: [macroname ...] but NOT [[macroname ...]]
const WP_SHORTCODE_REGEX = /(?<!\[)\[(\w+)([^\]]*?)(?:\s*\/)?\](?!\])/g;
export interface ImportExecutionDeps {
tagEngine: TagEngine;
postEngine: PostEngine;
mediaEngine: MediaEngine;
postMediaEngine: PostMediaEngine;
}
export class ImportExecutionEngine extends EventEmitter {
private currentProjectId: string = 'default';
private dataDir: string | null = null;
private turndown: TurndownService;
private siteBaseUrl: string | null = null; // Base URL for media URL conversion
private readonly tagEngine: TagEngine;
private readonly postEngine: PostEngine;
private readonly mediaEngine: MediaEngine;
private readonly postMediaEngine: PostMediaEngine;
constructor(deps: ImportExecutionDeps) {
super();
this.tagEngine = deps.tagEngine;
this.postEngine = deps.postEngine;
this.mediaEngine = deps.mediaEngine;
this.postMediaEngine = deps.postMediaEngine;
this.turndown = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
emDelimiter: '*',
});
// Custom rule for list items: use single space after marker instead of multiple spaces
this.turndown.addRule('listItem', {
filter: 'li',
replacement: (content, node, options) => {
content = content
.replace(/^\n+/, '') // Remove leading newlines
.replace(/\n+$/, '\n') // Replace trailing newlines with single newline
.replace(/\n/gm, '\n '); // Indent subsequent lines with 2 spaces
const parent = node.parentNode as HTMLElement;
const isOrdered = parent?.nodeName === 'OL';
let prefix = options.bulletListMarker + ' ';
if (isOrdered) {
const start = parent.getAttribute('start');
const index = Array.prototype.indexOf.call(parent.children, node);
const startNum = start ? parseInt(start, 10) : 1;
prefix = (startNum + index) + '. ';
}
return prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
},
});
// Custom rule for standalone images with empty alt but title attribute
// WordPress often uses title="name" with alt=""
this.turndown.addRule('imageWithTitle', {
filter: (node) => {
if (node.nodeName !== 'IMG') return false;
// Check if this image is NOT inside an <a> tag (those are handled by linkedImage rule)
const parent = node.parentNode;
if (parent?.nodeName === 'A') return false;
// Only match if alt is empty but title exists
const img = node as HTMLImageElement;
const alt = img.getAttribute('alt') || '';
const title = img.getAttribute('title') || '';
return !alt.trim() && title.trim().length > 0;
},
replacement: (_content, node) => {
const img = node as HTMLImageElement;
const src = img.getAttribute('src') || '';
const title = img.getAttribute('title') || '';
return `![${title}](${src})`;
},
});
// Custom rule for linked images: <a><img></a> -> ![alt](src)
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
this.turndown.addRule('linkedImage', {
filter: (node) => {
// Match <a> tags that contain only an <img> (possibly with whitespace)
if (node.nodeName !== 'A') return false;
const children = Array.from(node.childNodes).filter(
child => !(child.nodeType === 3 && !child.textContent?.trim())
);
return children.length === 1 && children[0].nodeName === 'IMG';
},
replacement: (_content, node) => {
const anchor = node as HTMLAnchorElement;
const img = anchor.querySelector('img');
if (!img) return '';
const href = anchor.getAttribute('href') || '';
const imgSrc = img.getAttribute('src') || '';
const imgAlt = img.getAttribute('alt') || '';
const imgTitle = img.getAttribute('title') || '';
// Check if the link href points to an image (common WordPress pattern for "click for larger")
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
const hrefIsImage = imageExtensions.test(href);
// Determine which URL to use:
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
// - Otherwise, use the original image src
const imageUrl = hrefIsImage ? href : imgSrc;
// Derive alt text: prefer alt, then title, then cleaned filename
let altText = imgAlt.trim();
if (!altText) {
altText = imgTitle.trim();
}
if (!altText) {
// Extract filename from the image URL as last resort
const urlPath = imageUrl.split('?')[0]; // Remove query string
const filename = urlPath.split('/').pop() || '';
// Clean the filename: remove extension and replace underscores with spaces
altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' ');
}
// Build the markdown image link (without title attribute)
return `![${altText}](${imageUrl})`;
},
});
// Custom rule for Flash embeds - replace with placeholder text
this.turndown.addRule('flashEmbed', {
filter: (node) => {
if (node.nodeName !== 'EMBED') return false;
const embed = node as HTMLEmbedElement;
const type = embed.getAttribute('type') || '';
const src = embed.getAttribute('src') || '';
// Match Flash content by type or file extension
return type.toLowerCase().includes('flash') ||
type.toLowerCase().includes('shockwave') ||
src.toLowerCase().endsWith('.swf');
},
replacement: () => 'FLASH PLAYER NOT SUPPORTED',
});
}
setProjectContext(projectId: string, dataDir?: string): void {
this.currentProjectId = projectId;
this.dataDir = dataDir || null;
}
getProjectContext(): string {
return this.currentProjectId;
}
private getBaseDir(): string {
if (this.dataDir) return this.dataDir;
const userDataPath = app.getPath('userData');
return path.join(userDataPath, 'projects', this.currentProjectId);
}
private getPostsBaseDir(): string {
return path.join(this.getBaseDir(), 'posts');
}
private getMediaBaseDir(): string {
return path.join(this.getBaseDir(), 'media');
}
/**
* Get the date-based directory for posts (posts/YYYY/MM/)
*/
private getPostsDirForDate(date: Date): string {
const baseDir = this.getPostsBaseDir();
const year = date.getFullYear().toString();
const month = (date.getMonth() + 1).toString().padStart(2, '0');
return path.join(baseDir, year, month);
}
/**
* Get the date-based directory for media (media/YYYY/MM/)
*/
private getMediaDirForDate(date: Date): string {
const baseDir = this.getMediaBaseDir();
const year = date.getFullYear().toString();
const month = (date.getMonth() + 1).toString().padStart(2, '0');
return path.join(baseDir, year, month);
}
/**
* Execute the full import process
*/
async executeImport(
report: ImportAnalysisReport,
options: ImportExecutionOptions
): Promise<ImportExecutionResult> {
const result: ImportExecutionResult = {
success: true,
tags: { created: 0, skipped: 0 },
posts: { imported: 0, skipped: 0, errors: 0 },
media: { imported: 0, skipped: 0, errors: 0 },
pages: { imported: 0, skipped: 0, errors: 0 },
wpIdToPostId: new Map(),
errors: [],
};
const progress = options.onProgress || (() => {});
// Store site URL for media URL conversion
this.siteBaseUrl = report.site.link || null;
try {
// Build tag/category mappings
const tagMapping = this.buildTaxonomyMapping(report.tags);
const categoryMapping = this.buildTaxonomyMapping(report.categories);
// Phase 1: Create new tags
progress('tags', 0, report.tags.length + report.categories.length, 'Creating tags...');
await this.executePhase1Tags(report, tagMapping, categoryMapping, result, progress);
// Phase 2: Import posts
progress('posts', 0, report.posts.items.length, 'Importing posts...');
await this.executePhase2Posts(report, tagMapping, categoryMapping, result, options, progress);
// Phase 3: Import media
progress('media', 0, report.media.items.length, 'Importing media...');
await this.executePhase3Media(report, result, options, progress);
// Phase 4: Import pages
progress('pages', 0, report.pages.items.length, 'Importing pages...');
await this.executePhase4Pages(report, tagMapping, categoryMapping, result, options, progress);
progress('complete', 1, 1, 'Import complete');
} catch (error) {
result.success = false;
result.errors.push(error instanceof Error ? error.message : String(error));
}
return result;
}
/**
* Build a mapping from original taxonomy name to resolved name
* - If existsInProject: use the name as-is (lowercase)
* - If mappedTo: use the mappedTo value (lowercase)
* - Otherwise: use the name and mark for creation
*/
private buildTaxonomyMapping(
items: Array<{ name: string; existsInProject: boolean; mappedTo?: string }>
): Map<string, { resolved: string; needsCreation: boolean }> {
const mapping = new Map<string, { resolved: string; needsCreation: boolean }>();
for (const item of items) {
const key = item.name.toLowerCase();
if (item.mappedTo) {
// Mapped to existing tag
mapping.set(key, { resolved: item.mappedTo.toLowerCase(), needsCreation: false });
} else if (item.existsInProject) {
// Already exists
mapping.set(key, { resolved: key, needsCreation: false });
} else {
// New tag to create
mapping.set(key, { resolved: key, needsCreation: true });
}
}
return mapping;
}
/**
* Phase 1: Create new tags and categories
*/
private async executePhase1Tags(
report: ImportAnalysisReport,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
progress: (phase: string, current: number, total: number, detail?: string) => void
): Promise<void> {
const tagEngine = this.tagEngine;
tagEngine.setProjectContext(this.currentProjectId);
let current = 0;
const total = report.tags.length + report.categories.length;
// Create new tags
for (const tag of report.tags) {
current++;
const mapping = tagMapping.get(tag.name.toLowerCase());
if (mapping?.needsCreation) {
try {
await tagEngine.createTag({ name: mapping.resolved });
result.tags.created++;
progress('tags', current, total, `Created tag: ${mapping.resolved}`);
} catch (error) {
// Tag might already exist (race condition or duplicate in list)
result.tags.skipped++;
}
} else {
result.tags.skipped++;
}
}
// Create new categories (as tags)
for (const category of report.categories) {
current++;
const mapping = categoryMapping.get(category.name.toLowerCase());
if (mapping?.needsCreation) {
try {
await tagEngine.createTag({ name: mapping.resolved });
result.tags.created++;
progress('tags', current, total, `Created category tag: ${mapping.resolved}`);
} catch (error) {
result.tags.skipped++;
}
} else {
result.tags.skipped++;
}
}
}
/**
* Phase 2: Import posts
*/
private async executePhase2Posts(
report: ImportAnalysisReport,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions,
progress: (phase: string, current: number, total: number, detail?: string) => void
): Promise<void> {
// Filter to only actual posts (postType === 'post'), skip nav_menu_item, revision, etc.
const postsToImport = report.posts.items.filter(item => item.wxrPost.postType === 'post');
const total = postsToImport.length;
// Count skipped "other" post types
const skippedOther = report.posts.items.length - postsToImport.length;
result.posts.skipped += skippedOther;
for (let i = 0; i < postsToImport.length; i++) {
const analyzed = postsToImport[i];
progress('posts', i + 1, total, `Processing: ${analyzed.wxrPost.title}`);
try {
const imported = await this.importPost(analyzed, tagMapping, categoryMapping, result, options);
if (imported) {
result.posts.imported++;
} else {
result.posts.skipped++;
}
} catch (error) {
result.posts.errors++;
result.errors.push(`Failed to import post "${analyzed.wxrPost.title}": ${error instanceof Error ? error.message : String(error)}`);
}
}
}
/**
* Import a single post
*/
private async importPost(
analyzed: AnalyzedPost,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions
): Promise<boolean> {
const wxrPost = analyzed.wxrPost;
// Handle different analysis statuses
if (analyzed.status === 'content-duplicate') {
// Skip content duplicates
return false;
}
if (analyzed.status === 'update') {
// Skip updates (same content already exists)
return false;
}
if (analyzed.status === 'conflict') {
const resolution = analyzed.conflictResolution || 'ignore';
if (resolution === 'ignore') {
return false;
}
// Handle overwrite and import
return await this.importPostWithConflict(analyzed, resolution, tagMapping, categoryMapping, result, options);
}
// New post - import it
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published');
}
/**
* Import a post that has a conflict
*/
private async importPostWithConflict(
analyzed: AnalyzedPost,
resolution: ImportConflictResolution,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions
): Promise<boolean> {
const postEngine = this.postEngine;
if (resolution === 'overwrite') {
// Update the existing post with new content and set to draft for review
if (!analyzed.existingPost?.id) {
// Fallback: if no existing post ID, create as new draft
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'draft');
}
return await this.updateExistingPost(analyzed, analyzed.existingPost.id, tagMapping, categoryMapping, result, options);
}
if (resolution === 'import') {
// Create with a new unique slug
const newSlug = await postEngine.generateUniqueSlug(analyzed.wxrPost.title);
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published', newSlug);
}
return false;
}
/**
* Update an existing post with imported content (for overwrite conflict resolution)
* Sets the post to draft status so user can review before publishing
*/
private async updateExistingPost(
analyzed: AnalyzedPost,
existingPostId: string,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions
): Promise<boolean> {
const wxrPost = analyzed.wxrPost;
const db = getDatabase().getLocal();
const postEngine = this.postEngine;
// Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion
const contentWithVimeo = this.convertVimeoIframes(wxrPost.content);
// Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion
const contentWithShortcodes = this.transformShortcodes(contentWithVimeo);
// Convert HTML content to Markdown
let transformedContent = this.convertToMarkdown(contentWithShortcodes);
// Convert absolute media URLs from the site to relative paths
transformedContent = this.convertMediaUrlsToRelative(transformedContent);
// Resolve tags
const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping);
// Resolve categories
const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping);
// Calculate checksum
const checksum = this.calculateChecksum(transformedContent);
// Update the existing post in the database
// Set to draft status so user can review the imported content
await db.update(posts)
.set({
title: wxrPost.title,
excerpt: wxrPost.excerpt || null,
content: transformedContent, // Store in DB since it's now a draft
status: 'draft',
author: wxrPost.creator || options.defaultAuthor || null,
updatedAt: new Date(),
publishedAt: null, // Clear publishedAt since it's now a draft
checksum,
tags: JSON.stringify(resolvedTags),
categories: JSON.stringify(resolvedCategories),
})
.where(eq(posts.id, existingPostId));
// Update FTS index
await postEngine.updateFTSIndex({
id: existingPostId,
projectId: this.currentProjectId,
title: wxrPost.title,
content: transformedContent,
excerpt: wxrPost.excerpt || undefined,
tags: resolvedTags,
categories: resolvedCategories,
});
// Track wpId to postId mapping (use existing ID)
result.wpIdToPostId.set(wxrPost.wpId, existingPostId);
return true;
}
/**
* Create an imported post
*/
private async createImportedPost(
analyzed: AnalyzedPost,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions,
status: 'draft' | 'published',
overrideSlug?: string
): Promise<boolean> {
const wxrPost = analyzed.wxrPost;
const db = getDatabase().getLocal();
// Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion
const contentWithVimeo = this.convertVimeoIframes(wxrPost.content);
// Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion
// (TurndownService escapes brackets, so we must transform first)
const contentWithShortcodes = this.transformShortcodes(contentWithVimeo);
// Convert HTML content to Markdown
let transformedContent = this.convertToMarkdown(contentWithShortcodes);
// Convert absolute media URLs from the site to relative paths
transformedContent = this.convertMediaUrlsToRelative(transformedContent);
// Resolve tags
const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping);
// Resolve categories
const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping);
// Determine dates (dates may be strings after JSON serialization through IPC)
const createdAt = this.toDate(wxrPost.postDate) || this.toDate(wxrPost.pubDate) || new Date();
const updatedAt = this.toDate(wxrPost.postModified) || createdAt;
const publishedAt = status === 'published' ? (this.toDate(wxrPost.pubDate) || createdAt) : undefined;
// Generate post ID
const postId = uuidv4();
// Build post data
const postData: PostData = {
id: postId,
projectId: this.currentProjectId,
title: wxrPost.title,
slug: overrideSlug || wxrPost.slug,
excerpt: wxrPost.excerpt || undefined,
content: transformedContent,
status,
author: wxrPost.creator || options.defaultAuthor || undefined,
createdAt,
updatedAt,
publishedAt,
tags: resolvedTags,
categories: resolvedCategories,
};
// Write to filesystem first (for published posts)
let filePath = '';
if (status === 'published') {
filePath = await this.writePostFile(postData);
}
// Calculate checksum
const checksum = this.calculateChecksum(transformedContent);
// Insert into database
const dbPost: NewPost = {
id: postData.id,
projectId: postData.projectId,
title: postData.title,
slug: postData.slug,
excerpt: postData.excerpt,
content: status === 'draft' ? postData.content : null, // Draft content in DB, published in file
status: postData.status,
author: postData.author,
createdAt: postData.createdAt,
updatedAt: postData.updatedAt,
publishedAt: postData.publishedAt,
filePath,
checksum,
tags: JSON.stringify(postData.tags),
categories: JSON.stringify(postData.categories),
};
await db.insert(posts).values(dbPost);
// Update FTS index
const postEngine = this.postEngine;
await postEngine.updateFTSIndex(postData);
// Track wpId to postId mapping
result.wpIdToPostId.set(wxrPost.wpId, postId);
return true;
}
/**
* Write a post file to the filesystem
*/
private async writePostFile(post: PostData): Promise<string> {
const metadata: Record<string, unknown> = {
id: post.id,
projectId: post.projectId,
title: post.title,
slug: post.slug,
status: post.status,
createdAt: post.createdAt.toISOString(),
updatedAt: post.updatedAt.toISOString(),
tags: post.tags,
categories: post.categories,
};
if (post.excerpt) metadata.excerpt = post.excerpt;
if (post.author) metadata.author = post.author;
if (post.publishedAt) metadata.publishedAt = post.publishedAt.toISOString();
const postsDir = this.getPostsDirForDate(post.createdAt);
await fs.mkdir(postsDir, { recursive: true });
const fileContent = matter.stringify(post.content, metadata);
const filePath = path.join(postsDir, `${post.slug}.md`);
await fs.writeFile(filePath, fileContent, 'utf-8');
return filePath;
}
/**
* Phase 3: Import media files
*/
private async executePhase3Media(
report: ImportAnalysisReport,
result: ImportExecutionResult,
options: ImportExecutionOptions,
progress: (phase: string, current: number, total: number, detail?: string) => void
): Promise<void> {
const total = report.media.items.length;
for (let i = 0; i < report.media.items.length; i++) {
const analyzed = report.media.items[i];
progress('media', i + 1, total, `Processing: ${analyzed.wxrMedia.filename}`);
try {
const imported = await this.importMediaFile(analyzed, result, options);
if (imported) {
result.media.imported++;
} else {
result.media.skipped++;
}
} catch (error) {
result.media.errors++;
result.errors.push(`Failed to import media "${analyzed.wxrMedia.filename}": ${error instanceof Error ? error.message : String(error)}`);
}
}
}
/**
* Import a single media file
*/
private async importMediaFile(
analyzed: AnalyzedMedia,
result: ImportExecutionResult,
options: ImportExecutionOptions
): Promise<boolean> {
const wxrMedia = analyzed.wxrMedia;
// Skip missing files
if (analyzed.status === 'missing') {
return false;
}
// Skip content duplicates
if (analyzed.status === 'content-duplicate') {
return false;
}
// Handle conflicts
if (analyzed.status === 'conflict') {
const resolution = analyzed.conflictResolution || 'ignore';
if (resolution === 'ignore') {
return false;
}
// For 'overwrite', update the existing media entry
if (resolution === 'overwrite' && analyzed.existingMedia?.id) {
return await this.updateExistingMedia(analyzed, analyzed.existingMedia.id, result, options);
}
// For 'import', fall through to create new entry
}
// Skip updates (same content already exists)
if (analyzed.status === 'update') {
return false;
}
// Build source path
if (!options.uploadsFolder) {
return false;
}
const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath);
// Check if file exists
try {
await fs.access(sourcePath);
} catch {
return false;
}
// Resolve parent post ID
const linkedPostIds: string[] = [];
if (wxrMedia.parentId && wxrMedia.parentId > 0) {
const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId);
if (parentPostId) {
linkedPostIds.push(parentPostId);
}
}
// Determine creation date from WXR (may be string after JSON serialization)
const createdAt = this.toDate(wxrMedia.pubDate) || new Date();
// Import the media file
const mediaEngine = this.mediaEngine;
const importedMedia = await mediaEngine.importMedia(sourcePath, {
title: wxrMedia.title || undefined,
alt: wxrMedia.description || undefined,
mimeType: wxrMedia.mimeType,
author: options.defaultAuthor,
tags: [],
linkedPostIds,
createdAt,
updatedAt: createdAt,
});
// Link media to posts in the postMedia table
if (linkedPostIds.length > 0) {
const postMediaEngine = this.postMediaEngine;
postMediaEngine.setProjectContext(this.currentProjectId);
for (const postId of linkedPostIds) {
await postMediaEngine.linkMediaToPost(postId, importedMedia.id);
}
}
return true;
}
/**
* Update an existing media entry with imported file (for overwrite conflict resolution)
* Replaces the file on disk and updates metadata in the database
*/
private async updateExistingMedia(
analyzed: AnalyzedMedia,
existingMediaId: string,
result: ImportExecutionResult,
options: ImportExecutionOptions
): Promise<boolean> {
const wxrMedia = analyzed.wxrMedia;
// Build source path
if (!options.uploadsFolder) {
return false;
}
const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath);
// Check if file exists
try {
await fs.access(sourcePath);
} catch {
return false;
}
const mediaEngine = this.mediaEngine;
// Replace the file on disk and update size/checksum/dimensions in database
await mediaEngine.replaceMediaFile(existingMediaId, sourcePath);
// Update metadata (title, alt, etc.)
await mediaEngine.updateMedia(existingMediaId, {
title: wxrMedia.title || undefined,
alt: wxrMedia.description || undefined,
author: options.defaultAuthor,
});
// Resolve parent post ID for linking
const linkedPostIds: string[] = [];
if (wxrMedia.parentId && wxrMedia.parentId > 0) {
const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId);
if (parentPostId) {
linkedPostIds.push(parentPostId);
}
}
// Link media to posts in the postMedia table if needed
if (linkedPostIds.length > 0) {
const postMediaEngine = this.postMediaEngine;
postMediaEngine.setProjectContext(this.currentProjectId);
for (const postId of linkedPostIds) {
await postMediaEngine.linkMediaToPost(postId, existingMediaId);
}
}
return true;
}
/**
* Phase 4: Import pages as posts with "page" category
*/
private async executePhase4Pages(
report: ImportAnalysisReport,
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
result: ImportExecutionResult,
options: ImportExecutionOptions,
progress: (phase: string, current: number, total: number, detail?: string) => void
): Promise<void> {
const total = report.pages.items.length;
// Ensure "page" category exists in mapping
if (!categoryMapping.has('page')) {
categoryMapping.set('page', { resolved: 'page', needsCreation: false });
}
for (let i = 0; i < report.pages.items.length; i++) {
const analyzed = report.pages.items[i];
const wxrPage = analyzed.wxrPost;
// Add "page" to categories
const modifiedWxrPost: WxrPost = {
...wxrPage,
categories: [...wxrPage.categories, 'page'],
};
const modifiedAnalyzed: AnalyzedPost = {
...analyzed,
wxrPost: modifiedWxrPost,
};
progress('pages', i + 1, total, `Processing: ${wxrPage.title}`);
try {
const imported = await this.importPost(modifiedAnalyzed, tagMapping, categoryMapping, result, options);
if (imported) {
result.pages.imported++;
} else {
result.pages.skipped++;
}
} catch (error) {
result.pages.errors++;
result.errors.push(`Failed to import page "${wxrPage.title}": ${error instanceof Error ? error.message : String(error)}`);
}
}
}
/**
* Convert HTML to Markdown using Turndown
*/
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
// Preprocess: Wrap standalone <code> blocks containing newlines in <pre> tags
// This must happen BEFORE preserveLineBreaks to prevent newlines from becoming <br>
// and to ensure Turndown recognizes them as fenced code blocks
const withCodeBlocks = this.wrapMultilineCode(html);
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
// This handles the common case where WordPress exports have line breaks in the XML
// that should be preserved in markdown
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
let markdown = this.turndown.turndown(preprocessed);
// Unescape double-bracket macros that TurndownService escaped
// \[\[ becomes [[ and \]\] becomes ]]
markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
// Remove backslash escapes inside [[macro]] blocks (e.g. photo\_archive → photo_archive)
markdown = markdown.replace(/\[\[([^\]]*?)\]\]/g, (_match, inner: string) => {
return '[[' + inner.replace(/\\(.)/g, '$1') + ']]';
});
// Normalize non-breaking spaces to regular spaces
markdown = markdown.replace(/\u00A0/g, ' ');
// Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation
markdown = markdown.split('\n').map(line => {
const trimmed = line.trimEnd();
// Preserve space after ">" for blockquote continuation lines
if (trimmed === '>' && line.startsWith('> ')) {
return '> ';
}
return trimmed;
}).join('\n');
// Normalize multiple blank lines (3+ consecutive newlines → 2 newlines)
markdown = markdown.replace(/\n{3,}/g, '\n\n');
return markdown;
}
/**
* Preserve line breaks and paragraph structure in content.
*
* WordPress exports often have:
* - Plain text mixed with HTML
* - Double newlines representing paragraph breaks
* - Single newlines that should become <br>
*
* This function converts:
* - Double newlines (\n\n) to paragraph breaks (</p><p>)
* - Single newlines within text to <br>
* - Wraps content in <p> tags if it starts with plain text
*/
private preserveLineBreaks(html: string): string {
if (!html || !html.trim()) return html;
// Check if content starts with a tag or plain text
const startsWithTag = /^\s*</.test(html);
// Protect <pre> blocks from having their newlines modified
const preBlocks: string[] = [];
let protectedHtml = html.replace(/<pre>([\s\S]*?)<\/pre>/g, (match) => {
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
preBlocks.push(match);
return placeholder;
});
// If it starts with plain text, we need to handle the whole content differently
if (!startsWithTag) {
// First, convert double newlines to paragraph markers
let processed = protectedHtml.replace(/\n\n+/g, '</p>\n<p>');
// Convert remaining single newlines within text to <br>
// (but not newlines that are just between tags)
processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
const preserved = textContent.replace(/\n/g, '<br>');
return '>' + preserved + '<';
});
// Also handle newlines at the start (before any tags)
processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
if (!textContent.trim()) return match;
return textContent.replace(/\n/g, '<br>');
});
// Wrap in <p> if we added paragraph markers
if (processed.includes('</p>')) {
processed = '<p>' + processed + '</p>';
}
// Restore protected <pre> blocks
preBlocks.forEach((block, i) => {
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
});
return processed;
}
// For content that starts with HTML, handle newlines within text content
let result = protectedHtml.replace(/>([^<]+)</g, (_match, textContent: string) => {
if (!textContent.trim()) {
return '>' + textContent + '<';
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
// Then convert remaining single newlines to <br>
preserved = preserved.replace(/\n/g, '<br>');
return '>' + preserved + '<';
});
// Also handle text at the END of content (after the last tag)
// This catches text after closing tags like --> or /> that goes to the end
result = result.replace(/>([^<]+)$/g, (match, textContent: string) => {
if (!textContent.trim()) {
return match;
}
// First convert double newlines to paragraph breaks
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
// Then convert remaining single newlines to <br>
preserved = preserved.replace(/\n/g, '<br>');
return '>' + preserved;
});
// Restore protected <pre> blocks
preBlocks.forEach((block, i) => {
result = result.replace(`__PRE_BLOCK_${i}__`, block);
});
return result;
}
/**
* Wrap standalone <code> blocks containing newlines in <pre> tags.
*
* WordPress content sometimes uses <code>...</code> for multi-line code blocks
* without a <pre> wrapper. Standard HTML parsing treats this as inline code and
* collapses whitespace. By wrapping in <pre>, we preserve the formatting and
* Turndown will convert it to a fenced Markdown code block.
*
* Only wraps <code> blocks that contain literal newlines.
* Does NOT wrap:
* - <code> already inside <pre>
* - <code> without newlines (inline code)
*/
private wrapMultilineCode(html: string): string {
if (!html) return html;
// Match <code> blocks containing newlines that are NOT inside <pre>
// Use a regex that captures the full <code>...</code> content including any embedded HTML
return html.replace(/<code>([\s\S]*?)<\/code>/g, (match, content: string) => {
// Only wrap if content contains newlines (multiline code block)
if (!content.includes('\n')) {
return match; // Leave inline code as-is
}
// Check if this <code> is already inside a <pre> by looking backward
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
return '<pre><code>' + content + '</code></pre>';
});
}
/**
* Convert absolute media URLs from the WordPress site to relative paths.
*
* Converts URLs like:
* https://site.com/wp-content/uploads/2022/11/image.jpg
* To:
* media/2022/11/image.jpg
*
* Only converts URLs from the site being imported (based on site.link).
* Does NOT convert:
* - URLs from external sites
* - URLs from wp-content/themes/ or wp-content/plugins/ (not imported media)
*/
private convertMediaUrlsToRelative(markdown: string): string {
if (!this.siteBaseUrl || !markdown) return markdown;
// Normalize the site URL (remove trailing slash and protocol)
const siteUrl = this.siteBaseUrl.replace(/\/$/, '');
// Extract the hostname from the site URL
// Handle both http:// and https://
const hostnameMatch = siteUrl.match(/^https?:\/\/(.+)$/);
if (!hostnameMatch) return markdown;
const hostname = hostnameMatch[1];
const escapedHostname = hostname.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
// Match URLs pointing to wp-content/uploads/ on the site
// This pattern matches BOTH HTTP and HTTPS versions regardless of what the site URL uses
// This handles the common case where the site URL is HTTPS but old content links are HTTP
// Pattern: http(s)://{hostname}/wp-content/uploads/{path}
const uploadsUrlPattern = new RegExp(
`https?://${escapedHostname}/wp-content/uploads/([^\\s)"']+)`,
'gi'
);
// Replace with relative media path
return markdown.replace(uploadsUrlPattern, 'media/$1');
}
/**
* Convert Vimeo iframes to [[vimeo id=...]] macros.
* Matches <iframe src="...player.vimeo.com/video/ID..."> and converts to [[vimeo id=ID]]
*/
private convertVimeoIframes(content: string): string {
// Match Vimeo iframe embeds: <iframe src="http(s)://player.vimeo.com/video/12345...">
const vimeoIframeRegex = /<iframe[^>]*src=["']https?:\/\/player\.vimeo\.com\/video\/(\d+)[^"']*["'][^>]*><\/iframe>/gi;
return content.replace(vimeoIframeRegex, '[[vimeo id=$1]]');
}
/**
* Transform WordPress shortcodes [shortcode] to [[shortcode]]
*/
private transformShortcodes(content: string): string {
return content.replace(WP_SHORTCODE_REGEX, '[[$1$2]]');
}
/**
* Resolve taxonomy items using the mapping
*/
private resolveTaxonomy(
items: string[],
mapping: Map<string, { resolved: string; needsCreation: boolean }>
): string[] {
return items.map(item => {
const key = item.toLowerCase();
const mapped = mapping.get(key);
return mapped ? mapped.resolved : key;
});
}
/**
* Safely convert a value to a Date object.
* Handles Date objects, ISO strings (from JSON serialization), and null/undefined.
*/
private toDate(value: Date | string | null | undefined): Date | null {
if (!value) return null;
if (value instanceof Date) {
return isNaN(value.getTime()) ? null : value;
}
if (typeof value === 'string') {
const parsed = new Date(value);
return isNaN(parsed.getTime()) ? null : parsed;
}
return null;
}
/**
* Calculate MD5 checksum of content
*/
private calculateChecksum(content: string): string {
return crypto.createHash('md5').update(content).digest('hex');
}
}