1182 lines
40 KiB
TypeScript
1182 lines
40 KiB
TypeScript
/**
|
|
* ImportExecutionEngine - Executes WXR import based on analysis results
|
|
*
|
|
* Handles the 4-phase import process:
|
|
* 1. Create new tags/categories
|
|
* 2. Import posts (handling conflicts correctly)
|
|
* 3. Import media (with post linkage)
|
|
* 4. Import pages (as posts with "page" category)
|
|
*/
|
|
|
|
import { EventEmitter } from 'events';
|
|
import { v4 as uuidv4 } from 'uuid';
|
|
import * as fs from 'fs/promises';
|
|
import * as path from 'path';
|
|
import * as crypto from 'crypto';
|
|
import matter from 'gray-matter';
|
|
import { app } from 'electron';
|
|
import TurndownService from 'turndown';
|
|
import { getDatabase } from '../database';
|
|
import { posts, media, NewPost, NewMedia } from '../database/schema';
|
|
import { eq } from 'drizzle-orm';
|
|
import type { TagEngine } from './TagEngine';
|
|
import type { PostEngine, PostData } from './PostEngine';
|
|
import type { MediaEngine, MediaData } from './MediaEngine';
|
|
import type { PostMediaEngine } from './PostMediaEngine';
|
|
import type {
|
|
ImportAnalysisReport,
|
|
AnalyzedPost,
|
|
AnalyzedMedia,
|
|
AnalyzedCategory,
|
|
AnalyzedTag,
|
|
ImportConflictResolution,
|
|
} from './ImportAnalysisEngine';
|
|
import type { WxrPost, WxrMedia } from './WxrParser';
|
|
|
|
export interface ImportExecutionOptions {
|
|
/** Path to the WordPress uploads folder for media files */
|
|
uploadsFolder?: string;
|
|
/** Default author to use when WXR post/media has no author */
|
|
defaultAuthor?: string;
|
|
/** Progress callback */
|
|
onProgress?: (phase: string, current: number, total: number, detail?: string) => void;
|
|
}
|
|
|
|
export interface ImportExecutionResult {
|
|
success: boolean;
|
|
tags: {
|
|
created: number;
|
|
skipped: number;
|
|
};
|
|
posts: {
|
|
imported: number;
|
|
skipped: number;
|
|
errors: number;
|
|
};
|
|
media: {
|
|
imported: number;
|
|
skipped: number;
|
|
errors: number;
|
|
};
|
|
pages: {
|
|
imported: number;
|
|
skipped: number;
|
|
errors: number;
|
|
};
|
|
/** Mapping from WordPress post ID to our post GUID */
|
|
wpIdToPostId: Map<number, string>;
|
|
errors: string[];
|
|
}
|
|
|
|
// Regex to match WordPress shortcodes: [macroname ...] but NOT [[macroname ...]]
|
|
const WP_SHORTCODE_REGEX = /(?<!\[)\[(\w+)([^\]]*?)(?:\s*\/)?\](?!\])/g;
|
|
|
|
export interface ImportExecutionDeps {
|
|
tagEngine: TagEngine;
|
|
postEngine: PostEngine;
|
|
mediaEngine: MediaEngine;
|
|
postMediaEngine: PostMediaEngine;
|
|
}
|
|
|
|
export class ImportExecutionEngine extends EventEmitter {
|
|
private currentProjectId: string = 'default';
|
|
private dataDir: string | null = null;
|
|
private turndown: TurndownService;
|
|
private siteBaseUrl: string | null = null; // Base URL for media URL conversion
|
|
private readonly tagEngine: TagEngine;
|
|
private readonly postEngine: PostEngine;
|
|
private readonly mediaEngine: MediaEngine;
|
|
private readonly postMediaEngine: PostMediaEngine;
|
|
|
|
constructor(deps: ImportExecutionDeps) {
|
|
super();
|
|
this.tagEngine = deps.tagEngine;
|
|
this.postEngine = deps.postEngine;
|
|
this.mediaEngine = deps.mediaEngine;
|
|
this.postMediaEngine = deps.postMediaEngine;
|
|
this.turndown = new TurndownService({
|
|
headingStyle: 'atx',
|
|
codeBlockStyle: 'fenced',
|
|
bulletListMarker: '-',
|
|
emDelimiter: '*',
|
|
});
|
|
|
|
// Custom rule for list items: use single space after marker instead of multiple spaces
|
|
this.turndown.addRule('listItem', {
|
|
filter: 'li',
|
|
replacement: (content, node, options) => {
|
|
content = content
|
|
.replace(/^\n+/, '') // Remove leading newlines
|
|
.replace(/\n+$/, '\n') // Replace trailing newlines with single newline
|
|
.replace(/\n/gm, '\n '); // Indent subsequent lines with 2 spaces
|
|
|
|
const parent = node.parentNode as HTMLElement;
|
|
const isOrdered = parent?.nodeName === 'OL';
|
|
let prefix = options.bulletListMarker + ' ';
|
|
|
|
if (isOrdered) {
|
|
const start = parent.getAttribute('start');
|
|
const index = Array.prototype.indexOf.call(parent.children, node);
|
|
const startNum = start ? parseInt(start, 10) : 1;
|
|
prefix = (startNum + index) + '. ';
|
|
}
|
|
|
|
return prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
|
|
},
|
|
});
|
|
|
|
// Custom rule for standalone images with empty alt but title attribute
|
|
// WordPress often uses title="name" with alt=""
|
|
this.turndown.addRule('imageWithTitle', {
|
|
filter: (node) => {
|
|
if (node.nodeName !== 'IMG') return false;
|
|
// Check if this image is NOT inside an <a> tag (those are handled by linkedImage rule)
|
|
const parent = node.parentNode;
|
|
if (parent?.nodeName === 'A') return false;
|
|
// Only match if alt is empty but title exists
|
|
const img = node as HTMLImageElement;
|
|
const alt = img.getAttribute('alt') || '';
|
|
const title = img.getAttribute('title') || '';
|
|
return !alt.trim() && title.trim().length > 0;
|
|
},
|
|
replacement: (_content, node) => {
|
|
const img = node as HTMLImageElement;
|
|
const src = img.getAttribute('src') || '';
|
|
const title = img.getAttribute('title') || '';
|
|
return ``;
|
|
},
|
|
});
|
|
|
|
// Custom rule for linked images: <a><img></a> -> 
|
|
// This handles the common WordPress pattern of wrapping thumbnails in links to full-size images
|
|
this.turndown.addRule('linkedImage', {
|
|
filter: (node) => {
|
|
// Match <a> tags that contain only an <img> (possibly with whitespace)
|
|
if (node.nodeName !== 'A') return false;
|
|
const children = Array.from(node.childNodes).filter(
|
|
child => !(child.nodeType === 3 && !child.textContent?.trim())
|
|
);
|
|
return children.length === 1 && children[0].nodeName === 'IMG';
|
|
},
|
|
replacement: (_content, node) => {
|
|
const anchor = node as HTMLAnchorElement;
|
|
const img = anchor.querySelector('img');
|
|
if (!img) return '';
|
|
|
|
const href = anchor.getAttribute('href') || '';
|
|
const imgSrc = img.getAttribute('src') || '';
|
|
const imgAlt = img.getAttribute('alt') || '';
|
|
const imgTitle = img.getAttribute('title') || '';
|
|
|
|
// Check if the link href points to an image (common WordPress pattern for "click for larger")
|
|
const imageExtensions = /\.(jpe?g|png|gif|webp|bmp|svg|tiff?)(\?.*)?$/i;
|
|
const hrefIsImage = imageExtensions.test(href);
|
|
|
|
// Determine which URL to use:
|
|
// - If href is an image URL (WordPress "click for full-size" pattern), use the href
|
|
// - Otherwise, use the original image src
|
|
const imageUrl = hrefIsImage ? href : imgSrc;
|
|
|
|
// Derive alt text: prefer alt, then title, then cleaned filename
|
|
let altText = imgAlt.trim();
|
|
if (!altText) {
|
|
altText = imgTitle.trim();
|
|
}
|
|
if (!altText) {
|
|
// Extract filename from the image URL as last resort
|
|
const urlPath = imageUrl.split('?')[0]; // Remove query string
|
|
const filename = urlPath.split('/').pop() || '';
|
|
// Clean the filename: remove extension and replace underscores with spaces
|
|
altText = filename.replace(/\.[^.]+$/, '').replace(/_/g, ' ');
|
|
}
|
|
|
|
// Build the markdown image link (without title attribute)
|
|
return ``;
|
|
},
|
|
});
|
|
|
|
// Custom rule for Flash embeds - replace with placeholder text
|
|
this.turndown.addRule('flashEmbed', {
|
|
filter: (node) => {
|
|
if (node.nodeName !== 'EMBED') return false;
|
|
const embed = node as HTMLEmbedElement;
|
|
const type = embed.getAttribute('type') || '';
|
|
const src = embed.getAttribute('src') || '';
|
|
// Match Flash content by type or file extension
|
|
return type.toLowerCase().includes('flash') ||
|
|
type.toLowerCase().includes('shockwave') ||
|
|
src.toLowerCase().endsWith('.swf');
|
|
},
|
|
replacement: () => 'FLASH PLAYER NOT SUPPORTED',
|
|
});
|
|
}
|
|
|
|
setProjectContext(projectId: string, dataDir?: string): void {
|
|
this.currentProjectId = projectId;
|
|
this.dataDir = dataDir || null;
|
|
}
|
|
|
|
getProjectContext(): string {
|
|
return this.currentProjectId;
|
|
}
|
|
|
|
private getBaseDir(): string {
|
|
if (this.dataDir) return this.dataDir;
|
|
const userDataPath = app.getPath('userData');
|
|
return path.join(userDataPath, 'projects', this.currentProjectId);
|
|
}
|
|
|
|
private getPostsBaseDir(): string {
|
|
return path.join(this.getBaseDir(), 'posts');
|
|
}
|
|
|
|
private getMediaBaseDir(): string {
|
|
return path.join(this.getBaseDir(), 'media');
|
|
}
|
|
|
|
/**
|
|
* Get the date-based directory for posts (posts/YYYY/MM/)
|
|
*/
|
|
private getPostsDirForDate(date: Date): string {
|
|
const baseDir = this.getPostsBaseDir();
|
|
const year = date.getFullYear().toString();
|
|
const month = (date.getMonth() + 1).toString().padStart(2, '0');
|
|
return path.join(baseDir, year, month);
|
|
}
|
|
|
|
/**
|
|
* Get the date-based directory for media (media/YYYY/MM/)
|
|
*/
|
|
private getMediaDirForDate(date: Date): string {
|
|
const baseDir = this.getMediaBaseDir();
|
|
const year = date.getFullYear().toString();
|
|
const month = (date.getMonth() + 1).toString().padStart(2, '0');
|
|
return path.join(baseDir, year, month);
|
|
}
|
|
|
|
/**
|
|
* Execute the full import process
|
|
*/
|
|
async executeImport(
|
|
report: ImportAnalysisReport,
|
|
options: ImportExecutionOptions
|
|
): Promise<ImportExecutionResult> {
|
|
const result: ImportExecutionResult = {
|
|
success: true,
|
|
tags: { created: 0, skipped: 0 },
|
|
posts: { imported: 0, skipped: 0, errors: 0 },
|
|
media: { imported: 0, skipped: 0, errors: 0 },
|
|
pages: { imported: 0, skipped: 0, errors: 0 },
|
|
wpIdToPostId: new Map(),
|
|
errors: [],
|
|
};
|
|
|
|
const progress = options.onProgress || (() => {});
|
|
|
|
// Store site URL for media URL conversion
|
|
this.siteBaseUrl = report.site.link || null;
|
|
|
|
try {
|
|
// Build tag/category mappings
|
|
const tagMapping = this.buildTaxonomyMapping(report.tags);
|
|
const categoryMapping = this.buildTaxonomyMapping(report.categories);
|
|
|
|
// Phase 1: Create new tags
|
|
progress('tags', 0, report.tags.length + report.categories.length, 'Creating tags...');
|
|
await this.executePhase1Tags(report, tagMapping, categoryMapping, result, progress);
|
|
|
|
// Phase 2: Import posts
|
|
progress('posts', 0, report.posts.items.length, 'Importing posts...');
|
|
await this.executePhase2Posts(report, tagMapping, categoryMapping, result, options, progress);
|
|
|
|
// Phase 3: Import media
|
|
progress('media', 0, report.media.items.length, 'Importing media...');
|
|
await this.executePhase3Media(report, result, options, progress);
|
|
|
|
// Phase 4: Import pages
|
|
progress('pages', 0, report.pages.items.length, 'Importing pages...');
|
|
await this.executePhase4Pages(report, tagMapping, categoryMapping, result, options, progress);
|
|
|
|
progress('complete', 1, 1, 'Import complete');
|
|
} catch (error) {
|
|
result.success = false;
|
|
result.errors.push(error instanceof Error ? error.message : String(error));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Build a mapping from original taxonomy name to resolved name
|
|
* - If existsInProject: use the name as-is (lowercase)
|
|
* - If mappedTo: use the mappedTo value (lowercase)
|
|
* - Otherwise: use the name and mark for creation
|
|
*/
|
|
private buildTaxonomyMapping(
|
|
items: Array<{ name: string; existsInProject: boolean; mappedTo?: string }>
|
|
): Map<string, { resolved: string; needsCreation: boolean }> {
|
|
const mapping = new Map<string, { resolved: string; needsCreation: boolean }>();
|
|
|
|
for (const item of items) {
|
|
const key = item.name.toLowerCase();
|
|
if (item.mappedTo) {
|
|
// Mapped to existing tag
|
|
mapping.set(key, { resolved: item.mappedTo.toLowerCase(), needsCreation: false });
|
|
} else if (item.existsInProject) {
|
|
// Already exists
|
|
mapping.set(key, { resolved: key, needsCreation: false });
|
|
} else {
|
|
// New tag to create
|
|
mapping.set(key, { resolved: key, needsCreation: true });
|
|
}
|
|
}
|
|
|
|
return mapping;
|
|
}
|
|
|
|
/**
|
|
* Phase 1: Create new tags and categories
|
|
*/
|
|
private async executePhase1Tags(
|
|
report: ImportAnalysisReport,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
progress: (phase: string, current: number, total: number, detail?: string) => void
|
|
): Promise<void> {
|
|
const tagEngine = this.tagEngine;
|
|
tagEngine.setProjectContext(this.currentProjectId);
|
|
|
|
let current = 0;
|
|
const total = report.tags.length + report.categories.length;
|
|
|
|
// Create new tags
|
|
for (const tag of report.tags) {
|
|
current++;
|
|
const mapping = tagMapping.get(tag.name.toLowerCase());
|
|
|
|
if (mapping?.needsCreation) {
|
|
try {
|
|
await tagEngine.createTag({ name: mapping.resolved });
|
|
result.tags.created++;
|
|
progress('tags', current, total, `Created tag: ${mapping.resolved}`);
|
|
} catch (error) {
|
|
// Tag might already exist (race condition or duplicate in list)
|
|
result.tags.skipped++;
|
|
}
|
|
} else {
|
|
result.tags.skipped++;
|
|
}
|
|
}
|
|
|
|
// Create new categories (as tags)
|
|
for (const category of report.categories) {
|
|
current++;
|
|
const mapping = categoryMapping.get(category.name.toLowerCase());
|
|
|
|
if (mapping?.needsCreation) {
|
|
try {
|
|
await tagEngine.createTag({ name: mapping.resolved });
|
|
result.tags.created++;
|
|
progress('tags', current, total, `Created category tag: ${mapping.resolved}`);
|
|
} catch (error) {
|
|
result.tags.skipped++;
|
|
}
|
|
} else {
|
|
result.tags.skipped++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Phase 2: Import posts
|
|
*/
|
|
private async executePhase2Posts(
|
|
report: ImportAnalysisReport,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions,
|
|
progress: (phase: string, current: number, total: number, detail?: string) => void
|
|
): Promise<void> {
|
|
// Filter to only actual posts (postType === 'post'), skip nav_menu_item, revision, etc.
|
|
const postsToImport = report.posts.items.filter(item => item.wxrPost.postType === 'post');
|
|
const total = postsToImport.length;
|
|
|
|
// Count skipped "other" post types
|
|
const skippedOther = report.posts.items.length - postsToImport.length;
|
|
result.posts.skipped += skippedOther;
|
|
|
|
for (let i = 0; i < postsToImport.length; i++) {
|
|
const analyzed = postsToImport[i];
|
|
progress('posts', i + 1, total, `Processing: ${analyzed.wxrPost.title}`);
|
|
|
|
try {
|
|
const imported = await this.importPost(analyzed, tagMapping, categoryMapping, result, options);
|
|
if (imported) {
|
|
result.posts.imported++;
|
|
} else {
|
|
result.posts.skipped++;
|
|
}
|
|
} catch (error) {
|
|
result.posts.errors++;
|
|
result.errors.push(`Failed to import post "${analyzed.wxrPost.title}": ${error instanceof Error ? error.message : String(error)}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Import a single post
|
|
*/
|
|
private async importPost(
|
|
analyzed: AnalyzedPost,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions
|
|
): Promise<boolean> {
|
|
const wxrPost = analyzed.wxrPost;
|
|
|
|
// Handle different analysis statuses
|
|
if (analyzed.status === 'content-duplicate') {
|
|
// Skip content duplicates
|
|
return false;
|
|
}
|
|
|
|
if (analyzed.status === 'update') {
|
|
// Skip updates (same content already exists)
|
|
return false;
|
|
}
|
|
|
|
if (analyzed.status === 'conflict') {
|
|
const resolution = analyzed.conflictResolution || 'ignore';
|
|
|
|
if (resolution === 'ignore') {
|
|
return false;
|
|
}
|
|
|
|
// Handle overwrite and import
|
|
return await this.importPostWithConflict(analyzed, resolution, tagMapping, categoryMapping, result, options);
|
|
}
|
|
|
|
// New post - import it
|
|
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published');
|
|
}
|
|
|
|
/**
|
|
* Import a post that has a conflict
|
|
*/
|
|
private async importPostWithConflict(
|
|
analyzed: AnalyzedPost,
|
|
resolution: ImportConflictResolution,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions
|
|
): Promise<boolean> {
|
|
const postEngine = this.postEngine;
|
|
|
|
if (resolution === 'overwrite') {
|
|
// Update the existing post with new content and set to draft for review
|
|
if (!analyzed.existingPost?.id) {
|
|
// Fallback: if no existing post ID, create as new draft
|
|
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'draft');
|
|
}
|
|
return await this.updateExistingPost(analyzed, analyzed.existingPost.id, tagMapping, categoryMapping, result, options);
|
|
}
|
|
|
|
if (resolution === 'import') {
|
|
// Create with a new unique slug
|
|
const newSlug = await postEngine.generateUniqueSlug(analyzed.wxrPost.title);
|
|
return await this.createImportedPost(analyzed, tagMapping, categoryMapping, result, options, 'published', newSlug);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Update an existing post with imported content (for overwrite conflict resolution)
|
|
* Sets the post to draft status so user can review before publishing
|
|
*/
|
|
private async updateExistingPost(
|
|
analyzed: AnalyzedPost,
|
|
existingPostId: string,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions
|
|
): Promise<boolean> {
|
|
const wxrPost = analyzed.wxrPost;
|
|
const db = getDatabase().getLocal();
|
|
const postEngine = this.postEngine;
|
|
|
|
// Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion
|
|
const contentWithVimeo = this.convertVimeoIframes(wxrPost.content);
|
|
|
|
// Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion
|
|
const contentWithShortcodes = this.transformShortcodes(contentWithVimeo);
|
|
|
|
// Convert HTML content to Markdown
|
|
let transformedContent = this.convertToMarkdown(contentWithShortcodes);
|
|
|
|
// Convert absolute media URLs from the site to relative paths
|
|
transformedContent = this.convertMediaUrlsToRelative(transformedContent);
|
|
|
|
// Resolve tags
|
|
const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping);
|
|
|
|
// Resolve categories
|
|
const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping);
|
|
|
|
// Calculate checksum
|
|
const checksum = this.calculateChecksum(transformedContent);
|
|
|
|
// Update the existing post in the database
|
|
// Set to draft status so user can review the imported content
|
|
await db.update(posts)
|
|
.set({
|
|
title: wxrPost.title,
|
|
excerpt: wxrPost.excerpt || null,
|
|
content: transformedContent, // Store in DB since it's now a draft
|
|
status: 'draft',
|
|
author: wxrPost.creator || options.defaultAuthor || null,
|
|
updatedAt: new Date(),
|
|
publishedAt: null, // Clear publishedAt since it's now a draft
|
|
checksum,
|
|
tags: JSON.stringify(resolvedTags),
|
|
categories: JSON.stringify(resolvedCategories),
|
|
})
|
|
.where(eq(posts.id, existingPostId));
|
|
|
|
// Update FTS index
|
|
await postEngine.updateFTSIndex({
|
|
id: existingPostId,
|
|
projectId: this.currentProjectId,
|
|
title: wxrPost.title,
|
|
content: transformedContent,
|
|
excerpt: wxrPost.excerpt || undefined,
|
|
tags: resolvedTags,
|
|
categories: resolvedCategories,
|
|
});
|
|
|
|
// Track wpId to postId mapping (use existing ID)
|
|
result.wpIdToPostId.set(wxrPost.wpId, existingPostId);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Create an imported post
|
|
*/
|
|
private async createImportedPost(
|
|
analyzed: AnalyzedPost,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions,
|
|
status: 'draft' | 'published',
|
|
overrideSlug?: string
|
|
): Promise<boolean> {
|
|
const wxrPost = analyzed.wxrPost;
|
|
const db = getDatabase().getLocal();
|
|
|
|
// Convert Vimeo iframes to [[vimeo]] macros BEFORE markdown conversion
|
|
const contentWithVimeo = this.convertVimeoIframes(wxrPost.content);
|
|
|
|
// Transform WordPress shortcodes [shortcode] to [[shortcode]] BEFORE markdown conversion
|
|
// (TurndownService escapes brackets, so we must transform first)
|
|
const contentWithShortcodes = this.transformShortcodes(contentWithVimeo);
|
|
|
|
// Convert HTML content to Markdown
|
|
let transformedContent = this.convertToMarkdown(contentWithShortcodes);
|
|
|
|
// Convert absolute media URLs from the site to relative paths
|
|
transformedContent = this.convertMediaUrlsToRelative(transformedContent);
|
|
|
|
// Resolve tags
|
|
const resolvedTags = this.resolveTaxonomy(wxrPost.tags, tagMapping);
|
|
|
|
// Resolve categories
|
|
const resolvedCategories = this.resolveTaxonomy(wxrPost.categories, categoryMapping);
|
|
|
|
// Determine dates (dates may be strings after JSON serialization through IPC)
|
|
const createdAt = this.toDate(wxrPost.postDate) || this.toDate(wxrPost.pubDate) || new Date();
|
|
const updatedAt = this.toDate(wxrPost.postModified) || createdAt;
|
|
const publishedAt = status === 'published' ? (this.toDate(wxrPost.pubDate) || createdAt) : undefined;
|
|
|
|
// Generate post ID
|
|
const postId = uuidv4();
|
|
|
|
// Build post data
|
|
const postData: PostData = {
|
|
id: postId,
|
|
projectId: this.currentProjectId,
|
|
title: wxrPost.title,
|
|
slug: overrideSlug || wxrPost.slug,
|
|
excerpt: wxrPost.excerpt || undefined,
|
|
content: transformedContent,
|
|
status,
|
|
author: wxrPost.creator || options.defaultAuthor || undefined,
|
|
createdAt,
|
|
updatedAt,
|
|
publishedAt,
|
|
tags: resolvedTags,
|
|
categories: resolvedCategories,
|
|
};
|
|
|
|
// Write to filesystem first (for published posts)
|
|
let filePath = '';
|
|
if (status === 'published') {
|
|
filePath = await this.writePostFile(postData);
|
|
}
|
|
|
|
// Calculate checksum
|
|
const checksum = this.calculateChecksum(transformedContent);
|
|
|
|
// Insert into database
|
|
const dbPost: NewPost = {
|
|
id: postData.id,
|
|
projectId: postData.projectId,
|
|
title: postData.title,
|
|
slug: postData.slug,
|
|
excerpt: postData.excerpt,
|
|
content: status === 'draft' ? postData.content : null, // Draft content in DB, published in file
|
|
status: postData.status,
|
|
author: postData.author,
|
|
createdAt: postData.createdAt,
|
|
updatedAt: postData.updatedAt,
|
|
publishedAt: postData.publishedAt,
|
|
filePath,
|
|
checksum,
|
|
tags: JSON.stringify(postData.tags),
|
|
categories: JSON.stringify(postData.categories),
|
|
};
|
|
|
|
await db.insert(posts).values(dbPost);
|
|
|
|
// Update FTS index
|
|
const postEngine = this.postEngine;
|
|
await postEngine.updateFTSIndex(postData);
|
|
|
|
// Track wpId to postId mapping
|
|
result.wpIdToPostId.set(wxrPost.wpId, postId);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Write a post file to the filesystem
|
|
*/
|
|
private async writePostFile(post: PostData): Promise<string> {
|
|
const metadata: Record<string, unknown> = {
|
|
id: post.id,
|
|
projectId: post.projectId,
|
|
title: post.title,
|
|
slug: post.slug,
|
|
status: post.status,
|
|
createdAt: post.createdAt.toISOString(),
|
|
updatedAt: post.updatedAt.toISOString(),
|
|
tags: post.tags,
|
|
categories: post.categories,
|
|
};
|
|
|
|
if (post.excerpt) metadata.excerpt = post.excerpt;
|
|
if (post.author) metadata.author = post.author;
|
|
if (post.publishedAt) metadata.publishedAt = post.publishedAt.toISOString();
|
|
|
|
const postsDir = this.getPostsDirForDate(post.createdAt);
|
|
await fs.mkdir(postsDir, { recursive: true });
|
|
|
|
const fileContent = matter.stringify(post.content, metadata);
|
|
const filePath = path.join(postsDir, `${post.slug}.md`);
|
|
|
|
await fs.writeFile(filePath, fileContent, 'utf-8');
|
|
return filePath;
|
|
}
|
|
|
|
/**
|
|
* Phase 3: Import media files
|
|
*/
|
|
private async executePhase3Media(
|
|
report: ImportAnalysisReport,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions,
|
|
progress: (phase: string, current: number, total: number, detail?: string) => void
|
|
): Promise<void> {
|
|
const total = report.media.items.length;
|
|
|
|
for (let i = 0; i < report.media.items.length; i++) {
|
|
const analyzed = report.media.items[i];
|
|
progress('media', i + 1, total, `Processing: ${analyzed.wxrMedia.filename}`);
|
|
|
|
try {
|
|
const imported = await this.importMediaFile(analyzed, result, options);
|
|
if (imported) {
|
|
result.media.imported++;
|
|
} else {
|
|
result.media.skipped++;
|
|
}
|
|
} catch (error) {
|
|
result.media.errors++;
|
|
result.errors.push(`Failed to import media "${analyzed.wxrMedia.filename}": ${error instanceof Error ? error.message : String(error)}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Import a single media file
|
|
*/
|
|
private async importMediaFile(
|
|
analyzed: AnalyzedMedia,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions
|
|
): Promise<boolean> {
|
|
const wxrMedia = analyzed.wxrMedia;
|
|
|
|
// Skip missing files
|
|
if (analyzed.status === 'missing') {
|
|
return false;
|
|
}
|
|
|
|
// Skip content duplicates
|
|
if (analyzed.status === 'content-duplicate') {
|
|
return false;
|
|
}
|
|
|
|
// Handle conflicts
|
|
if (analyzed.status === 'conflict') {
|
|
const resolution = analyzed.conflictResolution || 'ignore';
|
|
if (resolution === 'ignore') {
|
|
return false;
|
|
}
|
|
|
|
// For 'overwrite', update the existing media entry
|
|
if (resolution === 'overwrite' && analyzed.existingMedia?.id) {
|
|
return await this.updateExistingMedia(analyzed, analyzed.existingMedia.id, result, options);
|
|
}
|
|
// For 'import', fall through to create new entry
|
|
}
|
|
|
|
// Skip updates (same content already exists)
|
|
if (analyzed.status === 'update') {
|
|
return false;
|
|
}
|
|
|
|
// Build source path
|
|
if (!options.uploadsFolder) {
|
|
return false;
|
|
}
|
|
|
|
const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath);
|
|
|
|
// Check if file exists
|
|
try {
|
|
await fs.access(sourcePath);
|
|
} catch {
|
|
return false;
|
|
}
|
|
|
|
// Resolve parent post ID
|
|
const linkedPostIds: string[] = [];
|
|
if (wxrMedia.parentId && wxrMedia.parentId > 0) {
|
|
const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId);
|
|
if (parentPostId) {
|
|
linkedPostIds.push(parentPostId);
|
|
}
|
|
}
|
|
|
|
// Determine creation date from WXR (may be string after JSON serialization)
|
|
const createdAt = this.toDate(wxrMedia.pubDate) || new Date();
|
|
|
|
// Import the media file
|
|
const mediaEngine = this.mediaEngine;
|
|
const importedMedia = await mediaEngine.importMedia(sourcePath, {
|
|
title: wxrMedia.title || undefined,
|
|
alt: wxrMedia.description || undefined,
|
|
mimeType: wxrMedia.mimeType,
|
|
author: options.defaultAuthor,
|
|
tags: [],
|
|
linkedPostIds,
|
|
createdAt,
|
|
updatedAt: createdAt,
|
|
});
|
|
|
|
// Link media to posts in the postMedia table
|
|
if (linkedPostIds.length > 0) {
|
|
const postMediaEngine = this.postMediaEngine;
|
|
postMediaEngine.setProjectContext(this.currentProjectId);
|
|
for (const postId of linkedPostIds) {
|
|
await postMediaEngine.linkMediaToPost(postId, importedMedia.id);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Update an existing media entry with imported file (for overwrite conflict resolution)
|
|
* Replaces the file on disk and updates metadata in the database
|
|
*/
|
|
private async updateExistingMedia(
|
|
analyzed: AnalyzedMedia,
|
|
existingMediaId: string,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions
|
|
): Promise<boolean> {
|
|
const wxrMedia = analyzed.wxrMedia;
|
|
|
|
// Build source path
|
|
if (!options.uploadsFolder) {
|
|
return false;
|
|
}
|
|
|
|
const sourcePath = path.join(options.uploadsFolder, wxrMedia.relativePath);
|
|
|
|
// Check if file exists
|
|
try {
|
|
await fs.access(sourcePath);
|
|
} catch {
|
|
return false;
|
|
}
|
|
|
|
const mediaEngine = this.mediaEngine;
|
|
|
|
// Replace the file on disk and update size/checksum/dimensions in database
|
|
await mediaEngine.replaceMediaFile(existingMediaId, sourcePath);
|
|
|
|
// Update metadata (title, alt, etc.)
|
|
await mediaEngine.updateMedia(existingMediaId, {
|
|
title: wxrMedia.title || undefined,
|
|
alt: wxrMedia.description || undefined,
|
|
author: options.defaultAuthor,
|
|
});
|
|
|
|
// Resolve parent post ID for linking
|
|
const linkedPostIds: string[] = [];
|
|
if (wxrMedia.parentId && wxrMedia.parentId > 0) {
|
|
const parentPostId = result.wpIdToPostId.get(wxrMedia.parentId);
|
|
if (parentPostId) {
|
|
linkedPostIds.push(parentPostId);
|
|
}
|
|
}
|
|
|
|
// Link media to posts in the postMedia table if needed
|
|
if (linkedPostIds.length > 0) {
|
|
const postMediaEngine = this.postMediaEngine;
|
|
postMediaEngine.setProjectContext(this.currentProjectId);
|
|
for (const postId of linkedPostIds) {
|
|
await postMediaEngine.linkMediaToPost(postId, existingMediaId);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Phase 4: Import pages as posts with "page" category
|
|
*/
|
|
private async executePhase4Pages(
|
|
report: ImportAnalysisReport,
|
|
tagMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
categoryMapping: Map<string, { resolved: string; needsCreation: boolean }>,
|
|
result: ImportExecutionResult,
|
|
options: ImportExecutionOptions,
|
|
progress: (phase: string, current: number, total: number, detail?: string) => void
|
|
): Promise<void> {
|
|
const total = report.pages.items.length;
|
|
|
|
// Ensure "page" category exists in mapping
|
|
if (!categoryMapping.has('page')) {
|
|
categoryMapping.set('page', { resolved: 'page', needsCreation: false });
|
|
}
|
|
|
|
for (let i = 0; i < report.pages.items.length; i++) {
|
|
const analyzed = report.pages.items[i];
|
|
const wxrPage = analyzed.wxrPost;
|
|
|
|
// Add "page" to categories
|
|
const modifiedWxrPost: WxrPost = {
|
|
...wxrPage,
|
|
categories: [...wxrPage.categories, 'page'],
|
|
};
|
|
|
|
const modifiedAnalyzed: AnalyzedPost = {
|
|
...analyzed,
|
|
wxrPost: modifiedWxrPost,
|
|
};
|
|
|
|
progress('pages', i + 1, total, `Processing: ${wxrPage.title}`);
|
|
|
|
try {
|
|
const imported = await this.importPost(modifiedAnalyzed, tagMapping, categoryMapping, result, options);
|
|
if (imported) {
|
|
result.pages.imported++;
|
|
} else {
|
|
result.pages.skipped++;
|
|
}
|
|
} catch (error) {
|
|
result.pages.errors++;
|
|
result.errors.push(`Failed to import page "${wxrPage.title}": ${error instanceof Error ? error.message : String(error)}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert HTML to Markdown using Turndown
|
|
*/
|
|
private convertToMarkdown(html: string): string {
|
|
if (!html || !html.trim()) return '';
|
|
|
|
// Preprocess: Wrap standalone <code> blocks containing newlines in <pre> tags
|
|
// This must happen BEFORE preserveLineBreaks to prevent newlines from becoming <br>
|
|
// and to ensure Turndown recognizes them as fenced code blocks
|
|
const withCodeBlocks = this.wrapMultilineCode(html);
|
|
|
|
// Preprocess: Convert newlines within text to <br> tags to preserve line breaks
|
|
// This handles the common case where WordPress exports have line breaks in the XML
|
|
// that should be preserved in markdown
|
|
const preprocessed = this.preserveLineBreaks(withCodeBlocks);
|
|
|
|
let markdown = this.turndown.turndown(preprocessed);
|
|
// Unescape double-bracket macros that TurndownService escaped
|
|
// \[\[ becomes [[ and \]\] becomes ]]
|
|
markdown = markdown.replace(/\\\[\\\[/g, '[[').replace(/\\\]\\\]/g, ']]');
|
|
// Remove backslash escapes inside [[macro]] blocks (e.g. photo\_archive → photo_archive)
|
|
markdown = markdown.replace(/\[\[([^\]]*?)\]\]/g, (_match, inner: string) => {
|
|
return '[[' + inner.replace(/\\(.)/g, '$1') + ']]';
|
|
});
|
|
// Normalize non-breaking spaces to regular spaces
|
|
markdown = markdown.replace(/\u00A0/g, ' ');
|
|
// Clean up trailing whitespace from each line, but preserve "> " for blockquote continuation
|
|
markdown = markdown.split('\n').map(line => {
|
|
const trimmed = line.trimEnd();
|
|
// Preserve space after ">" for blockquote continuation lines
|
|
if (trimmed === '>' && line.startsWith('> ')) {
|
|
return '> ';
|
|
}
|
|
return trimmed;
|
|
}).join('\n');
|
|
// Normalize multiple blank lines (3+ consecutive newlines → 2 newlines)
|
|
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
return markdown;
|
|
}
|
|
|
|
/**
|
|
* Preserve line breaks and paragraph structure in content.
|
|
*
|
|
* WordPress exports often have:
|
|
* - Plain text mixed with HTML
|
|
* - Double newlines representing paragraph breaks
|
|
* - Single newlines that should become <br>
|
|
*
|
|
* This function converts:
|
|
* - Double newlines (\n\n) to paragraph breaks (</p><p>)
|
|
* - Single newlines within text to <br>
|
|
* - Wraps content in <p> tags if it starts with plain text
|
|
*/
|
|
private preserveLineBreaks(html: string): string {
|
|
if (!html || !html.trim()) return html;
|
|
|
|
// Check if content starts with a tag or plain text
|
|
const startsWithTag = /^\s*</.test(html);
|
|
|
|
// Protect <pre> blocks from having their newlines modified
|
|
const preBlocks: string[] = [];
|
|
let protectedHtml = html.replace(/<pre>([\s\S]*?)<\/pre>/g, (match) => {
|
|
const placeholder = `__PRE_BLOCK_${preBlocks.length}__`;
|
|
preBlocks.push(match);
|
|
return placeholder;
|
|
});
|
|
|
|
// If it starts with plain text, we need to handle the whole content differently
|
|
if (!startsWithTag) {
|
|
// First, convert double newlines to paragraph markers
|
|
let processed = protectedHtml.replace(/\n\n+/g, '</p>\n<p>');
|
|
|
|
// Convert remaining single newlines within text to <br>
|
|
// (but not newlines that are just between tags)
|
|
processed = processed.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
|
if (!textContent.trim()) {
|
|
return '>' + textContent + '<';
|
|
}
|
|
const preserved = textContent.replace(/\n/g, '<br>');
|
|
return '>' + preserved + '<';
|
|
});
|
|
|
|
// Also handle newlines at the start (before any tags)
|
|
processed = processed.replace(/^([^<]+)/g, (match, textContent: string) => {
|
|
if (!textContent.trim()) return match;
|
|
return textContent.replace(/\n/g, '<br>');
|
|
});
|
|
|
|
// Wrap in <p> if we added paragraph markers
|
|
if (processed.includes('</p>')) {
|
|
processed = '<p>' + processed + '</p>';
|
|
}
|
|
|
|
// Restore protected <pre> blocks
|
|
preBlocks.forEach((block, i) => {
|
|
processed = processed.replace(`__PRE_BLOCK_${i}__`, block);
|
|
});
|
|
|
|
return processed;
|
|
}
|
|
|
|
// For content that starts with HTML, handle newlines within text content
|
|
let result = protectedHtml.replace(/>([^<]+)</g, (_match, textContent: string) => {
|
|
if (!textContent.trim()) {
|
|
return '>' + textContent + '<';
|
|
}
|
|
// First convert double newlines to paragraph breaks
|
|
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
|
|
// Then convert remaining single newlines to <br>
|
|
preserved = preserved.replace(/\n/g, '<br>');
|
|
return '>' + preserved + '<';
|
|
});
|
|
|
|
// Also handle text at the END of content (after the last tag)
|
|
// This catches text after closing tags like --> or /> that goes to the end
|
|
result = result.replace(/>([^<]+)$/g, (match, textContent: string) => {
|
|
if (!textContent.trim()) {
|
|
return match;
|
|
}
|
|
// First convert double newlines to paragraph breaks
|
|
let preserved = textContent.replace(/\n\n+/g, '</p><p>');
|
|
// Then convert remaining single newlines to <br>
|
|
preserved = preserved.replace(/\n/g, '<br>');
|
|
return '>' + preserved;
|
|
});
|
|
|
|
// Restore protected <pre> blocks
|
|
preBlocks.forEach((block, i) => {
|
|
result = result.replace(`__PRE_BLOCK_${i}__`, block);
|
|
});
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Wrap standalone <code> blocks containing newlines in <pre> tags.
|
|
*
|
|
* WordPress content sometimes uses <code>...</code> for multi-line code blocks
|
|
* without a <pre> wrapper. Standard HTML parsing treats this as inline code and
|
|
* collapses whitespace. By wrapping in <pre>, we preserve the formatting and
|
|
* Turndown will convert it to a fenced Markdown code block.
|
|
*
|
|
* Only wraps <code> blocks that contain literal newlines.
|
|
* Does NOT wrap:
|
|
* - <code> already inside <pre>
|
|
* - <code> without newlines (inline code)
|
|
*/
|
|
private wrapMultilineCode(html: string): string {
|
|
if (!html) return html;
|
|
|
|
// Match <code> blocks containing newlines that are NOT inside <pre>
|
|
// Use a regex that captures the full <code>...</code> content including any embedded HTML
|
|
return html.replace(/<code>([\s\S]*?)<\/code>/g, (match, content: string) => {
|
|
// Only wrap if content contains newlines (multiline code block)
|
|
if (!content.includes('\n')) {
|
|
return match; // Leave inline code as-is
|
|
}
|
|
// Check if this <code> is already inside a <pre> by looking backward
|
|
// Since we're doing a simple regex, we'll just wrap it - the browser normalizes anyway
|
|
return '<pre><code>' + content + '</code></pre>';
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Convert absolute media URLs from the WordPress site to relative paths.
|
|
*
|
|
* Converts URLs like:
|
|
* https://site.com/wp-content/uploads/2022/11/image.jpg
|
|
* To:
|
|
* media/2022/11/image.jpg
|
|
*
|
|
* Only converts URLs from the site being imported (based on site.link).
|
|
* Does NOT convert:
|
|
* - URLs from external sites
|
|
* - URLs from wp-content/themes/ or wp-content/plugins/ (not imported media)
|
|
*/
|
|
private convertMediaUrlsToRelative(markdown: string): string {
|
|
if (!this.siteBaseUrl || !markdown) return markdown;
|
|
|
|
// Normalize the site URL (remove trailing slash and protocol)
|
|
const siteUrl = this.siteBaseUrl.replace(/\/$/, '');
|
|
|
|
// Extract the hostname from the site URL
|
|
// Handle both http:// and https://
|
|
const hostnameMatch = siteUrl.match(/^https?:\/\/(.+)$/);
|
|
if (!hostnameMatch) return markdown;
|
|
|
|
const hostname = hostnameMatch[1];
|
|
const escapedHostname = hostname.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
|
|
// Match URLs pointing to wp-content/uploads/ on the site
|
|
// This pattern matches BOTH HTTP and HTTPS versions regardless of what the site URL uses
|
|
// This handles the common case where the site URL is HTTPS but old content links are HTTP
|
|
// Pattern: http(s)://{hostname}/wp-content/uploads/{path}
|
|
const uploadsUrlPattern = new RegExp(
|
|
`https?://${escapedHostname}/wp-content/uploads/([^\\s)"']+)`,
|
|
'gi'
|
|
);
|
|
|
|
// Replace with relative media path
|
|
return markdown.replace(uploadsUrlPattern, 'media/$1');
|
|
}
|
|
|
|
/**
|
|
* Convert Vimeo iframes to [[vimeo id=...]] macros.
|
|
* Matches <iframe src="...player.vimeo.com/video/ID..."> and converts to [[vimeo id=ID]]
|
|
*/
|
|
private convertVimeoIframes(content: string): string {
|
|
// Match Vimeo iframe embeds: <iframe src="http(s)://player.vimeo.com/video/12345...">
|
|
const vimeoIframeRegex = /<iframe[^>]*src=["']https?:\/\/player\.vimeo\.com\/video\/(\d+)[^"']*["'][^>]*><\/iframe>/gi;
|
|
return content.replace(vimeoIframeRegex, '[[vimeo id=$1]]');
|
|
}
|
|
|
|
/**
|
|
* Transform WordPress shortcodes [shortcode] to [[shortcode]]
|
|
*/
|
|
private transformShortcodes(content: string): string {
|
|
return content.replace(WP_SHORTCODE_REGEX, '[[$1$2]]');
|
|
}
|
|
|
|
/**
|
|
* Resolve taxonomy items using the mapping
|
|
*/
|
|
private resolveTaxonomy(
|
|
items: string[],
|
|
mapping: Map<string, { resolved: string; needsCreation: boolean }>
|
|
): string[] {
|
|
return items.map(item => {
|
|
const key = item.toLowerCase();
|
|
const mapped = mapping.get(key);
|
|
return mapped ? mapped.resolved : key;
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Safely convert a value to a Date object.
|
|
* Handles Date objects, ISO strings (from JSON serialization), and null/undefined.
|
|
*/
|
|
private toDate(value: Date | string | null | undefined): Date | null {
|
|
if (!value) return null;
|
|
if (value instanceof Date) {
|
|
return isNaN(value.getTime()) ? null : value;
|
|
}
|
|
if (typeof value === 'string') {
|
|
const parsed = new Date(value);
|
|
return isNaN(parsed.getTime()) ? null : parsed;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Calculate MD5 checksum of content
|
|
*/
|
|
private calculateChecksum(content: string): string {
|
|
return crypto.createHash('md5').update(content).digest('hex');
|
|
}
|
|
}
|
|
|