feat: importer starting point

This commit is contained in:
2026-02-13 13:07:44 +01:00
parent deb0f3ae3b
commit d88fb1d9fa
19 changed files with 2666 additions and 10 deletions

View File

@@ -0,0 +1,331 @@
import crypto from 'crypto';
import * as fs from 'fs/promises';
import * as path from 'path';
import TurndownService from 'turndown';
import { getDatabase } from '../database';
import { posts, media, tags } from '../database/schema';
import { eq } from 'drizzle-orm';
import type { WxrData, WxrPost, WxrMedia, WxrSiteInfo, WxrCategory, WxrTag } from './WxrParser';
export type PostAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate';
export type MediaAnalysisStatus = 'new' | 'update' | 'conflict' | 'content-duplicate' | 'missing';
export interface AnalyzedPost {
wxrPost: WxrPost;
status: PostAnalysisStatus;
contentHash: string;
markdownPreview: string;
existingPost?: {
id: string;
title: string;
slug: string;
checksum: string | null;
};
}
export interface AnalyzedMedia {
wxrMedia: WxrMedia;
status: MediaAnalysisStatus;
fileHash: string | null;
existingMedia?: {
id: string;
originalName: string;
checksum: string | null;
};
}
export interface AnalyzedCategory {
name: string;
slug: string;
existsInProject: boolean;
}
export interface AnalyzedTag {
name: string;
slug: string;
existsInProject: boolean;
}
export interface ImportAnalysisReport {
sourceFile: string;
site: WxrSiteInfo;
analyzedAt: Date;
posts: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
items: AnalyzedPost[];
};
pages: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
items: AnalyzedPost[];
};
media: {
total: number;
new: number;
updates: number;
conflicts: number;
contentDuplicates: number;
missing: number;
items: AnalyzedMedia[];
};
categories: AnalyzedCategory[];
tags: AnalyzedTag[];
}
export class ImportAnalysisEngine {
private currentProjectId: string = '';
private turndown: TurndownService;
constructor() {
this.turndown = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
bulletListMarker: '-',
});
}
setProjectContext(projectId: string): void {
this.currentProjectId = projectId;
}
async analyzeWxr(wxrData: WxrData, sourceFile: string, uploadsFolder?: string): Promise<ImportAnalysisReport> {
const db = getDatabase().getLocal();
// Fetch existing posts for this project
const existingPosts = await db
.select({
id: posts.id,
slug: posts.slug,
title: posts.title,
checksum: posts.checksum,
})
.from(posts)
.where(eq(posts.projectId, this.currentProjectId))
.all();
// Fetch existing media for this project
const existingMedia = await db
.select({
id: media.id,
originalName: media.originalName,
checksum: media.checksum,
})
.from(media)
.where(eq(media.projectId, this.currentProjectId))
.all();
// Fetch existing tags for this project
const existingTags = await db
.select({
name: tags.name,
})
.from(tags)
.where(eq(tags.projectId, this.currentProjectId))
.all();
// Build lookup maps for posts
const slugToPost = new Map<string, typeof existingPosts[0]>();
const checksumToPost = new Map<string, typeof existingPosts[0]>();
for (const post of existingPosts) {
slugToPost.set(post.slug, post);
if (post.checksum) {
checksumToPost.set(post.checksum, post);
}
}
// Build lookup maps for media
const nameToMedia = new Map<string, typeof existingMedia[0]>();
const checksumToMedia = new Map<string, typeof existingMedia[0]>();
for (const m of existingMedia) {
nameToMedia.set(m.originalName.toLowerCase(), m);
if (m.checksum) {
checksumToMedia.set(m.checksum, m);
}
}
// Build tag set
const existingTagNames = new Set(existingTags.map(t => t.name.toLowerCase()));
// Analyze posts
const analyzedPosts = this.analyzePostItems(wxrData.posts, slugToPost, checksumToPost);
const analyzedPages = this.analyzePostItems(wxrData.pages, slugToPost, checksumToPost);
// Analyze media
const analyzedMedia = await this.analyzeMediaItems(wxrData.media, nameToMedia, checksumToMedia, uploadsFolder);
// Analyze categories
const analyzedCategories: AnalyzedCategory[] = wxrData.categories.map(cat => ({
name: cat.name,
slug: cat.slug,
existsInProject: existingTagNames.has(cat.name.toLowerCase()),
}));
// Analyze tags
const analyzedTags: AnalyzedTag[] = wxrData.tags.map(tag => ({
name: tag.name,
slug: tag.slug,
existsInProject: existingTagNames.has(tag.name.toLowerCase()),
}));
return {
sourceFile,
site: wxrData.site,
analyzedAt: new Date(),
posts: this.summarizePostAnalysis(analyzedPosts),
pages: this.summarizePostAnalysis(analyzedPages),
media: this.summarizeMediaAnalysis(analyzedMedia),
categories: analyzedCategories,
tags: analyzedTags,
};
}
private analyzePostItems(
wxrPosts: WxrPost[],
slugToPost: Map<string, { id: string; slug: string; title: string; checksum: string | null }>,
checksumToPost: Map<string, { id: string; slug: string; title: string; checksum: string | null }>,
): AnalyzedPost[] {
return wxrPosts.map(wxrPost => {
const markdown = this.convertToMarkdown(wxrPost.content);
const contentHash = this.calculateChecksum(markdown);
const markdownPreview = markdown.substring(0, 200);
const existingBySlug = slugToPost.get(wxrPost.slug);
const existingByHash = checksumToPost.get(contentHash);
let status: PostAnalysisStatus;
let existingPost: AnalyzedPost['existingPost'];
if (existingBySlug) {
if (existingBySlug.checksum === contentHash) {
status = 'update';
} else {
status = 'conflict';
}
existingPost = {
id: existingBySlug.id,
title: existingBySlug.title,
slug: existingBySlug.slug,
checksum: existingBySlug.checksum,
};
} else if (existingByHash) {
status = 'content-duplicate';
existingPost = {
id: existingByHash.id,
title: existingByHash.title,
slug: existingByHash.slug,
checksum: existingByHash.checksum,
};
} else {
status = 'new';
}
return { wxrPost, status, contentHash, markdownPreview, existingPost };
});
}
private async analyzeMediaItems(
wxrMediaItems: WxrMedia[],
nameToMedia: Map<string, { id: string; originalName: string; checksum: string | null }>,
checksumToMedia: Map<string, { id: string; originalName: string; checksum: string | null }>,
uploadsFolder?: string,
): Promise<AnalyzedMedia[]> {
const results: AnalyzedMedia[] = [];
for (const wxrMedia of wxrMediaItems) {
let fileHash: string | null = null;
let fileFound = false;
// Try to read the actual file from the uploads folder
if (uploadsFolder) {
try {
const filePath = path.join(uploadsFolder, wxrMedia.relativePath);
const buffer = await fs.readFile(filePath);
fileHash = this.calculateChecksum(buffer.toString('binary'));
fileFound = true;
} catch {
// File not found in uploads folder
}
}
if (!fileFound) {
results.push({
wxrMedia,
status: 'missing',
fileHash: null,
});
continue;
}
const existingByName = nameToMedia.get(wxrMedia.filename.toLowerCase());
const existingByHash = fileHash ? checksumToMedia.get(fileHash) : undefined;
let status: MediaAnalysisStatus;
let existingMedia: AnalyzedMedia['existingMedia'];
if (existingByName) {
if (fileHash && existingByName.checksum === fileHash) {
status = 'update';
} else {
status = 'conflict';
}
existingMedia = {
id: existingByName.id,
originalName: existingByName.originalName,
checksum: existingByName.checksum,
};
} else if (existingByHash) {
status = 'content-duplicate';
existingMedia = {
id: existingByHash.id,
originalName: existingByHash.originalName,
checksum: existingByHash.checksum,
};
} else {
status = 'new';
}
results.push({ wxrMedia, status, fileHash, existingMedia });
}
return results;
}
private summarizePostAnalysis(items: AnalyzedPost[]): ImportAnalysisReport['posts'] {
return {
total: items.length,
new: items.filter(i => i.status === 'new').length,
updates: items.filter(i => i.status === 'update').length,
conflicts: items.filter(i => i.status === 'conflict').length,
contentDuplicates: items.filter(i => i.status === 'content-duplicate').length,
items,
};
}
private summarizeMediaAnalysis(items: AnalyzedMedia[]): ImportAnalysisReport['media'] {
return {
total: items.length,
new: items.filter(i => i.status === 'new').length,
updates: items.filter(i => i.status === 'update').length,
conflicts: items.filter(i => i.status === 'conflict').length,
contentDuplicates: items.filter(i => i.status === 'content-duplicate').length,
missing: items.filter(i => i.status === 'missing').length,
items,
};
}
private convertToMarkdown(html: string): string {
if (!html || !html.trim()) return '';
return this.turndown.turndown(html);
}
private calculateChecksum(content: string): string {
return crypto.createHash('md5').update(content).digest('hex');
}
}

View File

@@ -0,0 +1,307 @@
import { DOMParser } from '@xmldom/xmldom';
import * as fs from 'fs/promises';
export interface WxrSiteInfo {
title: string;
link: string;
description: string;
language: string;
}
export interface WxrPost {
wpId: number;
title: string;
slug: string;
content: string;
excerpt: string;
pubDate: Date | null;
creator: string;
status: string;
postType: string;
categories: string[];
tags: string[];
}
export interface WxrMedia {
wpId: number;
title: string;
url: string;
filename: string;
relativePath: string;
pubDate: Date | null;
parentId: number;
mimeType: string;
description: string;
}
export interface WxrCategory {
name: string;
slug: string;
parent: string;
}
export interface WxrTag {
name: string;
slug: string;
}
export interface WxrData {
site: WxrSiteInfo;
posts: WxrPost[];
pages: WxrPost[];
media: WxrMedia[];
categories: WxrCategory[];
tags: WxrTag[];
}
// WordPress namespace URIs
const NS = {
wp: 'http://wordpress.org/export/1.2/',
content: 'http://purl.org/rss/1.0/modules/content/',
excerpt: 'http://wordpress.org/export/1.2/excerpt/',
dc: 'http://purl.org/dc/elements/1.1/',
};
// Common MIME types by file extension
const EXT_TO_MIME: Record<string, string> = {
jpg: 'image/jpeg',
jpeg: 'image/jpeg',
png: 'image/png',
gif: 'image/gif',
webp: 'image/webp',
svg: 'image/svg+xml',
bmp: 'image/bmp',
ico: 'image/x-icon',
mp4: 'video/mp4',
webm: 'video/webm',
mp3: 'audio/mpeg',
wav: 'audio/wav',
ogg: 'audio/ogg',
pdf: 'application/pdf',
doc: 'application/msword',
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
zip: 'application/zip',
};
export class WxrParser {
async parseFile(filePath: string): Promise<WxrData> {
const content = await fs.readFile(filePath, 'utf-8');
return this.parseXml(content);
}
parseXml(xmlContent: string): WxrData {
const doc = new DOMParser().parseFromString(xmlContent, 'text/xml');
const channel = doc.getElementsByTagName('channel')[0];
if (!channel) {
throw new Error('Invalid WXR file: no <channel> element found');
}
const site = this.parseSiteInfo(channel);
const categories = this.parseChannelCategories(channel);
const tags = this.parseChannelTags(channel);
const posts: WxrPost[] = [];
const pages: WxrPost[] = [];
const media: WxrMedia[] = [];
const items = channel.getElementsByTagName('item');
for (let i = 0; i < items.length; i++) {
const item = items[i];
const postType = this.getElementText(item, 'post_type', NS.wp);
if (postType === 'attachment') {
media.push(this.parseMediaItem(item));
} else if (postType === 'page') {
pages.push(this.parsePostItem(item));
} else {
// 'post' and any other custom post types
posts.push(this.parsePostItem(item));
}
}
return { site, posts, pages, media, categories, tags };
}
private parseSiteInfo(channel: Element): WxrSiteInfo {
return {
title: this.getDirectChildText(channel, 'title'),
link: this.getDirectChildText(channel, 'link'),
description: this.getDirectChildText(channel, 'description'),
language: this.getDirectChildText(channel, 'language'),
};
}
private parseChannelCategories(channel: Element): WxrCategory[] {
const categories: WxrCategory[] = [];
const elements = channel.getElementsByTagNameNS(NS.wp, 'category');
for (let i = 0; i < elements.length; i++) {
const el = elements[i];
// Only process direct children of channel (not item-level category elements)
if (el.parentNode !== channel) continue;
categories.push({
name: this.getElementText(el, 'cat_name', NS.wp),
slug: this.getElementText(el, 'category_nicename', NS.wp),
parent: this.getElementText(el, 'category_parent', NS.wp),
});
}
return categories;
}
private parseChannelTags(channel: Element): WxrTag[] {
const tags: WxrTag[] = [];
const elements = channel.getElementsByTagNameNS(NS.wp, 'tag');
for (let i = 0; i < elements.length; i++) {
const el = elements[i];
if (el.parentNode !== channel) continue;
tags.push({
name: this.getElementText(el, 'tag_name', NS.wp),
slug: this.getElementText(el, 'tag_slug', NS.wp),
});
}
return tags;
}
private parsePostItem(item: Element): WxrPost {
const categories: string[] = [];
const tags: string[] = [];
// Item-level <category> elements (no namespace)
const catElements = item.getElementsByTagName('category');
for (let i = 0; i < catElements.length; i++) {
const el = catElements[i];
// Only direct children of item
if (el.parentNode !== item) continue;
const domain = el.getAttribute('domain');
const text = this.getTextContent(el);
if (domain === 'category' && text) {
categories.push(text);
} else if (domain === 'post_tag' && text) {
tags.push(text);
}
}
const pubDateStr = this.getDirectChildText(item, 'pubDate');
let pubDate: Date | null = null;
if (pubDateStr) {
const parsed = new Date(pubDateStr);
if (!isNaN(parsed.getTime())) {
pubDate = parsed;
}
}
return {
wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
title: this.getDirectChildText(item, 'title'),
slug: this.getElementText(item, 'post_name', NS.wp),
content: this.getElementText(item, 'encoded', NS.content),
excerpt: this.getElementText(item, 'encoded', NS.excerpt),
pubDate,
creator: this.getElementText(item, 'creator', NS.dc),
status: this.getElementText(item, 'status', NS.wp),
postType: this.getElementText(item, 'post_type', NS.wp),
categories,
tags,
};
}
private parseMediaItem(item: Element): WxrMedia {
const url = this.getElementText(item, 'attachment_url', NS.wp);
const filename = this.extractFilename(url);
const relativePath = this.extractRelativePath(url);
const pubDateStr = this.getDirectChildText(item, 'pubDate');
let pubDate: Date | null = null;
if (pubDateStr) {
const parsed = new Date(pubDateStr);
if (!isNaN(parsed.getTime())) {
pubDate = parsed;
}
}
return {
wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
title: this.getDirectChildText(item, 'title'),
url,
filename,
relativePath,
pubDate,
parentId: parseInt(this.getElementText(item, 'post_parent', NS.wp) || '0', 10),
mimeType: this.inferMimeType(filename),
description: this.getElementText(item, 'encoded', NS.content),
};
}
private extractFilename(url: string): string {
if (!url) return '';
try {
const pathname = new URL(url).pathname;
return pathname.split('/').pop() || '';
} catch {
return url.split('/').pop() || '';
}
}
private extractRelativePath(url: string): string {
if (!url) return '';
// Extract path after wp-content/uploads/
const marker = 'wp-content/uploads/';
const idx = url.indexOf(marker);
if (idx !== -1) {
return url.substring(idx + marker.length);
}
// Fallback: return filename only
return this.extractFilename(url);
}
private inferMimeType(filename: string): string {
const ext = filename.split('.').pop()?.toLowerCase() || '';
return EXT_TO_MIME[ext] || 'application/octet-stream';
}
/** Get text content of a namespaced child element */
private getElementText(parent: Element, localName: string, nsUri: string): string {
const elements = parent.getElementsByTagNameNS(nsUri, localName);
for (let i = 0; i < elements.length; i++) {
const el = elements[i];
// Find first one that is either a direct child or a grandchild (for nested structures)
if (el.parentNode === parent || el.parentNode?.parentNode === parent) {
return this.getTextContent(el);
}
}
return '';
}
/** Get text content of a direct child element (no namespace) */
private getDirectChildText(parent: Element, tagName: string): string {
const children = parent.childNodes;
for (let i = 0; i < children.length; i++) {
const child = children[i];
if (child.nodeType === 1 && (child as Element).localName === tagName) {
return this.getTextContent(child as Element);
}
}
return '';
}
/** Safely extract text content, handling CDATA sections */
private getTextContent(el: Element): string {
let text = '';
const children = el.childNodes;
for (let i = 0; i < children.length; i++) {
const child = children[i];
if (child.nodeType === 3 || child.nodeType === 4) {
// Text node or CDATA section
text += child.nodeValue || '';
}
}
return text;
}
}

View File

@@ -50,5 +50,22 @@ export {
type SendMessageResult,
type ModelInfo,
} from './OpenCodeManager';
export {
WxrParser,
type WxrData,
type WxrPost,
type WxrMedia,
type WxrSiteInfo,
type WxrCategory,
type WxrTag,
} from './WxrParser';
export {
ImportAnalysisEngine,
type ImportAnalysisReport,
type AnalyzedPost,
type AnalyzedMedia,
type AnalyzedCategory,
type AnalyzedTag,
type PostAnalysisStatus,
type MediaAnalysisStatus,
} from './ImportAnalysisEngine';

View File

@@ -745,6 +745,68 @@ export function registerIpcHandlers(): void {
return engine.rebuildFromSidecars();
});
// ============ Import Analysis Handlers ============
safeHandle('import:selectAndAnalyze', async (_, uploadsFolder?: string) => {
const result = await dialog.showOpenDialog({
title: 'Select WordPress Export File (WXR)',
filters: [
{ name: 'WordPress Export', extensions: ['xml'] },
{ name: 'All Files', extensions: ['*'] },
],
properties: ['openFile'],
});
if (result.canceled || result.filePaths.length === 0) {
return null;
}
const filePath = result.filePaths[0];
const { WxrParser } = await import('../engine/WxrParser');
const { ImportAnalysisEngine } = await import('../engine/ImportAnalysisEngine');
const parser = new WxrParser();
const wxrData = await parser.parseFile(filePath);
const analysisEngine = new ImportAnalysisEngine();
const projectEngine = getProjectEngine();
const activeProject = await projectEngine.getActiveProject();
if (activeProject) {
analysisEngine.setProjectContext(activeProject.id);
}
return analysisEngine.analyzeWxr(wxrData, filePath, uploadsFolder || undefined);
});
safeHandle('import:analyzeFile', async (_, filePath: string, uploadsFolder?: string) => {
const { WxrParser } = await import('../engine/WxrParser');
const { ImportAnalysisEngine } = await import('../engine/ImportAnalysisEngine');
const parser = new WxrParser();
const wxrData = await parser.parseFile(filePath);
const analysisEngine = new ImportAnalysisEngine();
const projectEngine = getProjectEngine();
const activeProject = await projectEngine.getActiveProject();
if (activeProject) {
analysisEngine.setProjectContext(activeProject.id);
}
return analysisEngine.analyzeWxr(wxrData, filePath, uploadsFolder || undefined);
});
safeHandle('import:selectUploadsFolder', async () => {
const result = await dialog.showOpenDialog({
title: 'Select WordPress Uploads Folder',
properties: ['openDirectory'],
});
if (result.canceled || result.filePaths.length === 0) {
return null;
}
return result.filePaths[0];
});
// ============ Event Forwarding ============
// Forward engine events to renderer

View File

@@ -150,6 +150,13 @@ contextBridge.exposeInMainWorld('electronAPI', {
syncFromPosts: () => ipcRenderer.invoke('tags:syncFromPosts'),
},
// Import Analysis
import: {
selectAndAnalyze: (uploadsFolder?: string) => ipcRenderer.invoke('import:selectAndAnalyze', uploadsFolder),
analyzeFile: (filePath: string, uploadsFolder?: string) => ipcRenderer.invoke('import:analyzeFile', filePath, uploadsFolder),
selectUploadsFolder: () => ipcRenderer.invoke('import:selectUploadsFolder'),
},
// AI Chat (OpenCode Zen API integration)
chat: {
// API Key Management
@@ -312,6 +319,11 @@ export interface ElectronAPI {
getPostsWithTag: (tagId: string) => Promise<unknown[]>;
syncFromPosts: () => Promise<void>;
};
import: {
selectAndAnalyze: (uploadsFolder?: string) => Promise<unknown>;
analyzeFile: (filePath: string, uploadsFolder?: string) => Promise<unknown>;
selectUploadsFolder: () => Promise<string | null>;
};
chat: {
// API Key Management
checkReady: () => Promise<{ ready: boolean; error?: string; backend?: string }>;