feat: importer starting point

2026-02-13 13:07:44 +01:00
parent deb0f3ae3b
commit d88fb1d9fa
19 changed files with 2666 additions and 10 deletions
--- a/src/main/engine/WxrParser.ts
+++ b/src/main/engine/WxrParser.ts
@@ -0,0 +1,307 @@
+import { DOMParser } from '@xmldom/xmldom';
+import * as fs from 'fs/promises';
+
+export interface WxrSiteInfo {
+  title: string;
+  link: string;
+  description: string;
+  language: string;
+}
+
+export interface WxrPost {
+  wpId: number;
+  title: string;
+  slug: string;
+  content: string;
+  excerpt: string;
+  pubDate: Date | null;
+  creator: string;
+  status: string;
+  postType: string;
+  categories: string[];
+  tags: string[];
+}
+
+export interface WxrMedia {
+  wpId: number;
+  title: string;
+  url: string;
+  filename: string;
+  relativePath: string;
+  pubDate: Date | null;
+  parentId: number;
+  mimeType: string;
+  description: string;
+}
+
+export interface WxrCategory {
+  name: string;
+  slug: string;
+  parent: string;
+}
+
+export interface WxrTag {
+  name: string;
+  slug: string;
+}
+
+export interface WxrData {
+  site: WxrSiteInfo;
+  posts: WxrPost[];
+  pages: WxrPost[];
+  media: WxrMedia[];
+  categories: WxrCategory[];
+  tags: WxrTag[];
+}
+
+// WordPress namespace URIs
+const NS = {
+  wp: 'http://wordpress.org/export/1.2/',
+  content: 'http://purl.org/rss/1.0/modules/content/',
+  excerpt: 'http://wordpress.org/export/1.2/excerpt/',
+  dc: 'http://purl.org/dc/elements/1.1/',
+};
+
+// Common MIME types by file extension
+const EXT_TO_MIME: Record<string, string> = {
+  jpg: 'image/jpeg',
+  jpeg: 'image/jpeg',
+  png: 'image/png',
+  gif: 'image/gif',
+  webp: 'image/webp',
+  svg: 'image/svg+xml',
+  bmp: 'image/bmp',
+  ico: 'image/x-icon',
+  mp4: 'video/mp4',
+  webm: 'video/webm',
+  mp3: 'audio/mpeg',
+  wav: 'audio/wav',
+  ogg: 'audio/ogg',
+  pdf: 'application/pdf',
+  doc: 'application/msword',
+  docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  zip: 'application/zip',
+};
+
+export class WxrParser {
+
+  async parseFile(filePath: string): Promise<WxrData> {
+    const content = await fs.readFile(filePath, 'utf-8');
+    return this.parseXml(content);
+  }
+
+  parseXml(xmlContent: string): WxrData {
+    const doc = new DOMParser().parseFromString(xmlContent, 'text/xml');
+    const channel = doc.getElementsByTagName('channel')[0];
+
+    if (!channel) {
+      throw new Error('Invalid WXR file: no <channel> element found');
+    }
+
+    const site = this.parseSiteInfo(channel);
+    const categories = this.parseChannelCategories(channel);
+    const tags = this.parseChannelTags(channel);
+
+    const posts: WxrPost[] = [];
+    const pages: WxrPost[] = [];
+    const media: WxrMedia[] = [];
+
+    const items = channel.getElementsByTagName('item');
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      const postType = this.getElementText(item, 'post_type', NS.wp);
+
+      if (postType === 'attachment') {
+        media.push(this.parseMediaItem(item));
+      } else if (postType === 'page') {
+        pages.push(this.parsePostItem(item));
+      } else {
+        // 'post' and any other custom post types
+        posts.push(this.parsePostItem(item));
+      }
+    }
+
+    return { site, posts, pages, media, categories, tags };
+  }
+
+  private parseSiteInfo(channel: Element): WxrSiteInfo {
+    return {
+      title: this.getDirectChildText(channel, 'title'),
+      link: this.getDirectChildText(channel, 'link'),
+      description: this.getDirectChildText(channel, 'description'),
+      language: this.getDirectChildText(channel, 'language'),
+    };
+  }
+
+  private parseChannelCategories(channel: Element): WxrCategory[] {
+    const categories: WxrCategory[] = [];
+    const elements = channel.getElementsByTagNameNS(NS.wp, 'category');
+
+    for (let i = 0; i < elements.length; i++) {
+      const el = elements[i];
+      // Only process direct children of channel (not item-level category elements)
+      if (el.parentNode !== channel) continue;
+
+      categories.push({
+        name: this.getElementText(el, 'cat_name', NS.wp),
+        slug: this.getElementText(el, 'category_nicename', NS.wp),
+        parent: this.getElementText(el, 'category_parent', NS.wp),
+      });
+    }
+
+    return categories;
+  }
+
+  private parseChannelTags(channel: Element): WxrTag[] {
+    const tags: WxrTag[] = [];
+    const elements = channel.getElementsByTagNameNS(NS.wp, 'tag');
+
+    for (let i = 0; i < elements.length; i++) {
+      const el = elements[i];
+      if (el.parentNode !== channel) continue;
+
+      tags.push({
+        name: this.getElementText(el, 'tag_name', NS.wp),
+        slug: this.getElementText(el, 'tag_slug', NS.wp),
+      });
+    }
+
+    return tags;
+  }
+
+  private parsePostItem(item: Element): WxrPost {
+    const categories: string[] = [];
+    const tags: string[] = [];
+
+    // Item-level <category> elements (no namespace)
+    const catElements = item.getElementsByTagName('category');
+    for (let i = 0; i < catElements.length; i++) {
+      const el = catElements[i];
+      // Only direct children of item
+      if (el.parentNode !== item) continue;
+      const domain = el.getAttribute('domain');
+      const text = this.getTextContent(el);
+      if (domain === 'category' && text) {
+        categories.push(text);
+      } else if (domain === 'post_tag' && text) {
+        tags.push(text);
+      }
+    }
+
+    const pubDateStr = this.getDirectChildText(item, 'pubDate');
+    let pubDate: Date | null = null;
+    if (pubDateStr) {
+      const parsed = new Date(pubDateStr);
+      if (!isNaN(parsed.getTime())) {
+        pubDate = parsed;
+      }
+    }
+
+    return {
+      wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
+      title: this.getDirectChildText(item, 'title'),
+      slug: this.getElementText(item, 'post_name', NS.wp),
+      content: this.getElementText(item, 'encoded', NS.content),
+      excerpt: this.getElementText(item, 'encoded', NS.excerpt),
+      pubDate,
+      creator: this.getElementText(item, 'creator', NS.dc),
+      status: this.getElementText(item, 'status', NS.wp),
+      postType: this.getElementText(item, 'post_type', NS.wp),
+      categories,
+      tags,
+    };
+  }
+
+  private parseMediaItem(item: Element): WxrMedia {
+    const url = this.getElementText(item, 'attachment_url', NS.wp);
+    const filename = this.extractFilename(url);
+    const relativePath = this.extractRelativePath(url);
+
+    const pubDateStr = this.getDirectChildText(item, 'pubDate');
+    let pubDate: Date | null = null;
+    if (pubDateStr) {
+      const parsed = new Date(pubDateStr);
+      if (!isNaN(parsed.getTime())) {
+        pubDate = parsed;
+      }
+    }
+
+    return {
+      wpId: parseInt(this.getElementText(item, 'post_id', NS.wp) || '0', 10),
+      title: this.getDirectChildText(item, 'title'),
+      url,
+      filename,
+      relativePath,
+      pubDate,
+      parentId: parseInt(this.getElementText(item, 'post_parent', NS.wp) || '0', 10),
+      mimeType: this.inferMimeType(filename),
+      description: this.getElementText(item, 'encoded', NS.content),
+    };
+  }
+
+  private extractFilename(url: string): string {
+    if (!url) return '';
+    try {
+      const pathname = new URL(url).pathname;
+      return pathname.split('/').pop() || '';
+    } catch {
+      return url.split('/').pop() || '';
+    }
+  }
+
+  private extractRelativePath(url: string): string {
+    if (!url) return '';
+    // Extract path after wp-content/uploads/
+    const marker = 'wp-content/uploads/';
+    const idx = url.indexOf(marker);
+    if (idx !== -1) {
+      return url.substring(idx + marker.length);
+    }
+    // Fallback: return filename only
+    return this.extractFilename(url);
+  }
+
+  private inferMimeType(filename: string): string {
+    const ext = filename.split('.').pop()?.toLowerCase() || '';
+    return EXT_TO_MIME[ext] || 'application/octet-stream';
+  }
+
+  /** Get text content of a namespaced child element */
+  private getElementText(parent: Element, localName: string, nsUri: string): string {
+    const elements = parent.getElementsByTagNameNS(nsUri, localName);
+    for (let i = 0; i < elements.length; i++) {
+      const el = elements[i];
+      // Find first one that is either a direct child or a grandchild (for nested structures)
+      if (el.parentNode === parent || el.parentNode?.parentNode === parent) {
+        return this.getTextContent(el);
+      }
+    }
+    return '';
+  }
+
+  /** Get text content of a direct child element (no namespace) */
+  private getDirectChildText(parent: Element, tagName: string): string {
+    const children = parent.childNodes;
+    for (let i = 0; i < children.length; i++) {
+      const child = children[i];
+      if (child.nodeType === 1 && (child as Element).localName === tagName) {
+        return this.getTextContent(child as Element);
+      }
+    }
+    return '';
+  }
+
+  /** Safely extract text content, handling CDATA sections */
+  private getTextContent(el: Element): string {
+    let text = '';
+    const children = el.childNodes;
+    for (let i = 0; i < children.length; i++) {
+      const child = children[i];
+      if (child.nodeType === 3 || child.nodeType === 4) {
+        // Text node or CDATA section
+        text += child.nodeValue || '';
+      }
+    }
+    return text;
+  }
+}