import * as fs from 'node:fs/promises'; import * as path from 'node:path'; export interface SiteValidationDiffResult { missingUrlPaths: string[]; extraUrlPaths: string[]; updatedPostUrlPaths: string[]; expectedUrlCount: number; existingHtmlUrlCount: number; } export interface PostTimestampCheck { postUrlPath: string; postFilePath: string; generatedUpdatedAtMs?: number; } interface CompareSitemapToHtmlParams { sitemapXml: string; baseUrl: string; htmlDir: string; postTimestampChecks?: PostTimestampCheck[]; } function normalizeUrlPath(urlPath: string): string { const trimmed = (urlPath || '').trim(); if (!trimmed || trimmed === '/') { return '/'; } const noQuery = trimmed.split('?')[0]?.split('#')[0] ?? ''; const withoutSlashes = noQuery.replace(/^\/+|\/+$/g, ''); return withoutSlashes ? `/${withoutSlashes}` : '/'; } function sitemapLocToProjectPath(loc: string, baseUrl: string): string { try { const locUrl = new URL(loc); const base = new URL(baseUrl); const locPath = locUrl.pathname.replace(/\/+$/, ''); const basePath = base.pathname.replace(/\/+$/, ''); if (basePath && locPath.startsWith(basePath)) { const stripped = locPath.slice(basePath.length); return normalizeUrlPath(stripped || '/'); } return normalizeUrlPath(locPath || '/'); } catch { return normalizeUrlPath(loc); } } function extractSitemapLocs(sitemapXml: string): string[] { const matches = sitemapXml.matchAll(/(.*?)<\/loc>/g); const locs: string[] = []; for (const match of matches) { const value = match[1]?.trim(); if (value) { locs.push(value); } } return locs; } async function collectHtmlIndexPaths(htmlDir: string): Promise<{ existingHtmlPathSet: Set; zeroByteHtmlPathSet: Set; }> { const existingHtmlPathSet = new Set(); const zeroByteHtmlPathSet = new Set(); const collectIndexPaths = async (dir: string, relativePrefix = ''): Promise => { let entries: Array<{ name: string; isDirectory: () => boolean; isFile: () => boolean }>; try { entries = await fs.readdir(dir, { withFileTypes: true, encoding: 'utf8' }); } catch { return; } for (const entry of entries) { const nextRelative = relativePrefix ? `${relativePrefix}/${entry.name}` : entry.name; const nextPath = path.join(dir, entry.name); if (entry.isDirectory()) { await collectIndexPaths(nextPath, nextRelative); continue; } if (!entry.isFile() || entry.name !== 'index.html') { continue; } const normalizedRelative = nextRelative.replace(/(^|\/)index\.html$/, ''); const normalizedUrlPath = normalizeUrlPath(normalizedRelative ? `/${normalizedRelative}` : '/'); try { const stats = await fs.stat(nextPath); if (stats.size <= 0) { zeroByteHtmlPathSet.add(normalizedUrlPath); continue; } } catch { zeroByteHtmlPathSet.add(normalizedUrlPath); continue; } existingHtmlPathSet.add(normalizedUrlPath); } }; await collectIndexPaths(htmlDir); return { existingHtmlPathSet, zeroByteHtmlPathSet, }; } export async function compareSitemapToHtml(params: CompareSitemapToHtmlParams): Promise { const expectedPathSet = new Set( extractSitemapLocs(params.sitemapXml) .map((loc) => sitemapLocToProjectPath(loc, params.baseUrl)) .map((value) => normalizeUrlPath(value)), ); const { existingHtmlPathSet, zeroByteHtmlPathSet } = await collectHtmlIndexPaths(params.htmlDir); const missingUrlPaths = Array.from(expectedPathSet) .filter((value) => !existingHtmlPathSet.has(value)) .sort(); const extraUrlPaths = Array.from(existingHtmlPathSet) .filter((value) => !expectedPathSet.has(value)) .concat(Array.from(zeroByteHtmlPathSet).filter((value) => !expectedPathSet.has(value))) .filter((value, index, array) => array.indexOf(value) === index) .sort(); const updatedPostPathSet = new Set(); const postTimestampChecks = Array.isArray(params.postTimestampChecks) ? params.postTimestampChecks : []; for (const check of postTimestampChecks) { const normalizedPostUrlPath = normalizeUrlPath(check.postUrlPath); if (!expectedPathSet.has(normalizedPostUrlPath)) { continue; } if (missingUrlPaths.includes(normalizedPostUrlPath)) { continue; } const htmlPath = path.join(params.htmlDir, normalizedPostUrlPath === '/' ? 'index.html' : normalizedPostUrlPath.slice(1), 'index.html'); let htmlStat: Awaited>; let postStat: Awaited>; try { htmlStat = await fs.stat(htmlPath); postStat = await fs.stat(check.postFilePath); } catch { continue; } const generatedUpdatedAtMs = typeof check.generatedUpdatedAtMs === 'number' ? check.generatedUpdatedAtMs : 0; const effectiveGeneratedAtMs = Math.max(htmlStat.mtimeMs, generatedUpdatedAtMs); if (postStat.mtimeMs > effectiveGeneratedAtMs) { updatedPostPathSet.add(normalizedPostUrlPath); } } const updatedPostUrlPaths = Array.from(updatedPostPathSet.values()).sort(); return { missingUrlPaths, extraUrlPaths, updatedPostUrlPaths, expectedUrlCount: expectedPathSet.size, existingHtmlUrlCount: existingHtmlPathSet.size, }; }