From b4697aa1805216b34f5dc2d5e1e9e86d5400bc06 Mon Sep 17 00:00:00 2001 From: "Colby M. White" <3979735+colbywhite@users.noreply.github.com> Date: Mon, 24 Nov 2025 14:40:19 -0600 Subject: [PATCH] dx: use a single git cmd for lastmod calc (#26722) * dx: use a single git cmd for lastmod calc * fix: use commiter date for lastmod --- astro.config.ts | 137 +----------------------------------------- sitemap.serializer.ts | 126 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 134 deletions(-) create mode 100644 sitemap.serializer.ts diff --git a/astro.config.ts b/astro.config.ts index d172ba56de..28d08773a9 100644 --- a/astro.config.ts +++ b/astro.config.ts @@ -6,13 +6,11 @@ import liveCode from "astro-live-code"; import starlightLinksValidator from "starlight-links-validator"; import starlightScrollToTop from "starlight-scroll-to-top"; import icon from "astro-icon"; -import sitemap, { type SitemapItem } from "@astrojs/sitemap"; +import sitemap from "@astrojs/sitemap"; import react from "@astrojs/react"; import { readdir } from "fs/promises"; import { fileURLToPath } from "url"; -import { execSync } from "child_process"; -import { existsSync } from "fs"; import remarkValidateImages from "./src/plugins/remark/validate-images"; @@ -22,6 +20,7 @@ import rehypeAutolinkHeadings from "./src/plugins/rehype/autolink-headings.ts"; import rehypeExternalLinks from "./src/plugins/rehype/external-links.ts"; import rehypeHeadingSlugs from "./src/plugins/rehype/heading-slugs.ts"; import rehypeShiftHeadings from "./src/plugins/rehype/shift-headings.ts"; +import { createSitemapLastmodSerializer } from "./sitemap.serializer.ts"; async function autogenSections() { const sections = ( @@ -62,134 +61,6 @@ const customCss = await autogenStyles(); const RUN_LINK_CHECK = process.env.RUN_LINK_CHECK?.toLowerCase() === "true" || false; -/** - * Build a cache of all git last-modified dates in one batch - */ -function buildGitDateCache(): Map | null { - try { - console.time("[sitemap] Building git date cache"); - - // Use git log with --name-only and --diff-filter to get all files with their last commit - // The format outputs the commit date followed by the list of files changed in that commit - // e.g. - // 2025-10-01T12:34:56-07:00 - // src/content/docs/file1.mdx - // src/content/docs/file2.mdx - // - // 2025-09-25T09:15:30-07:00 - // src/content/docs/file3.mdx - - const result = execSync( - 'git log --pretty=format:"%cI" --name-only --diff-filter=AMR src/content/docs', - { - encoding: "utf-8", - maxBuffer: 100 * 1024 * 1024, - }, - ); - - const cache = new Map(); - const lines = result.split("\n"); - - let currentDate: string | null = null; - for (const line of lines) { - const trimmed = line.trim(); - if (!trimmed) { - continue; - } - // Lines are either dates or file paths - // Date lines match ISO format - if (/^\d{4}-\d{2}-\d{2}T/.test(trimmed)) { - currentDate = trimmed; - } else if (currentDate) { - const filePath = `./${trimmed}`; // fileURLToPath includes leading ./, so we do the same here - if (!cache.has(filePath)) { - cache.set(filePath, currentDate); // e.g., "src/content/docs/file.mdx" - } - } - } - - console.timeEnd("[sitemap] Building git date cache"); - console.log(`[sitemap] Loaded git dates for ${cache.size} files`); - return cache; - } catch (error) { - console.warn("[sitemap] Failed to build git date cache:", error); - return null; - } -} - -const gitDateCache = buildGitDateCache(); - -/** - * Get the last Git modification date for a file (from cache) - * @param filePath - Path to the file - * @returns ISO date string or null if not available - */ -function getGitLastModified(filePath: string): string | undefined { - if (!gitDateCache) { - console.warn("[sitemap] Git date cache is not initialized"); - return undefined; - } - - const result = gitDateCache.get(filePath); - - if (!result) { - console.log(`[sitemap] Last modified not found in git for: "${filePath}"`); - } - - return result ?? undefined; -} - -/** - * Convert a sitemap URL to the corresponding source file path - * @param url - The full URL from the sitemap - * @returns Absolute file path or null if not found - */ -function urlToFilePath(url: string): string | null { - try { - const urlObj = new URL(url); - const pathname = urlObj.pathname.replace(/\/$/, ""); // Remove trailing slash - - // Try different file extensions and paths - const possiblePaths = [ - `./src/content/docs${pathname}.md`, - `./src/content/docs${pathname}.mdx`, - `./src/content/docs${pathname}/index.md`, - `./src/content/docs${pathname}/index.mdx`, - ]; - - for (const path of possiblePaths) { - if (existsSync(path)) { - return path; - } - } - - return null; - } catch (_error) { - return null; - } -} - -function addLastModDate(item: SitemapItem) { - const filePath = urlToFilePath(item.url); - if (filePath) { - const gitDate = getGitLastModified(filePath); - if (gitDate) { - item.lastmod = gitDate; - } else { - console.warn( - `[sitemap] No git last mod date found for ${filePath} (${item.url}) - setting to now`, - ); - item.lastmod = new Date().toISOString(); - } - } else { - console.warn( - `[sitemap] Could not find source file for ${item.url} - setting last modified to now`, - ); - item.lastmod = new Date().toISOString(); - } - return item; -} - // https://astro.build/config export default defineConfig({ site: "https://developers.cloudflare.com", @@ -324,9 +195,7 @@ export default defineConfig({ return true; }, - serialize(item) { - return addLastModDate(item); - }, + serialize: createSitemapLastmodSerializer(), }), react(), ], diff --git a/sitemap.serializer.ts b/sitemap.serializer.ts new file mode 100644 index 0000000000..34ae6b24d8 --- /dev/null +++ b/sitemap.serializer.ts @@ -0,0 +1,126 @@ +import { spawn } from "node:child_process"; +import * as readline from "node:readline"; +import type { SitemapItem } from "@astrojs/sitemap"; +import { existsSync } from "fs"; +import { green, blue, dim } from "kleur/colors"; + +/** + * Meant to mimic how Astro prints duration during logging + * (i.e. build/util.ts from astro/core). + */ +export function readableMsDuration(duration: number) { + return duration < 1000 + ? `${Math.round(duration)}ms` + : `${(duration / 1000).toFixed(2)}s`; +} + +/** + * A helper function that uses git shell commands to get last modified dates. + * Note: it is important that this is only called once with all relevant paths as opposed to calling this separately for individual paths. + */ +async function getLastmodViaGitShell(...dirs: string[]) { + const gitArgs = [ + "log", + "--pretty=format:DATE: %cI", + "--diff-filter=AMR", + "--name-only", + ...dirs, + ]; + + return new Promise>((resolve, reject) => { + const git = spawn("git", gitArgs, { cwd: process.cwd() }); + // Use readline to process output to reduce memory usage since output will be large. + const rl = readline.createInterface({ + input: git.stdout!, + crlfDelay: Infinity, + }); + + const lastmodMetadata = new Map(); + let currentDate: Date; + + rl.on("line", (rawLine) => { + const line = rawLine.trim(); + if (!line) return; + if (line.startsWith("DATE: ")) { + // Cut off the 'DATE: ' prefix and use Date class to handle time zones + currentDate = new Date(line.slice(6)); + return; + } + const file = line; + if (!lastmodMetadata.has(file) && currentDate) { + lastmodMetadata.set(file, currentDate); + } + }); + + let stderr = ""; + git.stderr?.on("data", (chunk) => (stderr += chunk.toString())); + git.on("error", reject); + + git.on("close", (code) => { + rl.close(); + if (code !== 0) + return reject(new Error(`git exited with code ${code}: ${stderr}`)); + resolve(lastmodMetadata); + }); + }); +} + +/** + * Attempt to find corresponding source file path for a sitemap item. + * This only looks for Starlight files in `src/content/docs`. + * @todo Leverage Astro's IntegrationResolvedRoute.entrypoint to account for pages in `src/pages`. + * @returns Relative file path or null if not found + */ +function getSourceFile(item: SitemapItem) { + const url = new URL(item.url); + // Remove trailing slash + const pathname = url.pathname.replace(/\/$/, ""); + + // Try different file extensions and paths + const possiblePaths = [ + `src/content/docs${pathname}.md`, + `src/content/docs${pathname}.mdx`, + `src/content/docs${pathname}/index.md`, + `src/content/docs${pathname}/index.mdx`, + ]; + + for (const path of possiblePaths) { + if (existsSync(path)) { + return path; + } + } + + return null; +} + +export function createSitemapLastmodSerializer() { + let lastModMetadata: Map | undefined = undefined; + const currentDateString = new Date().toISOString(); + return async (item: SitemapItem) => { + const filePath = getSourceFile(item); + // Only calculate metadata once + if (lastModMetadata === undefined) { + const startTime = performance.now(); + lastModMetadata = await getLastmodViaGitShell( + "src/content/docs", + "src/pages/*.astro", + ); + const endTime = performance.now(); + + // Mimic the Astro logger output + console.log( + dim(new Date().toLocaleTimeString("en-US", { hour12: false })), + blue("[@cloudflare/lastmod-serializer]"), + green( + `✓ Lastmod metadata calculated in ${readableMsDuration(endTime - startTime)}.`, + ), + ); + } + if (filePath && lastModMetadata.has(filePath)) { + item.lastmod = lastModMetadata.get(filePath)!.toISOString(); + } else { + item.lastmod = currentDateString; + } + return item; + }; +}