dx: use a single git cmd for lastmod calc (#26722)

* dx: use a single git cmd for lastmod calc

* fix: use commiter date for lastmod
This commit is contained in:
Colby M. White 2025-11-24 14:40:19 -06:00 committed by GitHub
parent e201c05f5e
commit b4697aa180
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 129 additions and 134 deletions

View file

@ -6,13 +6,11 @@ import liveCode from "astro-live-code";
import starlightLinksValidator from "starlight-links-validator";
import starlightScrollToTop from "starlight-scroll-to-top";
import icon from "astro-icon";
import sitemap, { type SitemapItem } from "@astrojs/sitemap";
import sitemap from "@astrojs/sitemap";
import react from "@astrojs/react";
import { readdir } from "fs/promises";
import { fileURLToPath } from "url";
import { execSync } from "child_process";
import { existsSync } from "fs";
import remarkValidateImages from "./src/plugins/remark/validate-images";
@ -22,6 +20,7 @@ import rehypeAutolinkHeadings from "./src/plugins/rehype/autolink-headings.ts";
import rehypeExternalLinks from "./src/plugins/rehype/external-links.ts";
import rehypeHeadingSlugs from "./src/plugins/rehype/heading-slugs.ts";
import rehypeShiftHeadings from "./src/plugins/rehype/shift-headings.ts";
import { createSitemapLastmodSerializer } from "./sitemap.serializer.ts";
async function autogenSections() {
const sections = (
@ -62,134 +61,6 @@ const customCss = await autogenStyles();
const RUN_LINK_CHECK =
process.env.RUN_LINK_CHECK?.toLowerCase() === "true" || false;
/**
* Build a cache of all git last-modified dates in one batch
*/
function buildGitDateCache(): Map<string, string> | null {
try {
console.time("[sitemap] Building git date cache");
// Use git log with --name-only and --diff-filter to get all files with their last commit
// The format outputs the commit date followed by the list of files changed in that commit
// e.g.
// 2025-10-01T12:34:56-07:00
// src/content/docs/file1.mdx
// src/content/docs/file2.mdx
//
// 2025-09-25T09:15:30-07:00
// src/content/docs/file3.mdx
const result = execSync(
'git log --pretty=format:"%cI" --name-only --diff-filter=AMR src/content/docs',
{
encoding: "utf-8",
maxBuffer: 100 * 1024 * 1024,
},
);
const cache = new Map<string, string>();
const lines = result.split("\n");
let currentDate: string | null = null;
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
// Lines are either dates or file paths
// Date lines match ISO format
if (/^\d{4}-\d{2}-\d{2}T/.test(trimmed)) {
currentDate = trimmed;
} else if (currentDate) {
const filePath = `./${trimmed}`; // fileURLToPath includes leading ./, so we do the same here
if (!cache.has(filePath)) {
cache.set(filePath, currentDate); // e.g., "src/content/docs/file.mdx"
}
}
}
console.timeEnd("[sitemap] Building git date cache");
console.log(`[sitemap] Loaded git dates for ${cache.size} files`);
return cache;
} catch (error) {
console.warn("[sitemap] Failed to build git date cache:", error);
return null;
}
}
const gitDateCache = buildGitDateCache();
/**
* Get the last Git modification date for a file (from cache)
* @param filePath - Path to the file
* @returns ISO date string or null if not available
*/
function getGitLastModified(filePath: string): string | undefined {
if (!gitDateCache) {
console.warn("[sitemap] Git date cache is not initialized");
return undefined;
}
const result = gitDateCache.get(filePath);
if (!result) {
console.log(`[sitemap] Last modified not found in git for: "${filePath}"`);
}
return result ?? undefined;
}
/**
* Convert a sitemap URL to the corresponding source file path
* @param url - The full URL from the sitemap
* @returns Absolute file path or null if not found
*/
function urlToFilePath(url: string): string | null {
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname.replace(/\/$/, ""); // Remove trailing slash
// Try different file extensions and paths
const possiblePaths = [
`./src/content/docs${pathname}.md`,
`./src/content/docs${pathname}.mdx`,
`./src/content/docs${pathname}/index.md`,
`./src/content/docs${pathname}/index.mdx`,
];
for (const path of possiblePaths) {
if (existsSync(path)) {
return path;
}
}
return null;
} catch (_error) {
return null;
}
}
function addLastModDate(item: SitemapItem) {
const filePath = urlToFilePath(item.url);
if (filePath) {
const gitDate = getGitLastModified(filePath);
if (gitDate) {
item.lastmod = gitDate;
} else {
console.warn(
`[sitemap] No git last mod date found for ${filePath} (${item.url}) - setting to now`,
);
item.lastmod = new Date().toISOString();
}
} else {
console.warn(
`[sitemap] Could not find source file for ${item.url} - setting last modified to now`,
);
item.lastmod = new Date().toISOString();
}
return item;
}
// https://astro.build/config
export default defineConfig({
site: "https://developers.cloudflare.com",
@ -324,9 +195,7 @@ export default defineConfig({
return true;
},
serialize(item) {
return addLastModDate(item);
},
serialize: createSitemapLastmodSerializer(),
}),
react(),
],

126
sitemap.serializer.ts Normal file
View file

@ -0,0 +1,126 @@
import { spawn } from "node:child_process";
import * as readline from "node:readline";
import type { SitemapItem } from "@astrojs/sitemap";
import { existsSync } from "fs";
import { green, blue, dim } from "kleur/colors";
/**
* Meant to mimic how Astro prints duration during logging
* (i.e. build/util.ts from astro/core).
*/
export function readableMsDuration(duration: number) {
return duration < 1000
? `${Math.round(duration)}ms`
: `${(duration / 1000).toFixed(2)}s`;
}
/**
* A helper function that uses git shell commands to get last modified dates.
* Note: it is important that this is only called once with all relevant paths as opposed to calling this separately for individual paths.
*/
async function getLastmodViaGitShell(...dirs: string[]) {
const gitArgs = [
"log",
"--pretty=format:DATE: %cI",
"--diff-filter=AMR",
"--name-only",
...dirs,
];
return new Promise<Map<string, Date>>((resolve, reject) => {
const git = spawn("git", gitArgs, { cwd: process.cwd() });
// Use readline to process output to reduce memory usage since output will be large.
const rl = readline.createInterface({
input: git.stdout!,
crlfDelay: Infinity,
});
const lastmodMetadata = new Map<string, Date>();
let currentDate: Date;
rl.on("line", (rawLine) => {
const line = rawLine.trim();
if (!line) return;
if (line.startsWith("DATE: ")) {
// Cut off the 'DATE: ' prefix and use Date class to handle time zones
currentDate = new Date(line.slice(6));
return;
}
const file = line;
if (!lastmodMetadata.has(file) && currentDate) {
lastmodMetadata.set(file, currentDate);
}
});
let stderr = "";
git.stderr?.on("data", (chunk) => (stderr += chunk.toString()));
git.on("error", reject);
git.on("close", (code) => {
rl.close();
if (code !== 0)
return reject(new Error(`git exited with code ${code}: ${stderr}`));
resolve(lastmodMetadata);
});
});
}
/**
* Attempt to find corresponding source file path for a sitemap item.
* This only looks for Starlight files in `src/content/docs`.
* @todo Leverage Astro's IntegrationResolvedRoute.entrypoint to account for pages in `src/pages`.
* @returns Relative file path or null if not found
*/
function getSourceFile(item: SitemapItem) {
const url = new URL(item.url);
// Remove trailing slash
const pathname = url.pathname.replace(/\/$/, "");
// Try different file extensions and paths
const possiblePaths = [
`src/content/docs${pathname}.md`,
`src/content/docs${pathname}.mdx`,
`src/content/docs${pathname}/index.md`,
`src/content/docs${pathname}/index.mdx`,
];
for (const path of possiblePaths) {
if (existsSync(path)) {
return path;
}
}
return null;
}
export function createSitemapLastmodSerializer() {
let lastModMetadata: Map<string, Date> | undefined = undefined;
const currentDateString = new Date().toISOString();
return async (item: SitemapItem) => {
const filePath = getSourceFile(item);
// Only calculate metadata once
if (lastModMetadata === undefined) {
const startTime = performance.now();
lastModMetadata = await getLastmodViaGitShell(
"src/content/docs",
"src/pages/*.astro",
);
const endTime = performance.now();
// Mimic the Astro logger output
console.log(
dim(new Date().toLocaleTimeString("en-US", { hour12: false })),
blue("[@cloudflare/lastmod-serializer]"),
green(
`✓ Lastmod metadata calculated in ${readableMsDuration(endTime - startTime)}.`,
),
);
}
if (filePath && lastModMetadata.has(filePath)) {
item.lastmod = lastModMetadata.get(filePath)!.toISOString();
} else {
item.lastmod = currentDateString;
}
return item;
};
}