From 5e5028659e064bbbac170e3c6c58532bb163d713 Mon Sep 17 00:00:00 2001 From: Claas Augner <495429+caugner@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:41:38 +0200 Subject: [PATCH] feat(cloud-function): redirect non-canonical URLs (#11151) Previously, we served the same content independent of capitalization: - https://developer.mozilla.org/en-US/docs/web (incorrect capitalization) - https://developer.mozilla.org/en-US/docs/Web (correct capitalization) Now, we build a map to match the requested against the canonical URL, redirecting if necessary. --- .github/workflows/prod-build.yml | 1 + .github/workflows/stage-build.yml | 1 + .github/workflows/xyz-build.yml | 1 + build/cli.ts | 28 +++---- build/sitemaps.ts | 76 +++++++++++++++---- build/spas.ts | 5 +- cloud-function/.gcloudignore | 1 + cloud-function/.gitignore | 1 + cloud-function/package.json | 1 + cloud-function/src/app.ts | 24 ++++-- cloud-function/src/build-canonicals.ts | 40 ++++++++++ .../middlewares/redirect-non-canonicals.ts | 46 +++++++++++ cloud-function/src/utils.ts | 4 + 13 files changed, 190 insertions(+), 39 deletions(-) create mode 100644 cloud-function/src/build-canonicals.ts create mode 100644 cloud-function/src/middlewares/redirect-non-canonicals.ts diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml index 5a68d93e5507..19210702d04e 100644 --- a/.github/workflows/prod-build.yml +++ b/.github/workflows/prod-build.yml @@ -340,6 +340,7 @@ jobs: run: |- npm ci npm run build-redirects + npm run build-canonicals - name: Deploy Function if: ${{ ! vars.SKIP_FUNCTION }} diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml index 7d9cf3b4fad3..c49aa7009736 100644 --- a/.github/workflows/stage-build.yml +++ b/.github/workflows/stage-build.yml @@ -356,6 +356,7 @@ jobs: run: | npm ci npm run build-redirects + npm run build-canonicals - name: Deploy Function if: ${{ ! vars.SKIP_FUNCTION }} diff --git a/.github/workflows/xyz-build.yml b/.github/workflows/xyz-build.yml index 4b86c2cb6581..2646a6a01b7d 100644 --- a/.github/workflows/xyz-build.yml +++ b/.github/workflows/xyz-build.yml @@ -237,6 +237,7 @@ jobs: run: | npm ci npm run build-redirects + npm run build-canonicals - name: Deploy Function if: ${{ ! vars.SKIP_FUNCTION }} diff --git a/build/cli.ts b/build/cli.ts index 6e44be356ed0..828da2ae248d 100644 --- a/build/cli.ts +++ b/build/cli.ts @@ -26,11 +26,10 @@ import { } from "./index.js"; import { Doc, DocMetadata, Flaws } from "../libs/types/document.js"; import SearchIndex from "./search-index.js"; -import { makeSitemapIndexXML, buildSitemap } from "./sitemaps.js"; +import { buildSitemapIndex, buildSitemap } from "./sitemaps.js"; import { humanFileSize } from "./utils.js"; import { initSentry } from "./sentry.js"; import { macroRenderTimes } from "../kumascript/src/render.js"; -import { fdir } from "fdir"; import { ssrDocument } from "./ssr.js"; import { HydrationData } from "../libs/types/hydration.js"; @@ -515,25 +514,16 @@ program if (!options.quiet) { console.log(chalk.yellow("Building sitemap index file...")); } - const sitemapsBuilt = new fdir() - .filter((p) => p.endsWith("/sitemap.xml.gz")) - .withFullPaths() - .crawl(path.join(BUILD_OUT_ROOT, "sitemaps")) - .sync() - .sort() - .map((fp) => fp.replace(BUILD_OUT_ROOT, "")); - const sitemapIndexFilePath = path.join(BUILD_OUT_ROOT, "sitemap.xml"); - fs.writeFileSync( - sitemapIndexFilePath, - makeSitemapIndexXML(sitemapsBuilt) - ); + const sitemapsBuilt = await buildSitemapIndex(); if (!options.quiet) { - console.log( - chalk.green( - `Wrote sitemap index referencing ${sitemapsBuilt.length} sitemaps:\n${sitemapsBuilt.map((s) => `- ${s}`).join("\n")}` - ) - ); + for (const sitemaps of sitemapsBuilt) { + console.log( + chalk.green( + `Wrote sitemap index referencing ${sitemaps.length} sitemaps:\n${sitemaps.map((s) => `- ${s}`).join("\n")}` + ) + ); + } } return; } diff --git a/build/sitemaps.ts b/build/sitemaps.ts index 59f1a028d22d..0bd1573da611 100644 --- a/build/sitemaps.ts +++ b/build/sitemaps.ts @@ -1,7 +1,9 @@ import { join } from "node:path"; -import { mkdir, writeFile } from "node:fs/promises"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; import { gzipSync } from "node:zlib"; +import { fdir } from "fdir"; + import { BASE_URL, BUILD_OUT_ROOT } from "../libs/env/index.js"; const SITEMAP_BASE_URL = BASE_URL.replace(/\/$/, ""); @@ -18,10 +20,61 @@ export async function buildSitemap( pathSuffix = [], }: { slugPrefix?: string; pathSuffix?: string[] } ) { + const txt = entries.map(({ slug }) => `${slugPrefix}${slug}`).join("\n"); const xml = makeSitemapXML(slugPrefix, entries); - const path = await writeSitemap(xml, ...pathSuffix); - return path; + const dirPath = join( + BUILD_OUT_ROOT, + "sitemaps", + ...pathSuffix.map((p) => p.toLowerCase()) + ); + await mkdir(dirPath, { recursive: true }); + + const txtPath = join(dirPath, "sitemap.txt"); + const xmlPath = join(dirPath, "sitemap.xml.gz"); + + await Promise.all([ + writeFile(txtPath, txt, "utf-8"), + writeFile(xmlPath, gzipSync(xml)), + ]); + + return xmlPath; +} + +export async function buildSitemapIndex() { + return Promise.all([buildSitemapIndexTXT(), buildSitemapIndexXML()]); +} + +export async function buildSitemapIndexTXT() { + const sitemaps = new fdir() + .filter((p) => p.endsWith("/sitemap.txt")) + .withFullPaths() + .crawl(join(BUILD_OUT_ROOT, "sitemaps")) + .sync(); + + const file = join(BUILD_OUT_ROOT, "sitemap.txt"); + + const content = await makeSitemapIndexTXT(sitemaps); + + await writeFile(file, content, "utf-8"); + + return sitemaps.sort().map((fp) => fp.replace(BUILD_OUT_ROOT, "")); +} + +export async function buildSitemapIndexXML() { + const sitemaps = new fdir() + .filter((p) => p.endsWith("/sitemap.xml.gz")) + .withFullPaths() + .crawl(join(BUILD_OUT_ROOT, "sitemaps")) + .sync() + .sort() + .map((fp) => fp.replace(BUILD_OUT_ROOT, "")); + + const file = join(BUILD_OUT_ROOT, "sitemap.xml"); + + await writeFile(file, makeSitemapIndexXML(sitemaps)); + + return sitemaps.sort().map((fp) => fp.replace(BUILD_OUT_ROOT, "")); } function makeSitemapXML(prefix: string, docs: SitemapEntry[]) { @@ -62,16 +115,13 @@ export function makeSitemapIndexXML(paths: string[]) { ].join("\n"); } -async function writeSitemap(xml: string, ...paths: string[]) { - const dirPath = join( - BUILD_OUT_ROOT, - "sitemaps", - ...paths.map((p) => p.toLowerCase()) - ); - await mkdir(dirPath, { recursive: true }); +/** + * Creates a global text sitemap by merging all text sitemaps. + */ +export async function makeSitemapIndexTXT(paths: string[]) { + const maps = await Promise.all(paths.map((p) => readFile(p, "utf-8"))); - const filePath = join(dirPath, "sitemap.xml.gz"); - await writeFile(filePath, gzipSync(xml)); + const urls = maps.join("\n").split("\n").filter(Boolean); - return filePath; + return urls.sort().join("\n"); } diff --git a/build/spas.ts b/build/spas.ts index cd358e712fe2..3972c69b769e 100644 --- a/build/spas.ts +++ b/build/spas.ts @@ -135,7 +135,10 @@ export async function buildSPAs(options: { continue; } for (const pathLocale of fs.readdirSync(root)) { - if (!fs.statSync(path.join(root, pathLocale)).isDirectory()) { + if ( + !fs.statSync(path.join(root, pathLocale)).isDirectory() || + !isValidLocale(pathLocale) + ) { continue; } diff --git a/cloud-function/.gcloudignore b/cloud-function/.gcloudignore index 79b725a2ebad..7ddddcfa0d05 100644 --- a/cloud-function/.gcloudignore +++ b/cloud-function/.gcloudignore @@ -15,5 +15,6 @@ .gitignore #!include:.gitignore +!canonicals.json !redirects.json !src/internal/** diff --git a/cloud-function/.gitignore b/cloud-function/.gitignore index deb96650c7f6..57c8cd36b4ca 100644 --- a/cloud-function/.gitignore +++ b/cloud-function/.gitignore @@ -2,6 +2,7 @@ node_modules .env* !.env-dist *.log +canonicals.json redirects.json src/**/*.js src/internal diff --git a/cloud-function/package.json b/cloud-function/package.json index cd0c717f0430..b34347f0f95f 100644 --- a/cloud-function/package.json +++ b/cloud-function/package.json @@ -9,6 +9,7 @@ "main": "src/index.js", "scripts": { "build": "tsc -b", + "build-canonicals": "cross-env NODE_OPTIONS='--no-warnings=ExperimentalWarning --loader ts-node/esm' node src/build-canonicals.ts", "build-redirects": "cross-env NODE_OPTIONS='--no-warnings=ExperimentalWarning --loader ts-node/esm' node src/build-redirects.ts", "copy-internal": "rm -rf ./src/internal && cp -R ../libs ./src/internal", "gcp-build": "npm run build", diff --git a/cloud-function/src/app.ts b/cloud-function/src/app.ts index 66019a394427..1f7221f70d4f 100644 --- a/cloud-function/src/app.ts +++ b/cloud-function/src/app.ts @@ -10,6 +10,7 @@ import { handleStripePlans } from "./handlers/handle-stripe-plans.js"; import { proxyTelemetry } from "./handlers/proxy-telemetry.js"; import { lowercasePathname } from "./middlewares/lowercase-pathname.js"; import { resolveIndexHTML } from "./middlewares/resolve-index-html.js"; +import { redirectNonCanonicals } from "./middlewares/redirect-non-canonicals.js"; import { redirectLeadingSlash } from "./middlewares/redirect-leading-slash.js"; import { redirectMovedPages } from "./middlewares/redirect-moved-pages.js"; import { redirectEnforceTrailingSlash } from "./middlewares/redirect-enforce-trailing-slash.js"; @@ -26,43 +27,59 @@ import { proxyPong } from "./handlers/proxy-pong.js"; const router = Router(); router.use(stripForwardedHostHeaders); router.use(redirectLeadingSlash); +// MDN Plus plans. router.all( "/api/v1/stripe/plans", requireOrigin(Origin.main), handleStripePlans ); +// Backend. router.all( ["/api/*", "/admin-api/*", "/events/fxa", "/users/fxa/*"], requireOrigin(Origin.main), proxyApi ); +// Telemetry. router.all("/submit/mdn-yari/*", requireOrigin(Origin.main), proxyTelemetry); router.all("/pong/*", requireOrigin(Origin.main), express.json(), proxyPong); router.all("/pimg/*", requireOrigin(Origin.main), proxyPong); +// Playground. router.get( ["/[^/]+/docs/*/runner.html", "/[^/]+/blog/*/runner.html", "/runner.html"], requireOrigin(Origin.play), resolveRunnerHtml, proxyRunner ); +// Assets. router.get( ["/assets/*", "/sitemaps/*", "/static/*", "/[^/]+.[^/]+"], requireOrigin(Origin.main), proxyContent ); +router.get( + "/[^/]+/search-index.json", + requireOrigin(Origin.main), + lowercasePathname, + proxyContent +); +// Root. router.get("/", requireOrigin(Origin.main), redirectLocale); +// Live samples. router.get( ["/[^/]+/docs/*/_sample_.*.html", "/[^/]+/blog/*/_sample_.*.html"], requireOrigin(Origin.liveSamples), resolveIndexHTML, proxyContent ); +// Attachments. router.get( `/[^/]+/docs/*/*.(${ANY_ATTACHMENT_EXT.join("|")})`, requireOrigin(Origin.main, Origin.liveSamples, Origin.play), resolveIndexHTML, proxyContent ); +// Pages. +router.use(redirectNonCanonicals); router.get( "/[^/]+/docs/*", requireOrigin(Origin.main), @@ -81,12 +98,7 @@ router.get( resolveIndexHTML, proxyContent ); -router.get( - "/[^/]+/search-index.json", - requireOrigin(Origin.main), - lowercasePathname, - proxyContent -); +// MDN Plus, static pages, etc. router.get( "*", requireOrigin(Origin.main), diff --git a/cloud-function/src/build-canonicals.ts b/cloud-function/src/build-canonicals.ts new file mode 100644 index 000000000000..b5fdd6a484ee --- /dev/null +++ b/cloud-function/src/build-canonicals.ts @@ -0,0 +1,40 @@ +import { readFile, stat, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import * as dotenv from "dotenv"; + +import { normalizePath } from "./utils.js"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const root = join(__dirname, "..", ".."); +dotenv.config({ + path: join(root, process.env["ENV_FILE"] || ".env"), +}); + +async function buildCanonicals() { + const { BUILD_OUT_ROOT = join(root, "client", "build") } = process.env; + + const sitemapPath = join(BUILD_OUT_ROOT, "sitemap.txt"); + + const content = await readFile(sitemapPath, "utf-8"); + const lines = content.split("\n"); + const pages = lines.filter((line) => line.startsWith("/")); + + const siteMap: Record = {}; + for (const page of pages) { + siteMap[normalizePath(page)] = page; + } + console.log(`- ${sitemapPath}: ${pages.length} pages`); + + const output = "canonicals.json"; + + await writeFile(output, JSON.stringify(siteMap)); + + const count = Object.keys(siteMap).length; + const kb = Math.round((await stat(output)).size / 1024); + console.log(`Wrote ${count} pages in ${kb} KB.`); +} + +await buildCanonicals(); diff --git a/cloud-function/src/middlewares/redirect-non-canonicals.ts b/cloud-function/src/middlewares/redirect-non-canonicals.ts new file mode 100644 index 000000000000..a2abc9514822 --- /dev/null +++ b/cloud-function/src/middlewares/redirect-non-canonicals.ts @@ -0,0 +1,46 @@ +import { createRequire } from "node:module"; + +import { NextFunction, Request, Response } from "express"; + +import { THIRTY_DAYS } from "../constants.js"; +import { normalizePath, redirect } from "../utils.js"; + +const require = createRequire(import.meta.url); +const REDIRECTS = require("../../canonicals.json"); +const REDIRECT_SUFFIXES = ["/index.json", "/bcd.json", ""]; + +export async function redirectNonCanonicals( + req: Request, + res: Response, + next: NextFunction +) { + const parsedUrl = new URL(req.url, `${req.protocol}://${req.headers.host}/`); + const { pathname } = parsedUrl; + + // Redirect to canonical version. + // Example: + // - Source: /en-US/docs/web/guide/ajax/getting_started + // - Target: /en-US/docs/Web/Guide/AJAX/Getting_Started + for (const suffix of REDIRECT_SUFFIXES) { + if (!pathname.endsWith(suffix)) { + continue; + } + const originalSource = pathname.substring( + 0, + pathname.length - suffix.length + ); + const source = normalizePath(originalSource); + if ( + typeof REDIRECTS[source] == "string" && + REDIRECTS[source] !== originalSource + ) { + const target = REDIRECTS[source] + suffix + parsedUrl.search; + return redirect(res, target, { + status: 301, + cacheControlSeconds: THIRTY_DAYS, + }); + } + } + + next(); +} diff --git a/cloud-function/src/utils.ts b/cloud-function/src/utils.ts index 34f0060d6608..893570e09736 100644 --- a/cloud-function/src/utils.ts +++ b/cloud-function/src/utils.ts @@ -59,3 +59,7 @@ const ANY_ATTACHMENT_REGEXP = createRegExpFromExtensions( export function isAsset(url: string) { return ANY_ATTACHMENT_REGEXP.test(url); } + +export function normalizePath(path: string) { + return path.toLowerCase().replace(/\/$/, ""); +}