Skip to content

Commit

Permalink
feat(cloud-function): redirect non-canonical URLs (#11151)
Browse files Browse the repository at this point in the history
Previously, we served the same content independent of capitalization:
- https://developer.mozilla.org/en-US/docs/web (incorrect capitalization)
- https://developer.mozilla.org/en-US/docs/Web (correct capitalization)

Now, we build a map to match the requested against the canonical URL,
redirecting if necessary.
  • Loading branch information
caugner authored Jun 21, 2024
1 parent 118f71b commit 8568a5e
Show file tree
Hide file tree
Showing 13 changed files with 190 additions and 39 deletions.
1 change: 1 addition & 0 deletions .github/workflows/prod-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ jobs:
run: |-
npm ci
npm run build-redirects
npm run build-canonicals
- name: Deploy Function
if: ${{ ! vars.SKIP_FUNCTION }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/stage-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ jobs:
run: |
npm ci
npm run build-redirects
npm run build-canonicals
- name: Deploy Function
if: ${{ ! vars.SKIP_FUNCTION }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/xyz-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ jobs:
run: |
npm ci
npm run build-redirects
npm run build-canonicals
- name: Deploy Function
if: ${{ ! vars.SKIP_FUNCTION }}
Expand Down
28 changes: 9 additions & 19 deletions build/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ import {
} from "./index.js";
import { Doc, DocMetadata, Flaws } from "../libs/types/document.js";
import SearchIndex from "./search-index.js";
import { makeSitemapIndexXML, buildSitemap } from "./sitemaps.js";
import { buildSitemapIndex, buildSitemap } from "./sitemaps.js";
import { humanFileSize } from "./utils.js";
import { initSentry } from "./sentry.js";
import { macroRenderTimes } from "../kumascript/src/render.js";
import { fdir } from "fdir";
import { ssrDocument } from "./ssr.js";
import { HydrationData } from "../libs/types/hydration.js";

Expand Down Expand Up @@ -515,25 +514,16 @@ program
if (!options.quiet) {
console.log(chalk.yellow("Building sitemap index file..."));
}
const sitemapsBuilt = new fdir()
.filter((p) => p.endsWith("/sitemap.xml.gz"))
.withFullPaths()
.crawl(path.join(BUILD_OUT_ROOT, "sitemaps"))
.sync()
.sort()
.map((fp) => fp.replace(BUILD_OUT_ROOT, ""));
const sitemapIndexFilePath = path.join(BUILD_OUT_ROOT, "sitemap.xml");
fs.writeFileSync(
sitemapIndexFilePath,
makeSitemapIndexXML(sitemapsBuilt)
);
const sitemapsBuilt = await buildSitemapIndex();

if (!options.quiet) {
console.log(
chalk.green(
`Wrote sitemap index referencing ${sitemapsBuilt.length} sitemaps:\n${sitemapsBuilt.map((s) => `- ${s}`).join("\n")}`
)
);
for (const sitemaps of sitemapsBuilt) {
console.log(
chalk.green(
`Wrote sitemap index referencing ${sitemaps.length} sitemaps:\n${sitemaps.map((s) => `- ${s}`).join("\n")}`
)
);
}
}
return;
}
Expand Down
76 changes: 63 additions & 13 deletions build/sitemaps.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { join } from "node:path";
import { mkdir, writeFile } from "node:fs/promises";
import { mkdir, readFile, writeFile } from "node:fs/promises";
import { gzipSync } from "node:zlib";

import { fdir } from "fdir";

import { BASE_URL, BUILD_OUT_ROOT } from "../libs/env/index.js";

const SITEMAP_BASE_URL = BASE_URL.replace(/\/$/, "");
Expand All @@ -18,10 +20,61 @@ export async function buildSitemap(
pathSuffix = [],
}: { slugPrefix?: string; pathSuffix?: string[] }
) {
const txt = entries.map(({ slug }) => `${slugPrefix}${slug}`).join("\n");
const xml = makeSitemapXML(slugPrefix, entries);
const path = await writeSitemap(xml, ...pathSuffix);

return path;
const dirPath = join(
BUILD_OUT_ROOT,
"sitemaps",
...pathSuffix.map((p) => p.toLowerCase())
);
await mkdir(dirPath, { recursive: true });

const txtPath = join(dirPath, "sitemap.txt");
const xmlPath = join(dirPath, "sitemap.xml.gz");

await Promise.all([
writeFile(txtPath, txt, "utf-8"),
writeFile(xmlPath, gzipSync(xml)),
]);

return xmlPath;
}

export async function buildSitemapIndex() {
return Promise.all([buildSitemapIndexTXT(), buildSitemapIndexXML()]);
}

export async function buildSitemapIndexTXT() {
const sitemaps = new fdir()
.filter((p) => p.endsWith("/sitemap.txt"))
.withFullPaths()
.crawl(join(BUILD_OUT_ROOT, "sitemaps"))
.sync();

const file = join(BUILD_OUT_ROOT, "sitemap.txt");

const content = await makeSitemapIndexTXT(sitemaps);

await writeFile(file, content, "utf-8");

return sitemaps.sort().map((fp) => fp.replace(BUILD_OUT_ROOT, ""));
}

export async function buildSitemapIndexXML() {
const sitemaps = new fdir()
.filter((p) => p.endsWith("/sitemap.xml.gz"))
.withFullPaths()
.crawl(join(BUILD_OUT_ROOT, "sitemaps"))
.sync()
.sort()
.map((fp) => fp.replace(BUILD_OUT_ROOT, ""));

const file = join(BUILD_OUT_ROOT, "sitemap.xml");

await writeFile(file, makeSitemapIndexXML(sitemaps));

return sitemaps.sort().map((fp) => fp.replace(BUILD_OUT_ROOT, ""));
}

function makeSitemapXML(prefix: string, docs: SitemapEntry[]) {
Expand Down Expand Up @@ -62,16 +115,13 @@ export function makeSitemapIndexXML(paths: string[]) {
].join("\n");
}

async function writeSitemap(xml: string, ...paths: string[]) {
const dirPath = join(
BUILD_OUT_ROOT,
"sitemaps",
...paths.map((p) => p.toLowerCase())
);
await mkdir(dirPath, { recursive: true });
/**
* Creates a global text sitemap by merging all text sitemaps.
*/
export async function makeSitemapIndexTXT(paths: string[]) {
const maps = await Promise.all(paths.map((p) => readFile(p, "utf-8")));

const filePath = join(dirPath, "sitemap.xml.gz");
await writeFile(filePath, gzipSync(xml));
const urls = maps.join("\n").split("\n").filter(Boolean);

return filePath;
return urls.sort().join("\n");
}
5 changes: 4 additions & 1 deletion build/spas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@ export async function buildSPAs(options: {
continue;
}
for (const pathLocale of fs.readdirSync(root)) {
if (!fs.statSync(path.join(root, pathLocale)).isDirectory()) {
if (
!fs.statSync(path.join(root, pathLocale)).isDirectory() ||
!isValidLocale(pathLocale)
) {
continue;
}

Expand Down
1 change: 1 addition & 0 deletions cloud-function/.gcloudignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
.gitignore

#!include:.gitignore
!canonicals.json
!redirects.json
!src/internal/**
1 change: 1 addition & 0 deletions cloud-function/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ node_modules
.env*
!.env-dist
*.log
canonicals.json
redirects.json
src/**/*.js
src/internal
1 change: 1 addition & 0 deletions cloud-function/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"main": "src/index.js",
"scripts": {
"build": "tsc -b",
"build-canonicals": "cross-env NODE_OPTIONS='--no-warnings=ExperimentalWarning --loader ts-node/esm' node src/build-canonicals.ts",
"build-redirects": "cross-env NODE_OPTIONS='--no-warnings=ExperimentalWarning --loader ts-node/esm' node src/build-redirects.ts",
"copy-internal": "rm -rf ./src/internal && cp -R ../libs ./src/internal",
"gcp-build": "npm run build",
Expand Down
24 changes: 18 additions & 6 deletions cloud-function/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { handleStripePlans } from "./handlers/handle-stripe-plans.js";
import { proxyTelemetry } from "./handlers/proxy-telemetry.js";
import { lowercasePathname } from "./middlewares/lowercase-pathname.js";
import { resolveIndexHTML } from "./middlewares/resolve-index-html.js";
import { redirectNonCanonicals } from "./middlewares/redirect-non-canonicals.js";
import { redirectLeadingSlash } from "./middlewares/redirect-leading-slash.js";
import { redirectMovedPages } from "./middlewares/redirect-moved-pages.js";
import { redirectEnforceTrailingSlash } from "./middlewares/redirect-enforce-trailing-slash.js";
Expand All @@ -26,43 +27,59 @@ import { proxyPong } from "./handlers/proxy-pong.js";
const router = Router();
router.use(stripForwardedHostHeaders);
router.use(redirectLeadingSlash);
// MDN Plus plans.
router.all(
"/api/v1/stripe/plans",
requireOrigin(Origin.main),
handleStripePlans
);
// Backend.
router.all(
["/api/*", "/admin-api/*", "/events/fxa", "/users/fxa/*"],
requireOrigin(Origin.main),
proxyApi
);
// Telemetry.
router.all("/submit/mdn-yari/*", requireOrigin(Origin.main), proxyTelemetry);
router.all("/pong/*", requireOrigin(Origin.main), express.json(), proxyPong);
router.all("/pimg/*", requireOrigin(Origin.main), proxyPong);
// Playground.
router.get(
["/[^/]+/docs/*/runner.html", "/[^/]+/blog/*/runner.html", "/runner.html"],
requireOrigin(Origin.play),
resolveRunnerHtml,
proxyRunner
);
// Assets.
router.get(
["/assets/*", "/sitemaps/*", "/static/*", "/[^/]+.[^/]+"],
requireOrigin(Origin.main),
proxyContent
);
router.get(
"/[^/]+/search-index.json",
requireOrigin(Origin.main),
lowercasePathname,
proxyContent
);
// Root.
router.get("/", requireOrigin(Origin.main), redirectLocale);
// Live samples.
router.get(
["/[^/]+/docs/*/_sample_.*.html", "/[^/]+/blog/*/_sample_.*.html"],
requireOrigin(Origin.liveSamples),
resolveIndexHTML,
proxyContent
);
// Attachments.
router.get(
`/[^/]+/docs/*/*.(${ANY_ATTACHMENT_EXT.join("|")})`,
requireOrigin(Origin.main, Origin.liveSamples, Origin.play),
resolveIndexHTML,
proxyContent
);
// Pages.
router.use(redirectNonCanonicals);
router.get(
"/[^/]+/docs/*",
requireOrigin(Origin.main),
Expand All @@ -81,12 +98,7 @@ router.get(
resolveIndexHTML,
proxyContent
);
router.get(
"/[^/]+/search-index.json",
requireOrigin(Origin.main),
lowercasePathname,
proxyContent
);
// MDN Plus, static pages, etc.
router.get(
"*",
requireOrigin(Origin.main),
Expand Down
40 changes: 40 additions & 0 deletions cloud-function/src/build-canonicals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import { readFile, stat, writeFile } from "node:fs/promises";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";

import * as dotenv from "dotenv";

import { normalizePath } from "./utils.js";

const __dirname = dirname(fileURLToPath(import.meta.url));

const root = join(__dirname, "..", "..");
dotenv.config({
path: join(root, process.env["ENV_FILE"] || ".env"),
});

async function buildCanonicals() {
const { BUILD_OUT_ROOT = join(root, "client", "build") } = process.env;

const sitemapPath = join(BUILD_OUT_ROOT, "sitemap.txt");

const content = await readFile(sitemapPath, "utf-8");
const lines = content.split("\n");
const pages = lines.filter((line) => line.startsWith("/"));

const siteMap: Record<string, string> = {};
for (const page of pages) {
siteMap[normalizePath(page)] = page;
}
console.log(`- ${sitemapPath}: ${pages.length} pages`);

const output = "canonicals.json";

await writeFile(output, JSON.stringify(siteMap));

const count = Object.keys(siteMap).length;
const kb = Math.round((await stat(output)).size / 1024);
console.log(`Wrote ${count} pages in ${kb} KB.`);
}

await buildCanonicals();
46 changes: 46 additions & 0 deletions cloud-function/src/middlewares/redirect-non-canonicals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import { createRequire } from "node:module";

import { NextFunction, Request, Response } from "express";

import { THIRTY_DAYS } from "../constants.js";
import { normalizePath, redirect } from "../utils.js";

const require = createRequire(import.meta.url);
const REDIRECTS = require("../../canonicals.json");
const REDIRECT_SUFFIXES = ["/index.json", "/bcd.json", ""];

export async function redirectNonCanonicals(
req: Request,
res: Response,
next: NextFunction
) {
const parsedUrl = new URL(req.url, `${req.protocol}://${req.headers.host}/`);
const { pathname } = parsedUrl;

// Redirect to canonical version.
// Example:
// - Source: /en-US/docs/web/guide/ajax/getting_started
// - Target: /en-US/docs/Web/Guide/AJAX/Getting_Started
for (const suffix of REDIRECT_SUFFIXES) {
if (!pathname.endsWith(suffix)) {
continue;
}
const originalSource = pathname.substring(
0,
pathname.length - suffix.length
);
const source = normalizePath(originalSource);
if (
typeof REDIRECTS[source] == "string" &&
REDIRECTS[source] !== originalSource
) {
const target = REDIRECTS[source] + suffix + parsedUrl.search;
return redirect(res, target, {
status: 301,
cacheControlSeconds: THIRTY_DAYS,
});
}
}

next();
}
4 changes: 4 additions & 0 deletions cloud-function/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,7 @@ const ANY_ATTACHMENT_REGEXP = createRegExpFromExtensions(
export function isAsset(url: string) {
return ANY_ATTACHMENT_REGEXP.test(url);
}

export function normalizePath(path: string) {
return path.toLowerCase().replace(/\/$/, "");
}

0 comments on commit 8568a5e

Please sign in to comment.