From 299d1fad13fc8fb925ea78dbab5be39e19b0443f Mon Sep 17 00:00:00 2001
From: Oliver Byford <oliver.byford@digital.cabinet-office.gov.uk>
Date: Fri, 21 May 2021 11:33:34 +0100
Subject: [PATCH] =?UTF-8?q?Update=20robots.txt=20to=20ensure=20review=20ap?=
 =?UTF-8?q?p=20isn=E2=80=99t=20indexed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The review app does two things to prevent pages from being indexed – it adds a `X-Robots-Tag` header to responses, and it also serves a `/robots.txt` file which disallows all robots from crawling the site.

However, the `robots.txt` disallow statement actually prevents robots from ever seeing the X-Robots-Tag, which means that although pages from the review app can't be crawled they can still appear in search indexes.

From Google's own documentation [1] [2]:

> Important: For the noindex directive to be effective, the page must not be blocked by a robots.txt file. If the page is blocked by a robots.txt file, the crawler will never see the noindex directive, and the page can still appear in search results, for example if other pages link to it.

> While Google won't crawl or index the content blocked by robots.txt, we might still find and index a disallowed URL if it is linked from other places on the web. As a result, the URL address and, potentially, other publicly available information such as anchor text in links to the page can still appear in Google search results. To properly prevent your URL from appearing in Google Search results, you should password-protect the files on your server or use the noindex meta tag or response header (or remove the page entirely).

Update the robots.txt to allow crawling by all user agents, move the route so that it’s closer to the code that sets the `X-Robots-Tag` header and add a test to check that the robots.txt file matches the expected contents.

[1]: https://developers.google.com/search/docs/advanced/crawling/block-indexing
[2]: https://developers.google.com/search/docs/advanced/robots/intro#understand-the-limitations-of-robots.txt
---
 app/app.js      | 15 ++++++++++-----
 app/app.test.js |  9 +++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/app/app.js b/app/app.js
index 352fcdb359..90a1efea7f 100644
--- a/app/app.js
+++ b/app/app.js
@@ -54,6 +54,16 @@ module.exports = (options) => {
     next()
   })
 
+  // Ensure robots are still able to crawl the pages.
+  //
+  // This might seem like a mistake, but it's not. If a page is blocked by
+  // robots.txt, the crawler will never see the noindex directive, and so the
+  // page can still appear in search results.
+  app.get('/robots.txt', function (req, res) {
+    res.type('text/plain')
+    res.send('User-agent: *\nAllow: /')
+  })
+
   // Set up middleware to serve static assets
   app.use('/public', express.static(configPaths.public))
 
@@ -195,10 +205,5 @@ module.exports = (options) => {
   // Full page example views
   require('./full-page-examples.js')(app)
 
-  app.get('/robots.txt', function (req, res) {
-    res.type('text/plain')
-    res.send('User-agent: *\nDisallow: /')
-  })
-
   return app
 }
diff --git a/app/app.test.js b/app/app.test.js
index ba12b47c8b..02d77cf13c 100644
--- a/app/app.test.js
+++ b/app/app.test.js
@@ -66,6 +66,15 @@ describe(`http://localhost:${PORT}`, () => {
     })
   })
 
+  describe('/robots.txt', () => {
+    it('should allow crawling by robots', done => {
+      requestPath('/robots.txt', (err, res) => {
+        expect(res.body).toMatch(/^Allow: \/$/m)
+        done(err)
+      })
+    })
+  })
+
   describe('/examples/template-custom', () => {
     const templatePath = '/examples/template-custom'