From e8f9e6c45bd79c4e903cbfc1ccf885e7c6f1ad70 Mon Sep 17 00:00:00 2001
From: Jonathan Goldwasser <jogold@users.noreply.github.com>
Date: Mon, 30 Aug 2021 17:46:22 +0200
Subject: [PATCH] feat(core): normalize line endings in asset hash calculation
 (#16276)

Replace CRLF with LF so asset hashes are identical across platforms.

The hash still includes the size but it is now the size after converting
line endings.

Addresses #14555 (closes it?)

----

*By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
---
 packages/@aws-cdk/core/lib/fs/fingerprint.ts  | 52 ++++++++++++++++---
 packages/@aws-cdk/core/test/fs/eol/lf.txt     |  2 +
 .../core/test/fs/fs-fingerprint.test.ts       | 24 +++++++++
 3 files changed, 71 insertions(+), 7 deletions(-)
 create mode 100644 packages/@aws-cdk/core/test/fs/eol/lf.txt

diff --git a/packages/@aws-cdk/core/lib/fs/fingerprint.ts b/packages/@aws-cdk/core/lib/fs/fingerprint.ts
index 50a81fd53982d..7ba7214109b2c 100644
--- a/packages/@aws-cdk/core/lib/fs/fingerprint.ts
+++ b/packages/@aws-cdk/core/lib/fs/fingerprint.ts
@@ -9,14 +9,18 @@ const BUFFER_SIZE = 8 * 1024;
 const CTRL_SOH = '\x01';
 const CTRL_SOT = '\x02';
 const CTRL_ETX = '\x03';
+const CR = '\r';
+const LF = '\n';
+const CRLF = `${CR}${LF}`;
 
 /**
  * Produces fingerprint based on the contents of a single file or an entire directory tree.
  *
+ * Line endings are converted from CRLF to LF.
+ *
  * The fingerprint will also include:
  * 1. An extra string if defined in `options.extra`.
- * 2. The set of exclude patterns, if defined in `options.exclude`
- * 3. The symlink follow mode value.
+ * 2. The symlink follow mode value.
  *
  * @param fileOrDirectory The directory or file to fingerprint
  * @param options Fingerprinting options
@@ -60,7 +64,7 @@ export function fingerprint(fileOrDirectory: string, options: FingerprintOptions
         _hashField(hash, `link:${relativePath}`, linkTarget);
       }
     } else if (stat.isFile()) {
-      _hashField(hash, `file:${relativePath}`, _contentFingerprint(realPath, stat));
+      _hashField(hash, `file:${relativePath}`, contentFingerprint(realPath));
     } else if (stat.isDirectory()) {
       for (const item of fs.readdirSync(realPath).sort()) {
         _processFileOrDirectory(path.join(symbolicPath, item), false, path.join(realPath, item));
@@ -71,20 +75,54 @@ export function fingerprint(fileOrDirectory: string, options: FingerprintOptions
   }
 }
 
-function _contentFingerprint(file: string, stat: fs.Stats): string {
+export function contentFingerprint(file: string): string {
   const hash = crypto.createHash('sha256');
   const buffer = Buffer.alloc(BUFFER_SIZE);
   // eslint-disable-next-line no-bitwise
   const fd = fs.openSync(file, fs.constants.O_DSYNC | fs.constants.O_RDONLY | fs.constants.O_SYNC);
+  let size = 0;
+  let isBinary = false;
+  let lastStr = '';
+  let read = 0;
   try {
-    let read = 0;
     while ((read = fs.readSync(fd, buffer, 0, BUFFER_SIZE, null)) !== 0) {
-      hash.update(buffer.slice(0, read));
+      const slicedBuffer = buffer.slice(0, read);
+
+      // Detect if file is binary by checking the first 8k bytes for the
+      // null character (git like implementation)
+      if (size === 0) {
+        isBinary = slicedBuffer.indexOf(0) !== -1;
+      }
+
+      let dataBuffer = slicedBuffer;
+      if (!isBinary) { // Line endings normalization (CRLF -> LF)
+        const str = buffer.slice(0, read).toString();
+
+        // We are going to normalize line endings to LF. So if the current
+        // buffer ends with CR, it could be that the next one starts with
+        // LF so we need to save it for later use.
+        if (new RegExp(`${CR}$`).test(str)) {
+          lastStr += str;
+          continue;
+        }
+
+        const data = lastStr + str;
+        const normalizedData = data.replace(new RegExp(CRLF, 'g'), LF);
+        dataBuffer = Buffer.from(normalizedData);
+        lastStr = '';
+      }
+
+      size += dataBuffer.length;
+      hash.update(dataBuffer);
+    }
+
+    if (lastStr) {
+      hash.update(Buffer.from(lastStr));
     }
   } finally {
     fs.closeSync(fd);
   }
-  return `${stat.size}:${hash.digest('hex')}`;
+  return `${size}:${hash.digest('hex')}`;
 }
 
 function _hashField(hash: crypto.Hash, header: string, value: string | Buffer | DataView) {
diff --git a/packages/@aws-cdk/core/test/fs/eol/lf.txt b/packages/@aws-cdk/core/test/fs/eol/lf.txt
new file mode 100644
index 0000000000000..f41bc690ba927
--- /dev/null
+++ b/packages/@aws-cdk/core/test/fs/eol/lf.txt
@@ -0,0 +1,2 @@
+hello word
+this a new line!
diff --git a/packages/@aws-cdk/core/test/fs/fs-fingerprint.test.ts b/packages/@aws-cdk/core/test/fs/fs-fingerprint.test.ts
index 6a589b3ba159e..be093c32cbffc 100644
--- a/packages/@aws-cdk/core/test/fs/fs-fingerprint.test.ts
+++ b/packages/@aws-cdk/core/test/fs/fs-fingerprint.test.ts
@@ -3,6 +3,7 @@ import * as os from 'os';
 import * as path from 'path';
 import { nodeunitShim, Test } from 'nodeunit-shim';
 import { FileSystem, SymlinkFollowMode } from '../../lib/fs';
+import { contentFingerprint } from '../../lib/fs/fingerprint';
 
 nodeunitShim({
   files: {
@@ -155,4 +156,27 @@ nodeunitShim({
       test.done();
     },
   },
+
+  eol: {
+    'normalizes line endings'(test: Test) {
+      // GIVEN
+      const lf = path.join(__dirname, 'eol', 'lf.txt');
+      const crlf = path.join(__dirname, 'eol', 'crlf.txt');
+      fs.writeFileSync(crlf, fs.readFileSync(lf, 'utf8').replace(/\n/g, '\r\n'));
+
+      const lfStat = fs.statSync(lf);
+      const crlfStat = fs.statSync(crlf);
+
+      // WHEN
+      const crlfHash = contentFingerprint(crlf);
+      const lfHash = contentFingerprint(lf);
+
+      // THEN
+      test.notEqual(crlfStat.size, lfStat.size); // Difference in size due to different line endings
+      test.deepEqual(crlfHash, lfHash); // Same hash
+
+      fs.unlinkSync(crlf);
+      test.done();
+    },
+  },
 });