Skip to content

Commit

Permalink
SRT/VTT timestamp validation to deny single digit hour in hh:mm... (#546
Browse files Browse the repository at this point in the history
)

* SRT/VTT timestamp validation to deny single digit hour in hh:mm...

* From code review: break loop once an invalid timestamp is found
  • Loading branch information
Dananji authored Jul 2, 2024
1 parent 03edd24 commit 7465d96
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 24 deletions.
27 changes: 19 additions & 8 deletions src/components/Transcript/Transcript.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import './Transcript.scss';
const NO_TRANSCRIPTS_MSG = 'No valid Transcript(s) found, please check again.';
const INVALID_URL_MSG = 'Invalid URL for transcript, please check again.';
const INVALID_VTT = 'Invalid WebVTT file, please check again.';
const INVALID_TIMESTAMP = 'Invalid timestamp format in cue(s), please check again.';
const NO_SUPPORT = 'Transcript format is not supported, please check again.';

const buildSpeakerText = (item) => {
Expand Down Expand Up @@ -431,14 +432,24 @@ const Transcript = ({ playerID, manifestUrl, showNotes = false, search = {}, tra
if (value != null) {
const { tData, tUrl, tType, tFileExt } = value;
let newError = '';
if (tType === TRANSCRIPT_TYPES.invalid) {
newError = INVALID_URL_MSG;
} else if (tType === TRANSCRIPT_TYPES.noTranscript) {
newError = NO_TRANSCRIPTS_MSG;
} else if (tType === TRANSCRIPT_TYPES.noSupport) {
newError = NO_SUPPORT;
} else if (tType === TRANSCRIPT_TYPES.invalidTimedText) {
newError = INVALID_VTT;
switch (tType) {
case TRANSCRIPT_TYPES.invalid:
newError = INVALID_URL_MSG;
break;
case TRANSCRIPT_TYPES.noTranscript:
newError = NO_TRANSCRIPTS_MSG;
break;
case TRANSCRIPT_TYPES.noSupport:
newError = NO_SUPPORT;
break;
case TRANSCRIPT_TYPES.invalidVTT:
newError = INVALID_VTT;
break;
case TRANSCRIPT_TYPES.invalidTimestamp:
newError = INVALID_TIMESTAMP;
break;
default:
break;
}
setTranscript(tData);
setTranscriptInfo({ title, filename, id, isMachineGen, tType, tUrl, tFileExt, tError: newError });
Expand Down
53 changes: 51 additions & 2 deletions src/components/Transcript/Transcript.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ describe('Transcript component', () => {
initialManifestState: { manifest: lunchroomManners, canvasIndex: 0 },
initialPlayerState: {},
...props,
showNotes: true,
showNotes: true,
});

render(
Expand Down Expand Up @@ -782,7 +782,7 @@ describe('Transcript component', () => {
.mockReturnValue({
tData: [],
tUrl: 'https://example.com/lunchroom_manners.vtt',
tType: transcriptParser.TRANSCRIPT_TYPES.invalidTimedText,
tType: transcriptParser.TRANSCRIPT_TYPES.invalidVTT,
});

const TranscriptWithState = withManifestAndPlayerProvider(Transcript, {
Expand All @@ -809,6 +809,55 @@ describe('Transcript component', () => {
);
});
});

test('WebVTT file with invalid timestamps', async () => {
const props = {
playerID: 'player-id',
transcripts: [
{
canvasId: 0,
items: [
{
title: 'WebVTT Transcript',
url: 'https://example.com/lunchroom_manners.vtt',
},
],
},
],
};

const parseTranscriptMock = jest
.spyOn(transcriptParser, 'parseTranscriptData')
.mockReturnValue({
tData: [],
tUrl: 'https://example.com/lunchroom_manners.vtt',
tType: transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp,
});

const TranscriptWithState = withManifestAndPlayerProvider(Transcript, {
initialManifestState: { manifest: lunchroomManners, canvasIndex: 0 },
initialPlayerState: {},
...props
});

render(
<React.Fragment>
<video id="player-id" />
<TranscriptWithState />
</React.Fragment>
);

await act(() => Promise.resolve());

await waitFor(() => {
expect(parseTranscriptMock).toHaveBeenCalled();
expect(screen.queryByTestId('transcript_content_-4')).toBeInTheDocument();
expect(screen.queryByTestId('no-transcript')).toBeInTheDocument();
expect(screen.getByTestId('no-transcript')).toHaveTextContent(
'Invalid timestamp format in cue(s), please check again.'
);
});
});
});

describe('with props', () => {
Expand Down
31 changes: 22 additions & 9 deletions src/services/transcript-parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ const TRANSCRIPT_MIME_EXTENSIONS = [

// ENum for describing transcript types include invalid and no transcript info
export const TRANSCRIPT_TYPES = {
invalidTimedText: -3,
invalidTimestamp: -4,
invalidVTT: -3,
noSupport: -2,
invalid: -1,
noTranscript: 0,
Expand Down Expand Up @@ -557,23 +558,35 @@ export function parseTimedText(fileData, isSRT = false) {
const { valid, cue_lines, notes } = validateWebVTT(lines);
if (!valid) {
console.error('Invalid WebVTT file');
return { tData: [], tType: TRANSCRIPT_TYPES.invalidTimedText };
return { tData: [], tType: TRANSCRIPT_TYPES.invalidVTT };
}
cueLines = cue_lines;
noteLines = notes;
}

const groups = groupTimedTextLines(cueLines);

// Add back the NOTE(s) in the header block
groups.unshift(...noteLines);
groups.map((t) => {
let line = parseTimedTextLine(t, isSRT);
if (line) {

let hasInvalidTimestamp = false;
for (let i = 0; i < groups.length;) {
let line = parseTimedTextLine(groups[i], isSRT);
if (!line) {
hasInvalidTimestamp ||= true;
break;
} else {
tData.push(line);
i++;
}
});
}

return { tData, tType: TRANSCRIPT_TYPES.timedText };
return {
tData: hasInvalidTimestamp ? null : tData,
tType: hasInvalidTimestamp
? TRANSCRIPT_TYPES.invalidTimestamp
: TRANSCRIPT_TYPES.timedText
};
}

/**
Expand Down Expand Up @@ -719,9 +732,9 @@ function parseTimedTextLine({ times, line, tag }, isSRT) {
let timestampRegex;
if (isSRT) {
// SRT allows using comma for milliseconds while WebVTT does not
timestampRegex = /([0-9]*:){1,2}([0-9]{2})(\.|\,)[0-9]{2,3}/g;
timestampRegex = /^(?:\d{2}:)?\d{2}:\d{2}(?:[.,]\d+)/g;
} else {
timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g;
timestampRegex = /^(?:\d{2}:)?\d{2}:\d{2}(?:\.\d+)/;
}

switch (tag) {
Expand Down
27 changes: 22 additions & 5 deletions src/services/transcript-parser.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ describe('transcript-parser', () => {
expect(tData).toHaveLength(0);
expect(console.error).toHaveBeenCalledTimes(1);
expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file');
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText);
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidVTT);
});

test('with random text in the header', () => {
Expand All @@ -866,24 +866,41 @@ describe('transcript-parser', () => {
expect(tData).toHaveLength(0);
expect(console.error).toHaveBeenCalledTimes(1);
expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file');
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText);
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidVTT);
});

test('with incorrect timestamp', () => {
test('with incorrect timestamp with missing seconds group in hh:mm:ss format', () => {
// mock console.error
console.error = jest.fn();
const mockResponse =
'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:.000\n[music]\n\r\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n';

const { tData, tType } = transcriptParser.parseTimedText(mockResponse);

expect(tData).toHaveLength(4);
expect(tData).toBeNull();
expect(console.error).toHaveBeenCalledTimes(1);
expect(console.error).toHaveBeenCalledWith(
'Invalid timestamp in line with text; ',
'[music]'
);
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText);
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp);
});

test('with incorrect timestamp with a single digit for hour in hh:mm:ss format', () => {
// mock console.error
console.error = jest.fn();
const mockResponse =
'WEBVTT\r\n\r\n1\r\n0:00:01.200 --> 0:00:00.000\n[music]\n\r\n2\r\n0:00:22.200 --> 0:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n0:00:26.700 --> 0:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n0:00:31.600 --> 0:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n0:00:36.100 --> 0:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n';

const { tData, tType } = transcriptParser.parseTimedText(mockResponse);

expect(tData).toBeNull();
expect(console.error).toHaveBeenCalledTimes(1);
expect(console.error).toHaveBeenCalledWith(
'Invalid timestamp in line with text; ',
'[music]'
);
expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp);
});
});
});
Expand Down

0 comments on commit 7465d96

Please sign in to comment.