diff --git a/packages/api/src/export.ts b/packages/api/src/export.ts index e9c0879c..bbdf6b12 100644 --- a/packages/api/src/export.ts +++ b/packages/api/src/export.ts @@ -3,13 +3,13 @@ import { isS3Failure, logger, TranscriptionConfig, - TranscriptionDynamoItem, } from '@guardian/transcription-service-backend-common'; import { ExportItems, ExportStatus, ExportStatuses, ExportType, + TranscriptionDynamoItem, } from '@guardian/transcription-service-common'; import { uploadToGoogleDocs } from './services/googleDrive'; import { S3Client } from '@aws-sdk/client-s3'; diff --git a/packages/backend-common/src/sqs.ts b/packages/backend-common/src/sqs.ts index 15277279..f1b52f9e 100644 --- a/packages/backend-common/src/sqs.ts +++ b/packages/backend-common/src/sqs.ts @@ -10,9 +10,9 @@ import { OutputBucketUrls, DestinationService, TranscriptionJob, - LanguageCode, TranscriptionOutput, TranscriptionEngine, + InputLanguageCode, } from '@guardian/transcription-service-common'; import { getSignedUploadUrl, @@ -67,7 +67,7 @@ export const generateOutputSignedUrlAndSendMessage = async ( userEmail: string, originalFilename: string, inputSignedUrl: string, - languageCode: LanguageCode, + languageCode: InputLanguageCode, translationRequested: boolean, diarizationRequested: boolean, ): Promise => { diff --git a/packages/common/src/languages.ts b/packages/common/src/languages.ts index a3991005..35b5d409 100644 --- a/packages/common/src/languages.ts +++ b/packages/common/src/languages.ts @@ -1,10 +1,9 @@ -import { z } from 'zod'; +import { getKeys } from './types'; // languages supported by whisper.cpp // copied from // https://github.com/ggerganov/whisper.cpp/blob/25d313b38b1f562200f915cd5952555613cd0110/whisper.cpp#L251 // and values transformed to title case -export const languageCodeToLanguage = Object.freeze({ - auto: 'Auto-detect language (not yet recommended)', +const languageCodeToLanguage = { en: 'English', zh: 'Chinese', de: 'German', @@ -105,16 +104,18 @@ export const languageCodeToLanguage = Object.freeze({ jw: 'Javanese', su: 'Sundanese', yue: 'Cantonese', -}); +}; -type LanguageCodeToLanguage = typeof languageCodeToLanguage; +export const languageCodeToLanguageWithAuto = Object.freeze({ + auto: 'Auto-detect language (not yet recommended)', + ...languageCodeToLanguage, +}); -// There doesn't seem to be a way to get the object keys that has a type which -// is an array of a union of string literals rather than an array of strings. -// https://www.charpeni.com/blog/properly-type-object-keys-and-object-entries -export const languageCodes = Object.keys(languageCodeToLanguage) as [ - keyof LanguageCodeToLanguage, -]; +export const languageCodeToLanguageWithUnknown = Object.freeze({ + UNKNOWN: 'Detected language not recognised', + ...languageCodeToLanguage, +}); -export const LanguageCode = z.enum(languageCodes); -export type LanguageCode = z.infer; +export const languageCodes = getKeys(languageCodeToLanguage); +export const inputLanguageCodes = getKeys(languageCodeToLanguageWithAuto); +export const outputLanguageCodes = getKeys(languageCodeToLanguageWithUnknown); diff --git a/packages/common/src/types.ts b/packages/common/src/types.ts index c34799b7..6301a751 100644 --- a/packages/common/src/types.ts +++ b/packages/common/src/types.ts @@ -1,5 +1,5 @@ import { z } from 'zod'; -import { LanguageCode, languageCodeToLanguage } from './languages'; +import { inputLanguageCodes, outputLanguageCodes } from './languages'; // thanks https://github.com/colinhacks/zod/discussions/2125#discussioncomment-7452235 // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -7,7 +7,15 @@ export function getKeys>(obj: T) { return Object.keys(obj) as [keyof typeof obj]; } -const zodLanguageCode = z.enum(getKeys(languageCodeToLanguage)); +export const InputLanguageCode = z.enum(inputLanguageCodes); +export type InputLanguageCode = z.infer; + +export const OutputLanguageCode = z.enum(outputLanguageCodes); +export type OutputLanguageCode = z.infer; + +export const inputToOutputLanguageCode = ( + c: InputLanguageCode, +): OutputLanguageCode => (c === 'auto' ? 'UNKNOWN' : c); export enum DestinationService { TranscriptionService = 'TranscriptionService', @@ -41,7 +49,7 @@ export const MediaDownloadJob = z.object({ id: z.string(), url: z.string(), userEmail: z.string(), - languageCode: zodLanguageCode, + languageCode: InputLanguageCode, translationRequested: z.boolean(), diarizationRequested: z.boolean(), }); @@ -61,7 +69,7 @@ export const TranscriptionJob = z.object({ userEmail: z.string(), transcriptDestinationService: z.nativeEnum(DestinationService), outputBucketUrls: OutputBucketUrls, - languageCode: zodLanguageCode, + languageCode: InputLanguageCode, translate: z.boolean(), diarize: z.boolean(), engine: z.nativeEnum(TranscriptionEngine), @@ -80,7 +88,7 @@ const TranscriptionOutputBase = z.object({ export const TranscriptionOutputSuccess = TranscriptionOutputBase.extend({ status: z.literal('SUCCESS'), - languageCode: z.string(), + languageCode: OutputLanguageCode, outputBucketKeys: OutputBucketKeys, // we can get rid of this when we switch to using a zip translationOutputBucketKeys: z.optional(OutputBucketKeys), @@ -215,7 +223,7 @@ export type CreateFolderRequest = z.infer; export const transcribeUrlRequestBody = z.object({ url: z.string(), - languageCode: zodLanguageCode, + languageCode: InputLanguageCode, translationRequested: z.boolean(), diarizationRequested: z.boolean(), }); @@ -225,7 +233,7 @@ export type TranscribeUrlRequestBody = z.infer; export const transcribeFileRequestBody = z.object({ s3Key: z.string(), fileName: z.string(), - languageCode: zodLanguageCode, + languageCode: InputLanguageCode, translationRequested: z.boolean(), diarizationRequested: z.boolean(), }); @@ -263,7 +271,7 @@ export const TranscriptionDynamoItem = z.object({ userEmail: z.string(), completedAt: z.optional(z.string()), // dynamodb can't handle dates so we need to use an ISO date isTranslation: z.boolean(), - languageCode: z.optional(LanguageCode), + languageCode: z.optional(OutputLanguageCode), exportStatuses: z.optional(ExportStatuses), }); diff --git a/packages/output-handler/src/index.ts b/packages/output-handler/src/index.ts index 9242e2cf..ed84cee7 100644 --- a/packages/output-handler/src/index.ts +++ b/packages/output-handler/src/index.ts @@ -18,7 +18,6 @@ import { TranscriptionOutputFailure, transcriptionOutputIsTranscriptionFailure, TranscriptionDynamoItem, - LanguageCode, } from '@guardian/transcription-service-common'; import { MetricsService, @@ -64,14 +63,6 @@ const handleTranscriptionSuccess = async ( metrics: MetricsService, sourceMediaDownloadUrl: string, ) => { - const languageCode = LanguageCode.safeParse(transcriptionOutput.languageCode); - if (!languageCode.success) { - logger.error('Failed to parse language code', { - languageCode: transcriptionOutput.languageCode, - }); - await metrics.putMetric(FailureMetric); - return; - } const dynamoItem: TranscriptionDynamoItem = { id: transcriptionOutput.id, originalFilename: transcriptionOutput.originalFilename, @@ -83,7 +74,7 @@ const handleTranscriptionSuccess = async ( userEmail: transcriptionOutput.userEmail, completedAt: new Date().toISOString(), isTranslation: transcriptionOutput.isTranslation, - languageCode: languageCode.data, + languageCode: transcriptionOutput.languageCode, }; try { diff --git a/packages/worker/src/index.ts b/packages/worker/src/index.ts index 98eb32b8..4075ad7b 100644 --- a/packages/worker/src/index.ts +++ b/packages/worker/src/index.ts @@ -17,6 +17,7 @@ import { import { DestinationService, OutputBucketKeys, + OutputLanguageCode, TranscriptionJob, TranscriptionOutputFailure, type TranscriptionOutputSuccess, @@ -348,6 +349,11 @@ const pollTranscriptionQueue = async ( job.engine === 'whisperx', ); + const languageCode: OutputLanguageCode = + job.languageCode === 'auto' + ? transcriptResult.metadata.detectedLanguageCode + : job.languageCode; + // if we've received an interrupt signal we don't want to perform a half-finished transcript upload/publish as // this may, for example, result in duplicate emails to the user. Here we assume that we can upload some text // files to s3 and make a single request to SNS and SQS within 20 seconds @@ -385,10 +391,7 @@ const pollTranscriptionQueue = async ( const transcriptionOutput: TranscriptionOutputSuccess = { id: job.id, status: 'SUCCESS', - languageCode: - job.languageCode === 'auto' - ? transcriptResult.metadata.detectedLanguageCode || 'UNKNOWN' - : job.languageCode, + languageCode, userEmail: job.userEmail, originalFilename: job.originalFilename, outputBucketKeys, diff --git a/packages/worker/src/transcribe.ts b/packages/worker/src/transcribe.ts index 538c2ddf..47a39f03 100644 --- a/packages/worker/src/transcribe.ts +++ b/packages/worker/src/transcribe.ts @@ -2,8 +2,10 @@ import path from 'path'; import { readFile } from '@guardian/transcription-service-backend-common'; import { logger } from '@guardian/transcription-service-backend-common'; import { - LanguageCode, + InputLanguageCode, + inputToOutputLanguageCode, languageCodes, + OutputLanguageCode, TranscriptionEngine, } from '@guardian/transcription-service-common'; import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process'; @@ -21,7 +23,7 @@ export interface Transcripts { } type TranscriptionMetadata = { - detectedLanguageCode?: string; + detectedLanguageCode: OutputLanguageCode; loadTimeMs?: number; totalTimeMs?: number; }; @@ -133,7 +135,7 @@ const getDuration = (ffmpegOutput: string) => { const runTranscription = async ( whisperBaseParams: WhisperBaseParams, - languageCode: LanguageCode, + languageCode: InputLanguageCode, translate: boolean, whisperX: boolean, ) => { @@ -181,7 +183,7 @@ const runTranscription = async ( const getLanguageCode = async ( whisperBaseParams: WhisperBaseParams, whisperX: boolean, -): Promise => { +): Promise => { // whisperx is so slow to start up let's not even bother pre-detecting the language and just let it run detection // for both transcription and translation if (whisperX) { @@ -214,7 +216,8 @@ const transcribeAndTranslate = async ( // we only run language detection once, // so need to override the detected language of future whisper runs - transcription.metadata.detectedLanguageCode = languageCode; + transcription.metadata.detectedLanguageCode = + inputToOutputLanguageCode(languageCode); const translation = languageCode === 'en' ? null @@ -241,7 +244,7 @@ const transcribeAndTranslate = async ( export const getTranscriptionText = async ( whisperBaseParams: WhisperBaseParams, - languageCode: LanguageCode, + languageCode: InputLanguageCode, translate: boolean, combineTranscribeAndTranslate: boolean, whisperX: boolean, @@ -257,11 +260,16 @@ const regexExtract = (text: string, regex: RegExp): string | undefined => { return regexResult ? regexResult[1] : undefined; }; +const parseLanguageCodeString = (languageCode?: string): OutputLanguageCode => + languageCodes.find((c) => c === languageCode) || 'UNKNOWN'; + const extractWhisperXStderrData = (stderr: string): TranscriptionMetadata => { //Detected language: en (0.99) in first 30s of audio... const languageRegex = /Detected language: ([a-zA-Z]{2})/; const detectedLanguageCode = regexExtract(stderr, languageRegex); - return { detectedLanguageCode }; + return { + detectedLanguageCode: parseLanguageCodeString(detectedLanguageCode), + }; }; const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => { @@ -275,7 +283,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => { const loadTime = regexExtract(stderr, loadTimeRegex); return { - detectedLanguageCode: detectedLanguageCode, + detectedLanguageCode: parseLanguageCodeString(detectedLanguageCode), loadTimeMs: loadTime ? parseInt(loadTime) : undefined, totalTimeMs: totalTime ? parseInt(totalTime) : undefined, }; @@ -284,7 +292,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => { const whisperParams = ( detectLanguageOnly: boolean, file: string, - languageCode: LanguageCode = 'auto', + languageCode: InputLanguageCode = 'auto', translate: boolean = false, ) => { if (detectLanguageOnly) { @@ -308,7 +316,7 @@ const whisperParams = ( export const runWhisperX = async ( whisperBaseParams: WhisperBaseParams, - languageCode: LanguageCode, + languageCode: InputLanguageCode, translate: boolean, ) => { const { wavPath, diarize, stage } = whisperBaseParams;