Skip to content

Commit

Permalink
Refactor language types again - introduce input/output types
Browse files Browse the repository at this point in the history
  • Loading branch information
philmcmahon committed Feb 3, 2025
1 parent dabfc8f commit 888fbad
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 48 deletions.
2 changes: 1 addition & 1 deletion packages/api/src/export.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ import {
isS3Failure,
logger,
TranscriptionConfig,
TranscriptionDynamoItem,
} from '@guardian/transcription-service-backend-common';
import {
ExportItems,
ExportStatus,
ExportStatuses,
ExportType,
TranscriptionDynamoItem,
} from '@guardian/transcription-service-common';
import { uploadToGoogleDocs } from './services/googleDrive';
import { S3Client } from '@aws-sdk/client-s3';
Expand Down
4 changes: 2 additions & 2 deletions packages/backend-common/src/sqs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ import {
OutputBucketUrls,
DestinationService,
TranscriptionJob,
LanguageCode,
TranscriptionOutput,
TranscriptionEngine,
InputLanguageCode,
} from '@guardian/transcription-service-common';
import {
getSignedUploadUrl,
Expand Down Expand Up @@ -67,7 +67,7 @@ export const generateOutputSignedUrlAndSendMessage = async (
userEmail: string,
originalFilename: string,
inputSignedUrl: string,
languageCode: LanguageCode,
languageCode: InputLanguageCode,
translationRequested: boolean,
diarizationRequested: boolean,
): Promise<SendResult> => {
Expand Down
27 changes: 14 additions & 13 deletions packages/common/src/languages.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import { z } from 'zod';
import { getKeys } from './types';
// languages supported by whisper.cpp
// copied from
// https://github.com/ggerganov/whisper.cpp/blob/25d313b38b1f562200f915cd5952555613cd0110/whisper.cpp#L251
// and values transformed to title case
export const languageCodeToLanguage = Object.freeze({
auto: 'Auto-detect language (not yet recommended)',
const languageCodeToLanguage = {
en: 'English',
zh: 'Chinese',
de: 'German',
Expand Down Expand Up @@ -105,16 +104,18 @@ export const languageCodeToLanguage = Object.freeze({
jw: 'Javanese',
su: 'Sundanese',
yue: 'Cantonese',
});
};

type LanguageCodeToLanguage = typeof languageCodeToLanguage;
export const languageCodeToLanguageWithAuto = Object.freeze({
auto: 'Auto-detect language (not yet recommended)',
...languageCodeToLanguage,
});

// There doesn't seem to be a way to get the object keys that has a type which
// is an array of a union of string literals rather than an array of strings.
// https://www.charpeni.com/blog/properly-type-object-keys-and-object-entries
export const languageCodes = Object.keys(languageCodeToLanguage) as [
keyof LanguageCodeToLanguage,
];
export const languageCodeToLanguageWithUnknown = Object.freeze({
UNKNOWN: 'Detected language not recognised',
...languageCodeToLanguage,
});

export const LanguageCode = z.enum(languageCodes);
export type LanguageCode = z.infer<typeof LanguageCode>;
export const languageCodes = getKeys(languageCodeToLanguage);
export const inputLanguageCodes = getKeys(languageCodeToLanguageWithAuto);
export const outputLanguageCodes = getKeys(languageCodeToLanguageWithUnknown);
24 changes: 16 additions & 8 deletions packages/common/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import { z } from 'zod';
import { LanguageCode, languageCodeToLanguage } from './languages';
import { inputLanguageCodes, outputLanguageCodes } from './languages';

// thanks https://github.com/colinhacks/zod/discussions/2125#discussioncomment-7452235
// eslint-disable-next-line @typescript-eslint/no-explicit-any
export function getKeys<T extends Record<string, any>>(obj: T) {
return Object.keys(obj) as [keyof typeof obj];
}

const zodLanguageCode = z.enum(getKeys(languageCodeToLanguage));
export const InputLanguageCode = z.enum(inputLanguageCodes);
export type InputLanguageCode = z.infer<typeof InputLanguageCode>;

export const OutputLanguageCode = z.enum(outputLanguageCodes);
export type OutputLanguageCode = z.infer<typeof OutputLanguageCode>;

export const inputToOutputLanguageCode = (
c: InputLanguageCode,
): OutputLanguageCode => (c === 'auto' ? 'UNKNOWN' : c);

export enum DestinationService {
TranscriptionService = 'TranscriptionService',
Expand Down Expand Up @@ -41,7 +49,7 @@ export const MediaDownloadJob = z.object({
id: z.string(),
url: z.string(),
userEmail: z.string(),
languageCode: zodLanguageCode,
languageCode: InputLanguageCode,
translationRequested: z.boolean(),
diarizationRequested: z.boolean(),
});
Expand All @@ -61,7 +69,7 @@ export const TranscriptionJob = z.object({
userEmail: z.string(),
transcriptDestinationService: z.nativeEnum(DestinationService),
outputBucketUrls: OutputBucketUrls,
languageCode: zodLanguageCode,
languageCode: InputLanguageCode,
translate: z.boolean(),
diarize: z.boolean(),
engine: z.nativeEnum(TranscriptionEngine),
Expand All @@ -80,7 +88,7 @@ const TranscriptionOutputBase = z.object({

export const TranscriptionOutputSuccess = TranscriptionOutputBase.extend({
status: z.literal('SUCCESS'),
languageCode: z.string(),
languageCode: OutputLanguageCode,
outputBucketKeys: OutputBucketKeys,
// we can get rid of this when we switch to using a zip
translationOutputBucketKeys: z.optional(OutputBucketKeys),
Expand Down Expand Up @@ -215,7 +223,7 @@ export type CreateFolderRequest = z.infer<typeof CreateFolderRequest>;

export const transcribeUrlRequestBody = z.object({
url: z.string(),
languageCode: zodLanguageCode,
languageCode: InputLanguageCode,
translationRequested: z.boolean(),
diarizationRequested: z.boolean(),
});
Expand All @@ -225,7 +233,7 @@ export type TranscribeUrlRequestBody = z.infer<typeof transcribeUrlRequestBody>;
export const transcribeFileRequestBody = z.object({
s3Key: z.string(),
fileName: z.string(),
languageCode: zodLanguageCode,
languageCode: InputLanguageCode,
translationRequested: z.boolean(),
diarizationRequested: z.boolean(),
});
Expand Down Expand Up @@ -263,7 +271,7 @@ export const TranscriptionDynamoItem = z.object({
userEmail: z.string(),
completedAt: z.optional(z.string()), // dynamodb can't handle dates so we need to use an ISO date
isTranslation: z.boolean(),
languageCode: z.optional(LanguageCode),
languageCode: z.optional(OutputLanguageCode),
exportStatuses: z.optional(ExportStatuses),
});

Expand Down
11 changes: 1 addition & 10 deletions packages/output-handler/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import {
TranscriptionOutputFailure,
transcriptionOutputIsTranscriptionFailure,
TranscriptionDynamoItem,
LanguageCode,
} from '@guardian/transcription-service-common';
import {
MetricsService,
Expand Down Expand Up @@ -64,14 +63,6 @@ const handleTranscriptionSuccess = async (
metrics: MetricsService,
sourceMediaDownloadUrl: string,
) => {
const languageCode = LanguageCode.safeParse(transcriptionOutput.languageCode);
if (!languageCode.success) {
logger.error('Failed to parse language code', {
languageCode: transcriptionOutput.languageCode,
});
await metrics.putMetric(FailureMetric);
return;
}
const dynamoItem: TranscriptionDynamoItem = {
id: transcriptionOutput.id,
originalFilename: transcriptionOutput.originalFilename,
Expand All @@ -83,7 +74,7 @@ const handleTranscriptionSuccess = async (
userEmail: transcriptionOutput.userEmail,
completedAt: new Date().toISOString(),
isTranslation: transcriptionOutput.isTranslation,
languageCode: languageCode.data,
languageCode: transcriptionOutput.languageCode,
};

try {
Expand Down
11 changes: 7 additions & 4 deletions packages/worker/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
import {
DestinationService,
OutputBucketKeys,
OutputLanguageCode,
TranscriptionJob,
TranscriptionOutputFailure,
type TranscriptionOutputSuccess,
Expand Down Expand Up @@ -348,6 +349,11 @@ const pollTranscriptionQueue = async (
job.engine === 'whisperx',
);

const languageCode: OutputLanguageCode =
job.languageCode === 'auto'
? transcriptResult.metadata.detectedLanguageCode
: job.languageCode;

// if we've received an interrupt signal we don't want to perform a half-finished transcript upload/publish as
// this may, for example, result in duplicate emails to the user. Here we assume that we can upload some text
// files to s3 and make a single request to SNS and SQS within 20 seconds
Expand Down Expand Up @@ -385,10 +391,7 @@ const pollTranscriptionQueue = async (
const transcriptionOutput: TranscriptionOutputSuccess = {
id: job.id,
status: 'SUCCESS',
languageCode:
job.languageCode === 'auto'
? transcriptResult.metadata.detectedLanguageCode || 'UNKNOWN'
: job.languageCode,
languageCode,
userEmail: job.userEmail,
originalFilename: job.originalFilename,
outputBucketKeys,
Expand Down
28 changes: 18 additions & 10 deletions packages/worker/src/transcribe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ import path from 'path';
import { readFile } from '@guardian/transcription-service-backend-common';
import { logger } from '@guardian/transcription-service-backend-common';
import {
LanguageCode,
InputLanguageCode,
inputToOutputLanguageCode,
languageCodes,
OutputLanguageCode,
TranscriptionEngine,
} from '@guardian/transcription-service-common';
import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process';
Expand All @@ -21,7 +23,7 @@ export interface Transcripts {
}

type TranscriptionMetadata = {
detectedLanguageCode?: string;
detectedLanguageCode: OutputLanguageCode;
loadTimeMs?: number;
totalTimeMs?: number;
};
Expand Down Expand Up @@ -133,7 +135,7 @@ const getDuration = (ffmpegOutput: string) => {

const runTranscription = async (
whisperBaseParams: WhisperBaseParams,
languageCode: LanguageCode,
languageCode: InputLanguageCode,
translate: boolean,
whisperX: boolean,
) => {
Expand Down Expand Up @@ -181,7 +183,7 @@ const runTranscription = async (
const getLanguageCode = async (
whisperBaseParams: WhisperBaseParams,
whisperX: boolean,
): Promise<LanguageCode> => {
): Promise<InputLanguageCode> => {
// whisperx is so slow to start up let's not even bother pre-detecting the language and just let it run detection
// for both transcription and translation
if (whisperX) {
Expand Down Expand Up @@ -214,7 +216,8 @@ const transcribeAndTranslate = async (

// we only run language detection once,
// so need to override the detected language of future whisper runs
transcription.metadata.detectedLanguageCode = languageCode;
transcription.metadata.detectedLanguageCode =
inputToOutputLanguageCode(languageCode);
const translation =
languageCode === 'en'
? null
Expand All @@ -241,7 +244,7 @@ const transcribeAndTranslate = async (

export const getTranscriptionText = async (
whisperBaseParams: WhisperBaseParams,
languageCode: LanguageCode,
languageCode: InputLanguageCode,
translate: boolean,
combineTranscribeAndTranslate: boolean,
whisperX: boolean,
Expand All @@ -257,11 +260,16 @@ const regexExtract = (text: string, regex: RegExp): string | undefined => {
return regexResult ? regexResult[1] : undefined;
};

const parseLanguageCodeString = (languageCode?: string): OutputLanguageCode =>
languageCodes.find((c) => c === languageCode) || 'UNKNOWN';

const extractWhisperXStderrData = (stderr: string): TranscriptionMetadata => {
//Detected language: en (0.99) in first 30s of audio...
const languageRegex = /Detected language: ([a-zA-Z]{2})/;
const detectedLanguageCode = regexExtract(stderr, languageRegex);
return { detectedLanguageCode };
return {
detectedLanguageCode: parseLanguageCodeString(detectedLanguageCode),
};
};

const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => {
Expand All @@ -275,7 +283,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => {
const loadTime = regexExtract(stderr, loadTimeRegex);

return {
detectedLanguageCode: detectedLanguageCode,
detectedLanguageCode: parseLanguageCodeString(detectedLanguageCode),
loadTimeMs: loadTime ? parseInt(loadTime) : undefined,
totalTimeMs: totalTime ? parseInt(totalTime) : undefined,
};
Expand All @@ -284,7 +292,7 @@ const extractWhisperStderrData = (stderr: string): TranscriptionMetadata => {
const whisperParams = (
detectLanguageOnly: boolean,
file: string,
languageCode: LanguageCode = 'auto',
languageCode: InputLanguageCode = 'auto',
translate: boolean = false,
) => {
if (detectLanguageOnly) {
Expand All @@ -308,7 +316,7 @@ const whisperParams = (

export const runWhisperX = async (
whisperBaseParams: WhisperBaseParams,
languageCode: LanguageCode,
languageCode: InputLanguageCode,
translate: boolean,
) => {
const { wavPath, diarize, stage } = whisperBaseParams;
Expand Down

0 comments on commit 888fbad

Please sign in to comment.