From 5a4422b72e3bbb369d715d0949d962d075250e51 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Sun, 23 Feb 2025 10:38:20 +0000 Subject: [PATCH 1/3] Use --print to store output file location in a file rather than inferring from info.json --- packages/media-download/src/yt-dlp.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/media-download/src/yt-dlp.ts b/packages/media-download/src/yt-dlp.ts index a0783574..82351afa 100644 --- a/packages/media-download/src/yt-dlp.ts +++ b/packages/media-download/src/yt-dlp.ts @@ -2,6 +2,7 @@ import fs from 'node:fs'; import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process'; import { logger } from '@guardian/transcription-service-backend-common'; import { MEDIA_DOWNLOAD_WORKING_DIRECTORY } from './index'; +import path from 'path'; export type MediaMetadata = { title: string; @@ -11,13 +12,16 @@ export type MediaMetadata = { duration: number; }; -const extractInfoJson = (infoJsonPath: string): MediaMetadata => { +const extractInfoJson = ( + infoJsonPath: string, + outputFilePath: string, +): MediaMetadata => { const file = fs.readFileSync(infoJsonPath, 'utf8'); const json = JSON.parse(file); return { title: json.title, extension: json.ext, - filename: json.filename, + filename: path.basename(outputFilePath), mediaPath: `${json.filename}`, duration: parseInt(json.duration), }; @@ -70,12 +74,14 @@ export const downloadMedia = async ( ) => { const proxyParams = proxyUrl ? ['--proxy', proxyUrl] : []; try { + const filepathLocation = `${destinationDirectoryPath}/${id}.txt`; await runSpawnCommand( 'downloadMedia', 'yt-dlp', [ '--write-info-json', '--no-clean-info-json', + `--print-to-file after_move:filepath ${filepathLocation}`, '--newline', '-o', `${destinationDirectoryPath}/${id}.%(ext)s`, @@ -84,8 +90,10 @@ export const downloadMedia = async ( ], true, ); + const outputPath = fs.readFileSync(filepathLocation, 'utf8').trim(); const metadata = extractInfoJson( `${destinationDirectoryPath}/${id}.info.json`, + outputPath, ); return metadata; From ff514eb1ce48b55b95974e7e249b3407ed2b493e Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Mon, 24 Feb 2025 10:40:04 +0000 Subject: [PATCH 2/3] Tidy up: fix yt-dlp command syntax, make metadata simpler/more resilient, fix running locally --- packages/cdk/lib/transcription-service.ts | 2 +- packages/media-download/src/index.ts | 9 ++++-- packages/media-download/src/yt-dlp.ts | 35 ++++++++++++----------- scripts/trigger-media-download-service.sh | 4 +-- 4 files changed, 28 insertions(+), 22 deletions(-) mode change 100644 => 100755 scripts/trigger-media-download-service.sh diff --git a/packages/cdk/lib/transcription-service.ts b/packages/cdk/lib/transcription-service.ts index 68c453f4..e489c786 100644 --- a/packages/cdk/lib/transcription-service.ts +++ b/packages/cdk/lib/transcription-service.ts @@ -789,7 +789,7 @@ export class TranscriptionService extends GuStack { mediaDownloadTask.taskDefinition.addVolume(tempVolume); mediaDownloadTask.containerDefinition.addMountPoints({ sourceVolume: downloadVolume.name, - containerPath: '/media-download', // needs to match DOWNLOAD_DIRECTORY in media-download index.ts + containerPath: '/media-download', // needs to match ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY in media-download index.ts readOnly: false, }); mediaDownloadTask.containerDefinition.addMountPoints({ diff --git a/packages/media-download/src/index.ts b/packages/media-download/src/index.ts index 64ba023d..d932e419 100644 --- a/packages/media-download/src/index.ts +++ b/packages/media-download/src/index.ts @@ -20,7 +20,8 @@ import { MediaDownloadJob, } from '@guardian/transcription-service-common'; -export const MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download'; +// This needs to be kept in sync with CDK downloadVolume +export const ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download'; const uploadToS3 = async ( s3Client: S3Client, @@ -138,17 +139,21 @@ const main = async () => { const useProxy = config.app.stage !== 'DEV' || process.env['USE_PROXY'] === 'true'; + const workingDirectory = + config.app.stage === 'DEV' ? '/tmp' : ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY; + const proxyUrl = useProxy ? await startProxyTunnel( await config.app.mediaDownloadProxySSHKey(), config.app.mediaDownloadProxyIpAddress, config.app.mediaDownloadProxyPort, + workingDirectory, ) : undefined; const metadata = await downloadMedia( job.url, - MEDIA_DOWNLOAD_WORKING_DIRECTORY, + workingDirectory, job.id, proxyUrl, ); diff --git a/packages/media-download/src/yt-dlp.ts b/packages/media-download/src/yt-dlp.ts index 82351afa..52dac31e 100644 --- a/packages/media-download/src/yt-dlp.ts +++ b/packages/media-download/src/yt-dlp.ts @@ -1,13 +1,10 @@ import fs from 'node:fs'; import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process'; import { logger } from '@guardian/transcription-service-backend-common'; -import { MEDIA_DOWNLOAD_WORKING_DIRECTORY } from './index'; -import path from 'path'; export type MediaMetadata = { title: string; extension: string; - filename: string; mediaPath: string; duration: number; }; @@ -20,9 +17,8 @@ const extractInfoJson = ( const json = JSON.parse(file); return { title: json.title, - extension: json.ext, - filename: path.basename(outputFilePath), - mediaPath: `${json.filename}`, + extension: json.ext || json.entries[0]?.ext, + mediaPath: outputFilePath, duration: parseInt(json.duration), }; }; @@ -31,13 +27,12 @@ export const startProxyTunnel = async ( key: string, ip: string, port: number, + workingDirectory: string, ): Promise => { try { - fs.writeFileSync( - `${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`, - key + '\n', - { mode: 0o600 }, - ); + fs.writeFileSync(`${workingDirectory}/media_download`, key + '\n', { + mode: 0o600, + }); const result = await runSpawnCommand( 'startProxyTunnel', 'ssh', @@ -53,7 +48,7 @@ export const startProxyTunnel = async ( '-N', '-f', '-i', - `${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`, + `${workingDirectory}/media_download`, `media_download@${ip}`, ], true, @@ -68,23 +63,26 @@ export const startProxyTunnel = async ( export const downloadMedia = async ( url: string, - destinationDirectoryPath: string, + workingDirectory: string, id: string, proxyUrl?: string, ) => { const proxyParams = proxyUrl ? ['--proxy', proxyUrl] : []; try { - const filepathLocation = `${destinationDirectoryPath}/${id}.txt`; + const filepathLocation = `${workingDirectory}/${id}.txt`; + fs.writeFileSync(filepathLocation, ''); await runSpawnCommand( 'downloadMedia', 'yt-dlp', [ '--write-info-json', '--no-clean-info-json', - `--print-to-file after_move:filepath ${filepathLocation}`, + '--print-to-file', + 'after_move:filepath', + `${filepathLocation}`, '--newline', '-o', - `${destinationDirectoryPath}/${id}.%(ext)s`, + `${workingDirectory}/${id}.%(ext)s`, ...proxyParams, url, ], @@ -92,9 +90,12 @@ export const downloadMedia = async ( ); const outputPath = fs.readFileSync(filepathLocation, 'utf8').trim(); const metadata = extractInfoJson( - `${destinationDirectoryPath}/${id}.info.json`, + `${workingDirectory}/${id}.info.json`, outputPath, ); + logger.info( + `Download complete, extracted metadata: ${JSON.stringify(metadata)}`, + ); return metadata; } catch (error) { diff --git a/scripts/trigger-media-download-service.sh b/scripts/trigger-media-download-service.sh old mode 100644 new mode 100755 index 518728ad..94bd2bf9 --- a/scripts/trigger-media-download-service.sh +++ b/scripts/trigger-media-download-service.sh @@ -8,5 +8,5 @@ if [ -z "$URL" ] || [ -z "$USER_EMAIL" ]; then exit 1 fi -export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}" -npm run media-download::start \ No newline at end of file +export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"diarizationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}" +npm run media-download::start From 57aa5248bf0dc4119eb629dbf8ed233e5ee28fc7 Mon Sep 17 00:00:00 2001 From: philmcmahon Date: Mon, 24 Feb 2025 12:27:05 +0000 Subject: [PATCH 3/3] Document why we wipe the file --- packages/media-download/src/yt-dlp.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/media-download/src/yt-dlp.ts b/packages/media-download/src/yt-dlp.ts index 52dac31e..687a5811 100644 --- a/packages/media-download/src/yt-dlp.ts +++ b/packages/media-download/src/yt-dlp.ts @@ -70,6 +70,7 @@ export const downloadMedia = async ( const proxyParams = proxyUrl ? ['--proxy', proxyUrl] : []; try { const filepathLocation = `${workingDirectory}/${id}.txt`; + // yt-dlp --print-to-file appends to the file, so wipe it first fs.writeFileSync(filepathLocation, ''); await runSpawnCommand( 'downloadMedia',