diff --git a/packages/cdk/lib/transcription-service.ts b/packages/cdk/lib/transcription-service.ts index 68c453f4..e489c786 100644 --- a/packages/cdk/lib/transcription-service.ts +++ b/packages/cdk/lib/transcription-service.ts @@ -789,7 +789,7 @@ export class TranscriptionService extends GuStack { mediaDownloadTask.taskDefinition.addVolume(tempVolume); mediaDownloadTask.containerDefinition.addMountPoints({ sourceVolume: downloadVolume.name, - containerPath: '/media-download', // needs to match DOWNLOAD_DIRECTORY in media-download index.ts + containerPath: '/media-download', // needs to match ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY in media-download index.ts readOnly: false, }); mediaDownloadTask.containerDefinition.addMountPoints({ diff --git a/packages/media-download/src/index.ts b/packages/media-download/src/index.ts index 64ba023d..d932e419 100644 --- a/packages/media-download/src/index.ts +++ b/packages/media-download/src/index.ts @@ -20,7 +20,8 @@ import { MediaDownloadJob, } from '@guardian/transcription-service-common'; -export const MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download'; +// This needs to be kept in sync with CDK downloadVolume +export const ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download'; const uploadToS3 = async ( s3Client: S3Client, @@ -138,17 +139,21 @@ const main = async () => { const useProxy = config.app.stage !== 'DEV' || process.env['USE_PROXY'] === 'true'; + const workingDirectory = + config.app.stage === 'DEV' ? '/tmp' : ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY; + const proxyUrl = useProxy ? await startProxyTunnel( await config.app.mediaDownloadProxySSHKey(), config.app.mediaDownloadProxyIpAddress, config.app.mediaDownloadProxyPort, + workingDirectory, ) : undefined; const metadata = await downloadMedia( job.url, - MEDIA_DOWNLOAD_WORKING_DIRECTORY, + workingDirectory, job.id, proxyUrl, ); diff --git a/packages/media-download/src/yt-dlp.ts b/packages/media-download/src/yt-dlp.ts index a0783574..687a5811 100644 --- a/packages/media-download/src/yt-dlp.ts +++ b/packages/media-download/src/yt-dlp.ts @@ -1,24 +1,24 @@ import fs from 'node:fs'; import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process'; import { logger } from '@guardian/transcription-service-backend-common'; -import { MEDIA_DOWNLOAD_WORKING_DIRECTORY } from './index'; export type MediaMetadata = { title: string; extension: string; - filename: string; mediaPath: string; duration: number; }; -const extractInfoJson = (infoJsonPath: string): MediaMetadata => { +const extractInfoJson = ( + infoJsonPath: string, + outputFilePath: string, +): MediaMetadata => { const file = fs.readFileSync(infoJsonPath, 'utf8'); const json = JSON.parse(file); return { title: json.title, - extension: json.ext, - filename: json.filename, - mediaPath: `${json.filename}`, + extension: json.ext || json.entries[0]?.ext, + mediaPath: outputFilePath, duration: parseInt(json.duration), }; }; @@ -27,13 +27,12 @@ export const startProxyTunnel = async ( key: string, ip: string, port: number, + workingDirectory: string, ): Promise => { try { - fs.writeFileSync( - `${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`, - key + '\n', - { mode: 0o600 }, - ); + fs.writeFileSync(`${workingDirectory}/media_download`, key + '\n', { + mode: 0o600, + }); const result = await runSpawnCommand( 'startProxyTunnel', 'ssh', @@ -49,7 +48,7 @@ export const startProxyTunnel = async ( '-N', '-f', '-i', - `${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`, + `${workingDirectory}/media_download`, `media_download@${ip}`, ], true, @@ -64,28 +63,39 @@ export const startProxyTunnel = async ( export const downloadMedia = async ( url: string, - destinationDirectoryPath: string, + workingDirectory: string, id: string, proxyUrl?: string, ) => { const proxyParams = proxyUrl ? ['--proxy', proxyUrl] : []; try { + const filepathLocation = `${workingDirectory}/${id}.txt`; + // yt-dlp --print-to-file appends to the file, so wipe it first + fs.writeFileSync(filepathLocation, ''); await runSpawnCommand( 'downloadMedia', 'yt-dlp', [ '--write-info-json', '--no-clean-info-json', + '--print-to-file', + 'after_move:filepath', + `${filepathLocation}`, '--newline', '-o', - `${destinationDirectoryPath}/${id}.%(ext)s`, + `${workingDirectory}/${id}.%(ext)s`, ...proxyParams, url, ], true, ); + const outputPath = fs.readFileSync(filepathLocation, 'utf8').trim(); const metadata = extractInfoJson( - `${destinationDirectoryPath}/${id}.info.json`, + `${workingDirectory}/${id}.info.json`, + outputPath, + ); + logger.info( + `Download complete, extracted metadata: ${JSON.stringify(metadata)}`, ); return metadata; diff --git a/scripts/trigger-media-download-service.sh b/scripts/trigger-media-download-service.sh old mode 100644 new mode 100755 index 518728ad..94bd2bf9 --- a/scripts/trigger-media-download-service.sh +++ b/scripts/trigger-media-download-service.sh @@ -8,5 +8,5 @@ if [ -z "$URL" ] || [ -z "$USER_EMAIL" ]; then exit 1 fi -export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}" -npm run media-download::start \ No newline at end of file +export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"diarizationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}" +npm run media-download::start