Skip to content

Commit

Permalink
Merge pull request #142 from guardian/pm-print-filepath-ytdlp
Browse files Browse the repository at this point in the history
Media download - stop inferring filename from info.json, use --print template instead
  • Loading branch information
philmcmahon authored Feb 25, 2025
2 parents 0889e0a + 57aa524 commit 6f6e025
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 20 deletions.
2 changes: 1 addition & 1 deletion packages/cdk/lib/transcription-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ export class TranscriptionService extends GuStack {
mediaDownloadTask.taskDefinition.addVolume(tempVolume);
mediaDownloadTask.containerDefinition.addMountPoints({
sourceVolume: downloadVolume.name,
containerPath: '/media-download', // needs to match DOWNLOAD_DIRECTORY in media-download index.ts
containerPath: '/media-download', // needs to match ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY in media-download index.ts
readOnly: false,
});
mediaDownloadTask.containerDefinition.addMountPoints({
Expand Down
9 changes: 7 additions & 2 deletions packages/media-download/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ import {
MediaDownloadJob,
} from '@guardian/transcription-service-common';

export const MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download';
// This needs to be kept in sync with CDK downloadVolume
export const ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY = '/media-download';

const uploadToS3 = async (
s3Client: S3Client,
Expand Down Expand Up @@ -138,17 +139,21 @@ const main = async () => {
const useProxy =
config.app.stage !== 'DEV' || process.env['USE_PROXY'] === 'true';

const workingDirectory =
config.app.stage === 'DEV' ? '/tmp' : ECS_MEDIA_DOWNLOAD_WORKING_DIRECTORY;

const proxyUrl = useProxy
? await startProxyTunnel(
await config.app.mediaDownloadProxySSHKey(),
config.app.mediaDownloadProxyIpAddress,
config.app.mediaDownloadProxyPort,
workingDirectory,
)
: undefined;

const metadata = await downloadMedia(
job.url,
MEDIA_DOWNLOAD_WORKING_DIRECTORY,
workingDirectory,
job.id,
proxyUrl,
);
Expand Down
40 changes: 25 additions & 15 deletions packages/media-download/src/yt-dlp.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
import fs from 'node:fs';
import { runSpawnCommand } from '@guardian/transcription-service-backend-common/src/process';
import { logger } from '@guardian/transcription-service-backend-common';
import { MEDIA_DOWNLOAD_WORKING_DIRECTORY } from './index';

export type MediaMetadata = {
title: string;
extension: string;
filename: string;
mediaPath: string;
duration: number;
};

const extractInfoJson = (infoJsonPath: string): MediaMetadata => {
const extractInfoJson = (
infoJsonPath: string,
outputFilePath: string,
): MediaMetadata => {
const file = fs.readFileSync(infoJsonPath, 'utf8');
const json = JSON.parse(file);
return {
title: json.title,
extension: json.ext,
filename: json.filename,
mediaPath: `${json.filename}`,
extension: json.ext || json.entries[0]?.ext,
mediaPath: outputFilePath,
duration: parseInt(json.duration),
};
};
Expand All @@ -27,13 +27,12 @@ export const startProxyTunnel = async (
key: string,
ip: string,
port: number,
workingDirectory: string,
): Promise<string> => {
try {
fs.writeFileSync(
`${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`,
key + '\n',
{ mode: 0o600 },
);
fs.writeFileSync(`${workingDirectory}/media_download`, key + '\n', {
mode: 0o600,
});
const result = await runSpawnCommand(
'startProxyTunnel',
'ssh',
Expand All @@ -49,7 +48,7 @@ export const startProxyTunnel = async (
'-N',
'-f',
'-i',
`${MEDIA_DOWNLOAD_WORKING_DIRECTORY}/media_download`,
`${workingDirectory}/media_download`,
`media_download@${ip}`,
],
true,
Expand All @@ -64,28 +63,39 @@ export const startProxyTunnel = async (

export const downloadMedia = async (
url: string,
destinationDirectoryPath: string,
workingDirectory: string,
id: string,
proxyUrl?: string,
) => {
const proxyParams = proxyUrl ? ['--proxy', proxyUrl] : [];
try {
const filepathLocation = `${workingDirectory}/${id}.txt`;
// yt-dlp --print-to-file appends to the file, so wipe it first
fs.writeFileSync(filepathLocation, '');
await runSpawnCommand(
'downloadMedia',
'yt-dlp',
[
'--write-info-json',
'--no-clean-info-json',
'--print-to-file',
'after_move:filepath',
`${filepathLocation}`,
'--newline',
'-o',
`${destinationDirectoryPath}/${id}.%(ext)s`,
`${workingDirectory}/${id}.%(ext)s`,
...proxyParams,
url,
],
true,
);
const outputPath = fs.readFileSync(filepathLocation, 'utf8').trim();
const metadata = extractInfoJson(
`${destinationDirectoryPath}/${id}.info.json`,
`${workingDirectory}/${id}.info.json`,
outputPath,
);
logger.info(
`Download complete, extracted metadata: ${JSON.stringify(metadata)}`,
);

return metadata;
Expand Down
4 changes: 2 additions & 2 deletions scripts/trigger-media-download-service.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ if [ -z "$URL" ] || [ -z "$USER_EMAIL" ]; then
exit 1
fi

export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}"
npm run media-download::start
export MESSAGE_BODY="{\"id\":\"a168f62d-e179-46d5-9a9e-ff519551e0ee\",\"url\":\"${URL}\",\"languageCode\":\"en\",\"translationRequested\":false,\"diarizationRequested\":false,\"userEmail\":\"${USER_EMAIL}\"}"
npm run media-download::start

0 comments on commit 6f6e025

Please sign in to comment.