From df3f7860327e2bd21599c95eb39309bf7a7f7ad6 Mon Sep 17 00:00:00 2001 From: Migush Date: Mon, 21 Oct 2024 10:11:38 +0200 Subject: [PATCH] [WIP] add metadata options base --- src/MsEdgeTTS.spec.ts | 36 +++++++---- src/MsEdgeTTS.ts | 145 ++++++++++++++++++++++++++++-------------- 2 files changed, 124 insertions(+), 57 deletions(-) diff --git a/src/MsEdgeTTS.spec.ts b/src/MsEdgeTTS.spec.ts index 79394fc..d7c7e7a 100644 --- a/src/MsEdgeTTS.spec.ts +++ b/src/MsEdgeTTS.spec.ts @@ -2,7 +2,7 @@ import "jest" import {MsEdgeTTS} from "./MsEdgeTTS" import {OUTPUT_FORMAT} from "./OUTPUT_FORMAT" import {readFileSync, mkdirSync} from "fs" -import {existsSync, unlinkSync} from "node:fs" +import {existsSync} from "node:fs" import {tmpdir} from "os" import {join} from "path" import randomBytes from "randombytes" @@ -23,14 +23,28 @@ describe("MsEdgeTTS", () => { }) it("should write audio to file", async () => { - const filePath = await tts.toFile(join(tmpPath, "./example_audio.webm"), "Hi, how are you?") - console.log("Done!", filePath) + const {audioFilePath} = await tts.toFile(join(tmpPath), "Hi, how are you doing today hello hello hello?") + console.log("Done!", audioFilePath) - expect(filePath).toBeDefined() - expect(filePath).toMatch(/example_audio.webm/) + expect(audioFilePath).toBeDefined() + expect(audioFilePath).toMatch(/example_audio.webm/) expect(Object.keys(tts["_streams"]).length).toBe(0) // have content - expect(readFileSync(filePath).length).toBeGreaterThan(0) + expect(readFileSync(audioFilePath).length).toBeGreaterThan(0) + }) + + it("should write metadata to file", async () => { + await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS, { + sentenceBoundaryEnabled: true, + }) + const {metadataFilePath} = await tts.toFile(join(tmpPath), "Hi, how are you doing today hello hello hello?") + console.log("Done!", metadataFilePath) + // + // expect(metadataFilePath).toBeDefined() + // expect(metadataFilePath).toMatch(/.json$/) + // expect(Object.keys(tts["_streams"]).length).toBe(0) + // have content + // expect(readFileSync(metadataFilePath).length).toBeGreaterThan(0) }) it("should handle multiple streams simultaneously", async () => { @@ -75,14 +89,14 @@ describe("MsEdgeTTS", () => { // }) it("should return different audio when a pitch is applied", async () => { - const filePath = await tts.toFile(join(tmpPath, `./example_audio4.webm`), "Hi, how are you?", {pitch: "+10Hz"}) - console.log("Done!", filePath) + const {audioFilePath} = await tts.toFile(join(tmpPath, `./example_audio4.webm`), "Hi, how are you?", {pitch: "+10Hz"}) + console.log("Done!", audioFilePath) - expect(filePath).toBeDefined() - expect(filePath).toMatch(/example_audio4.webm/) + expect(audioFilePath).toBeDefined() + expect(audioFilePath).toMatch(/example_audio4.webm/) expect(Object.keys(tts["_streams"]).length).toBe(0) // have content - expect(readFileSync(filePath).length).toBeGreaterThan(0) + expect(readFileSync(audioFilePath).length).toBeGreaterThan(0) }) afterEach(() => { diff --git a/src/MsEdgeTTS.ts b/src/MsEdgeTTS.ts index 95cebe7..7b47cbe 100644 --- a/src/MsEdgeTTS.ts +++ b/src/MsEdgeTTS.ts @@ -41,6 +41,22 @@ export class ProsodyOptions { volume?: VOLUME | string | number = 100.0 } +export class MetadataOptions { + /** + * (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`. + * Changing the voiceName will reset the voiceLocale. + */ + voiceLocale?: string + /** + * (optional) whether to enable sentence boundary metadata. Default is `false` + */ + sentenceBoundaryEnabled?: boolean = false + /** + * (optional) whether to enable word boundary metadata. Default is `false` + */ + wordBoundaryEnabled?: boolean = false +} + enum messageTypes { TURN_START = "turn.start", TURN_END = "turn.end", @@ -63,8 +79,8 @@ export class MsEdgeTTS { private readonly _isBrowser: boolean private _ws: WebSocket private _voice - private _voiceLocale private _outputFormat + private _metadataOptions: MetadataOptions = new MetadataOptions() private _streams: { [key: string]: { audio: Readable, metadata: Readable } } = {} private _startTime = 0 private readonly _agent: Agent @@ -115,8 +131,8 @@ export class MsEdgeTTS { "synthesis": { "audio": { "metadataoptions": { - "sentenceBoundaryEnabled": "false", - "wordBoundaryEnabled": "false" + "sentenceBoundaryEnabled": "${this._metadataOptions.sentenceBoundaryEnabled}", + "wordBoundaryEnabled": "${this._metadataOptions.wordBoundaryEnabled}" }, "outputFormat": "${this._outputFormat}" } @@ -182,7 +198,7 @@ export class MsEdgeTTS { private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string { // in case future updates to the edge API block these elements, we'll be concatenating strings. options = {...new ProsodyOptions(), ...options} - return ` + return ` ${input} @@ -208,33 +224,36 @@ export class MsEdgeTTS { * Must be called at least once before text can be synthesised. * Saved in this instance. Can be called at any time times to update the metadata. * - * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) + * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices). Changing the voiceName will reset the voiceLocale. * @param outputFormat any {@link OUTPUT_FORMAT} - * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` + * @param metadataOptions (optional) {@link MetadataOptions} */ - async setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, voiceLocale?: string) { + async setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, metadataOptions?: MetadataOptions): Promise { const oldVoice = this._voice - const oldVoiceLocale = this._voiceLocale const oldOutputFormat = this._outputFormat + const oldOptions = JSON.stringify(this._metadataOptions) this._voice = voiceName - this._voiceLocale = voiceLocale - if (!this._voiceLocale) { + if (!this._metadataOptions.voiceLocale || (!metadataOptions.voiceLocale && oldVoice !== this._voice)) { const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice) - if (!voiceLangMatch) throw new Error("Could not infer voiceLocale from voiceName!") - this._voiceLocale = voiceLangMatch[0] + if (!voiceLangMatch) throw new Error("Could not infer voiceLocale from voiceName, and no voiceLocale was specified!") + this._metadataOptions.voiceLocale = voiceLangMatch[0] } this._outputFormat = outputFormat + Object.assign(this._metadataOptions, metadataOptions) + const changed = oldVoice !== this._voice - || oldVoiceLocale !== this._voiceLocale || oldOutputFormat !== this._outputFormat + || oldOptions !== JSON.stringify(this._metadataOptions) - // create new client - if (changed || this._ws.readyState !== this._ws.OPEN) { - this._startTime = Date.now() - await this._initClient() + if (!changed && this._ws.readyState === this._ws.OPEN) { + return } + + // create new client + this._startTime = Date.now() + await this._initClient() } private _metadataCheck() { @@ -252,13 +271,16 @@ export class MsEdgeTTS { /** * Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}. * - * @param path a valid output path, including a filename and file extension. + * @param dirPath a valid output directory path * @param input the input to synthesise * @param options (optional) {@link ProsodyOptions} - * @returns {Promise} - a `Promise` with the full filepath + @returns {Promise<{audioFilePath: string, metadataFilePath: string}>} - a `Promise` with the full filepaths */ - toFile(path: string, input: string, options?: ProsodyOptions): Promise { - return this._rawSSMLRequestToFile(path, this._SSMLTemplate(input, options)) + toFile(dirPath: string, input: string, options?: ProsodyOptions): Promise<{ + audioFilePath: string, + metadataFilePath: string + }> { + return this._rawSSMLRequestToFile(dirPath, this._SSMLTemplate(input, options)) } /** @@ -276,12 +298,12 @@ export class MsEdgeTTS { /** * Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request. * - * @param path a valid output path, including a filename and file extension. + * @param dirPath a valid output directory path. * @param requestSSML the SSML to send. SSML elements required in order to work. - * @returns {Promise} - a `Promise` with the full filepath + * @returns {Promise<{audioFilePath: string, metadataFilePath: string}>} - a `Promise` with the full filepaths */ - rawToFile(path: string, requestSSML: string): Promise { - return this._rawSSMLRequestToFile(path, requestSSML) + rawToFile(dirPath: string, requestSSML: string): Promise<{ audioFilePath: string, metadataFilePath: string }> { + return this._rawSSMLRequestToFile(dirPath, requestSSML) } /** @@ -295,29 +317,55 @@ export class MsEdgeTTS { return audioStream } - private _rawSSMLRequestToFile(path: string, requestSSML: string): Promise { - return new Promise(async (resolve, reject) => { - const {audioStream, requestId} = this._rawSSMLRequest(requestSSML) - - const writableFile = audioStream.pipe(fs.createWriteStream(path)) - - writableFile.once("close", async () => { - if (writableFile.bytesWritten > 0) { - resolve(path) - } else { - fs.unlinkSync(path) - reject("No audio data received") - } - }) - - audioStream.on("error", (e) => { - audioStream.destroy() - reject(e) - }) - }) + private async _rawSSMLRequestToFile(dirPath: string, requestSSML: string): Promise<{ + audioFilePath: string, + metadataFilePath: string + }> { + const {audioStream, metadataStream, requestId} = this._rawSSMLRequest(requestSSML) + + try { + const [audioFilePath, metadataFilePath] = await Promise.all([ + new Promise((resolve, reject) => { + const writableAudioFile = audioStream.pipe(fs.createWriteStream(dirPath+ "/example_audio.webm")) + writableAudioFile.once("close", async () => { + if (writableAudioFile.bytesWritten > 0) { + resolve(dirPath + "/example_audio.webm") + } else { + reject("No audio data received") + } + }) + metadataStream.once("error", reject) + }) as Promise, + new Promise((resolve, reject) => { + // get metadata from buffer and combine all MetaData root elements + const metadataItems = [] + metadataStream.on("data", (chunk: Buffer) => { + const chunkObj = JSON.parse(chunk.toString()) + // .Metadata is an array of objects, just combine them + metadataItems.push(...chunkObj["Metadata"]) + }) + metadataStream.on("close", () => { + // create file if not exists + const metadataFilePath = dirPath + "/example_metadata.json" + fs.writeFileSync(metadataFilePath, JSON.stringify(metadataItems, null, 2)) + resolve(metadataFilePath) + }) + metadataStream.once("error", reject) + }) as Promise, + ]) + return {audioFilePath, metadataFilePath} + } catch (e) { + audioStream.destroy() + metadataStream.destroy() + throw e + } } - private _rawSSMLRequest(requestSSML: string): { audioStream: Readable, metadataStream: Readable, requestId: string } { + private _rawSSMLRequest(requestSSML: string): { + audioStream: Readable, + metadataStream: Readable, + requestId: string + } { this._metadataCheck() const requestId = randomBytes(16).toString("hex") @@ -336,6 +384,11 @@ export class MsEdgeTTS { read() { }, }) + + audioStream.on("error", (e) => { + audioStream.destroy() + metadataStream.destroy() + }) audioStream.once("close", () => { audioStream.destroy() metadataStream.destroy()