Skip to content

Commit

Permalink
[WIP] add metadata options base
Browse files Browse the repository at this point in the history
  • Loading branch information
Migushthe2nd committed Oct 21, 2024
1 parent aaf9690 commit df3f786
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 57 deletions.
36 changes: 25 additions & 11 deletions src/MsEdgeTTS.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import "jest"
import {MsEdgeTTS} from "./MsEdgeTTS"
import {OUTPUT_FORMAT} from "./OUTPUT_FORMAT"
import {readFileSync, mkdirSync} from "fs"
import {existsSync, unlinkSync} from "node:fs"
import {existsSync} from "node:fs"
import {tmpdir} from "os"
import {join} from "path"
import randomBytes from "randombytes"
Expand All @@ -23,14 +23,28 @@ describe("MsEdgeTTS", () => {
})

it("should write audio to file", async () => {
const filePath = await tts.toFile(join(tmpPath, "./example_audio.webm"), "Hi, how are you?")
console.log("Done!", filePath)
const {audioFilePath} = await tts.toFile(join(tmpPath), "Hi, how are you doing today hello hello hello?")
console.log("Done!", audioFilePath)

expect(filePath).toBeDefined()
expect(filePath).toMatch(/example_audio.webm/)
expect(audioFilePath).toBeDefined()
expect(audioFilePath).toMatch(/example_audio.webm/)
expect(Object.keys(tts["_streams"]).length).toBe(0)
// have content
expect(readFileSync(filePath).length).toBeGreaterThan(0)
expect(readFileSync(audioFilePath).length).toBeGreaterThan(0)
})

it("should write metadata to file", async () => {
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS, {
sentenceBoundaryEnabled: true,
})
const {metadataFilePath} = await tts.toFile(join(tmpPath), "Hi, how are you doing today hello hello hello?")
console.log("Done!", metadataFilePath)
//
// expect(metadataFilePath).toBeDefined()
// expect(metadataFilePath).toMatch(/.json$/)
// expect(Object.keys(tts["_streams"]).length).toBe(0)
// have content
// expect(readFileSync(metadataFilePath).length).toBeGreaterThan(0)
})

it("should handle multiple streams simultaneously", async () => {
Expand Down Expand Up @@ -75,14 +89,14 @@ describe("MsEdgeTTS", () => {
// })

it("should return different audio when a pitch is applied", async () => {
const filePath = await tts.toFile(join(tmpPath, `./example_audio4.webm`), "Hi, how are you?", {pitch: "+10Hz"})
console.log("Done!", filePath)
const {audioFilePath} = await tts.toFile(join(tmpPath, `./example_audio4.webm`), "Hi, how are you?", {pitch: "+10Hz"})
console.log("Done!", audioFilePath)

expect(filePath).toBeDefined()
expect(filePath).toMatch(/example_audio4.webm/)
expect(audioFilePath).toBeDefined()
expect(audioFilePath).toMatch(/example_audio4.webm/)
expect(Object.keys(tts["_streams"]).length).toBe(0)
// have content
expect(readFileSync(filePath).length).toBeGreaterThan(0)
expect(readFileSync(audioFilePath).length).toBeGreaterThan(0)
})

afterEach(() => {
Expand Down
145 changes: 99 additions & 46 deletions src/MsEdgeTTS.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ export class ProsodyOptions {
volume?: VOLUME | string | number = 100.0
}

export class MetadataOptions {
/**
* (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`.
* Changing the voiceName will reset the voiceLocale.
*/
voiceLocale?: string
/**
* (optional) whether to enable sentence boundary metadata. Default is `false`
*/
sentenceBoundaryEnabled?: boolean = false
/**
* (optional) whether to enable word boundary metadata. Default is `false`
*/
wordBoundaryEnabled?: boolean = false
}

enum messageTypes {
TURN_START = "turn.start",
TURN_END = "turn.end",
Expand All @@ -63,8 +79,8 @@ export class MsEdgeTTS {
private readonly _isBrowser: boolean
private _ws: WebSocket
private _voice
private _voiceLocale
private _outputFormat
private _metadataOptions: MetadataOptions = new MetadataOptions()
private _streams: { [key: string]: { audio: Readable, metadata: Readable } } = {}
private _startTime = 0
private readonly _agent: Agent
Expand Down Expand Up @@ -115,8 +131,8 @@ export class MsEdgeTTS {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "false",
"wordBoundaryEnabled": "false"
"sentenceBoundaryEnabled": "${this._metadataOptions.sentenceBoundaryEnabled}",
"wordBoundaryEnabled": "${this._metadataOptions.wordBoundaryEnabled}"
},
"outputFormat": "${this._outputFormat}"
}
Expand Down Expand Up @@ -182,7 +198,7 @@ export class MsEdgeTTS {
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
// in case future updates to the edge API block these elements, we'll be concatenating strings.
options = {...new ProsodyOptions(), ...options}
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._metadataOptions.voiceLocale}">
<voice name="${this._voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${input}
Expand All @@ -208,33 +224,36 @@ export class MsEdgeTTS {
* Must be called at least once before text can be synthesised.
* Saved in this instance. Can be called at any time times to update the metadata.
*
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices). Changing the voiceName will reset the voiceLocale.
* @param outputFormat any {@link OUTPUT_FORMAT}
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
* @param metadataOptions (optional) {@link MetadataOptions}
*/
async setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, voiceLocale?: string) {
async setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, metadataOptions?: MetadataOptions): Promise<void> {
const oldVoice = this._voice
const oldVoiceLocale = this._voiceLocale
const oldOutputFormat = this._outputFormat
const oldOptions = JSON.stringify(this._metadataOptions)

this._voice = voiceName
this._voiceLocale = voiceLocale
if (!this._voiceLocale) {
if (!this._metadataOptions.voiceLocale || (!metadataOptions.voiceLocale && oldVoice !== this._voice)) {
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice)
if (!voiceLangMatch) throw new Error("Could not infer voiceLocale from voiceName!")
this._voiceLocale = voiceLangMatch[0]
if (!voiceLangMatch) throw new Error("Could not infer voiceLocale from voiceName, and no voiceLocale was specified!")
this._metadataOptions.voiceLocale = voiceLangMatch[0]
}
this._outputFormat = outputFormat

Object.assign(this._metadataOptions, metadataOptions)

const changed = oldVoice !== this._voice
|| oldVoiceLocale !== this._voiceLocale
|| oldOutputFormat !== this._outputFormat
|| oldOptions !== JSON.stringify(this._metadataOptions)

// create new client
if (changed || this._ws.readyState !== this._ws.OPEN) {
this._startTime = Date.now()
await this._initClient()
if (!changed && this._ws.readyState === this._ws.OPEN) {
return
}

// create new client
this._startTime = Date.now()
await this._initClient()
}

private _metadataCheck() {
Expand All @@ -252,13 +271,16 @@ export class MsEdgeTTS {
/**
* Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}.
*
* @param path a valid output path, including a filename and file extension.
* @param dirPath a valid output directory path
* @param input the input to synthesise
* @param options (optional) {@link ProsodyOptions}
* @returns {Promise<string>} - a `Promise` with the full filepath
@returns {Promise<{audioFilePath: string, metadataFilePath: string}>} - a `Promise` with the full filepaths
*/
toFile(path: string, input: string, options?: ProsodyOptions): Promise<string> {
return this._rawSSMLRequestToFile(path, this._SSMLTemplate(input, options))
toFile(dirPath: string, input: string, options?: ProsodyOptions): Promise<{
audioFilePath: string,
metadataFilePath: string
}> {
return this._rawSSMLRequestToFile(dirPath, this._SSMLTemplate(input, options))
}

/**
Expand All @@ -276,12 +298,12 @@ export class MsEdgeTTS {
/**
* Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request.
*
* @param path a valid output path, including a filename and file extension.
* @param dirPath a valid output directory path.
* @param requestSSML the SSML to send. SSML elements required in order to work.
* @returns {Promise<string>} - a `Promise` with the full filepath
* @returns {Promise<{audioFilePath: string, metadataFilePath: string}>} - a `Promise` with the full filepaths
*/
rawToFile(path: string, requestSSML: string): Promise<string> {
return this._rawSSMLRequestToFile(path, requestSSML)
rawToFile(dirPath: string, requestSSML: string): Promise<{ audioFilePath: string, metadataFilePath: string }> {
return this._rawSSMLRequestToFile(dirPath, requestSSML)
}

/**
Expand All @@ -295,29 +317,55 @@ export class MsEdgeTTS {
return audioStream
}

private _rawSSMLRequestToFile(path: string, requestSSML: string): Promise<string> {
return new Promise(async (resolve, reject) => {
const {audioStream, requestId} = this._rawSSMLRequest(requestSSML)

const writableFile = audioStream.pipe(fs.createWriteStream(path))

writableFile.once("close", async () => {
if (writableFile.bytesWritten > 0) {
resolve(path)
} else {
fs.unlinkSync(path)
reject("No audio data received")
}
})

audioStream.on("error", (e) => {
audioStream.destroy()
reject(e)
})
})
private async _rawSSMLRequestToFile(dirPath: string, requestSSML: string): Promise<{
audioFilePath: string,
metadataFilePath: string
}> {
const {audioStream, metadataStream, requestId} = this._rawSSMLRequest(requestSSML)

try {
const [audioFilePath, metadataFilePath] = await Promise.all([
new Promise((resolve, reject) => {
const writableAudioFile = audioStream.pipe(fs.createWriteStream(dirPath+ "/example_audio.webm"))
writableAudioFile.once("close", async () => {
if (writableAudioFile.bytesWritten > 0) {
resolve(dirPath + "/example_audio.webm")
} else {
reject("No audio data received")
}
})
metadataStream.once("error", reject)
}) as Promise<string>,
new Promise((resolve, reject) => {
// get metadata from buffer and combine all MetaData root elements
const metadataItems = []
metadataStream.on("data", (chunk: Buffer) => {
const chunkObj = JSON.parse(chunk.toString())
// .Metadata is an array of objects, just combine them
metadataItems.push(...chunkObj["Metadata"])
})
metadataStream.on("close", () => {
// create file if not exists
const metadataFilePath = dirPath + "/example_metadata.json"
fs.writeFileSync(metadataFilePath, JSON.stringify(metadataItems, null, 2))
resolve(metadataFilePath)
})
metadataStream.once("error", reject)
}) as Promise<string>,
])
return {audioFilePath, metadataFilePath}
} catch (e) {
audioStream.destroy()
metadataStream.destroy()
throw e
}
}

private _rawSSMLRequest(requestSSML: string): { audioStream: Readable, metadataStream: Readable, requestId: string } {
private _rawSSMLRequest(requestSSML: string): {
audioStream: Readable,
metadataStream: Readable,
requestId: string
} {
this._metadataCheck()

const requestId = randomBytes(16).toString("hex")
Expand All @@ -336,6 +384,11 @@ export class MsEdgeTTS {
read() {
},
})

audioStream.on("error", (e) => {
audioStream.destroy()
metadataStream.destroy()
})
audioStream.once("close", () => {
audioStream.destroy()
metadataStream.destroy()
Expand Down

0 comments on commit df3f786

Please sign in to comment.