From c060389d457da4474dea10c61d8d94249a17cb80 Mon Sep 17 00:00:00 2001
From: Bryan Anderson <bryan@play.ht>
Date: Thu, 9 Jan 2025 02:33:05 +0000
Subject: [PATCH] Explicitly specify protocol (http, ws, grpc) when calling
 tts(); include backward compatibility

---
 README.md            |  31 ++++-----
 pyht/async_client.py |  27 ++++----
 pyht/client.py       |  44 ++++++------
 pyht/utils.py        | 159 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 188 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index ee3ff23..9521108 100644
--- a/README.md
+++ b/README.md
@@ -79,17 +79,14 @@ The `tts` method takes the following arguments:
 - `text`: The text to be converted to speech; a string or list of strings.
 - `options`: The options to use for the TTS request; a `TTSOptions` object [(see below)](#ttsoptions).
 - `voice_engine`: The voice engine to use for the TTS request; a string (default `Play3.0-mini-http`).
-    - `PlayDialog-*`: Our large, expressive English model, which also supports multi-turn two-speaker dialogues.
-        - `PlayDialog-http`: Streaming and non-streaming audio over HTTP.
-        - `PlayDialog-ws`: Streaming audio over WebSockets.
-    - `PlayDialogMultilingual-*`: Our large, expressive multilingual model, which also supports multi-turn two-speaker dialogues.
-        - `PlayDialogMultilingual-http`: Streaming and non-streaming audio over HTTP.
-        - `PlayDialogMultilingual-ws`: Streaming audio over WebSockets.
-    - `Play3.0-mini-*`: Our small, fast multilingual model.
-        - `Play3.0-mini-http`: Streaming and non-streaming audio over HTTP.
-        - `Play3.0-mini-ws`: Streaming audio over WebSockets.
-        - `Play3.0-mini-grpc`: Streaming audio over gRPC. NOTE: This voice engine is ONLY available for Play On-Prem customers.
-    - `PlayHT2.0-turbo`: Our legacy English-only model, streaming audio over gRPC.
+    - `PlayDialog`: Our large, expressive English model, which also supports multi-turn two-speaker dialogues.
+    - `PlayDialogMultilingual`: Our large, expressive multilingual model, which also supports multi-turn two-speaker dialogues.
+    - `Play3.0-mini`: Our small, fast multilingual model.
+    - `PlayHT2.0-turbo`: Our legacy English-only model
+- `protocol`: The protocol to use to communicate with the Play API (`http` by default except for `PlayHT2.0-turbo` which is `grpc` by default).
+    - `http`: Streaming and non-streaming audio over HTTP (supports `Play3.0-mini`, `PlayDialog`, and `PlayDialogMultilingual`).
+    - `ws`: Streaming audio over WebSockets (supports `Play3.0-mini`, `PlayDialog`, and `PlayDialogMultilingual`).
+    - `grpc`: Streaming audio over gRPC (supports `PlayHT2.0-turbo` for all, and `Play3.0-mini` ONLY for Play On-Prem customers).
 - `streaming`: Whether or not to stream the audio in chunks (default True); non-streaming is only enabled for HTTP endpoints.
 
 ### TTSOptions
@@ -117,12 +114,12 @@ The `TTSOptions` class is used to specify the options for the TTS request. It ha
 - The following options are inference-time hyperparameters of the text-to-speech model; if unset, the model will use default values chosen by Play.
     - `temperature` (all models): The temperature of the model, a float.
     - `top_p` (all models): The top_p of the model, a float.
-    - `text_guidance` (`Play3.0-mini-*` and `PlayHT2.0-turbo` only): The text_guidance of the model, a float.
-    - `voice_guidance` (`Play3.0-mini-*` and `PlayHT2.0-turbo` only): The voice_guidance of the model, a float.
-    - `style_guidance` (`Play3.0-mini-*` only): The style_guidance of the model, a float.
-    - `repetition_penalty` (`Play3.0-mini-*` and `PlayHT2.0-turbo` only): The repetition_penalty of the model, a float.
+    - `text_guidance` (`Play3.0-mini` and `PlayHT2.0-turbo` only): The text_guidance of the model, a float.
+    - `voice_guidance` (`Play3.0-mini` and `PlayHT2.0-turbo` only): The voice_guidance of the model, a float.
+    - `style_guidance` (`Play3.0-mini` only): The style_guidance of the model, a float.
+    - `repetition_penalty` (`Play3.0-mini` and `PlayHT2.0-turbo` only): The repetition_penalty of the model, a float.
 - `disable_stabilization` (`PlayHT2.0-turbo` only): Disable the audio stabilization process, a boolean (default `False`).
-- `language` (`Play3.0-*` and `PlayDialogMultilingual-*` only): The language of the text to be spoken, a `Language` enum value or `None` (default `ENGLISH`).
+- `language` (`Play3.0` and `PlayDialogMultilingual` only): The language of the text to be spoken, a `Language` enum value or `None` (default `ENGLISH`).
     - `AFRIKAANS`
     - `ALBANIAN`
     - `AMHARIC`
@@ -160,7 +157,7 @@ The `TTSOptions` class is used to specify the options for the TTS request. It ha
     - `UKRAINIAN`
     - `URDU`
     - `XHOSA`
-- The following options are additional inference-time hyperparameters which only apply to the `PlayDialog-*` and `PlayDialogMultilingual-*` models; if unset, the model will use default values chosen by Play.
+- The following options are additional inference-time hyperparameters which only apply to the `PlayDialog` and `PlayDialogMultilingual` models; if unset, the model will use default values chosen by Play.
     - `voice_2` (multi-turn dialogue only): The second voice to use for a multi-turn TTS request; a string.
         - A URL pointing to a Play voice manifest file.
     - `turn_prefix` (multi-turn dialogue only): The prefix for the first speaker's turns in a multi-turn TTS request; a string.
diff --git a/pyht/async_client.py b/pyht/async_client.py
index 72a02e7..444f0ab 100644
--- a/pyht/async_client.py
+++ b/pyht/async_client.py
@@ -230,6 +230,8 @@ async def stream_tts_input(
         text_stream: Union[AsyncGenerator[str, None], AsyncIterable[str]],
         options: TTSOptions,
         voice_engine: Optional[str] = None,
+        protocol: Optional[str] = None,
+        streaming: bool = True
     ):
         """Stream input to Play via the text_stream object."""
         buffer = io.StringIO()
@@ -239,12 +241,12 @@ async def stream_tts_input(
             buffer.write(" ")  # normalize word spacing.
             if SENTENCE_END_REGEX.match(t) is None:
                 continue
-            async for data in self.tts(buffer.getvalue(), options, voice_engine):
+            async for data in self.tts(buffer.getvalue(), options, voice_engine, protocol, streaming):
                 yield data
             buffer = io.StringIO()
         # If text_stream closes, send all remaining text, regardless of sentence structure.
         if buffer.tell() > 0:
-            async for data in self.tts(buffer.getvalue(), options, voice_engine):
+            async for data in self.tts(buffer.getvalue(), options, voice_engine, protocol, streaming):
                 yield data
 
     def tts(
@@ -252,24 +254,23 @@ def tts(
         text: Union[str, list[str]],
         options: TTSOptions,
         voice_engine: Optional[str] = None,
+        protocol: Optional[str] = None,
         streaming: bool = True
     ) -> AsyncIterable[bytes]:
         metrics = self._telemetry.start("tts-request")
         try:
-            voice_engine, protocol = get_voice_engine_and_protocol(voice_engine)
+            voice_engine, protocol = get_voice_engine_and_protocol(voice_engine, protocol)
 
             if protocol == "http":
                 return self._tts_http(text, options, voice_engine, metrics, streaming)
             elif protocol == "ws":
-                if streaming:
-                    return self._tts_ws(text, options, voice_engine, metrics)
-                else:
+                if not streaming:
                     raise ValueError("Non-streaming is not supported for WebSocket API")
+                return self._tts_ws(text, options, voice_engine, metrics)
             elif protocol == "grpc":
-                if streaming:
-                    return self._tts_grpc(text, options, voice_engine, metrics)
-                else:
+                if not streaming:
                     raise ValueError("Non-streaming is not supported for gRPC API")
+                return self._tts_grpc(text, options, voice_engine, metrics)
             else:
                 raise ValueError(f"Unknown protocol {protocol}")
         except Exception as e:
@@ -489,7 +490,8 @@ async def _tts_ws(
     def get_stream_pair(
         self,
         options: TTSOptions,
-        voice_engine: Optional[str] = None
+        voice_engine: Optional[str] = None,
+        protocol: Optional[str] = None
     ) -> tuple['_InputStream', '_OutputStream']:
         """Get a linked pair of (input, output) streams.
 
@@ -498,7 +500,7 @@ def get_stream_pair(
         """
         shared_q = asyncio.Queue()
         return (
-            _InputStream(self, options, shared_q, voice_engine),
+            _InputStream(self, options, shared_q, voice_engine, protocol),
             _OutputStream(shared_q)
         )
 
@@ -587,11 +589,12 @@ def __init__(
         options: TTSOptions,
         q: asyncio.Queue[Optional[bytes]],
         voice_engine: Optional[str],
+        protocol: Optional[str] = None
     ):
         self._input = TextStream()
 
         async def listen():
-            async for output in client.stream_tts_input(self._input, options, voice_engine):
+            async for output in client.stream_tts_input(self._input, options, voice_engine, protocol):
                 await q.put(output)
             await q.put(None)
 
diff --git a/pyht/client.py b/pyht/client.py
index ec54ffc..0f8cb42 100644
--- a/pyht/client.py
+++ b/pyht/client.py
@@ -64,7 +64,7 @@ class HTTPFormat(Enum):
     FORMAT_PCM = "pcm"
 
 
-# PlayDialog-* and PlayDialogMultilingual-* only
+# PlayDialog and PlayDialogMultilingual only
 class CandidateRankingMethod(Enum):
     # non-streaming only
     DescriptionASRWithMeanProbRank = "description_asr_with_mean_prob"
@@ -185,21 +185,21 @@ class TTSOptions:
     temperature: Optional[float] = None
     top_p: Optional[float] = None
 
-    # only applies to Play3.0-* and PlayHT2.0-turbo
+    # only apply to Play3.0 and PlayHT2.0-turbo
     text_guidance: Optional[float] = None
     voice_guidance: Optional[float] = None
     repetition_penalty: Optional[float] = None
 
-    # only applies to Play3.0-*
+    # only applies to Play3.0
     style_guidance: Optional[float] = None
 
-    # only applies to PlayHT2.0-*
+    # only applies to PlayHT2.0
     disable_stabilization: Optional[bool] = None
 
-    # only applies to Play3.0-* and PlayDialogMultilingual-*
+    # only applies to Play3.0 and PlayDialogMultilingual
     language: Optional[Language] = None
 
-    # only applies to PlayDialog-* and PlayDialogMultilingual-*
+    # only apply to PlayDialog and PlayDialogMultilingual
     # leave the _2 params None if generating single-speaker audio
     voice_2: Optional[str] = None
     turn_prefix: Optional[str] = None
@@ -293,7 +293,7 @@ def http_prepare_dict(text: List[str], options: TTSOptions, voice_engine: str) -
         "language": options.language.value if options.language is not None else None,
         "version": version,
 
-        # PlayDialog-* and PlayDialogMultilingual-*
+        # PlayDialog and PlayDialogMultilingual
         # leave the _2 params None if generating single-speaker audio
         "voice_2": options.voice_2,
         "turn_prefix": options.turn_prefix,
@@ -506,7 +506,9 @@ def stream_tts_input(
         self,
         text_stream: Union[Generator[str, None, None], Iterable[str]],
         options: TTSOptions,
-        voice_engine: Optional[str] = None
+        voice_engine: Optional[str] = None,
+        protocol: Optional[str] = None,
+        streaming: bool = True
     ) -> Iterable[bytes]:
         """Stream input to Play.ht via the text_stream object."""
         buffer = io.StringIO()
@@ -516,35 +518,34 @@ def stream_tts_input(
             buffer.write(" ")  # normalize word spacing.
             if SENTENCE_END_REGEX.match(t) is None:
                 continue
-            yield from self.tts(buffer.getvalue(), options, voice_engine)
+            yield from self.tts(buffer.getvalue(), options, voice_engine, protocol, streaming)
             buffer = io.StringIO()
         # If text_stream closes, send all remaining text, regardless of sentence structure.
         if buffer.tell() > 0:
-            yield from self.tts(buffer.getvalue(), options, voice_engine)
+            yield from self.tts(buffer.getvalue(), options, voice_engine, protocol, streaming)
 
     def tts(
             self,
             text: Union[str, List[str]],
             options: TTSOptions,
             voice_engine: Optional[str] = None,
+            protocol: Optional[str] = None,
             streaming: bool = True
     ) -> Iterable[bytes]:
         metrics = self._telemetry.start("tts-request")
         try:
-            voice_engine, protocol = get_voice_engine_and_protocol(voice_engine)
+            voice_engine, protocol = get_voice_engine_and_protocol(voice_engine, protocol)
 
             if protocol == "http":
                 return self._tts_http(text, options, voice_engine, metrics, streaming)
             elif protocol == "ws":
-                if streaming:
-                    return self._tts_ws(text, options, voice_engine, metrics)
-                else:
+                if not streaming:
                     raise ValueError("Non-streaming is not supported for WebSocket API")
+                return self._tts_ws(text, options, voice_engine, metrics)
             elif protocol == "grpc":
-                if streaming:
-                    return self._tts_grpc(text, options, voice_engine, metrics)
-                else:
+                if not streaming:
                     raise ValueError("Non-streaming is not supported for gRPC API")
+                return self._tts_grpc(text, options, voice_engine, metrics)
             else:
                 raise ValueError(f"Unknown protocol {protocol}")
         except Exception as e:
@@ -757,7 +758,8 @@ def _tts_ws(
     def get_stream_pair(
         self,
         options: TTSOptions,
-        voice_engine: Optional[str] = None
+        voice_engine: Optional[str] = None,
+        protocol: Optional[str] = None
     ) -> Tuple['_InputStream', '_OutputStream']:
         """Get a linked pair of (input, output) streams.
 
@@ -765,7 +767,7 @@ def get_stream_pair(
         """
         shared_q = queue.Queue()
         return (
-            _InputStream(self, options, shared_q, voice_engine),
+            _InputStream(self, options, shared_q, voice_engine, protocol),
             _OutputStream(shared_q)
         )
 
@@ -818,11 +820,11 @@ class _InputStream:
        input_stream.done()
     """
     def __init__(self, client: Client, options: TTSOptions, q: queue.Queue[Optional[bytes]],
-                 voice_engine: Optional[str]):
+                 voice_engine: Optional[str], protocol: Optional[str] = None):
         self._input = TextStream()
 
         def listen():
-            for output in client.stream_tts_input(self._input, options, voice_engine):
+            for output in client.stream_tts_input(self._input, options, voice_engine, protocol):
                 q.put(output)
             q.put(None)
 
diff --git a/pyht/utils.py b/pyht/utils.py
index 02d1d7b..fc9778c 100644
--- a/pyht/utils.py
+++ b/pyht/utils.py
@@ -15,30 +15,143 @@ def prepare_text(text: Union[str, List[str]], remove_ssml_tags: bool = True) ->
     return text
 
 
-def get_voice_engine_and_protocol(voice_engine: Optional[str]) -> Tuple[str, str]:
-    if voice_engine is None:
-        logging.warning("No voice engine specified; using Play3.0-mini-http")
-        voice_engine = "Play3.0-mini"
-        protocol = "http"
+def _convert_deprecated_voice_engine(voice_engine: str, protocol: Optional[str]) -> Tuple[str, str]:
+    _voice_engine, _protocol = voice_engine.rsplit("-", 1)
+    if not protocol or protocol == _protocol:
+        logging.warning(f"Voice engine {_voice_engine}-{_protocol} is deprecated; \
+                        separately pass voice_engine='{_voice_engine}' and protocol='{_protocol}'.")
+        return _voice_engine, _protocol
+    else:
+        raise ValueError(f"Got voice engine of deprecated format {voice_engine} \
+                         as well as mismatched protocol {protocol}.")
+
+
+def get_voice_engine_and_protocol(voice_engine: Optional[str], protocol: Optional[str]) -> Tuple[str, str]:
+    if protocol and protocol not in ["http", "ws", "grpc"]:
+        raise ValueError(f"Invalid protocol: {protocol} (must be http, ws, or grpc).")
+
+    # this is a bunch of tedious backward compatibility
+
+    if not voice_engine:
+        if not protocol:
+            logging.warning("No voice engine or protocol specified; using Play3.0-mini-http.")
+            voice_engine = "Play3.0-mini"
+            protocol = "http"
+        elif protocol in ["http", "ws"]:
+            logging.warning(f"No voice engine specified and protocol is {protocol}; using Play3.0-mini-{protocol}.")
+            voice_engine = "Play3.0-mini"
+        elif protocol == "grpc":
+            logging.warning("No voice engine specified and protocol is grpc; using PlayHT2.0-turbo.")
+            voice_engine = "PlayHT2.0-turbo"
+        else:
+            raise ValueError(f"No voice engine specified and invalid protocol {protocol} (must be http, ws, or grpc).")
+
     elif voice_engine == "PlayHT2.0-turbo":
-        protocol = "grpc"
-    elif voice_engine == "Play3.0":
-        logging.warning("Voice engine Play3.0 is deprecated; use Play3.0-mini-http or Play3.0-mini-ws instead.")
-        logging.warning("No protocol specified; using HTTP (if not desired, append '-ws' to the voice engine)")
-        voice_engine = "Play3.0-mini"
-        protocol = "http"
-    elif voice_engine == "Play3.0-http":
-        logging.warning("Voice engine Play3.0-http is deprecated; use Play3.0-mini-http instead.")
-        voice_engine = "Play3.0-mini"
-        protocol = "http"
-    elif voice_engine == "Play3.0-ws":
-        logging.warning("Voice engine Play3.0-ws is deprecated; use Play3.0-mini-ws instead.")
-        voice_engine = "Play3.0-mini"
-        protocol = "ws"
-    elif voice_engine == "Play3.0-mini" or voice_engine == "PlayDialog" or voice_engine == "PlayDialogMultilingual":
-        logging.warning("No protocol specified; using HTTP (if not desired, append '-ws' to the voice engine)")
-        protocol = "http"
+        if not protocol:
+            protocol = "grpc"
+        if protocol != "grpc":
+            raise ValueError(f"Voice engine PlayHT2.0-turbo does not support protocol {protocol} (must be grpc).")
+
+    elif voice_engine in ["Play3.0-mini", "Play3.0-mini-http", "Play3.0-mini-ws", "Play3.0-mini-grpc",
+                          "Play3.0", "Play3.0-http", "Play3.0-ws", "Play3.0-grpc"]:
+        if "mini" not in voice_engine:
+            logging.warning("Voice engine Play3.0 is deprecated; use Play3.0-mini.")
+            voice_engine = voice_engine.replace("Play3.0", "Play3.0-mini")
+        if voice_engine == "Play3.0-mini":
+            if not protocol:
+                logging.warning("No protocol specified; using http")
+                protocol = "http"
+            if protocol not in ["http", "ws", "grpc"]:
+                raise ValueError(f"Voice engine Play3.0-mini does not support protocol {protocol} \
+                                 (must be http, ws, or grpc [grpc for on-prem customers only]).")
+        else:
+            voice_engine, protocol = _convert_deprecated_voice_engine(voice_engine, protocol)
+
+    elif voice_engine in ["PlayDialog", "PlayDialog-http", "PlayDialog-ws", "PlayDialogMultilingual",
+                          "PlayDialogMultilingual-http", "PlayDialogMultilingual-ws"]:
+        if voice_engine in ["PlayDialog", "PlayDialogMultilingual"]:
+            if not protocol:
+                logging.warning("No protocol specified; using http")
+                protocol = "http"
+            if protocol not in ["http", "ws"]:
+                raise ValueError(f"Voice engine {voice_engine} does not support protocol {protocol} \
+                                 (must be http or ws).")
+        else:
+            voice_engine, protocol = _convert_deprecated_voice_engine(voice_engine, protocol)
+
     else:
-        voice_engine, protocol = voice_engine.rsplit("-", 1)
+        raise ValueError(f"Invalid voice engine: {voice_engine} (must be Play3.0-mini, PlayDialog, \
+                         PlayDialogMultilingual, or PlayHT2.0-turbo).")
 
     return voice_engine, protocol
+
+
+def main():
+    assert get_voice_engine_and_protocol(None, "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("", "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol(None, "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("", "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol(None, None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("", None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol(None, "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("", "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini", "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini", "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-mini", "grpc") == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0-mini", None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini", "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini-http", "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini-http", None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini-http", "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-mini-ws", "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-mini-ws", None) == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-mini-ws", "") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-mini-grpc", "grpc") == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0-mini-grpc", None) == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0-mini-grpc", "") == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0", "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0", "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0", "grpc") == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0", None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0", "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-http", "http") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-http", None) == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-http", "") == ("Play3.0-mini", "http")
+    assert get_voice_engine_and_protocol("Play3.0-ws", "ws") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-ws", None) == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-ws", "") == ("Play3.0-mini", "ws")
+    assert get_voice_engine_and_protocol("Play3.0-grpc", "grpc") == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0-grpc", None) == ("Play3.0-mini", "grpc")
+    assert get_voice_engine_and_protocol("Play3.0-grpc", "") == ("Play3.0-mini", "grpc")
+
+    assert get_voice_engine_and_protocol("PlayDialog", "http") == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog", "ws") == ("PlayDialog", "ws")
+    assert get_voice_engine_and_protocol("PlayDialog", None) == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog", "") == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog-http", "http") == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog-http", None) == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog-http", "") == ("PlayDialog", "http")
+    assert get_voice_engine_and_protocol("PlayDialog-ws", "ws") == ("PlayDialog", "ws")
+    assert get_voice_engine_and_protocol("PlayDialog-ws", None) == ("PlayDialog", "ws")
+    assert get_voice_engine_and_protocol("PlayDialog-ws", "") == ("PlayDialog", "ws")
+
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual", "http") == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual", "ws") == ("PlayDialogMultilingual", "ws")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual", None) == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual", "") == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-http", "http") == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-http", None) == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-http", "") == ("PlayDialogMultilingual", "http")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-ws", "ws") == ("PlayDialogMultilingual", "ws")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-ws", None) == ("PlayDialogMultilingual", "ws")
+    assert get_voice_engine_and_protocol("PlayDialogMultilingual-ws", "") == ("PlayDialogMultilingual", "ws")
+
+    assert get_voice_engine_and_protocol(None, "grpc") == ("PlayHT2.0-turbo", "grpc")
+    assert get_voice_engine_and_protocol("", "grpc") == ("PlayHT2.0-turbo", "grpc")
+    assert get_voice_engine_and_protocol("PlayHT2.0-turbo", "grpc") == ("PlayHT2.0-turbo", "grpc")
+    assert get_voice_engine_and_protocol("PlayHT2.0-turbo", None) == ("PlayHT2.0-turbo", "grpc")
+    assert get_voice_engine_and_protocol("PlayHT2.0-turbo", "") == ("PlayHT2.0-turbo", "grpc")
+
+
+if __name__ == "__main__":
+    main()