[feature] [CUDA solver] Add multi-GPU and ask for CUDA during btcli r…

…un (#893) * added cuda solver * boost versions to fix pip error * allow choosing device id * fix solution check to use keccak * adds params for cuda and dev_id to register * list devices by name during selection * add block number logging * fix calculation of hashrate * fix update interval default * add --TPB arg to register * add update_interval flag * switch back to old looping/work structure * change typing * device count is a function * stop early if wallet registered * add update interval and num proc flag * add better number output * optimize multiproc cpu reg keeping proc until solution * fix test * change import to cubit * fix import and default * up default should have default in CLI call * add comments about params * fix config var access * add cubit as extra * handle stale pow differently check registration after failure * restrict number of processes for integration test * fix stale check * use wallet.is_registered instead * attempt to fix test issue * fix my test * oops typo * typo again ugh * remove print out * fix partly reg test * fix if solution None * fix test? * fix patch * add args for cuda to subtensor * add cuda args to reregister call * add to wallet register the cuda args * fix refs and tests * add for val test also * fix tests with rereg * fix patch for tests * add mock_register to subtensor passed instead * move register under the check for isregistered * use patch obj instead * fit patch object * fix prompt * remove unneeded if * modify POW submit to use rolling submit again * add backoff to block get from network * add test for backoff get block * suppress the dev id flag if not set * remove dest so it uses first arg * fix pow submit loop * move registration status with * fix max attempts check * remove status in subtensor.register * add submit status * change to neuron get instead * fix count * try to patch live display * fix patch * . * separate test cases * add POWNotStale and tests * add more test cases for block get with retry * fix return to None * fix arg order * fix indent * add test to verify solution is submitted * fix mock call * patch hex bytes instead * typo :/ * fix print out for unstake * fix indexing into mock call * call indexing * access dict not with dot * fix other indent * add CUDAException for cubit * up cubit version * [Feature] ask cuda during btcli run (#890) * add ask for cuda reg config in btcli run * suppress unset arg * [Feature] [cuda solver] multi gpu (#891) * change diff display out * remove logging * check cubit support in the check config * allow 1 or more devices in flag * cuda flag should be suppress * modify how cpu count is found * make a solver base class * add a solverbase for CUDA * use mutli process kernel launching, one per GPU * move check under dot get accessor * Feature/cuda solver multi gpu (#892) * change diff display out * remove logging * check cubit support in the check config * allow 1 or more devices in flag * cuda flag should be suppress * modify how cpu count is found * make a solver base class * add a solverbase for CUDA * use mutli process kernel launching, one per GPU * move check under dot get accessor * add All gpus specification * continue trying reg after Stale * catch for OSX * dont use qsize * add test for continue after being stale * patch get_nowait instead of qsize
opentensor · Sep 9, 2022 · 4bfb69b · 4bfb69b
1 parent 0afe907
commit 4bfb69b
Show file tree

Hide file tree

Showing 9 changed files with 476 additions and 186 deletions.
diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py
@@ -26,7 +26,7 @@
 
 import bittensor
 import torch
-from rich.prompt import Confirm, Prompt
+from rich.prompt import Confirm, Prompt, PromptBase
 
 from . import cli_impl
 
@@ -823,6 +823,36 @@ def check_overview_config( config: 'bittensor.Config' ):
             wallet_name = Prompt.ask("Enter wallet name", default = bittensor.defaults.wallet.name)
             config.wallet.name = str(wallet_name)
 
+    def _check_for_cuda_reg_config( config: 'bittensor.Config' ) -> None:
+        """Checks, when CUDA is available, if the user would like to register with their CUDA device."""
+        if torch.cuda.is_available():
+            if config.subtensor.register.cuda.get('use_cuda') is None:
+                # Ask about cuda registration only if a CUDA device is available.
+                cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
+                config.subtensor.register.cuda.use_cuda = cuda
+
+            # Only ask about which CUDA device if the user has more than one CUDA device.
+            if config.subtensor.register.cuda.use_cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
+                devices: List[str] = [str(x) for x in range(torch.cuda.device_count())]
+                device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())]
+                console.print("Available CUDA devices:")
+                choices_str: str = ""
+                for i, device in enumerate(devices):
+                    choices_str += ("  {}: {}\n".format(device, device_names[i]))
+                console.print(choices_str)
+                dev_id = IntListPrompt.ask("Which GPU(s) would you like to use? Please list one, or comma-separated", choices=devices, default='All')
+                if dev_id == 'All':
+                    dev_id = list(range(torch.cuda.device_count()))
+                else:
+                    try:
+                        # replace the commas with spaces then split over whitespace.,
+                        # then strip the whitespace and convert to ints.
+                        dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()]
+                    except ValueError:
+                        console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
+                        sys.exit(1)
+                config.subtensor.register.cuda.dev_id = dev_id
+
     def check_register_config( config: 'bittensor.Config' ):
         if config.subtensor.get('network') == bittensor.defaults.subtensor.network and not config.no_prompt:
             config.subtensor.network = Prompt.ask("Enter subtensor network", choices=bittensor.__networks__, default = bittensor.defaults.subtensor.network)
@@ -835,27 +865,8 @@ def check_register_config( config: 'bittensor.Config' ):
             hotkey = Prompt.ask("Enter hotkey name", default = bittensor.defaults.wallet.hotkey)
             config.wallet.hotkey = str(hotkey)
 
-        if not config.no_prompt and config.subtensor.register.cuda.use_cuda == bittensor.defaults.subtensor.register.cuda.use_cuda:
-            # Ask about cuda registration only if a CUDA device is available.
-            if torch.cuda.is_available():
-                cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
-                config.subtensor.register.cuda.use_cuda = cuda
-                # Only ask about which CUDA device if the user has more than one CUDA device.
-                if cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
-                    devices: List[str] = [str(x) for x in range(torch.cuda.device_count())]
-                    device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())]
-                    console.print("Available CUDA devices:")
-                    choices_str: str = ""
-                    for i, device in enumerate(devices):
-                        choices_str += ("  {}: {}\n".format(device, device_names[i]))
-                    console.print(choices_str)
-                    dev_id = Prompt.ask("Which GPU would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id))
-                    try:
-                        dev_id = int(dev_id)
-                    except ValueError:
-                        console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
-                        sys.exit(1)
-                    config.subtensor.register.cuda.dev_id = dev_id
+        if not config.no_prompt:
+            cli._check_for_cuda_reg_config(config)
 
     def check_new_coldkey_config( config: 'bittensor.Config' ):
         if config.wallet.get('name') == bittensor.defaults.wallet.name  and not config.no_prompt:
@@ -931,6 +942,10 @@ def check_run_config( config: 'bittensor.Config' ):
         if 'server' in config.model and not config.no_prompt:
             synapse =  Prompt.ask('Enter synapse', choices = list(bittensor.synapse.__synapses_types__), default = 'All')
             config.synapse = synapse
+
+        # Don't need to ask about registration if they don't want to reregister the wallet.
+        if config.wallet.get('reregister', bittensor.defaults.wallet.reregister) and not config.no_prompt:
+            cli._check_for_cuda_reg_config(config)
 
     def check_help_config( config: 'bittensor.Config'):
         if config.model == 'None':
@@ -941,3 +956,13 @@ def check_update_config( config: 'bittensor.Config'):
         if not config.no_prompt:
             answer = Prompt.ask('This will update the local bittensor package', choices = ['Y','N'], default = 'Y')
             config.answer = answer
+
+class IntListPrompt(PromptBase):
+    """ Prompt for a list of integers. """
+
+    def check_choice( self, value: str ) -> bool:
+        assert self.choices is not None
+        # check if value is a valid choice or all the values in a list of ints are valid choices
+        return value == "All" or \
+            value in self.choices or \
+            all( val.strip() in self.choices for val in value.replace(',', ' ').split( ))
diff --git a/bittensor/_cli/cli_impl.py b/bittensor/_cli/cli_impl.py
@@ -309,7 +309,7 @@ def unstake( self ):
         if not self.config.no_prompt:
             if not Confirm.ask("Do you want to unstake from the following keys:\n" + \
                     "".join([
-                        f"    [bold white]- {wallet.hotkey_str}: {amount.tao}𝜏[/bold white]\n" for wallet, amount in zip(final_wallets, final_amounts)
+                        f"    [bold white]- {wallet.hotkey_str}: {amount}𝜏[/bold white]\n" for wallet, amount in zip(final_wallets, final_amounts)
                     ])
                 ):
                 return None

diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py
@@ -15,22 +15,16 @@
 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 # DEALINGS IN THE SOFTWARE.
 import argparse
+import copy
 import os
 
-import random
-import time
-import psutil
-import subprocess
-from sys import platform   
-
 import bittensor
-import copy
+from loguru import logger
 from substrateinterface import SubstrateInterface
+from torch.cuda import is_available as is_cuda_available
 
-from . import subtensor_impl
-from . import subtensor_mock
+from . import subtensor_impl, subtensor_mock
 
-from loguru import logger
 logger = logger.opt(colors=True)
 
 __type_registery__ = {
@@ -193,8 +187,9 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ):
             parser.add_argument('--' + prefix_str + 'subtensor.register.num_processes', '-n', dest='subtensor.register.num_processes', help="Number of processors to use for registration", type=int, default=bittensor.defaults.subtensor.register.num_processes)
             parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval)
              # registration args. Used for register and re-register and anything that calls register.
-            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False )
-            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id',  type=int, default=argparse.SUPPRESS, help='''Set the CUDA device id. Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
+            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=argparse.SUPPRESS, help='''Set true to use CUDA.''', action='store_true', required=False )
+            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id',  type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
+
             parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False )
 
         except argparse.ArgumentError:
@@ -215,14 +210,26 @@ def add_defaults(cls, defaults ):
         defaults.subtensor.register.update_interval = os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') if os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') != None else 50_000
 
         defaults.subtensor.register.cuda = bittensor.Config()
-        defaults.subtensor.register.cuda.dev_id = 0
+        defaults.subtensor.register.cuda.dev_id = [0]
         defaults.subtensor.register.cuda.use_cuda = False
         defaults.subtensor.register.cuda.TPB = 256
 
     @staticmethod   
     def check_config( config: 'bittensor.Config' ):
         assert config.subtensor
         #assert config.subtensor.network != None
+        if config.subtensor.get('register') and config.subtensor.register.get('cuda'):
+            assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', []))
+
+            if config.subtensor.register.cuda.get('use_cuda', False):
+                try:
+                    import cubit
+                except ImportError:
+                    raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.')
+
+                if not is_cuda_available():
+                    raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.')
+
 
     @staticmethod
     def determine_chain_endpoint(network: str):