Skip to content

Commit

Permalink
[feature] [CUDA solver] Add multi-GPU and ask for CUDA during btcli r…
Browse files Browse the repository at this point in the history
…un (#893)

* added cuda solver

* boost versions to fix pip error

* allow choosing device id

* fix solution check to use keccak

* adds params for cuda and dev_id to register

* list devices by name during selection

* add block number logging

* fix calculation of hashrate

* fix update interval default

* add --TPB arg to register

* add update_interval flag

* switch back to old looping/work structure

* change typing

* device count is a function

* stop early if wallet registered

* add update interval and num proc flag

* add better number output

* optimize multiproc cpu reg
keeping proc until solution

* fix test

* change import to cubit

* fix import and default

* up default
should have default in CLI call

* add comments about params

* fix config var access

* add cubit as extra

* handle stale pow differently
check registration after failure

* restrict number of processes for integration test

* fix stale check

* use wallet.is_registered instead

* attempt to fix test issue

* fix my test

* oops typo

* typo again ugh

* remove print out

* fix partly reg test

* fix if solution None

* fix test?

* fix patch

* add args for cuda to subtensor

* add cuda args to reregister call

* add to wallet register the cuda args

* fix refs and tests

* add for val test also

* fix tests with rereg

* fix patch for tests

* add mock_register to subtensor passed instead

* move register under the check for isregistered

* use patch obj instead

* fit patch object

* fix prompt

* remove unneeded if

* modify POW submit to use rolling submit again

* add backoff to block get from network

* add test for backoff get block

* suppress the dev id flag if not set

* remove dest so it uses first arg

* fix pow submit loop

* move registration status with

* fix max attempts check

* remove status in subtensor.register

* add submit status

* change to neuron get instead

* fix count

* try to patch live display

* fix patch

* .

* separate test cases

* add POWNotStale and tests

* add more test cases for block get with retry

* fix return to None

* fix arg order

* fix indent

* add test to verify solution is submitted

* fix mock call

* patch hex bytes instead

* typo :/

* fix print out for unstake

* fix indexing into mock call

* call indexing

* access dict not with dot

* fix other indent

* add CUDAException for cubit

* up cubit version

* [Feature] ask cuda during btcli run (#890)

* add ask for cuda reg config in btcli run

* suppress unset arg

* [Feature] [cuda solver] multi gpu (#891)

* change diff display out

* remove logging

* check cubit support in the check config

* allow 1 or more devices in flag

* cuda flag should be suppress

* modify how cpu count is found

* make a solver base class

* add a solverbase for CUDA

* use mutli process kernel launching, one per GPU

* move check under dot get accessor

* Feature/cuda solver multi gpu (#892)

* change diff display out

* remove logging

* check cubit support in the check config

* allow 1 or more devices in flag

* cuda flag should be suppress

* modify how cpu count is found

* make a solver base class

* add a solverbase for CUDA

* use mutli process kernel launching, one per GPU

* move check under dot get accessor

* add All gpus specification

* continue trying reg after Stale

* catch for OSX

* dont use qsize

* add test for continue after being stale

* patch get_nowait instead of qsize
  • Loading branch information
Cameron Fairchild authored Sep 9, 2022
1 parent 0afe907 commit 4bfb69b
Show file tree
Hide file tree
Showing 9 changed files with 476 additions and 186 deletions.
69 changes: 47 additions & 22 deletions bittensor/_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

import bittensor
import torch
from rich.prompt import Confirm, Prompt
from rich.prompt import Confirm, Prompt, PromptBase

from . import cli_impl

Expand Down Expand Up @@ -823,6 +823,36 @@ def check_overview_config( config: 'bittensor.Config' ):
wallet_name = Prompt.ask("Enter wallet name", default = bittensor.defaults.wallet.name)
config.wallet.name = str(wallet_name)

def _check_for_cuda_reg_config( config: 'bittensor.Config' ) -> None:
"""Checks, when CUDA is available, if the user would like to register with their CUDA device."""
if torch.cuda.is_available():
if config.subtensor.register.cuda.get('use_cuda') is None:
# Ask about cuda registration only if a CUDA device is available.
cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
config.subtensor.register.cuda.use_cuda = cuda

# Only ask about which CUDA device if the user has more than one CUDA device.
if config.subtensor.register.cuda.use_cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
devices: List[str] = [str(x) for x in range(torch.cuda.device_count())]
device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())]
console.print("Available CUDA devices:")
choices_str: str = ""
for i, device in enumerate(devices):
choices_str += (" {}: {}\n".format(device, device_names[i]))
console.print(choices_str)
dev_id = IntListPrompt.ask("Which GPU(s) would you like to use? Please list one, or comma-separated", choices=devices, default='All')
if dev_id == 'All':
dev_id = list(range(torch.cuda.device_count()))
else:
try:
# replace the commas with spaces then split over whitespace.,
# then strip the whitespace and convert to ints.
dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()]
except ValueError:
console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
sys.exit(1)
config.subtensor.register.cuda.dev_id = dev_id

def check_register_config( config: 'bittensor.Config' ):
if config.subtensor.get('network') == bittensor.defaults.subtensor.network and not config.no_prompt:
config.subtensor.network = Prompt.ask("Enter subtensor network", choices=bittensor.__networks__, default = bittensor.defaults.subtensor.network)
Expand All @@ -835,27 +865,8 @@ def check_register_config( config: 'bittensor.Config' ):
hotkey = Prompt.ask("Enter hotkey name", default = bittensor.defaults.wallet.hotkey)
config.wallet.hotkey = str(hotkey)

if not config.no_prompt and config.subtensor.register.cuda.use_cuda == bittensor.defaults.subtensor.register.cuda.use_cuda:
# Ask about cuda registration only if a CUDA device is available.
if torch.cuda.is_available():
cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
config.subtensor.register.cuda.use_cuda = cuda
# Only ask about which CUDA device if the user has more than one CUDA device.
if cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
devices: List[str] = [str(x) for x in range(torch.cuda.device_count())]
device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())]
console.print("Available CUDA devices:")
choices_str: str = ""
for i, device in enumerate(devices):
choices_str += (" {}: {}\n".format(device, device_names[i]))
console.print(choices_str)
dev_id = Prompt.ask("Which GPU would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id))
try:
dev_id = int(dev_id)
except ValueError:
console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
sys.exit(1)
config.subtensor.register.cuda.dev_id = dev_id
if not config.no_prompt:
cli._check_for_cuda_reg_config(config)

def check_new_coldkey_config( config: 'bittensor.Config' ):
if config.wallet.get('name') == bittensor.defaults.wallet.name and not config.no_prompt:
Expand Down Expand Up @@ -931,6 +942,10 @@ def check_run_config( config: 'bittensor.Config' ):
if 'server' in config.model and not config.no_prompt:
synapse = Prompt.ask('Enter synapse', choices = list(bittensor.synapse.__synapses_types__), default = 'All')
config.synapse = synapse

# Don't need to ask about registration if they don't want to reregister the wallet.
if config.wallet.get('reregister', bittensor.defaults.wallet.reregister) and not config.no_prompt:
cli._check_for_cuda_reg_config(config)

def check_help_config( config: 'bittensor.Config'):
if config.model == 'None':
Expand All @@ -941,3 +956,13 @@ def check_update_config( config: 'bittensor.Config'):
if not config.no_prompt:
answer = Prompt.ask('This will update the local bittensor package', choices = ['Y','N'], default = 'Y')
config.answer = answer

class IntListPrompt(PromptBase):
""" Prompt for a list of integers. """

def check_choice( self, value: str ) -> bool:
assert self.choices is not None
# check if value is a valid choice or all the values in a list of ints are valid choices
return value == "All" or \
value in self.choices or \
all( val.strip() in self.choices for val in value.replace(',', ' ').split( ))
2 changes: 1 addition & 1 deletion bittensor/_cli/cli_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def unstake( self ):
if not self.config.no_prompt:
if not Confirm.ask("Do you want to unstake from the following keys:\n" + \
"".join([
f" [bold white]- {wallet.hotkey_str}: {amount.tao}𝜏[/bold white]\n" for wallet, amount in zip(final_wallets, final_amounts)
f" [bold white]- {wallet.hotkey_str}: {amount}𝜏[/bold white]\n" for wallet, amount in zip(final_wallets, final_amounts)
])
):
return None
Expand Down
33 changes: 20 additions & 13 deletions bittensor/_subtensor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,16 @@
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
import argparse
import copy
import os

import random
import time
import psutil
import subprocess
from sys import platform

import bittensor
import copy
from loguru import logger
from substrateinterface import SubstrateInterface
from torch.cuda import is_available as is_cuda_available

from . import subtensor_impl
from . import subtensor_mock
from . import subtensor_impl, subtensor_mock

from loguru import logger
logger = logger.opt(colors=True)

__type_registery__ = {
Expand Down Expand Up @@ -193,8 +187,9 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ):
parser.add_argument('--' + prefix_str + 'subtensor.register.num_processes', '-n', dest='subtensor.register.num_processes', help="Number of processors to use for registration", type=int, default=bittensor.defaults.subtensor.register.num_processes)
parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval)
# registration args. Used for register and re-register and anything that calls register.
parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False )
parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id', type=int, default=argparse.SUPPRESS, help='''Set the CUDA device id. Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=argparse.SUPPRESS, help='''Set true to use CUDA.''', action='store_true', required=False )
parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id', type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )

parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False )

except argparse.ArgumentError:
Expand All @@ -215,14 +210,26 @@ def add_defaults(cls, defaults ):
defaults.subtensor.register.update_interval = os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') if os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') != None else 50_000

defaults.subtensor.register.cuda = bittensor.Config()
defaults.subtensor.register.cuda.dev_id = 0
defaults.subtensor.register.cuda.dev_id = [0]
defaults.subtensor.register.cuda.use_cuda = False
defaults.subtensor.register.cuda.TPB = 256

@staticmethod
def check_config( config: 'bittensor.Config' ):
assert config.subtensor
#assert config.subtensor.network != None
if config.subtensor.get('register') and config.subtensor.register.get('cuda'):
assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', []))

if config.subtensor.register.cuda.get('use_cuda', False):
try:
import cubit
except ImportError:
raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.')

if not is_cuda_available():
raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.')


@staticmethod
def determine_chain_endpoint(network: str):
Expand Down
Loading

0 comments on commit 4bfb69b

Please sign in to comment.