From f1c58fd0d7e5723990e41267f8f819e5d7b46cfe Mon Sep 17 00:00:00 2001 From: tankya2 Date: Wed, 24 Jan 2024 17:10:03 +0800 Subject: [PATCH] Remove test codes --- src/qibotn/cutn.py | 330 +++++++-------------------------------------- 1 file changed, 52 insertions(+), 278 deletions(-) diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py index aca33ff1..e016570f 100644 --- a/src/qibotn/cutn.py +++ b/src/qibotn/cutn.py @@ -8,53 +8,29 @@ from qibotn.QiboCircuitToMPS import QiboCircuitToMPS from qibotn.mps_contraction_helper import MPSContractionHelper - def eval(qibo_circ, datatype): myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract(*myconvertor.state_vector_operands()) - -def eval_expectation(qibo_circ, datatype): - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - return contract( - *myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - ) - - def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket + from mpi4py import MPI from cuquantum import Network - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - operands = myconvertor.state_vector_operands() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - # Broadcast the operand data. - # operands = comm.bcast(operands, root) + operands = myconvertor.state_vector_operands() # Assign the device for each process. device_id = rank % getDeviceCount() - # dev = cp.cuda.Device(device_id) - # free_mem, total_mem = dev.mem_info - # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) - # Create network object. network = Network(*operands, options={"device_id": device_id}) @@ -62,14 +38,10 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): path, info = network.contract_path( optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") - # Broadcast info from the sender to all other ranks. info = comm.bcast(info, sender) @@ -87,34 +59,24 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) return result, rank - def eval_tn_nccl(qibo_circ, datatype, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket + from mpi4py import MPI from cuquantum import Network from cupy.cuda import nccl - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm_mpi = MPI.COMM_WORLD rank = comm_mpi.Get_rank() size = comm_mpi.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() @@ -126,11 +88,8 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.state_vector_operands() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) network = Network(*operands) @@ -139,13 +98,9 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") - # Select the best path from all ranks. opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # Broadcast info from the sender to all other ranks. info = comm_mpi.bcast(info, sender) @@ -164,12 +119,8 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. stream_ptr = cp.cuda.get_current_stream().ptr @@ -185,57 +136,44 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8): return result, rank +def eval_expectation(qibo_circ, datatype): + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + return contract( + *myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) + ) -def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): +def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8): from mpi4py import MPI # this line initializes MPI - import socket from cuquantum import Network - from cupy.cuda import nccl - - # Get the hostname - # hostname = socket.gethostname() root = 0 - comm_mpi = MPI.COMM_WORLD - rank = comm_mpi.Get_rank() - size = comm_mpi.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) - device_id = rank % getDeviceCount() - - cp.cuda.Device(device_id).use() + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() - # Set up the NCCL communicator. - nccl_id = nccl.get_unique_id() if rank == root else None - nccl_id = comm_mpi.bcast(nccl_id, root) - comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) + device_id = rank % getDeviceCount() # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + # Assign the device for each process. + device_id = rank % getDeviceCount() - network = Network(*operands) + # Create network object. + network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") - # Select the best path from all ranks. - opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) # Broadcast info from the sender to all other ranks. - info = comm_mpi.bcast(info, sender) + info = comm.bcast(info, sender) # Set path and slices. path, info = network.contract_path( @@ -251,79 +189,50 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. - stream_ptr = cp.cuda.get_current_stream().ptr - comm_nccl.reduce( - result.data.ptr, - result.data.ptr, - result.size, - nccl.NCCL_FLOAT64, - nccl.NCCL_SUM, - root, - stream_ptr, - ) + result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) return result, rank - -def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket +def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI from cuquantum import Network - - # Get the hostname - # hostname = socket.gethostname() + from cupy.cuda import nccl root = 0 - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + comm_mpi = MPI.COMM_WORLD + rank = comm_mpi.Get_rank() + size = comm_mpi.Get_size() + device_id = rank % getDeviceCount() - # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + cp.cuda.Device(device_id).use() - # Broadcast the operand data. - # operands = comm.bcast(operands, root) + # Set up the NCCL communicator. + nccl_id = nccl.get_unique_id() if rank == root else None + nccl_id = comm_mpi.bcast(nccl_id, root) + comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) - # Assign the device for each process. - device_id = rank % getDeviceCount() + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # dev = cp.cuda.Device(device_id) - # free_mem, total_mem = dev.mem_info - # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) + operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) - # Create network object. - network = Network(*operands, options={"device_id": device_id}) + network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. - opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) # Broadcast info from the sender to all other ranks. - info = comm.bcast(info, sender) + info = comm_mpi.bcast(info, sender) # Set path and slices. path, info = network.contract_path( @@ -339,157 +248,23 @@ def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. - result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) - - return result, rank - - -def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket - - # Get the hostname - # hostname = socket.gethostname() - - ncpu_threads = multiprocessing.cpu_count() // 2 - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) - device_id = rank % getDeviceCount() - cp.cuda.Device(device_id).use() - - handle = cutn.create() - network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft network opts",mem_avail, "rank =",rank) - cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) - # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands_interleave = myconvertor.expectation_operands( - PauliStringGen(qibo_circ.nqubits) - ) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - - # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. - network = cutn.Network(*operands_interleave, options=network_opts) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) - path, opt_info = network.contract_path( - optimize={ - "samples": n_samples, - "threads": ncpu_threads, - "slicing": {"min_slices": max(16, size)}, - } - ) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft contract path",mem_avail, "rank =",rank) - # Execution: To execute the contraction using the optimal path found previously - # print("opt_cost",opt_info.opt_cost, "Process =",rank) - - num_slices = opt_info.num_slices # Andy - chunk, extra = num_slices // size, num_slices % size # Andy - slice_begin = rank * chunk + min(rank, extra) # Andy - slice_end = ( - num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) - ) # Andy - slices = range(slice_begin, slice_end) # Andy - result = network.contract(slices=slices) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft contract",mem_avail, "rank =",rank) - cutn.destroy(handle) - - return result, rank - - -def eval_tn_MPI(qibo_circ, datatype, n_samples=8): - """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI. - The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. - The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread. - After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN. - """ - - from mpi4py import MPI # this line initializes MPI - import socket - - # Get the hostname - # hostname = socket.gethostname() - - ncpu_threads = multiprocessing.cpu_count() // 2 - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) - device_id = rank % getDeviceCount() - cp.cuda.Device(device_id).use() - - handle = cutn.create() - network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft network opts",mem_avail, "rank =",rank) - cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) - # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - operands_interleave = myconvertor.state_vector_operands() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - - # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. - network = cutn.Network(*operands_interleave, options=network_opts) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) - network.contract_path( - optimize={ - "samples": n_samples, - "threads": ncpu_threads, - "slicing": {"min_slices": max(16, size)}, - } + stream_ptr = cp.cuda.get_current_stream().ptr + comm_nccl.reduce( + result.data.ptr, + result.data.ptr, + result.size, + nccl.NCCL_FLOAT64, + nccl.NCCL_SUM, + root, + stream_ptr, ) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft contract path",mem_avail, "rank =",rank) - # Execution: To execute the contraction using the optimal path found previously - # print("opt_cost",opt_info.opt_cost, "Process =",rank) - - """ - path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) - - num_slices = opt_info.num_slices#Andy - chunk, extra = num_slices // size, num_slices % size#Andy - slice_begin = rank * chunk + min(rank, extra)#Andy - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy - slices = range(slice_begin, slice_end)#Andy - result = network.contract(slices=slices) - """ - result = network.contract() - - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft contract",mem_avail, "rank =",rank) - cutn.destroy(handle) return result, rank - def eval_mps(qibo_circ, gate_algo, datatype): myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype) mps_helper = MPSContractionHelper(myconvertor.num_qubits) @@ -498,12 +273,11 @@ def eval_mps(qibo_circ, gate_algo, datatype): myconvertor.mps_tensors, {"handle": myconvertor.handle} ) - +# To be improved def PauliStringGen(nqubits): if nqubits <= 0: return "Invalid input. N should be a positive integer." - # characters = 'IXYZ' characters = "XXXZ" result = ""