From 214fdf60bde5766a8a08afa78da3c42636f15bf3 Mon Sep 17 00:00:00 2001 From: tankya2 Date: Wed, 24 Jan 2024 11:40:51 +0800 Subject: [PATCH] Updates to include expectation calculation --- src/qibotn/QiboCircuitConvertor.py | 113 ++++++++ src/qibotn/QiboCircuitToMPS.py | 2 +- src/qibotn/backends.py | 53 ++++ src/qibotn/cutn.py | 406 ++++++++++++++++++++++++++++- 4 files changed, 567 insertions(+), 7 deletions(-) diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py index d72a09cb..11aaa716 100644 --- a/src/qibotn/QiboCircuitConvertor.py +++ b/src/qibotn/QiboCircuitConvertor.py @@ -21,6 +21,7 @@ def __init__(self, circuit, dtype="complex128"): self.dtype = getattr(self.backend, dtype) self.init_basis_map(self.backend, dtype) self.init_intermediate_circuit(circuit) + self.circuit = circuit def state_vector_operands(self): input_bitstring = "0" * len(self.active_qubits) @@ -109,3 +110,115 @@ def init_basis_map(self, backend, dtype): state_1 = asarray([0, 1], dtype=dtype) self.basis_map = {"0": state_0, "1": state_1} + + + def init_inverse_circuit(self, circuit): + self.gate_tensors_inverse = [] + gates_qubits_inverse = [] + + for gate in circuit.queue: + gate_qubits = gate.control_qubits + gate.target_qubits + gates_qubits_inverse.extend(gate_qubits) + + # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on + # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32 + required_shape = self.op_shape_from_qubits(len(gate_qubits)) + self.gate_tensors_inverse.append( + ( + cp.asarray(gate.matrix()).reshape(required_shape), + gate_qubits, + ) + ) + + # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. + self.active_qubits_inverse = np.unique(gates_qubits_inverse) + + + def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp): + """ + Populate the gates for all pauli operators. + + Args: + pauli_map: A dictionary mapping qubits to pauli operators. + dtype: Data type for the tensor operands. + backend: The package the tensor operands belong to. + + Returns: + A sequence of pauli gates. + """ + asarray = backend.asarray + pauli_i = asarray([[1,0], [0,1]], dtype=dtype) + pauli_x = asarray([[0,1], [1,0]], dtype=dtype) + pauli_y = asarray([[0,-1j], [1j,0]], dtype=dtype) + pauli_z = asarray([[1,0], [0,-1]], dtype=dtype) + + operand_map = {'I': pauli_i, + 'X': pauli_x, + 'Y': pauli_y, + 'Z': pauli_z} + gates = [] + for qubit, pauli_char in pauli_map.items(): + operand = operand_map.get(pauli_char) + if operand is None: + raise ValueError('pauli string character must be one of I/X/Y/Z') + gates.append((operand, (qubit,))) + return gates + + def expectation_operands(self, pauli_string): + #assign pauli string to qubit + #_get_forward_inverse_metadata() + input_bitstring = "0" * self.circuit.nqubits #Need all qubits! + + input_operands = self._get_bitstring_tensors(input_bitstring) + pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) + pauli_map = pauli_string + coned_qubits = pauli_map.keys() + + ( + mode_labels, + qubits_frontier, + next_frontier, + ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits)) + + gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( + self.gate_tensors, qubits_frontier, next_frontier + ) + + operands = input_operands + gate_operands + mode_labels += gate_mode_labels + + self.init_inverse_circuit(self.circuit.invert()) + + + next_frontier = max(qubits_frontier.values()) + 1 + + #input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits) + + pauli_gates = self.get_pauli_gates(pauli_map, dtype=self.dtype, backend=self.backend) + + + gates_inverse = pauli_gates + self.gate_tensors_inverse + + gate_mode_labels_inverse, gate_operands_inverse = self._parse_gates_to_mode_labels_operands( + gates_inverse, qubits_frontier, next_frontier + ) + mode_labels = mode_labels + gate_mode_labels_inverse + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] + operands = operands + gate_operands_inverse + operands[:self.circuit.nqubits] + + operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] + + #expec = contract(*operand_exp_interleave) + #print(expec) + + ''' + gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, + qubits_frontier, + next_frontier) + + mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits] + operands = input_operands + gate_operands + input_operands[:n_qubits] + + output_mode_labels = [] + expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels) + ''' + return operand_exp_interleave \ No newline at end of file diff --git a/src/qibotn/QiboCircuitToMPS.py b/src/qibotn/QiboCircuitToMPS.py index d51093f5..816b17c0 100644 --- a/src/qibotn/QiboCircuitToMPS.py +++ b/src/qibotn/QiboCircuitToMPS.py @@ -21,7 +21,7 @@ def __init__( self.handle = cutn.create() self.dtype = dtype self.mps_tensors = initial(self.num_qubits, dtype=dtype) - circuitconvertor = QiboCircuitToEinsum(circ_qibo) + circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype) for gate, qubits in circuitconvertor.gate_tensors: # mapping from qubits to qubit indices diff --git a/src/qibotn/backends.py b/src/qibotn/backends.py index 9d399731..4b28431d 100644 --- a/src/qibotn/backends.py +++ b/src/qibotn/backends.py @@ -14,6 +14,13 @@ def __init__(self, platform): platform == "cu_tensornet" or platform == "cu_mps" or platform == "qu_tensornet" + or platform == "cu_tensornet_mpi" + or platform == "cu_tensornet_mpi_expectation" + or platform == "cu_tensornet_expectation" + or platform == "cu_tensornet_nccl" + or platform == "cu_tensornet_nccl_expectation" + + ): # pragma: no cover self.platform = platform else: @@ -71,6 +78,52 @@ def execute_circuit( init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype) init_state[0] = 1.0 state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy") + + if self.platform == "cu_tensornet_mpi": + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN cannot support initial state.") + + #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype,32) + if rank > 0: + state = np.array(0) + + if self.platform == "cu_tensornet_nccl": + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN cannot support initial state.") + + #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + state, rank = cutn.eval_tn_nccl(circuit, self.dtype,32) + if rank > 0: + state = np.array(0) + + if self.platform == "cu_tensornet_expectation": + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN cannot support initial state.") + + state = cutn.eval_expectation(circuit, self.dtype) + + if self.platform == "cu_tensornet_mpi_expectation": + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN cannot support initial state.") + + #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) + state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype,32) + + if rank > 0: + state = np.array(0) + + if self.platform == "cu_tensornet_nccl_expectation": + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN cannot support initial state.") + + #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) + #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) + state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype,32) + + if rank > 0: + state = np.array(0) if return_array: return state.flatten() diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py index eb0e0d49..67d70c49 100644 --- a/src/qibotn/cutn.py +++ b/src/qibotn/cutn.py @@ -13,6 +13,354 @@ def eval(qibo_circ, datatype): myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract(*myconvertor.state_vector_operands()) +def eval_expectation(qibo_circ, datatype): + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))) + +def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI # this line initializes MPI + import socket + from cuquantum import Network + + # Get the hostname + #hostname = socket.gethostname() + + root = 0 + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() + + + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.state_vector_operands() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + + # Broadcast the operand data. + #operands = comm.bcast(operands, root) + + # Assign the device for each process. + device_id = rank % getDeviceCount() + + #dev = cp.cuda.Device(device_id) + #free_mem, total_mem = dev.mem_info + #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) + + # Create network object. + network = Network(*operands, options={'device_id' : device_id}) + + # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. + path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + + # Select the best path from all ranks. + opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) + + #if rank == root: + # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + + # Broadcast info from the sender to all other ranks. + info = comm.bcast(info, sender) + + # Set path and slices. + path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + + # Calculate this process's share of the slices. + num_slices = info.num_slices + chunk, extra = num_slices // size, num_slices % size + slice_begin = rank * chunk + min(rank, extra) + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slices = range(slice_begin, slice_end) + + #print(f"Process {rank} is processing slice range: {slices}.") + + # Contract the group of slices the process is responsible for. + result = network.contract(slices=slices) + #print(f"Process {rank} result shape is : {result.shape}.") + #print(f"Process {rank} result size is : {result.nbytes}.") + + # Sum the partial contribution from each process on root. + result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) + + return result, rank + +def eval_tn_nccl(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI # this line initializes MPI + import socket + from cuquantum import Network + from cupy.cuda import nccl + + # Get the hostname + #hostname = socket.gethostname() + + root = 0 + comm_mpi = MPI.COMM_WORLD + rank = comm_mpi.Get_rank() + size = comm_mpi.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() + + cp.cuda.Device(device_id).use() + + # Set up the NCCL communicator. + nccl_id = nccl.get_unique_id() if rank == root else None + nccl_id = comm_mpi.bcast(nccl_id, root) + comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) + + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.state_vector_operands() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + + network = Network(*operands) + + # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. + path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + + #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + + # Select the best path from all ranks. + opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) + + #if rank == root: + # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + + # Broadcast info from the sender to all other ranks. + info = comm_mpi.bcast(info, sender) + + # Set path and slices. + path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + + # Calculate this process's share of the slices. + num_slices = info.num_slices + chunk, extra = num_slices // size, num_slices % size + slice_begin = rank * chunk + min(rank, extra) + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slices = range(slice_begin, slice_end) + + #print(f"Process {rank} is processing slice range: {slices}.") + + # Contract the group of slices the process is responsible for. + result = network.contract(slices=slices) + #print(f"Process {rank} result shape is : {result.shape}.") + #print(f"Process {rank} result size is : {result.nbytes}.") + + # Sum the partial contribution from each process on root. + stream_ptr = cp.cuda.get_current_stream().ptr + comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) + + return result, rank + +def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI # this line initializes MPI + import socket + from cuquantum import Network + from cupy.cuda import nccl + + # Get the hostname + #hostname = socket.gethostname() + + root = 0 + comm_mpi = MPI.COMM_WORLD + rank = comm_mpi.Get_rank() + size = comm_mpi.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() + + cp.cuda.Device(device_id).use() + + # Set up the NCCL communicator. + nccl_id = nccl.get_unique_id() if rank == root else None + nccl_id = comm_mpi.bcast(nccl_id, root) + comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) + + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) + + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + + network = Network(*operands) + + # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. + path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + + #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + + # Select the best path from all ranks. + opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) + + #if rank == root: + # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + + # Broadcast info from the sender to all other ranks. + info = comm_mpi.bcast(info, sender) + + # Set path and slices. + path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + + # Calculate this process's share of the slices. + num_slices = info.num_slices + chunk, extra = num_slices // size, num_slices % size + slice_begin = rank * chunk + min(rank, extra) + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slices = range(slice_begin, slice_end) + + #print(f"Process {rank} is processing slice range: {slices}.") + + # Contract the group of slices the process is responsible for. + result = network.contract(slices=slices) + #print(f"Process {rank} result shape is : {result.shape}.") + #print(f"Process {rank} result size is : {result.nbytes}.") + + # Sum the partial contribution from each process on root. + stream_ptr = cp.cuda.get_current_stream().ptr + comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) + + return result, rank + + +def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI # this line initializes MPI + import socket + from cuquantum import Network + + # Get the hostname + #hostname = socket.gethostname() + + root = 0 + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() + + + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + + # Broadcast the operand data. + #operands = comm.bcast(operands, root) + + # Assign the device for each process. + device_id = rank % getDeviceCount() + + #dev = cp.cuda.Device(device_id) + #free_mem, total_mem = dev.mem_info + #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) + + # Create network object. + network = Network(*operands, options={'device_id' : device_id}) + + # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. + path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) + #print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") + + # Select the best path from all ranks. + opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) + + #if rank == root: + # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") + + # Broadcast info from the sender to all other ranks. + info = comm.bcast(info, sender) + + # Set path and slices. + path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) + + # Calculate this process's share of the slices. + num_slices = info.num_slices + chunk, extra = num_slices // size, num_slices % size + slice_begin = rank * chunk + min(rank, extra) + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) + slices = range(slice_begin, slice_end) + + #print(f"Process {rank} is processing slice range: {slices}.") + + # Contract the group of slices the process is responsible for. + result = network.contract(slices=slices) + #print(f"Process {rank} result shape is : {result.shape}.") + #print(f"Process {rank} result size is : {result.nbytes}.") + + # Sum the partial contribution from each process on root. + result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) + + return result, rank + + +def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8): + from mpi4py import MPI # this line initializes MPI + import socket + # Get the hostname + #hostname = socket.gethostname() + + ncpu_threads = multiprocessing.cpu_count() // 2 + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() + cp.cuda.Device(device_id).use() + + handle = cutn.create() + network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft network opts",mem_avail, "rank =",rank) + cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) + # Perform circuit conversion + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) + + # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. + network = cutn.Network(*operands_interleave, options=network_opts) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) + path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft contract path",mem_avail, "rank =",rank) + # Execution: To execute the contraction using the optimal path found previously + #print("opt_cost",opt_info.opt_cost, "Process =",rank) + + + num_slices = opt_info.num_slices#Andy + chunk, extra = num_slices // size, num_slices % size#Andy + slice_begin = rank * chunk + min(rank, extra)#Andy + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy + slices = range(slice_begin, slice_end)#Andy + result = network.contract(slices=slices) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft contract",mem_avail, "rank =",rank) + cutn.destroy(handle) + + return result, rank def eval_tn_MPI(qibo_circ, datatype, n_samples=8): """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI. @@ -22,29 +370,59 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8): """ from mpi4py import MPI # this line initializes MPI - + import socket + # Get the hostname + #hostname = socket.gethostname() + ncpu_threads = multiprocessing.cpu_count() // 2 - + comm = MPI.COMM_WORLD rank = comm.Get_rank() + size = comm.Get_size() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() handle = cutn.create() - cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") - + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft network opts",mem_avail, "rank =",rank) + cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands_interleave = myconvertor.state_vector_operands() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. network = cutn.Network(*operands_interleave, options=network_opts) - network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads}) - + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) + network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft contract path",mem_avail, "rank =",rank) # Execution: To execute the contraction using the optimal path found previously + #print("opt_cost",opt_info.opt_cost, "Process =",rank) + + ''' + path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) + + num_slices = opt_info.num_slices#Andy + chunk, extra = num_slices // size, num_slices % size#Andy + slice_begin = rank * chunk + min(rank, extra)#Andy + slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy + slices = range(slice_begin, slice_end)#Andy + result = network.contract(slices=slices) + ''' result = network.contract() + #mem_avail = cp.cuda.Device().mem_info[0] + #print("Mem avail: aft contract",mem_avail, "rank =",rank) cutn.destroy(handle) return result, rank @@ -57,3 +435,19 @@ def eval_mps(qibo_circ, gate_algo, datatype): return mps_helper.contract_state_vector( myconvertor.mps_tensors, {"handle": myconvertor.handle} ) + +def PauliStringGen(nqubits): + + if nqubits <= 0: + return "Invalid input. N should be a positive integer." + + #characters = 'IXYZ' + characters = 'XXXZ' + + result = '' + + for i in range(nqubits): + char_to_add = characters[i % len(characters)] + result += char_to_add + + return result \ No newline at end of file