From 214fdf60bde5766a8a08afa78da3c42636f15bf3 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 24 Jan 2024 11:40:51 +0800
Subject: [PATCH] Updates to include expectation calculation

---
 src/qibotn/QiboCircuitConvertor.py | 113 ++++++++
 src/qibotn/QiboCircuitToMPS.py     |   2 +-
 src/qibotn/backends.py             |  53 ++++
 src/qibotn/cutn.py                 | 406 ++++++++++++++++++++++++++++-
 4 files changed, 567 insertions(+), 7 deletions(-)

diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py
index d72a09cb..11aaa716 100644
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -21,6 +21,7 @@ def __init__(self, circuit, dtype="complex128"):
         self.dtype = getattr(self.backend, dtype)
         self.init_basis_map(self.backend, dtype)
         self.init_intermediate_circuit(circuit)
+        self.circuit = circuit
 
     def state_vector_operands(self):
         input_bitstring = "0" * len(self.active_qubits)
@@ -109,3 +110,115 @@ def init_basis_map(self, backend, dtype):
         state_1 = asarray([0, 1], dtype=dtype)
 
         self.basis_map = {"0": state_0, "1": state_1}
+
+
+    def init_inverse_circuit(self, circuit):
+        self.gate_tensors_inverse = []
+        gates_qubits_inverse = []
+
+        for gate in circuit.queue:
+            gate_qubits = gate.control_qubits + gate.target_qubits
+            gates_qubits_inverse.extend(gate_qubits)
+
+            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
+            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
+            required_shape = self.op_shape_from_qubits(len(gate_qubits))
+            self.gate_tensors_inverse.append(
+                (
+                    cp.asarray(gate.matrix()).reshape(required_shape),
+                    gate_qubits,
+                )
+            )
+
+        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
+        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
+        
+        
+    def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp):
+        """
+        Populate the gates for all pauli operators.
+
+        Args:
+            pauli_map: A dictionary mapping qubits to pauli operators. 
+            dtype: Data type for the tensor operands.
+            backend: The package the tensor operands belong to.
+
+        Returns:
+            A sequence of pauli gates.
+        """
+        asarray = backend.asarray
+        pauli_i = asarray([[1,0], [0,1]], dtype=dtype)
+        pauli_x = asarray([[0,1], [1,0]], dtype=dtype)
+        pauli_y = asarray([[0,-1j], [1j,0]], dtype=dtype)
+        pauli_z = asarray([[1,0], [0,-1]], dtype=dtype)
+        
+        operand_map = {'I': pauli_i,
+                    'X': pauli_x,
+                    'Y': pauli_y,
+                    'Z': pauli_z}
+        gates = []
+        for qubit, pauli_char in pauli_map.items():
+            operand = operand_map.get(pauli_char)
+            if operand is None:
+                raise ValueError('pauli string character must be one of I/X/Y/Z')
+            gates.append((operand, (qubit,)))
+        return gates
+
+    def expectation_operands(self, pauli_string):
+        #assign pauli string to qubit
+        #_get_forward_inverse_metadata()
+        input_bitstring = "0" * self.circuit.nqubits #Need all qubits!
+
+        input_operands = self._get_bitstring_tensors(input_bitstring)
+        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))        
+        pauli_map = pauli_string
+        coned_qubits = pauli_map.keys()
+        
+        (
+            mode_labels,
+            qubits_frontier,
+            next_frontier,
+        ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
+        
+        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
+            self.gate_tensors, qubits_frontier, next_frontier
+        )
+        
+        operands = input_operands + gate_operands
+        mode_labels += gate_mode_labels
+        
+        self.init_inverse_circuit(self.circuit.invert())
+        
+        
+        next_frontier = max(qubits_frontier.values()) + 1
+
+        #input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
+
+        pauli_gates = self.get_pauli_gates(pauli_map, dtype=self.dtype, backend=self.backend)
+        
+        
+        gates_inverse = pauli_gates + self.gate_tensors_inverse
+        
+        gate_mode_labels_inverse, gate_operands_inverse = self._parse_gates_to_mode_labels_operands(
+            gates_inverse, qubits_frontier, next_frontier
+        )
+        mode_labels = mode_labels + gate_mode_labels_inverse + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
+        operands = operands + gate_operands_inverse + operands[:self.circuit.nqubits]
+        
+        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
+        
+        #expec = contract(*operand_exp_interleave)
+        #print(expec)
+
+        '''
+        gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, 
+                                                                                         qubits_frontier, 
+                                                                                         next_frontier)
+        
+        mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
+        operands = input_operands + gate_operands + input_operands[:n_qubits]
+
+        output_mode_labels = []
+        expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
+        '''
+        return operand_exp_interleave
\ No newline at end of file
diff --git a/src/qibotn/QiboCircuitToMPS.py b/src/qibotn/QiboCircuitToMPS.py
index d51093f5..816b17c0 100644
--- a/src/qibotn/QiboCircuitToMPS.py
+++ b/src/qibotn/QiboCircuitToMPS.py
@@ -21,7 +21,7 @@ def __init__(
         self.handle = cutn.create()
         self.dtype = dtype
         self.mps_tensors = initial(self.num_qubits, dtype=dtype)
-        circuitconvertor = QiboCircuitToEinsum(circ_qibo)
+        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
 
         for gate, qubits in circuitconvertor.gate_tensors:
             # mapping from qubits to qubit indices
diff --git a/src/qibotn/backends.py b/src/qibotn/backends.py
index 9d399731..4b28431d 100644
--- a/src/qibotn/backends.py
+++ b/src/qibotn/backends.py
@@ -14,6 +14,13 @@ def __init__(self, platform):
             platform == "cu_tensornet"
             or platform == "cu_mps"
             or platform == "qu_tensornet"
+            or platform == "cu_tensornet_mpi"
+            or platform == "cu_tensornet_mpi_expectation"
+            or platform == "cu_tensornet_expectation"
+            or platform == "cu_tensornet_nccl"
+            or platform == "cu_tensornet_nccl_expectation"
+
+
         ):  # pragma: no cover
             self.platform = platform
         else:
@@ -71,6 +78,52 @@ def execute_circuit(
             init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype)
             init_state[0] = 1.0
             state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy")
+            
+        if self.platform == "cu_tensornet_mpi":
+            if initial_state is not None:
+                raise_error(NotImplementedError, "QiboTN cannot support initial state.")
+
+            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype,32)
+            if rank > 0:
+                state = np.array(0)
+             
+        if self.platform == "cu_tensornet_nccl":
+            if initial_state is not None:
+                raise_error(NotImplementedError, "QiboTN cannot support initial state.")
+
+            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_nccl(circuit, self.dtype,32)
+            if rank > 0:
+                state = np.array(0)
+        
+        if self.platform == "cu_tensornet_expectation":
+            if initial_state is not None:
+                raise_error(NotImplementedError, "QiboTN cannot support initial state.")
+                
+            state = cutn.eval_expectation(circuit, self.dtype)
+        
+        if self.platform == "cu_tensornet_mpi_expectation":
+            if initial_state is not None:
+                raise_error(NotImplementedError, "QiboTN cannot support initial state.")
+
+            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype,32)
+            
+            if rank > 0:
+                state = np.array(0)
+
+        if self.platform == "cu_tensornet_nccl_expectation":
+            if initial_state is not None:
+                raise_error(NotImplementedError, "QiboTN cannot support initial state.")
+
+            #state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
+            #state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
+            state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype,32)
+            
+            if rank > 0:
+                state = np.array(0)
 
         if return_array:
             return state.flatten()
diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index eb0e0d49..67d70c49 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -13,6 +13,354 @@ def eval(qibo_circ, datatype):
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(*myconvertor.state_vector_operands())
 
+def eval_expectation(qibo_circ, datatype):
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)))
+
+def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
+    from mpi4py import MPI  # this line initializes MPI
+    import socket
+    from cuquantum import Network
+
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
+    root = 0
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    device_id = rank % getDeviceCount()
+    
+    
+    # Perform circuit conversion
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    operands = myconvertor.state_vector_operands()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    
+    # Broadcast the operand data.
+    #operands = comm.bcast(operands, root)
+        
+    # Assign the device for each process.
+    device_id = rank % getDeviceCount()
+    
+    #dev = cp.cuda.Device(device_id)
+    #free_mem, total_mem = dev.mem_info
+    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # Create network object.
+    network = Network(*operands, options={'device_id' : device_id})
+
+    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+
+    # Select the best path from all ranks.
+    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
+
+    #if rank == root:
+    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
+
+    # Broadcast info from the sender to all other ranks.
+    info = comm.bcast(info, sender)
+
+    # Set path and slices.
+    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+
+    # Calculate this process's share of the slices.
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slices = range(slice_begin, slice_end)
+
+    #print(f"Process {rank} is processing slice range: {slices}.")
+
+    # Contract the group of slices the process is responsible for.
+    result = network.contract(slices=slices)
+    #print(f"Process {rank} result shape is : {result.shape}.")
+    #print(f"Process {rank} result size is : {result.nbytes}.")
+
+    # Sum the partial contribution from each process on root.
+    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+    
+    return result, rank
+
+def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
+    from mpi4py import MPI  # this line initializes MPI
+    import socket
+    from cuquantum import Network
+    from cupy.cuda import nccl
+
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
+    root = 0
+    comm_mpi = MPI.COMM_WORLD
+    rank = comm_mpi.Get_rank()
+    size = comm_mpi.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    device_id = rank % getDeviceCount()
+    
+    cp.cuda.Device(device_id).use()
+    
+    # Set up the NCCL communicator.
+    nccl_id = nccl.get_unique_id() if rank == root else None
+    nccl_id = comm_mpi.bcast(nccl_id, root)
+    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
+
+    # Perform circuit conversion
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    operands = myconvertor.state_vector_operands()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
+    network = Network(*operands)
+
+    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+
+    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+
+    # Select the best path from all ranks.
+    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
+
+    #if rank == root:
+    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
+
+    # Broadcast info from the sender to all other ranks.
+    info = comm_mpi.bcast(info, sender)
+
+    # Set path and slices.
+    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+
+    # Calculate this process's share of the slices.
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slices = range(slice_begin, slice_end)
+
+    #print(f"Process {rank} is processing slice range: {slices}.")
+
+    # Contract the group of slices the process is responsible for.
+    result = network.contract(slices=slices)
+    #print(f"Process {rank} result shape is : {result.shape}.")
+    #print(f"Process {rank} result size is : {result.nbytes}.")
+
+    # Sum the partial contribution from each process on root.
+    stream_ptr = cp.cuda.get_current_stream().ptr
+    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
+    
+    return result, rank
+
+def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
+    from mpi4py import MPI  # this line initializes MPI
+    import socket
+    from cuquantum import Network
+    from cupy.cuda import nccl
+
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
+    root = 0
+    comm_mpi = MPI.COMM_WORLD
+    rank = comm_mpi.Get_rank()
+    size = comm_mpi.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    device_id = rank % getDeviceCount()
+    
+    cp.cuda.Device(device_id).use()
+    
+    # Set up the NCCL communicator.
+    nccl_id = nccl.get_unique_id() if rank == root else None
+    nccl_id = comm_mpi.bcast(nccl_id, root)
+    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
+
+    # Perform circuit conversion
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
+
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
+    network = Network(*operands)
+
+    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+
+    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+
+    # Select the best path from all ranks.
+    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
+
+    #if rank == root:
+    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
+
+    # Broadcast info from the sender to all other ranks.
+    info = comm_mpi.bcast(info, sender)
+
+    # Set path and slices.
+    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+
+    # Calculate this process's share of the slices.
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slices = range(slice_begin, slice_end)
+
+    #print(f"Process {rank} is processing slice range: {slices}.")
+
+    # Contract the group of slices the process is responsible for.
+    result = network.contract(slices=slices)
+    #print(f"Process {rank} result shape is : {result.shape}.")
+    #print(f"Process {rank} result size is : {result.nbytes}.")
+
+    # Sum the partial contribution from each process on root.
+    stream_ptr = cp.cuda.get_current_stream().ptr
+    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
+    
+    return result, rank
+
+
+def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8):
+    from mpi4py import MPI  # this line initializes MPI
+    import socket
+    from cuquantum import Network
+
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
+    root = 0
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    device_id = rank % getDeviceCount()
+    
+    
+    # Perform circuit conversion
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    
+    # Broadcast the operand data.
+    #operands = comm.bcast(operands, root)
+        
+    # Assign the device for each process.
+    device_id = rank % getDeviceCount()
+    
+    #dev = cp.cuda.Device(device_id)
+    #free_mem, total_mem = dev.mem_info
+    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # Create network object.
+    network = Network(*operands, options={'device_id' : device_id})
+
+    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+
+    # Select the best path from all ranks.
+    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
+
+    #if rank == root:
+    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
+
+    # Broadcast info from the sender to all other ranks.
+    info = comm.bcast(info, sender)
+
+    # Set path and slices.
+    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+
+    # Calculate this process's share of the slices.
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slices = range(slice_begin, slice_end)
+
+    #print(f"Process {rank} is processing slice range: {slices}.")
+
+    # Contract the group of slices the process is responsible for.
+    result = network.contract(slices=slices)
+    #print(f"Process {rank} result shape is : {result.shape}.")
+    #print(f"Process {rank} result size is : {result.nbytes}.")
+
+    # Sum the partial contribution from each process on root.
+    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+    
+    return result, rank
+
+
+def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8):
+    from mpi4py import MPI  # this line initializes MPI
+    import socket
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
+    ncpu_threads = multiprocessing.cpu_count() // 2
+    
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
+
+    handle = cutn.create()
+    network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
+    # Perform circuit conversion
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
+    # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
+    network = cutn.Network(*operands_interleave, options=network_opts)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
+    # Execution: To execute the contraction using the optimal path found previously
+    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
+
+    
+    num_slices = opt_info.num_slices#Andy
+    chunk, extra = num_slices // size, num_slices % size#Andy
+    slice_begin = rank * chunk + min(rank, extra)#Andy
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
+    slices = range(slice_begin, slice_end)#Andy
+    result = network.contract(slices=slices)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
+    cutn.destroy(handle)
+
+    return result, rank
 
 def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
     """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
@@ -22,29 +370,59 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
     """
 
     from mpi4py import MPI  # this line initializes MPI
-
+    import socket
+    # Get the hostname
+    #hostname = socket.gethostname()
+    
     ncpu_threads = multiprocessing.cpu_count() // 2
-
+    
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
+    size = comm.Get_size()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
     device_id = rank % getDeviceCount()
     cp.cuda.Device(device_id).use()
 
     handle = cutn.create()
-    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
     network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
-
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands_interleave = myconvertor.state_vector_operands()
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
     network = cutn.Network(*operands_interleave, options=network_opts)
-    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads})
-
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
     # Execution: To execute the contraction using the optimal path found previously
+    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
+
+    '''
+    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
+
+    num_slices = opt_info.num_slices#Andy
+    chunk, extra = num_slices // size, num_slices % size#Andy
+    slice_begin = rank * chunk + min(rank, extra)#Andy
+    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
+    slices = range(slice_begin, slice_end)#Andy
+    result = network.contract(slices=slices)
+    '''
     result = network.contract()
 
+    #mem_avail = cp.cuda.Device().mem_info[0]
+    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
     cutn.destroy(handle)
 
     return result, rank
@@ -57,3 +435,19 @@ def eval_mps(qibo_circ, gate_algo, datatype):
     return mps_helper.contract_state_vector(
         myconvertor.mps_tensors, {"handle": myconvertor.handle}
     )
+
+def PauliStringGen(nqubits):
+    
+    if nqubits <= 0:
+        return "Invalid input. N should be a positive integer."
+
+    #characters = 'IXYZ'
+    characters = 'XXXZ'
+
+    result = ''
+
+    for i in range(nqubits):
+        char_to_add = characters[i % len(characters)]
+        result += char_to_add
+
+    return result
\ No newline at end of file