QuEST-Kit · thedaemon-wizard · Jun 11, 2026
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -17,6 +17,7 @@
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/randomiser.hpp"
+#include "quest/src/core/accelerator.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
@@ -459,6 +460,9 @@ void finalizeQuESTEnv() {
     // calling this will not automatically
     // free the memory of existing Quregs
 
+    // free the persistent fused-multi-swap staging workspace (host or device), if any
+    accel_clearFusedSwapSendCache();
+
     if (global_envPtr->isGpuAccelerated)
         gpu_clearCache(); // syncs first
 

diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
@@ -242,6 +242,45 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
 }
 
 
+void exchangeArraysWithMultiplePartners(
+    qcomp* send, qcomp* recv,
+    const vector<int>& partnerRanks,
+    const vector<qindex>& sendInds, const vector<qindex>& recvInds,
+    qindex numAmpsPerBlock
+) {
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
+
+    // each per-partner block is divided into power-of-2 messages; we issue ALL of every
+    // partner's asynchronous send/receives up-front (a personalised all-to-all) and wait
+    // once, so that a fused multi-swap completes in a single communication round
+    auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numAmpsPerBlock);
+
+    vector<MPI_Request> requests(2 * partnerRanks.size() * numMessages, MPI_REQUEST_NULL);
+    size_t r = 0;
+
+    for (size_t b=0; b<partnerRanks.size(); b++) {
+        int pairRank = partnerRanks[b];
+
+        // messages to/from distinct partners share tags safely (distinguished by rank);
+        // within a partner, unique tags permit out-of-order arrival (UCX adaptive-routing)
+        for (qindex m=0; m<numMessages; m++) {
+            int tag = static_cast<int>(m);
+            MPI_Irecv(&recv[recvInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
+            MPI_Isend(&send[sendInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
+        }
+    }
+
+    // single wait completes the whole round (MPI will automatically free the request memory)
+    MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+
+#else
+    error_commButEnvNotDistributed();
+#endif
+}
+
+
 
 /*
  * PRIVATE ASYNC SEND AND RECEIVE
@@ -528,11 +567,50 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {
 
     if (qureg.isGpuAccelerated)
         exchangeGpuSubBuffers(qureg, numAmps, pairRank);
-    else 
+    else
         exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
 }
 
 
+void comm_exchangeAmpsToBuffersForFusedSwap(
+    Qureg qureg, qcomp* sendBuf,
+    vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds,
+    qindex numAmpsPerBlock
+) {
+    assert_commQuregIsDistributed(qureg);
+    for (int pairRank : partnerRanks)
+        assert_pairRankIsDistinct(qureg, pairRank);
+
+    // 'sendBuf' is a contiguous staging buffer (in the qureg's memory space) holding one
+    // packed block per subcube partner; received blocks are written into the qureg's
+    // commBuffer. The total staged payload is < numAmpsPerNode (the self-block never moves).
+    qindex sendSpan = 0;
+    qindex recvSpan = 0;
+    for (size_t b=0; b<partnerRanks.size(); b++) {
+        sendSpan = std::max(sendSpan, blockSendInds[b] + numAmpsPerBlock);
+        recvSpan = std::max(recvSpan, blockRecvInds[b] + numAmpsPerBlock);
+    }
+
+    // non-GPU quregs exchange host staging buffer to host commBuffer directly
+    if (!qureg.isGpuAccelerated) {
+        exchangeArraysWithMultiplePartners(sendBuf, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+        return;
+    }
+
+    // GPU quregs exchange VRAM directly when supported
+    if (gpu_isDirectGpuCommPossible()) {
+        gpu_sync();
+        exchangeArraysWithMultiplePartners(sendBuf, qureg.gpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+        return;
+    }
+
+    // otherwise route VRAM through RAM, reusing cpuAmps (mere host mirror in GPU mode) as send scratch
+    gpu_copyGpuToCpu(qureg, sendBuf, qureg.cpuAmps, sendSpan);
+    exchangeArraysWithMultiplePartners(qureg.cpuAmps, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
+    gpu_copyCpuToGpu(qureg, qureg.cpuCommBuffer, qureg.gpuCommBuffer, recvSpan);
+}
+
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {
 
     auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);

diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp
@@ -31,6 +31,8 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);
 
 void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);
 
+void comm_exchangeAmpsToBuffersForFusedSwap(Qureg qureg, qcomp* sendBuf, vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds, qindex numAmpsPerBlock);
+
 void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);
 
 void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
@@ -246,6 +246,71 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
 }
 
 
+// persistent, lazily-grown staging workspace reused across fused multi-swaps, so we
+// avoid a large (de)allocation (host malloc or cudaMalloc) on every call. This mirrors
+// QuEST's existing 'gpuCache' for the dense-matrix kernel, and the persistent workspaces
+// of cuStateVec / mpiQulacs. It is freed by accel_clearFusedSwapSendCache(), invoked at
+// environment teardown (finalizeQuESTEnv).
+static qcomp* fusedSwapSendCache    = nullptr;
+static qindex fusedSwapSendCacheLen = 0;
+static bool   fusedSwapSendCacheIsGpu = false;
+
+
+qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps) {
+
+    // discard a stale cache if the memory space (RAM vs VRAM) has changed
+    if (fusedSwapSendCache != nullptr && fusedSwapSendCacheIsGpu != qureg.isGpuAccelerated)
+        accel_clearFusedSwapSendCache();
+
+    // reuse the existing workspace when already large enough
+    if (numAmps <= fusedSwapSendCacheLen)
+        return fusedSwapSendCache;
+
+    // otherwise grow it (freeing the old, smaller buffer first)
+    if (fusedSwapSendCache != nullptr)
+        (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);
+
+    fusedSwapSendCacheIsGpu = qureg.isGpuAccelerated;
+    fusedSwapSendCache = (fusedSwapSendCacheIsGpu)? gpu_allocArray(numAmps) : cpu_allocArray(numAmps);
+    fusedSwapSendCacheLen = numAmps;
+    return fusedSwapSendCache;
+}
+
+
+void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer) {
+
+    // no-op: the staging workspace persists and is reused by subsequent fused
+    // multi-swaps; it is released at environment teardown. Kept for call-site symmetry.
+    (void) qureg;
+    (void) buffer;
+}
+
+
+void accel_clearFusedSwapSendCache() {
+
+    if (fusedSwapSendCache == nullptr)
+        return;
+
+    (fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);
+    fusedSwapSendCache = nullptr;
+    fusedSwapSendCacheLen = 0;
+}
+
+
+void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) {
+
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsForFusedSwap, qureg, qubits.size() );
+    func(qureg, qubits, qubitStates, sendBuf, sendOffset);
+}
+
+
+void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) {
+
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_unpackAmpsForFusedSwap, qureg, qubits.size() );
+    func(qureg, qubits, qubitStates, recvOffset);
+}
+
+
 
 /*
  * SWAPS

diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
@@ -173,6 +173,13 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL
 
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
+qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps);
+void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer);
+void accel_clearFusedSwapSendCache();
+
+void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset);
+void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset);
+
 
 /*
  * SWAPS

diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
@@ -34,6 +34,7 @@
 #include <complex>
 #include <algorithm>
 #include <unordered_map>
+#include <cstdlib>
 
 using std::vector;
 using std::tuple;
@@ -893,32 +894,149 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64
  */
 
 
+void fusedMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {
+
+    // we fuse 'numPairs' (>= 2) disjoint prefix<->suffix SWAPs into a single communication
+    // round. The 2^numPairs ranks differing from ours only in the swapped prefix qubits'
+    // rank-bits form a subcube; identifying a rank by the 'numPairs'-bit address of those
+    // rank-bits, the multi-swap relocates a local amp with suffix-target bit-pattern 'v' on
+    // rank-address 'a' to rank-address 'v' with new pattern 'a'. Hence the block with v==a
+    // stays put, and for each of the 2^numPairs - 1 partners we swap the block whose suffix
+    // pattern equals the partner's address. These index sets are disjoint, so all partner
+    // exchanges proceed concurrently. See arXiv:2311.01512, arXiv:quant-ph/0608239.
+
+    int numPairs = prefixTargs.size();
+
+    // rank-bit positions of the prefix (global) targets, and our rank's bit values there
+    vector<int> rankBitInds(numPairs);
+    vector<int> myAddrBits(numPairs);
+    for (int i=0; i<numPairs; i++) {
+        rankBitInds[i] = util_getPrefixInd(prefixTargs[i], qureg);
+        myAddrBits[i]  = util_getRankBitOfQubit(prefixTargs[i], qureg);
+    }
+
+    // each control and each swap-target halves the per-partner block of swapped local amps
+    qindex numAmpsPerBlock = qureg.numAmpsPerNode / powerOf2(numPairs + ctrls.size());
+
+    // packing constrains the swap-targets (to a partner-specific pattern) and the controls
+    List64 packQubits = ctrls;
+    for (int i=0; i<numPairs; i++)
+        packQubits.push_back(suffixTargs[i]);
+
+    // there are 2^numPairs - 1 partners (non-empty subsets of flipped rank-bits)
+    int numPartners = powerOf2(numPairs) - 1;
+    vector<int>    partnerRanks(numPartners);
+    vector<qindex> blockInds(numPartners);
+
+    // stage all partners' send-blocks contiguously in a temporary buffer (< one node's state)
+    qcomp* sendBuf = accel_allocFusedSwapSendBuffer(qureg, numPartners * numAmpsPerBlock);
+
+    // pack each partner's block, identified by subset 'sub' of flipped rank-bits
+    for (int sub=1; sub<=numPartners; sub++) {
+
+        int partner = sub - 1;
+        blockInds[partner] = partner * numAmpsPerBlock;
+
+        int pairRank = qureg.rank;
+        List64 packStates = ctrlStates;
+
+        for (int i=0; i<numPairs; i++) {
+            int flip = (sub >> i) & 1;
+            if (flip)
+                pairRank = flipBit(pairRank, rankBitInds[i]);
+
+            // partner's block holds local amps whose suffix-target[i] == partner's address bit i
+            packStates.push_back(myAddrBits[i] ^ flip);
+        }
+
+        partnerRanks[partner] = pairRank;
+        accel_statevec_packAmpsForFusedSwap(qureg, packQubits, packStates, sendBuf, blockInds[partner]);
+    }
+
+    // single all-to-all round: send each block to its partner, receive theirs into commBuffer
+    comm_exchangeAmpsToBuffersForFusedSwap(qureg, sendBuf, partnerRanks, blockInds, blockInds, numAmpsPerBlock);
+
+    // scatter each received block back into the same local indices it was packed from
+    for (int sub=1; sub<=numPartners; sub++) {
+
+        int partner = sub - 1;
+        List64 packStates = ctrlStates;
+        for (int i=0; i<numPairs; i++)
+            packStates.push_back(myAddrBits[i] ^ ((sub >> i) & 1));
+
+        accel_statevec_unpackAmpsForFusedSwap(qureg, packQubits, packStates, blockInds[partner]);
+    }
+
+    accel_deallocFusedSwapSendBuffer(qureg, sendBuf);
+}
+
+
+void multiSwapSequentially(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {
+
+    // reference (pre-fusion) behaviour: perform each disjoint prefix<->suffix SWAP one-at-a-time,
+    // wastefully relocating amplitudes once per swap. Retained for correctness comparison and
+    // benchmarking against the fused single-round path; never the default at runtime.
+    for (size_t i=0; i<prefixTargs.size(); i++)
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
+}
+
+
+bool localiser_isFusedSwapDisabled() {
+
+    // honour an optional environment toggle (QUEST_DISABLE_SWAP_FUSION=1) so benchmarks can
+    // compare the fused single-round path against the legacy sequential path without recompiling.
+    // read once and cached; default (unset) keeps fusion enabled.
+    static int cached = -1;
+    if (cached == -1) {
+        const char* env = std::getenv("QUEST_DISABLE_SWAP_FUSION");
+        cached = (env != nullptr && env[0] == '1') ? 1 : 0;
+    }
+    return cached == 1;
+}
+
+
 void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) {
 
     // this is an internal function called by the below routines which require
     // performing a sequence of SWAPs to reorder qubits, or move them into suffix.
     // the SWAPs act on unique qubit pairs and so commute.
 
     /// @todo
-    ///   - the sequence of pair-wise full-swaps should be more efficient as a
-    ///     "single" sequence of smaller messages sending amps directly to their
-    ///     final destination node. This could use a new "multiSwap" function.
     ///   - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
-    ///     multiSwap function should use custatevecSwapIndexBits() if local,
-    ///     or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
+    ///     multiSwap could instead use custatevecSwapIndexBits() if local, or
+    ///     custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
     ///     although the latter requires substantially more work like setting up
     ///     a communicator which may be inelegant alongside our own distribution scheme.
 
-    // perform necessary swaps to move all targets into suffix, each of which invokes communication
-    for (size_t i=0; i<targsA.size(); i++) {
+    // collect the disjoint pairs needing communication; for each, the suffix (local)
+    // qubit is the smaller index and the prefix (global) qubit is the larger
+    List64 prefixTargs = lists_getEmptyList64();
+    List64 suffixTargs = lists_getEmptyList64();
 
+    for (size_t i=0; i<targsA.size(); i++) {
         if (targsA[i] == targsB[i])
             continue;
+        suffixTargs.push_back(std::min(targsA[i], targsB[i]));
+        prefixTargs.push_back(std::max(targsA[i], targsB[i]));
+    }
 
-        int suffixTarg = std::min(targsA[i], targsB[i]);
-        int prefixTarg = std::max(targsA[i], targsB[i]);
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
+    int numPairs = prefixTargs.size();
+
+    // nothing to do, or a single swap reduces to the existing routine (needs no staging buffer)
+    if (numPairs == 0)
+        return;
+    if (numPairs == 1) {
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[0], prefixTargs[0]);
+        return;
     }
+
+    // multiple swaps are fused, sending each amplitude directly to its final node in a single
+    // round, rather than wastefully relocating it once per sequential swap. The benchmark toggle
+    // permits comparing against the legacy sequential behaviour.
+    if (localiser_isFusedSwapDisabled())
+        multiSwapSequentially(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
+    else
+        fusedMultiSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
 }