Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions quest/src/api/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "quest/src/core/autodeployer.hpp"
#include "quest/src/core/validation.hpp"
#include "quest/src/core/randomiser.hpp"
#include "quest/src/core/accelerator.hpp"
#include "quest/src/comm/comm_config.hpp"
#include "quest/src/cpu/cpu_config.hpp"
#include "quest/src/gpu/gpu_config.hpp"
Expand Down Expand Up @@ -459,6 +460,9 @@ void finalizeQuESTEnv() {
// calling this will not automatically
// free the memory of existing Quregs

// free the persistent fused-multi-swap staging workspace (host or device), if any
accel_clearFusedSwapSendCache();

if (global_envPtr->isGpuAccelerated)
gpu_clearCache(); // syncs first

Expand Down
80 changes: 79 additions & 1 deletion quest/src/comm/comm_routines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,45 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
}


void exchangeArraysWithMultiplePartners(
qcomp* send, qcomp* recv,
const vector<int>& partnerRanks,
const vector<qindex>& sendInds, const vector<qindex>& recvInds,
qindex numAmpsPerBlock
) {
#if QUEST_COMPILE_MPI

MPI_Comm mpiComm = comm_getMpiComm();

// each per-partner block is divided into power-of-2 messages; we issue ALL of every
// partner's asynchronous send/receives up-front (a personalised all-to-all) and wait
// once, so that a fused multi-swap completes in a single communication round
auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numAmpsPerBlock);

vector<MPI_Request> requests(2 * partnerRanks.size() * numMessages, MPI_REQUEST_NULL);
size_t r = 0;

for (size_t b=0; b<partnerRanks.size(); b++) {
int pairRank = partnerRanks[b];

// messages to/from distinct partners share tags safely (distinguished by rank);
// within a partner, unique tags permit out-of-order arrival (UCX adaptive-routing)
for (qindex m=0; m<numMessages; m++) {
int tag = static_cast<int>(m);
MPI_Irecv(&recv[recvInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
MPI_Isend(&send[sendInds[b] + m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[r++]);
}
}

// single wait completes the whole round (MPI will automatically free the request memory)
MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);

#else
error_commButEnvNotDistributed();
#endif
}



/*
* PRIVATE ASYNC SEND AND RECEIVE
Expand Down Expand Up @@ -528,11 +567,50 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {

if (qureg.isGpuAccelerated)
exchangeGpuSubBuffers(qureg, numAmps, pairRank);
else
else
exchangeArrays(&qureg.cpuCommBuffer[sendInd], &qureg.cpuCommBuffer[recvInd], numAmps, pairRank);
}


void comm_exchangeAmpsToBuffersForFusedSwap(
Qureg qureg, qcomp* sendBuf,
vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds,
qindex numAmpsPerBlock
) {
assert_commQuregIsDistributed(qureg);
for (int pairRank : partnerRanks)
assert_pairRankIsDistinct(qureg, pairRank);

// 'sendBuf' is a contiguous staging buffer (in the qureg's memory space) holding one
// packed block per subcube partner; received blocks are written into the qureg's
// commBuffer. The total staged payload is < numAmpsPerNode (the self-block never moves).
qindex sendSpan = 0;
qindex recvSpan = 0;
for (size_t b=0; b<partnerRanks.size(); b++) {
sendSpan = std::max(sendSpan, blockSendInds[b] + numAmpsPerBlock);
recvSpan = std::max(recvSpan, blockRecvInds[b] + numAmpsPerBlock);
}

// non-GPU quregs exchange host staging buffer to host commBuffer directly
if (!qureg.isGpuAccelerated) {
exchangeArraysWithMultiplePartners(sendBuf, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
return;
}

// GPU quregs exchange VRAM directly when supported
if (gpu_isDirectGpuCommPossible()) {
gpu_sync();
exchangeArraysWithMultiplePartners(sendBuf, qureg.gpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
return;
}

// otherwise route VRAM through RAM, reusing cpuAmps (mere host mirror in GPU mode) as send scratch
gpu_copyGpuToCpu(qureg, sendBuf, qureg.cpuAmps, sendSpan);
exchangeArraysWithMultiplePartners(qureg.cpuAmps, qureg.cpuCommBuffer, partnerRanks, blockSendInds, blockRecvInds, numAmpsPerBlock);
gpu_copyCpuToGpu(qureg, qureg.cpuCommBuffer, qureg.gpuCommBuffer, recvSpan);
}


void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {

auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);
Expand Down
2 changes: 2 additions & 0 deletions quest/src/comm/comm_routines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);

void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);

void comm_exchangeAmpsToBuffersForFusedSwap(Qureg qureg, qcomp* sendBuf, vector<int> partnerRanks, vector<qindex> blockSendInds, vector<qindex> blockRecvInds, qindex numAmpsPerBlock);

void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);

void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);
Expand Down
65 changes: 65 additions & 0 deletions quest/src/core/accelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,71 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
}


// persistent, lazily-grown staging workspace reused across fused multi-swaps, so we
// avoid a large (de)allocation (host malloc or cudaMalloc) on every call. This mirrors
// QuEST's existing 'gpuCache' for the dense-matrix kernel, and the persistent workspaces
// of cuStateVec / mpiQulacs. It is freed by accel_clearFusedSwapSendCache(), invoked at
// environment teardown (finalizeQuESTEnv).
static qcomp* fusedSwapSendCache = nullptr;
static qindex fusedSwapSendCacheLen = 0;
static bool fusedSwapSendCacheIsGpu = false;


qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps) {

// discard a stale cache if the memory space (RAM vs VRAM) has changed
if (fusedSwapSendCache != nullptr && fusedSwapSendCacheIsGpu != qureg.isGpuAccelerated)
accel_clearFusedSwapSendCache();

// reuse the existing workspace when already large enough
if (numAmps <= fusedSwapSendCacheLen)
return fusedSwapSendCache;

// otherwise grow it (freeing the old, smaller buffer first)
if (fusedSwapSendCache != nullptr)
(fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);

fusedSwapSendCacheIsGpu = qureg.isGpuAccelerated;
fusedSwapSendCache = (fusedSwapSendCacheIsGpu)? gpu_allocArray(numAmps) : cpu_allocArray(numAmps);
fusedSwapSendCacheLen = numAmps;
return fusedSwapSendCache;
}


void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer) {

// no-op: the staging workspace persists and is reused by subsequent fused
// multi-swaps; it is released at environment teardown. Kept for call-site symmetry.
(void) qureg;
(void) buffer;
}


void accel_clearFusedSwapSendCache() {

if (fusedSwapSendCache == nullptr)
return;

(fusedSwapSendCacheIsGpu)? gpu_deallocArray(fusedSwapSendCache) : cpu_deallocArray(fusedSwapSendCache);
fusedSwapSendCache = nullptr;
fusedSwapSendCacheLen = 0;
}


void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset) {

GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsForFusedSwap, qureg, qubits.size() );
func(qureg, qubits, qubitStates, sendBuf, sendOffset);
}


void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset) {

GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_unpackAmpsForFusedSwap, qureg, qubits.size() );
func(qureg, qubits, qubitStates, recvOffset);
}



/*
* SWAPS
Expand Down
7 changes: 7 additions & 0 deletions quest/src/core/accelerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstL

qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);

qcomp* accel_allocFusedSwapSendBuffer(Qureg qureg, qindex numAmps);
void accel_deallocFusedSwapSendBuffer(Qureg qureg, qcomp* buffer);
void accel_clearFusedSwapSendCache();

void accel_statevec_packAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qcomp* sendBuf, qindex sendOffset);
void accel_statevec_unpackAmpsForFusedSwap(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates, qindex recvOffset);


/*
* SWAPS
Expand Down
138 changes: 128 additions & 10 deletions quest/src/core/localiser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <complex>
#include <algorithm>
#include <unordered_map>
#include <cstdlib>

using std::vector;
using std::tuple;
Expand Down Expand Up @@ -893,32 +894,149 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64
*/


void fusedMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {

// we fuse 'numPairs' (>= 2) disjoint prefix<->suffix SWAPs into a single communication
// round. The 2^numPairs ranks differing from ours only in the swapped prefix qubits'
// rank-bits form a subcube; identifying a rank by the 'numPairs'-bit address of those
// rank-bits, the multi-swap relocates a local amp with suffix-target bit-pattern 'v' on
// rank-address 'a' to rank-address 'v' with new pattern 'a'. Hence the block with v==a
// stays put, and for each of the 2^numPairs - 1 partners we swap the block whose suffix
// pattern equals the partner's address. These index sets are disjoint, so all partner
// exchanges proceed concurrently. See arXiv:2311.01512, arXiv:quant-ph/0608239.

int numPairs = prefixTargs.size();

// rank-bit positions of the prefix (global) targets, and our rank's bit values there
vector<int> rankBitInds(numPairs);
vector<int> myAddrBits(numPairs);
for (int i=0; i<numPairs; i++) {
rankBitInds[i] = util_getPrefixInd(prefixTargs[i], qureg);
myAddrBits[i] = util_getRankBitOfQubit(prefixTargs[i], qureg);
}

// each control and each swap-target halves the per-partner block of swapped local amps
qindex numAmpsPerBlock = qureg.numAmpsPerNode / powerOf2(numPairs + ctrls.size());

// packing constrains the swap-targets (to a partner-specific pattern) and the controls
List64 packQubits = ctrls;
for (int i=0; i<numPairs; i++)
packQubits.push_back(suffixTargs[i]);

// there are 2^numPairs - 1 partners (non-empty subsets of flipped rank-bits)
int numPartners = powerOf2(numPairs) - 1;
vector<int> partnerRanks(numPartners);
vector<qindex> blockInds(numPartners);

// stage all partners' send-blocks contiguously in a temporary buffer (< one node's state)
qcomp* sendBuf = accel_allocFusedSwapSendBuffer(qureg, numPartners * numAmpsPerBlock);

// pack each partner's block, identified by subset 'sub' of flipped rank-bits
for (int sub=1; sub<=numPartners; sub++) {

int partner = sub - 1;
blockInds[partner] = partner * numAmpsPerBlock;

int pairRank = qureg.rank;
List64 packStates = ctrlStates;

for (int i=0; i<numPairs; i++) {
int flip = (sub >> i) & 1;
if (flip)
pairRank = flipBit(pairRank, rankBitInds[i]);

// partner's block holds local amps whose suffix-target[i] == partner's address bit i
packStates.push_back(myAddrBits[i] ^ flip);
}

partnerRanks[partner] = pairRank;
accel_statevec_packAmpsForFusedSwap(qureg, packQubits, packStates, sendBuf, blockInds[partner]);
}

// single all-to-all round: send each block to its partner, receive theirs into commBuffer
comm_exchangeAmpsToBuffersForFusedSwap(qureg, sendBuf, partnerRanks, blockInds, blockInds, numAmpsPerBlock);

// scatter each received block back into the same local indices it was packed from
for (int sub=1; sub<=numPartners; sub++) {

int partner = sub - 1;
List64 packStates = ctrlStates;
for (int i=0; i<numPairs; i++)
packStates.push_back(myAddrBits[i] ^ ((sub >> i) & 1));

accel_statevec_unpackAmpsForFusedSwap(qureg, packQubits, packStates, blockInds[partner]);
}

accel_deallocFusedSwapSendBuffer(qureg, sendBuf);
}


void multiSwapSequentially(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 prefixTargs, ConstList64 suffixTargs) {

// reference (pre-fusion) behaviour: perform each disjoint prefix<->suffix SWAP one-at-a-time,
// wastefully relocating amplitudes once per swap. Retained for correctness comparison and
// benchmarking against the fused single-round path; never the default at runtime.
for (size_t i=0; i<prefixTargs.size(); i++)
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
}


bool localiser_isFusedSwapDisabled() {

// honour an optional environment toggle (QUEST_DISABLE_SWAP_FUSION=1) so benchmarks can
// compare the fused single-round path against the legacy sequential path without recompiling.
// read once and cached; default (unset) keeps fusion enabled.
static int cached = -1;
if (cached == -1) {
const char* env = std::getenv("QUEST_DISABLE_SWAP_FUSION");
cached = (env != nullptr && env[0] == '1') ? 1 : 0;
}
return cached == 1;
}


void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) {

// this is an internal function called by the below routines which require
// performing a sequence of SWAPs to reorder qubits, or move them into suffix.
// the SWAPs act on unique qubit pairs and so commute.

/// @todo
/// - the sequence of pair-wise full-swaps should be more efficient as a
/// "single" sequence of smaller messages sending amps directly to their
/// final destination node. This could use a new "multiSwap" function.
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
/// multiSwap function should use custatevecSwapIndexBits() if local,
/// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// multiSwap could instead use custatevecSwapIndexBits() if local, or
/// custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// although the latter requires substantially more work like setting up
/// a communicator which may be inelegant alongside our own distribution scheme.

// perform necessary swaps to move all targets into suffix, each of which invokes communication
for (size_t i=0; i<targsA.size(); i++) {
// collect the disjoint pairs needing communication; for each, the suffix (local)
// qubit is the smaller index and the prefix (global) qubit is the larger
List64 prefixTargs = lists_getEmptyList64();
List64 suffixTargs = lists_getEmptyList64();

for (size_t i=0; i<targsA.size(); i++) {
if (targsA[i] == targsB[i])
continue;
suffixTargs.push_back(std::min(targsA[i], targsB[i]));
prefixTargs.push_back(std::max(targsA[i], targsB[i]));
}

int suffixTarg = std::min(targsA[i], targsB[i]);
int prefixTarg = std::max(targsA[i], targsB[i]);
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);
int numPairs = prefixTargs.size();

// nothing to do, or a single swap reduces to the existing routine (needs no staging buffer)
if (numPairs == 0)
return;
if (numPairs == 1) {
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[0], prefixTargs[0]);
return;
}

// multiple swaps are fused, sending each amplitude directly to its final node in a single
// round, rather than wastefully relocating it once per sequential swap. The benchmark toggle
// permits comparing against the legacy sequential behaviour.
if (localiser_isFusedSwapDisabled())
multiSwapSequentially(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
else
fusedMultiSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, prefixTargs, suffixTargs);
}


Expand Down
Loading