Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions quest/src/comm/comm_routines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,56 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
}


void exchangeSubBufferChunks(Qureg qureg, const vector<int>& pairRanks, const vector<qindex>& recvTagBases, qindex sendTagBase, qindex chunkSize) {
#if QUEST_COMPILE_MPI

if (pairRanks.empty())
return;

MPI_Comm mpiComm = comm_getMpiComm();

qindex sendInd = getSubBufferSendInd(qureg);
qindex recvInd = getBufferRecvInd();

auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(chunkSize);
qindex maxTagBase = sendTagBase;
for (qindex tagBase : recvTagBases)
maxTagBase = std::max(maxTagBase, tagBase);

qindex numTaggedMessages = numMessages * (maxTagBase + 1);
if (numTaggedMessages > getMaxNumMessages())
error_commNumMessagesExceedTagMax();

qindex numRequests = 2 * numMessages * pairRanks.size();
vector<MPI_Request> requests(numRequests, MPI_REQUEST_NULL);

qindex reqInd = 0;
for (qindex c=0; c<(qindex) pairRanks.size(); c++) {
qindex chunkOffset = c * chunkSize;

for (qindex m=0; m<numMessages; m++) {
int recvTag = static_cast<int>(recvTagBases[c]*numMessages + m);
int sendTag = static_cast<int>(sendTagBase*numMessages + m);
qindex messageOffset = chunkOffset + m*messageSize;

MPI_Irecv(
&qureg.cpuCommBuffer[recvInd + messageOffset],
messageSize, MPI_QCOMP, pairRanks[c], recvTag, mpiComm, &requests[reqInd++]);

MPI_Isend(
&qureg.cpuCommBuffer[sendInd + messageOffset],
messageSize, MPI_QCOMP, pairRanks[c], sendTag, mpiComm, &requests[reqInd++]);
}
}

MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);

#else
error_commButEnvNotDistributed();
#endif
}



/*
* PRIVATE ASYNC SEND AND RECEIVE
Expand Down Expand Up @@ -533,6 +583,25 @@ void comm_exchangeSubBuffers(Qureg qureg, qindex numAmps, int pairRank) {
}


void comm_exchangeSubBufferChunks(Qureg qureg, const vector<int>& pairRanks, const vector<qindex>& recvTagBases, qindex sendTagBase, qindex chunkSize) {

qindex sendInd = getSubBufferSendInd(qureg);
qindex recvInd = getBufferRecvInd();
qindex totalSize = chunkSize * pairRanks.size();

assert_commBoundsAreValid(qureg, sendInd, recvInd, totalSize);
assert_bufferSendRecvDoesNotOverlap(sendInd, recvInd, totalSize);
assert_commQuregIsDistributed(qureg);
if (pairRanks.size() != recvTagBases.size())
error_commGivenInconsistentNumSubArraysANodes();

for (int pairRank : pairRanks)
assert_pairRankIsDistinct(qureg, pairRank);

exchangeSubBufferChunks(qureg, pairRanks, recvTagBases, sendTagBase, chunkSize);
}


void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank) {

auto [sendInd, recvInd] = getSubBufferSendRecvInds(qureg);
Expand Down
4 changes: 3 additions & 1 deletion quest/src/comm/comm_routines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank);

void comm_exchangeSubBuffers(Qureg qureg, qindex numAmpsAndRecvInd, int pairRank);

void comm_exchangeSubBufferChunks(Qureg qureg, const vector<int>& pairRanks, const vector<qindex>& recvTagBases, qindex sendTagBase, qindex chunkSize);

void comm_asynchSendSubBuffer(Qureg qureg, qindex numElems, int pairRank);

void comm_receiveArrayToBuffer(Qureg qureg, qindex numElems, int pairRank);
Expand Down Expand Up @@ -81,4 +83,4 @@ vector<std::string> comm_gatherStringsToRoot(char* localChars, int maxNumLocalCh



#endif // COMM_ROUTINES_HPP
#endif // COMM_ROUTINES_HPP
110 changes: 105 additions & 5 deletions quest/src/core/localiser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
#include "quest/src/core/localiser.hpp"
#include "quest/src/core/accelerator.hpp"
#include "quest/src/comm/comm_config.hpp"
#include "quest/src/comm/comm_indices.hpp"
#include "quest/src/comm/comm_routines.hpp"
#include "quest/src/cpu/cpu_config.hpp"
#include "quest/src/cpu/cpu_subroutines.hpp"
#include "quest/src/gpu/gpu_config.hpp"

#include <tuple>
Expand Down Expand Up @@ -893,32 +895,130 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64
*/


qindex getBitMaskOfQubitsInPattern(ConstList64 qubits, qindex pattern) {

qindex mask = 0;
for (size_t i=0; i<qubits.size(); i++)
mask = setBit(mask, qubits[i], getBit(pattern, i));

return mask;
}


qindex getRankPatternInPrefixInds(int rank, ConstList64 prefixInds) {

qindex pattern = 0;
for (size_t i=0; i<prefixInds.size(); i++)
pattern = setBit(pattern, i, getBit(rank, prefixInds[i]));

return pattern;
}


int getRankWithPrefixIndsInPattern(int rank, ConstList64 prefixInds, qindex pattern) {

for (size_t i=0; i<prefixInds.size(); i++)
rank = setBit(rank, prefixInds[i], getBit(pattern, i));

return rank;
}


void multiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 suffixTargs, ConstList64 prefixInds) {

auto sortedSuffixTargs = util_getSorted(suffixTargs);
qindex localPrefixPattern = getRankPatternInPrefixInds(qureg.rank, prefixInds);
qindex numPatterns = powerOf2(suffixTargs.size());

vector<qindex> remotePatterns;
vector<int> pairRanks;

for (qindex pattern=0; pattern<numPatterns; pattern++) {
if (pattern == localPrefixPattern)
continue;

remotePatterns.push_back(pattern);
pairRanks.push_back(getRankWithPrefixIndsInPattern(qureg.rank, prefixInds, pattern));
}

qindex chunkSize = qureg.numAmpsPerNode / numPatterns;
qindex maxChunksPerWave = numPatterns / 2;
qindex sendBase = getSubBufferSendInd(qureg);
qindex recvBase = getBufferRecvInd();

for (qindex firstChunk=0; firstChunk<(qindex) remotePatterns.size(); firstChunk += maxChunksPerWave) {
qindex numChunks = std::min(maxChunksPerWave, (qindex) remotePatterns.size() - firstChunk);
vector<int> waveRanks;
vector<qindex> recvTagBases;

for (qindex c=0; c<numChunks; c++) {
qindex pattern = remotePatterns[firstChunk + c];
qindex mask = getBitMaskOfQubitsInPattern(suffixTargs, pattern);
qindex bufferOffset = sendBase + c*chunkSize;

cpu_statevec_packAmpsIntoBufferAtOffset(qureg, sortedSuffixTargs, mask, bufferOffset);
waveRanks.push_back(pairRanks[firstChunk + c]);
recvTagBases.push_back(pattern);
}

comm_exchangeSubBufferChunks(qureg, waveRanks, recvTagBases, localPrefixPattern, chunkSize);

for (qindex c=0; c<numChunks; c++) {
qindex pattern = remotePatterns[firstChunk + c];
qindex mask = getBitMaskOfQubitsInPattern(suffixTargs, pattern);
qindex bufferOffset = recvBase + c*chunkSize;

cpu_statevec_unpackAmpsFromBufferAtOffset(qureg, sortedSuffixTargs, mask, bufferOffset);
}
}
}


void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) {

// this is an internal function called by the below routines which require
// performing a sequence of SWAPs to reorder qubits, or move them into suffix.
// the SWAPs act on unique qubit pairs and so commute.

/// @todo
/// - the sequence of pair-wise full-swaps should be more efficient as a
/// "single" sequence of smaller messages sending amps directly to their
/// final destination node. This could use a new "multiSwap" function.
/// - if the user has compiled cuQuantum, and Qureg is GPU-accelerated, the
/// multiSwap function should use custatevecSwapIndexBits() if local,
/// or custatevecDistIndexBitSwapSchedulerSetIndexBitSwaps() if distributed,
/// although the latter requires substantially more work like setting up
/// a communicator which may be inelegant alongside our own distribution scheme.

// perform necessary swaps to move all targets into suffix, each of which invokes communication
List64 suffixTargs = lists_getEmptyList64();
List64 prefixTargs = lists_getEmptyList64();

for (size_t i=0; i<targsA.size(); i++) {

if (targsA[i] == targsB[i])
continue;

int suffixTarg = std::min(targsA[i], targsB[i]);
int prefixTarg = std::max(targsA[i], targsB[i]);
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTarg, prefixTarg);

suffixTargs.push_back(suffixTarg);
prefixTargs.push_back(prefixTarg);
}

if (
suffixTargs.size() >= 2 &&
ctrls.empty() &&
qureg.isDistributed &&
!qureg.isGpuAccelerated
) {
List64 prefixInds = lists_getEmptyList64();
for (int prefixTarg : prefixTargs)
prefixInds.push_back(util_getPrefixInd(prefixTarg, qureg));

multiSwapBetweenPrefixAndSuffix(qureg, suffixTargs, prefixInds);
return;
}

// otherwise, fall back to per-SWAP communication
for (size_t i=0; i<suffixTargs.size(); i++)
anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, suffixTargs[i], prefixTargs[i]);
}


Expand Down
32 changes: 32 additions & 0 deletions quest/src/cpu/cpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,38 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
}


void cpu_statevec_packAmpsIntoBufferAtOffset(Qureg qureg, ConstList64 sortedQubits, qindex qubitStateMask, qindex bufferOffset) {

cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);

qindex numIts = qureg.numAmpsPerNode / powerOf2(sortedQubits.size());
int numQubitBits = sortedQubits.size();

#pragma omp parallel for if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {
qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
buffer[bufferOffset + n] = amps[i];
}
}


void cpu_statevec_unpackAmpsFromBufferAtOffset(Qureg qureg, ConstList64 sortedQubits, qindex qubitStateMask, qindex bufferOffset) {

cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
cpu_qcomp* buffer = getCpuQcompPtr(qureg.cpuCommBuffer);

qindex numIts = qureg.numAmpsPerNode / powerOf2(sortedQubits.size());
int numQubitBits = sortedQubits.size();

#pragma omp parallel for if(qureg.isMultithreaded)
for (qindex n=0; n<numIts; n++) {
qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
amps[i] = buffer[bufferOffset + n];
}
}


qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);
Expand Down
6 changes: 5 additions & 1 deletion quest/src/cpu/cpu_subroutines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp

template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);

void cpu_statevec_packAmpsIntoBufferAtOffset(Qureg qureg, ConstList64 sortedQubits, qindex qubitStateMask, qindex bufferOffset);

void cpu_statevec_unpackAmpsFromBufferAtOffset(Qureg qureg, ConstList64 sortedQubits, qindex qubitStateMask, qindex bufferOffset);

qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);


Expand Down Expand Up @@ -202,4 +206,4 @@ void cpu_statevec_initDebugState_sub(Qureg qureg);
void cpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qureg);


#endif // CPU_SUBROUTINES_HPP
#endif // CPU_SUBROUTINES_HPP
Loading