_static/doxygen/qengine__cuda_8hpp_source.html

 //

 // (C) Daniel Strano and the Qrack contributors 2017-2023. All rights reserved.

 //

 // This is a multithreaded, universal quantum register simulation, allowing

 // (nonphysical) register cloning and direct measurement of probability and

 // phase, to leverage what advantages classical emulation of qubits can have.

 //

 // Licensed under the GNU Lesser General Public License V3.

 // See LICENSE.md in the project root or https://www.gnu.org/licenses/lgpl-3.0.en.html

 // for details.


 #pragma once


 #include "common/cudaengine.cuh"

 #include "qengine.hpp"

 #include "qengine_gpu_util.hpp"


 #if !ENABLE_CUDA

 #error CUDA has not been enabled

 #endif


 #include <list>


 #define BCI_ARG_LEN 10

 #define CMPLX_NORM_LEN 6

 #define REAL_ARG_LEN 2


 namespace Qrack {


 typedef unsigned long cl_map_flags;

 typedef unsigned long cl_mem_flags;


 // clang-format off

 #define CL_MAP_READ                                 (1 << 0)

 #define CL_MAP_WRITE                                (1 << 1)


 #define CL_MEM_READ_WRITE                           (1 << 0)

 #define CL_MEM_WRITE_ONLY                           (1 << 1)

 #define CL_MEM_READ_ONLY                            (1 << 2)

 #define CL_MEM_USE_HOST_PTR                         (1 << 3)

 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)

 // clang-format on


 typedef std::shared_ptr<void> BufferPtr;


 class QEngineCUDA;

 typedef std::shared_ptr<QEngineCUDA> QEngineCUDAPtr;


 struct QueueItem {

     OCLAPI api_call;

     size_t workItemCount;

     size_t localGroupSize;

     size_t deallocSize;

     std::vector<BufferPtr> buffers;

     size_t localBuffSize;

     bool isSetDoNorm;

     bool isSetRunningNorm;

     bool doNorm;

     real1 runningNorm;


     QueueItem()

         : api_call()

         , workItemCount(0U)

         , localGroupSize(0U)

         , deallocSize(0U)

         , buffers()

         , localBuffSize(0U)

         , isSetDoNorm(false)

         , isSetRunningNorm(true)

         , doNorm(false)

         , runningNorm(ONE_R1)

     {

     }


     QueueItem(OCLAPI ac, size_t wic, size_t lgs, size_t ds, std::vector<BufferPtr> b, size_t lbs)

         : api_call(ac)

         , workItemCount(wic)

         , localGroupSize(lgs)

         , deallocSize(ds)

         , buffers(b)

         , localBuffSize(lbs)

         , isSetDoNorm(false)

         , isSetRunningNorm(false)

         , doNorm(false)

         , runningNorm(ONE_R1)

     {

     }


     QueueItem(bool doNrm)

         : api_call()

         , workItemCount(0U)

         , localGroupSize(0U)

         , deallocSize(0U)

         , buffers()

         , localBuffSize(0U)

         , isSetDoNorm(true)

         , isSetRunningNorm(false)

         , doNorm(doNrm)

         , runningNorm(ONE_R1)

     {

     }


     QueueItem(real1_f runningNrm)

         : api_call()

         , workItemCount(0U)

         , localGroupSize(0U)

         , deallocSize(0U)

         , buffers()

         , localBuffSize(0U)

         , isSetDoNorm(false)

         , isSetRunningNorm(true)

         , doNorm(false)

         , runningNorm(runningNrm)

     {

     }

 };


 class PoolItem {

 public:

     BufferPtr cmplxBuffer;

     BufferPtr realBuffer;

     BufferPtr ulongBuffer;


     std::shared_ptr<real1> probArray;

     std::shared_ptr<real1> angleArray;


     PoolItem()

         : probArray(NULL)

         , angleArray(NULL)

     {

         cmplxBuffer = MakeBuffer(sizeof(complex) * CMPLX_NORM_LEN);

         realBuffer = MakeBuffer(sizeof(real1) * REAL_ARG_LEN);

         ulongBuffer = MakeBuffer(sizeof(bitCapIntOcl) * BCI_ARG_LEN);

     }


     ~PoolItem() {}


 protected:

     BufferPtr MakeBuffer(size_t size)

     {

         cudaError_t error;


         BufferPtr toRet = std::shared_ptr<void>(AllocRaw(size, &error), [](void* c) { cudaFree(c); });


         if (error != cudaSuccess) {

             throw std::runtime_error("CUDA error code on buffer allocation attempt: " + std::to_string(error));

         }


         return toRet;

     }


     void* AllocRaw(size_t size, cudaError_t* errorPtr)

     {

         void* toRet;

         *errorPtr = cudaMalloc(&toRet, size);


         return toRet;

     }

 };


 typedef std::shared_ptr<PoolItem> PoolItemPtr;


 class QEngineCUDA : public QEngine {

 protected:

     bool didInit;

     bool usingHostRam;

     bool unlockHostMem;

     size_t nrmGroupCount;

     size_t nrmGroupSize;

     size_t totalOclAllocSize;

     int64_t deviceID;

     cl_map_flags lockSyncFlags;

     complex permutationAmp;

     std::shared_ptr<complex> stateVec;

     std::mutex queue_mutex;

     // stateBuffer is allocated as a shared_ptr, because it's the only buffer that will be acted on outside of

     // QEngineCUDA itself, specifically by QEngineCUDAMulti.

     BufferPtr stateBuffer;

     BufferPtr nrmBuffer;

     DeviceContextPtr device_context;

     std::list<QueueItem> wait_queue_items;

     std::vector<PoolItemPtr> poolItems;

     std::unique_ptr<real1[], void (*)(real1*)> nrmArray;


     // For std::function, cudaError_t use might discard int qualifiers.

     void tryCuda(std::string message, std::function<cudaError_t()> oclCall)

     {

         if (oclCall() == cudaSuccess) {

             // Success

             return;

         }


         // Soft finish (just for this QEngineCUDA)

         clFinish();


         if (oclCall() == cudaSuccess) {

             // Success after clearing QEngineCUDA queue

             return;

         }


         // Hard finish (for the unique OpenCL device)

         clFinish(true);


         cudaError_t error = oclCall();

         if (error == cudaSuccess) {

             // Success after clearing all queues for the OpenCL device

             return;

         }


         wait_queue_items.clear();


         // We're fatally blocked. Throw to exit.

         throw std::runtime_error(message + ", error code: " + std::to_string(error));

     }


 public:

     static const bitCapIntOcl OclMemDenom = 3U;


     QEngineCUDA(bitLenInt qBitCount, bitCapInt initState, qrack_rand_gen_ptr rgp = nullptr,

         complex phaseFac = CMPLX_DEFAULT_ARG, bool doNorm = false, bool randomGlobalPhase = true,

         bool useHostMem = false, int64_t devID = -1, bool useHardwareRNG = true, bool ignored = false,

         real1_f norm_thresh = REAL1_EPSILON, std::vector<int64_t> ignored2 = {}, bitLenInt ignored4 = 0U,

         real1_f ignored3 = FP_NORM_EPSILON_F);


     ~QEngineCUDA()

     {

         // Make sure we track device allocation.

         FreeAll();

     }


     virtual bool isOpenCL() { return true; }


     bool IsZeroAmplitude() { return !stateBuffer; }

     real1_f FirstNonzeroPhase()

     {

         if (!stateBuffer) {

             return ZERO_R1_F;

         }


         return QInterface::FirstNonzeroPhase();

     }


     void SwitchHostPtr(bool useHostMem)

     {

         if (useHostMem == usingHostRam) {

             return;

         }


         std::shared_ptr<complex> copyVec = AllocStateVec(maxQPowerOcl, true);

         GetQuantumState(copyVec.get());


         if (useHostMem) {

             stateVec = copyVec;

             stateBuffer = MakeStateVecBuffer(stateVec);

         } else {

             stateVec = NULL;

             stateBuffer = MakeStateVecBuffer(stateVec);

             clFinish();

             tryCuda("Failed to write buffer", [&] {

                 return cudaMemcpy(

                     stateBuffer.get(), (void*)(copyVec.get()), sizeof(complex) * maxQPowerOcl, cudaMemcpyHostToDevice);

             });

             copyVec.reset();

         }


         usingHostRam = useHostMem;

     }


     void FreeAll();

     void ZeroAmplitudes();

     void CopyStateVec(QEnginePtr src);


     void GetAmplitudePage(complex* pagePtr, bitCapIntOcl offset, bitCapIntOcl length);

     void SetAmplitudePage(const complex* pagePtr, bitCapIntOcl offset, bitCapIntOcl length);

     void SetAmplitudePage(

         QEnginePtr pageEnginePtr, bitCapIntOcl srcOffset, bitCapIntOcl dstOffset, bitCapIntOcl length);

     void ShuffleBuffers(QEnginePtr engine);

     QEnginePtr CloneEmpty();


     void QueueSetDoNormalize(bool doNorm) { AddQueueItem(QueueItem(doNorm)); }

     void QueueSetRunningNorm(real1_f runningNrm) { AddQueueItem(QueueItem(runningNrm)); }

     void AddQueueItem(const QueueItem& item)

     {

         // For lock_guard:

         if (true) {

             std::lock_guard<std::mutex> lock(queue_mutex);

             wait_queue_items.push_back(item);

         }


         DispatchQueue();

     }

     void QueueCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector<BufferPtr> args,

         size_t localBuffSize = 0U, size_t deallocSize = 0U)

     {

         if (localBuffSize > device_context->GetLocalSize()) {

             throw bad_alloc("Local memory limits exceeded in QEngineCUDA::QueueCall()");

         }

         cudaStreamSynchronize(device_context->params_queue);

         AddQueueItem(QueueItem(api_call, workItemCount, localGroupSize, deallocSize, args, localBuffSize));

     }


     bitCapIntOcl GetMaxSize() { return device_context->GetMaxAlloc() / sizeof(complex); };


     void SetPermutation(bitCapInt perm, complex phaseFac = CMPLX_DEFAULT_ARG);


     using QEngine::UniformlyControlledSingleBit;

     void UniformlyControlledSingleBit(const std::vector<bitLenInt>& controls, bitLenInt qubitIndex,

         complex const* mtrxs, const std::vector<bitCapInt>& mtrxSkipPowers, bitCapInt mtrxSkipValueMask);

     void UniformParityRZ(bitCapInt mask, real1_f angle);

     void CUniformParityRZ(const std::vector<bitLenInt>& controls, bitCapInt mask, real1_f angle);


     using QEngine::X;

     void X(bitLenInt target);

     using QEngine::Z;

     void Z(bitLenInt target);

     using QEngine::Invert;

     void Invert(complex topRight, complex bottomLeft, bitLenInt qubitIndex);

     using QEngine::Phase;

     void Phase(complex topLeft, complex bottomRight, bitLenInt qubitIndex);


     void XMask(bitCapInt mask);

     void PhaseParity(real1_f radians, bitCapInt mask);


     using QEngine::Compose;

     bitLenInt Compose(QEngineCUDAPtr toCopy);

     bitLenInt Compose(QInterfacePtr toCopy) { return Compose(std::dynamic_pointer_cast<QEngineCUDA>(toCopy)); }

     bitLenInt Compose(QEngineCUDAPtr toCopy, bitLenInt start);

     bitLenInt Compose(QInterfacePtr toCopy, bitLenInt start)

     {

         return Compose(std::dynamic_pointer_cast<QEngineCUDA>(toCopy), start);

     }

     using QEngine::Decompose;

     void Decompose(bitLenInt start, QInterfacePtr dest);

     void Dispose(bitLenInt start, bitLenInt length);

     void Dispose(bitLenInt start, bitLenInt length, bitCapInt disposedPerm);

     using QEngine::Allocate;

     bitLenInt Allocate(bitLenInt start, bitLenInt length);


     void ROL(bitLenInt shift, bitLenInt start, bitLenInt length);


 #if ENABLE_ALU

     void INC(bitCapInt toAdd, bitLenInt start, bitLenInt length);

     void CINC(bitCapInt toAdd, bitLenInt inOutStart, bitLenInt length, const std::vector<bitLenInt>& controls);

     void INCS(bitCapInt toAdd, bitLenInt start, bitLenInt length, bitLenInt carryIndex);

 #if ENABLE_BCD

     void INCBCD(bitCapInt toAdd, bitLenInt start, bitLenInt length);

 #endif

     void MUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);

     void DIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);

     void MULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);

     void IMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);

     void POWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);

     void CMUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void CDIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void CMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void CIMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void CPOWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void FullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut);

     void IFullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut);


     bitCapInt IndexedLDA(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,

         const unsigned char* values, bool resetValue = true);

     bitCapInt IndexedADC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,

         bitLenInt carryIndex, const unsigned char* values);

     bitCapInt IndexedSBC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,

         bitLenInt carryIndex, const unsigned char* values);

     void Hash(bitLenInt start, bitLenInt length, const unsigned char* values);


     void CPhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length, bitLenInt flagIndex);

     void PhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length);

 #endif


     real1_f Prob(bitLenInt qubit);

     real1_f CtrlOrAntiProb(bool controlState, bitLenInt control, bitLenInt target);

     real1_f ProbReg(bitLenInt start, bitLenInt length, bitCapInt permutation);

     void ProbRegAll(bitLenInt start, bitLenInt length, real1* probsArray);

     real1_f ProbMask(bitCapInt mask, bitCapInt permutation);

     void ProbMaskAll(bitCapInt mask, real1* probsArray);

     real1_f ProbParity(bitCapInt mask);

     bool ForceMParity(bitCapInt mask, bool result, bool doForce = true);

     real1_f ExpectationBitsAll(const std::vector<bitLenInt>& bits, bitCapInt offset = 0);


     void SetDevice(int64_t dID);

     int64_t GetDevice() { return deviceID; }


     void SetQuantumState(complex const* inputState);

     void GetQuantumState(complex* outputState);

     void GetProbs(real1* outputProbs);

     bitCapInt MAll();

     complex GetAmplitude(bitCapInt perm);

     void SetAmplitude(bitCapInt perm, complex amp);


     real1_f SumSqrDiff(QInterfacePtr toCompare)

     {

         return SumSqrDiff(std::dynamic_pointer_cast<QEngineCUDA>(toCompare));

     }

     real1_f SumSqrDiff(QEngineCUDAPtr toCompare);


     void NormalizeState(

         real1_f nrm = REAL1_DEFAULT_ARG, real1_f norm_thresh = REAL1_DEFAULT_ARG, real1_f phaseArg = ZERO_R1_F);

     ;

     void UpdateRunningNorm(real1_f norm_thresh = REAL1_DEFAULT_ARG);

     void Finish() { clFinish(); };

     bool isFinished() { return !wait_queue_items.size(); };


     QInterfacePtr Clone();


     void PopQueue();

     void DispatchQueue();


 protected:

     void AddAlloc(size_t size)

     {

         size_t currentAlloc = CUDAEngine::Instance().AddToActiveAllocSize(deviceID, size);

         if (device_context && (currentAlloc > device_context->GetGlobalAllocLimit())) {

             CUDAEngine::Instance().SubtractFromActiveAllocSize(deviceID, size);

             throw bad_alloc("VRAM limits exceeded in QEngineCUDA::AddAlloc()");

         }

         totalOclAllocSize += size;

     }

     void SubtractAlloc(size_t size)

     {

         CUDAEngine::Instance().SubtractFromActiveAllocSize(deviceID, size);

         totalOclAllocSize -= size;

     }


     BufferPtr MakeBuffer(cl_mem_flags flags, size_t size, void* host_ptr = NULL)

     {

         cudaError_t error;


         BufferPtr toRet = std::shared_ptr<void>(

             AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });


         if (error == cudaSuccess) {

             // Success

             return toRet;

         }


         // Soft finish (just for this QEngineCUDA)

         clFinish();


         toRet = std::shared_ptr<void>(

             AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });


         if (error == cudaSuccess) {

             // Success after clearing QEngineCUDA queue

             return toRet;

         }


         // Hard finish (for the unique OpenCL device)

         clFinish(true);


         toRet = std::shared_ptr<void>(

             AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });


         if (error != cudaSuccess) {

             throw std::runtime_error("CUDA error code on buffer allocation attempt: " + std::to_string(error));

         }


         return toRet;

     }


     void* AllocRaw(cl_mem_flags flags, void* host_ptr, size_t size, cudaError_t* errorPtr)

     {

         void* toRet = host_ptr;

         *errorPtr = (flags & CL_MEM_USE_HOST_PTR) ? cudaHostRegister(host_ptr, size, cudaHostRegisterDefault)

                                                   : cudaMalloc(&toRet, size);

         if ((*errorPtr == cudaSuccess) && (flags & CL_MEM_COPY_HOST_PTR)) {

             cudaMemcpy(toRet, host_ptr, size, cudaMemcpyHostToDevice);

         }


         return toRet;

     }


     void FreeRaw(cl_mem_flags flags, void* c)

     {

         if (flags & CL_MEM_USE_HOST_PTR) {

             cudaHostUnregister(c);

         } else {

             cudaFree(c);

         }

     }


     real1_f GetExpectation(bitLenInt valueStart, bitLenInt valueLength);


     std::shared_ptr<complex> AllocStateVec(bitCapInt elemCount, bool doForceAlloc = false);

     void FreeStateVec() { stateVec = NULL; }

     void ResetStateBuffer(BufferPtr nStateBuffer);

     BufferPtr MakeStateVecBuffer(std::shared_ptr<complex> nStateVec);

     void ReinitBuffer();


     void Compose(OCLAPI apiCall, const bitCapIntOcl* bciArgs, QEngineCUDAPtr toCopy);


     void InitOCL(int64_t devID);

     PoolItemPtr GetFreePoolItem();


     real1_f ParSum(real1* toSum, bitCapIntOcl maxI);


     void LockSync(cl_map_flags flags = (CL_MAP_READ | CL_MAP_WRITE));

     void UnlockSync();


     void clFinish(bool doHard = false);


     void clDump();


     size_t FixWorkItemCount(size_t maxI, size_t wic)

     {

         if (wic > maxI) {

             // Guaranteed to be a power of two

             return maxI;

         }


         // Otherwise, clamp to a power of two

         return (size_t)pow2(log2(wic));

     }


     size_t FixGroupSize(size_t wic, size_t gs)

     {

         if (gs > wic) {

             return wic;

         }


         return gs - (wic % gs);

     }


     void DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCUDAPtr dest);


     using QEngine::Apply2x2;

     void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const* mtrx, bitLenInt bitCount,

         const bitCapIntOcl* qPowersSorted, bool doCalcNorm, real1_f norm_thresh = REAL1_DEFAULT_ARG)

     {

         Apply2x2(offset1, offset2, mtrx, bitCount, qPowersSorted, doCalcNorm, SPECIAL_2X2::NONE, norm_thresh);

     }

     void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const* mtrx, bitLenInt bitCount,

         const bitCapIntOcl* qPowersSorted, bool doCalcNorm, SPECIAL_2X2 special,

         real1_f norm_thresh = REAL1_DEFAULT_ARG);


     void BitMask(bitCapIntOcl mask, OCLAPI api_call, real1_f phase = (real1_f)PI_R1);


     void ApplyM(bitCapInt mask, bool result, complex nrm);

     void ApplyM(bitCapInt mask, bitCapInt result, complex nrm);


     /* Utility functions used by the operations above. */

     void WaitCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector<BufferPtr> args,

         size_t localBuffSize = 0U);

     EventVecPtr ResetWaitEvents(bool waitQueue = true);

     void ApplyMx(OCLAPI api_call, const bitCapIntOcl* bciArgs, complex nrm);

     real1_f Probx(OCLAPI api_call, const bitCapIntOcl* bciArgs);


     void ArithmeticCall(OCLAPI api_call, const bitCapIntOcl (&bciArgs)[BCI_ARG_LEN], const unsigned char* values = NULL,

         bitCapIntOcl valuesLength = 0U);

     void CArithmeticCall(OCLAPI api_call, const bitCapIntOcl (&bciArgs)[BCI_ARG_LEN], bitCapIntOcl* controlPowers,

         bitLenInt controlLen, const unsigned char* values = NULL, bitCapIntOcl valuesLength = 0U);

     void ROx(OCLAPI api_call, bitLenInt shift, bitLenInt start, bitLenInt length);


 #if ENABLE_ALU

     void INCDECC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

     void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

     void INCDECSC(

         bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex);

 #if ENABLE_BCD

     void INCDECBCDC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

 #endif


     void INT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length);

     void CINT(

         OCLAPI api_call, bitCapIntOcl toMod, bitLenInt start, bitLenInt length, const std::vector<bitLenInt>& controls);

     void INTC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

     void INTS(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex);

     void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

     void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex,

         bitLenInt carryIndex);

 #if ENABLE_BCD

     void INTBCD(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length);

     void INTBCDC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);

 #endif

     void xMULx(OCLAPI api_call, const bitCapIntOcl* bciArgs, BufferPtr controlBuffer);

     void MULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);

     void MULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart,

         bitLenInt length);

     void CMULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,

         const std::vector<bitLenInt>& controls);

     void CMULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart,

         bitLenInt length, const std::vector<bitLenInt>& controls);

     void FullAdx(

         bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut, OCLAPI api_call);

     void PhaseFlipX(OCLAPI api_call, const bitCapIntOcl* bciArgs);


     bitCapIntOcl OpIndexed(OCLAPI api_call, bitCapIntOcl carryIn, bitLenInt indexStart, bitLenInt indexLength,

         bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char* values);

 #endif


     void ClearBuffer(BufferPtr buff, bitCapIntOcl offset, bitCapIntOcl size);

 };


 } // namespace Qrack

Qrack::PoolItem
Definition: qengine_cuda.hpp:119

Qrack::PoolItem::ulongBuffer
BufferPtr ulongBuffer
Definition: qengine_cuda.hpp:123

Qrack::PoolItem::~PoolItem
~PoolItem()
Definition: qengine_cuda.hpp:137

Qrack::PoolItem::cmplxBuffer
BufferPtr cmplxBuffer
Definition: qengine_cuda.hpp:121

Qrack::PoolItem::MakeBuffer
BufferPtr MakeBuffer(size_t size)
Definition: qengine_cuda.hpp:140

Qrack::PoolItem::realBuffer
BufferPtr realBuffer
Definition: qengine_cuda.hpp:122

Qrack::PoolItem::angleArray
std::shared_ptr< real1 > angleArray
Definition: qengine_cuda.hpp:126

Qrack::PoolItem::PoolItem
PoolItem()
Definition: qengine_cuda.hpp:128

Qrack::PoolItem::AllocRaw
void * AllocRaw(size_t size, cudaError_t *errorPtr)
Definition: qengine_cuda.hpp:153

Qrack::PoolItem::probArray
std::shared_ptr< real1 > probArray
Definition: qengine_cuda.hpp:125

Qrack::QEngineCUDA
OpenCL enhanced QEngineCPU implementation.
Definition: qengine_cuda.hpp:182

Qrack::QEngineCUDA::Prob
real1_f Prob(bitLenInt qubit)
Direct measure of bit probability to be in |1> state.

Qrack::QEngineCUDA::Apply2x2
virtual void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, bitCapIntOcl const *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)=0

Qrack::QEngineCUDA::SumSqrDiff
real1_f SumSqrDiff(QInterfacePtr toCompare)
Definition: qengine_cuda.hpp:437

Qrack::QEngineCUDA::MUL
void MUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
Multiply by integer.

Qrack::QEngineCUDA::Compose
void Compose(OCLAPI apiCall, const bitCapIntOcl *bciArgs, QEngineCUDAPtr toCopy)

Qrack::QEngineCUDA::UniformlyControlledSingleBit
void UniformlyControlledSingleBit(const std::vector< bitLenInt > &controls, bitLenInt qubitIndex, complex const *mtrxs, const std::vector< bitCapInt > &mtrxSkipPowers, bitCapInt mtrxSkipValueMask)

Qrack::QEngineCUDA::INCBCD
void INCBCD(bitCapInt toAdd, bitLenInt start, bitLenInt length)
Add classical BCD integer (without sign)

Qrack::QEngineCUDA::isOpenCL
virtual bool isOpenCL()
Returns "true" if current simulation is OpenCL-based.
Definition: qengine_cuda.hpp:270

Qrack::QEngineCUDA::QueueCall
void QueueCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector< BufferPtr > args, size_t localBuffSize=0U, size_t deallocSize=0U)
Definition: qengine_cuda.hpp:331

Qrack::QEngineCUDA::Decompose
void Decompose(bitLenInt start, QInterfacePtr dest)
Minimally decompose a set of contiguous bits from the separably composed unit, into "destination".

Qrack::QEngineCUDA::Allocate
bitLenInt Allocate(bitLenInt start, bitLenInt length)
Allocate new "length" count of |0> state qubits at specified qubit index start position.

Qrack::QEngineCUDA::FirstNonzeroPhase
real1_f FirstNonzeroPhase()
Get phase of lowest permutation nonzero amplitude.
Definition: qengine_cuda.hpp:273

Qrack::QEngineCUDA::ApplyM
void ApplyM(bitCapInt mask, bool result, complex nrm)

Qrack::QEngineCUDA::didInit
bool didInit
Definition: qengine_cuda.hpp:184

Qrack::QEngineCUDA::ShuffleBuffers
void ShuffleBuffers(QEnginePtr engine)
Swap the high half of this engine with the low half of another.

Qrack::QEngineCUDA::Apply2x2
void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, const bitCapIntOcl *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)
Definition: qengine_cuda.hpp:602

Qrack::QEngineCUDA::Finish
void Finish()
If asynchronous work is still running, block until it finishes.
Definition: qengine_cuda.hpp:447

Qrack::QEngineCUDA::Phase
void Phase(complex topLeft, complex bottomRight, bitLenInt qubitIndex)
Apply a single bit transformation that only effects phase.

Qrack::QEngineCUDA::CUniformParityRZ
void CUniformParityRZ(const std::vector< bitLenInt > &controls, bitCapInt mask, real1_f angle)
If the controls are set and the target qubit set parity is odd, this applies a phase factor of .

Qrack::QEngineCUDA::ProbReg
real1_f ProbReg(bitLenInt start, bitLenInt length, bitCapInt permutation)
Direct measure of register permutation probability.

Qrack::QEngineCUDA::CPOWModNOut
void CPOWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled, raise a classical base to a quantum power, modulo N, (out of place)

Qrack::QEngineCUDA::INTBCD
void INTBCD(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length)

Qrack::QEngineCUDA::Compose
bitLenInt Compose(QEngineCUDAPtr toCopy)

Qrack::QEngineCUDA::CDIV
void CDIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled division by power of integer.

Qrack::QEngineCUDA::PhaseFlipX
void PhaseFlipX(OCLAPI api_call, const bitCapIntOcl *bciArgs)

Qrack::QEngineCUDA::PhaseFlipIfLess
void PhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length)
This is an expedient for an adaptive Grover's search for a function's global minimum.

Qrack::QEngineCUDA::SubtractAlloc
void SubtractAlloc(size_t size)
Definition: qengine_cuda.hpp:465

Qrack::QEngineCUDA::FullAdx
void FullAdx(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut, OCLAPI api_call)

Qrack::QEngineCUDA::stateBuffer
BufferPtr stateBuffer
Definition: qengine_cuda.hpp:197

Qrack::QEngineCUDA::INCDECSC
void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (with overflow flag)

Qrack::QEngineCUDA::tryCuda
void tryCuda(std::string message, std::function< cudaError_t()> oclCall)
Definition: qengine_cuda.hpp:205

Qrack::QEngineCUDA::Dispose
void Dispose(bitLenInt start, bitLenInt length)
Minimally decompose a set of contiguous bits from the separably composed unit, and discard the separa...

Qrack::QEngineCUDA::FullAdd
void FullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut)
Quantum analog of classical "Full Adder" gate.

Qrack::QEngineCUDA::SetPermutation
void SetPermutation(bitCapInt perm, complex phaseFac=CMPLX_DEFAULT_ARG)
Set to a specific permutation of all qubits.

Qrack::QEngineCUDA::stateVec
std::shared_ptr< complex > stateVec
Definition: qengine_cuda.hpp:193

Qrack::QEngineCUDA::IndexedSBC
bitCapInt IndexedSBC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)
Subtract from an entangled 8 bit register state with a superposed index-offset-based read from classi...

Qrack::QEngineCUDA::QueueSetRunningNorm
void QueueSetRunningNorm(real1_f runningNrm)
Add an operation to the (OpenCL) queue, to set the value of runningNorm, which is the normalization c...
Definition: qengine_cuda.hpp:320

Qrack::QEngineCUDA::NormalizeState
void NormalizeState(real1_f nrm=REAL1_DEFAULT_ARG, real1_f norm_thresh=REAL1_DEFAULT_ARG, real1_f phaseArg=ZERO_R1_F)
Apply the normalization factor found by UpdateRunningNorm() or on the fly by a single bit gate.

Qrack::QEngineCUDA::ROL
void ROL(bitLenInt shift, bitLenInt start, bitLenInt length)
Circular shift left - shift bits left, and carry last bits.

Qrack::QEngineCUDA::nrmArray
std::unique_ptr< real1[], void(*)(real1 *)> nrmArray
Definition: qengine_cuda.hpp:202

Qrack::QEngineCUDA::OclMemDenom
static const bitCapIntOcl OclMemDenom
1 / OclMemDenom is the maximum fraction of total OCL device RAM that a single state vector should occ...
Definition: qengine_cuda.hpp:238

Qrack::QEngineCUDA::GetDevice
int64_t GetDevice()
Get GPU device ID.
Definition: qengine_cuda.hpp:428

Qrack::QEngineCUDA::UnlockSync
void UnlockSync()
Unlocks synchronization between the state vector buffer and general RAM, so the state vector can be o...

Qrack::QEngineCUDA::CPhaseFlipIfLess
void CPhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length, bitLenInt flagIndex)
The 6502 uses its carry flag also as a greater-than/less-than flag, for the CMP operation.

Qrack::QEngineCUDA::AddAlloc
void AddAlloc(size_t size)
Definition: qengine_cuda.hpp:456

Qrack::QEngineCUDA::ArithmeticCall
void ArithmeticCall(OCLAPI api_call, const bitCapIntOcl(&bciArgs)[BCI_ARG_LEN], const unsigned char *values=NULL, bitCapIntOcl valuesLength=0U)

Qrack::QEngineCUDA::ClearBuffer
void ClearBuffer(BufferPtr buff, bitCapIntOcl offset, bitCapIntOcl size)

Qrack::QEngineCUDA::INC
void INC(bitCapInt toAdd, bitLenInt start, bitLenInt length)
Add integer (without sign)

Qrack::QEngineCUDA::FixGroupSize
size_t FixGroupSize(size_t wic, size_t gs)
Definition: qengine_cuda.hpp:590

Qrack::QEngineCUDA::totalOclAllocSize
size_t totalOclAllocSize
Definition: qengine_cuda.hpp:189

Qrack::QEngineCUDA::X
void X(bitLenInt target)

Qrack::QEngineCUDA::PopQueue
void PopQueue()

Qrack::QEngineCUDA::GetProbs
void GetProbs(real1 *outputProbs)
Get the pure quantum state representation.

Qrack::QEngineCUDA::CINC
void CINC(bitCapInt toAdd, bitLenInt inOutStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Add integer (without sign, with controls)

Qrack::QEngineCUDA::CloneEmpty
QEnginePtr CloneEmpty()
Clone this QEngine's settings, with a zeroed state vector.

Qrack::QEngineCUDA::GetQuantumState
void GetQuantumState(complex *outputState)
Get the pure quantum state representation.

Qrack::QEngineCUDA::SetAmplitudePage
void SetAmplitudePage(const complex *pagePtr, bitCapIntOcl offset, bitCapIntOcl length)
Copy a "page" of amplitudes from pagePtr into this QEngine's internal state.

Qrack::QEngineCUDA::Compose
bitLenInt Compose(QEngineCUDAPtr toCopy, bitLenInt start)

Qrack::QEngineCUDA::~QEngineCUDA
~QEngineCUDA()
Definition: qengine_cuda.hpp:264

Qrack::QEngineCUDA::Compose
bitLenInt Compose(QInterfacePtr toCopy, bitLenInt start)
Definition: qengine_cuda.hpp:367

Qrack::QEngineCUDA::ProbMask
real1_f ProbMask(bitCapInt mask, bitCapInt permutation)
Direct measure of masked permutation probability.

Qrack::QEngineCUDA::SetQuantumState
void SetQuantumState(complex const *inputState)
Set an arbitrary pure quantum state representation.

Qrack::QEngineCUDA::MULModNOut
void MULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Multiplication modulo N by integer, (out of place)

Qrack::QEngineCUDA::IFullAdd
void IFullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut)
Inverse of FullAdd.

Qrack::QEngineCUDA::DispatchQueue
void DispatchQueue()

Qrack::QEngineCUDA::SetAmplitude
void SetAmplitude(bitCapInt perm, complex amp)
Sets the representational amplitude of a full permutation.

Qrack::QEngineCUDA::QEngineCUDA
QEngineCUDA(bitLenInt qBitCount, bitCapInt initState, qrack_rand_gen_ptr rgp=nullptr, complex phaseFac=CMPLX_DEFAULT_ARG, bool doNorm=false, bool randomGlobalPhase=true, bool useHostMem=false, int64_t devID=-1, bool useHardwareRNG=true, bool ignored=false, real1_f norm_thresh=REAL1_EPSILON, std::vector< int64_t > ignored2={}, bitLenInt ignored4=0U, real1_f ignored3=FP_NORM_EPSILON_F)
Initialize a Qrack::QEngineCUDA object.

Qrack::QEngineCUDA::deviceID
int64_t deviceID
Definition: qengine_cuda.hpp:190

Qrack::QEngineCUDA::INTBCDC
void INTBCDC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)

Qrack::QEngineCUDA::INCS
void INCS(bitCapInt toAdd, bitLenInt start, bitLenInt length, bitLenInt carryIndex)
Add a classical integer to the register, with sign and without carry.

Qrack::QEngineCUDA::INCDECSC
void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (without overflow flag)

Qrack::QEngineCUDA::ApplyM
void ApplyM(bitCapInt mask, bitCapInt result, complex nrm)

Qrack::QEngineCUDA::ROx
void ROx(OCLAPI api_call, bitLenInt shift, bitLenInt start, bitLenInt length)

Qrack::QEngineCUDA::queue_mutex
std::mutex queue_mutex
Definition: qengine_cuda.hpp:194

Qrack::QEngineCUDA::CMULx
void CMULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)

Qrack::QEngineCUDA::isFinished
bool isFinished()
Returns "false" if asynchronous work is still running, and "true" if all previously dispatched asynch...
Definition: qengine_cuda.hpp:448

Qrack::QEngineCUDA::FreeAll
void FreeAll()

Qrack::QEngineCUDA::ResetStateBuffer
void ResetStateBuffer(BufferPtr nStateBuffer)

Qrack::QEngineCUDA::IndexedLDA
bitCapInt IndexedLDA(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, const unsigned char *values, bool resetValue=true)
Set 8 bit register bits by a superposed index-offset-based read from classical memory.

Qrack::QEngineCUDA::Z
void Z(bitLenInt target)
Z gate.

Qrack::QEngineCUDA::INT
void INT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length)

Qrack::QEngineCUDA::FreeStateVec
void FreeStateVec()
Definition: qengine_cuda.hpp:531

Qrack::QEngineCUDA::permutationAmp
complex permutationAmp
Definition: qengine_cuda.hpp:192

Qrack::QEngineCUDA::WaitCall
void WaitCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector< BufferPtr > args, size_t localBuffSize=0U)

Qrack::QEngineCUDA::ZeroAmplitudes
void ZeroAmplitudes()
Set all amplitudes to 0, and optionally temporarily deallocate state vector RAM.

Qrack::QEngineCUDA::OpIndexed
bitCapIntOcl OpIndexed(OCLAPI api_call, bitCapIntOcl carryIn, bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)

Qrack::QEngineCUDA::SetAmplitudePage
void SetAmplitudePage(QEnginePtr pageEnginePtr, bitCapIntOcl srcOffset, bitCapIntOcl dstOffset, bitCapIntOcl length)
Copy a "page" of amplitudes from another QEngine, pointed to by pageEnginePtr, into this QEngine's in...

Qrack::QEngineCUDA::nrmBuffer
BufferPtr nrmBuffer
Definition: qengine_cuda.hpp:198

Qrack::QEngineCUDA::CMUL
void CMUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled multiplication by integer.

Qrack::QEngineCUDA::usingHostRam
bool usingHostRam
Definition: qengine_cuda.hpp:185

Qrack::QEngineCUDA::wait_queue_items
std::list< QueueItem > wait_queue_items
Definition: qengine_cuda.hpp:200

Qrack::QEngineCUDA::PhaseParity
void PhaseParity(real1_f radians, bitCapInt mask)
Parity phase gate.

Qrack::QEngineCUDA::Invert
void Invert(complex topRight, complex bottomLeft, bitLenInt qubitIndex)
Apply a single bit transformation that reverses bit probability and might effect phase.

Qrack::QEngineCUDA::device_context
DeviceContextPtr device_context
Definition: qengine_cuda.hpp:199

Qrack::QEngineCUDA::ForceMParity
bool ForceMParity(bitCapInt mask, bool result, bool doForce=true)
Act as if is a measurement of parity of the masked set of qubits was applied, except force the (usual...

Qrack::QEngineCUDA::XMask
void XMask(bitCapInt mask)
Masked X gate.

Qrack::QEngineCUDA::CINT
void CINT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt start, bitLenInt length, const std::vector< bitLenInt > &controls)

Qrack::QEngineCUDA::ProbRegAll
void ProbRegAll(bitLenInt start, bitLenInt length, real1 *probsArray)

Qrack::QEngineCUDA::MakeStateVecBuffer
BufferPtr MakeStateVecBuffer(std::shared_ptr< complex > nStateVec)

Qrack::QEngineCUDA::ExpectationBitsAll
real1_f ExpectationBitsAll(const std::vector< bitLenInt > &bits, bitCapInt offset=0)
Get permutation expectation value of bits.

Qrack::QEngineCUDA::INCDECBCDC
void INCDECBCDC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (without overflow flag)

Qrack::QEngineCUDA::IndexedADC
bitCapInt IndexedADC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)
Add to entangled 8 bit register state with a superposed index-offset-based read from classical memory...

Qrack::QEngineCUDA::INTS
void INTS(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex)

Qrack::QEngineCUDA::CtrlOrAntiProb
real1_f CtrlOrAntiProb(bool controlState, bitLenInt control, bitLenInt target)

Qrack::QEngineCUDA::ProbParity
real1_f ProbParity(bitCapInt mask)
Overall probability of any odd permutation of the masked set of bits.

Qrack::QEngineCUDA::INCDECC
void INCDECC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCC and DECC (without sign, with carry)

Qrack::QEngineCUDA::POWModNOut
void POWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Raise a classical base to a quantum power, modulo N, (out of place)

Qrack::QEngineCUDA::QueueSetDoNormalize
void QueueSetDoNormalize(bool doNorm)
Add an operation to the (OpenCL) queue, to set the value of doNormalize, which controls whether to au...
Definition: qengine_cuda.hpp:319

Qrack::QEngineCUDA::clDump
void clDump()
Dumps the remaining asynchronous wait event list or queue of OpenCL events, for the current queue.

Qrack::QEngineCUDA::MULModx
void MULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)

Qrack::QEngineCUDA::Compose
bitLenInt Compose(QInterfacePtr toCopy)
Combine another QInterface with this one, after the last bit index of this one.
Definition: qengine_cuda.hpp:365

Qrack::QEngineCUDA::ParSum
real1_f ParSum(real1 *toSum, bitCapIntOcl maxI)

Qrack::QEngineCUDA::GetAmplitudePage
void GetAmplitudePage(complex *pagePtr, bitCapIntOcl offset, bitCapIntOcl length)
Copy a "page" of amplitudes from this QEngine's internal state, into pagePtr.

Qrack::QEngineCUDA::MAll
bitCapInt MAll()
Measure permutation state of all coherent bits.

Qrack::QEngineCUDA::LockSync
void LockSync(cl_map_flags flags=(CL_MAP_READ|CL_MAP_WRITE))
Locks synchronization between the state vector buffer and general RAM, so the state vector can be dir...

Qrack::QEngineCUDA::INTSC
void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex)

Qrack::QEngineCUDA::GetAmplitude
complex GetAmplitude(bitCapInt perm)
Get the representational amplitude of a full permutation.

Qrack::QEngineCUDA::FixWorkItemCount
size_t FixWorkItemCount(size_t maxI, size_t wic)
Definition: qengine_cuda.hpp:579

Qrack::QEngineCUDA::ReinitBuffer
void ReinitBuffer()

Qrack::QEngineCUDA::xMULx
void xMULx(OCLAPI api_call, const bitCapIntOcl *bciArgs, BufferPtr controlBuffer)

Qrack::QEngineCUDA::INTC
void INTC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)

Qrack::QEngineCUDA::SumSqrDiff
real1_f SumSqrDiff(QEngineCUDAPtr toCompare)

Qrack::QEngineCUDA::DIV
void DIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
Divide by integer.

Qrack::QEngineCUDA::AllocStateVec
std::shared_ptr< complex > AllocStateVec(bitCapInt elemCount, bool doForceAlloc=false)

Qrack::QEngineCUDA::MakeBuffer
BufferPtr MakeBuffer(cl_mem_flags flags, size_t size, void *host_ptr=NULL)
Definition: qengine_cuda.hpp:471

Qrack::QEngineCUDA::InitOCL
void InitOCL(int64_t devID)

Qrack::QEngineCUDA::IsZeroAmplitude
bool IsZeroAmplitude()
Returns "true" only if amplitudes are all totally 0.
Definition: qengine_cuda.hpp:272

Qrack::QEngineCUDA::INTSC
void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)

Qrack::QEngineCUDA::MULx
void MULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)

Qrack::QEngineCUDA::FreeRaw
void FreeRaw(cl_mem_flags flags, void *c)
Definition: qengine_cuda.hpp:519

Qrack::QEngineCUDA::nrmGroupSize
size_t nrmGroupSize
Definition: qengine_cuda.hpp:188

Qrack::QEngineCUDA::UniformParityRZ
void UniformParityRZ(bitCapInt mask, real1_f angle)
If the target qubit set parity is odd, this applies a phase factor of .

Qrack::QEngineCUDA::UpdateRunningNorm
void UpdateRunningNorm(real1_f norm_thresh=REAL1_DEFAULT_ARG)
Force a calculation of the norm of the state vector, in order to make it unit length before the next ...

Qrack::QEngineCUDA::ResetWaitEvents
EventVecPtr ResetWaitEvents(bool waitQueue=true)

Qrack::QEngineCUDA::AllocRaw
void * AllocRaw(cl_mem_flags flags, void *host_ptr, size_t size, cudaError_t *errorPtr)
Definition: qengine_cuda.hpp:507

Qrack::QEngineCUDA::nrmGroupCount
size_t nrmGroupCount
Definition: qengine_cuda.hpp:187

Qrack::QEngineCUDA::ApplyMx
void ApplyMx(OCLAPI api_call, const bitCapIntOcl *bciArgs, complex nrm)

Qrack::QEngineCUDA::unlockHostMem
bool unlockHostMem
Definition: qengine_cuda.hpp:186

Qrack::QEngineCUDA::CMULModx
void CMULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)

Qrack::QEngineCUDA::lockSyncFlags
cl_map_flags lockSyncFlags
Definition: qengine_cuda.hpp:191

Qrack::QEngineCUDA::SwitchHostPtr
void SwitchHostPtr(bool useHostMem)
Switch to/from host/device state vector bufffer.
Definition: qengine_cuda.hpp:282

Qrack::QEngineCUDA::CIMULModNOut
void CIMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Inverse of controlled multiplication modulo N by integer, (out of place)

Qrack::QEngineCUDA::CopyStateVec
void CopyStateVec(QEnginePtr src)
Exactly copy the state vector of a different QEngine instance.

Qrack::QEngineCUDA::ProbMaskAll
void ProbMaskAll(bitCapInt mask, real1 *probsArray)
Direct measure of masked permutation probability.

Qrack::QEngineCUDA::BitMask
void BitMask(bitCapIntOcl mask, OCLAPI api_call, real1_f phase=(real1_f) PI_R1)

Qrack::QEngineCUDA::DecomposeDispose
void DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCUDAPtr dest)

Qrack::QEngineCUDA::IMULModNOut
void IMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Inverse of multiplication modulo N by integer, (out of place)

Qrack::QEngineCUDA::Probx
real1_f Probx(OCLAPI api_call, const bitCapIntOcl *bciArgs)

Qrack::QEngineCUDA::GetFreePoolItem
PoolItemPtr GetFreePoolItem()

Qrack::QEngineCUDA::Hash
void Hash(bitLenInt start, bitLenInt length, const unsigned char *values)
Transform a length of qubit register via lookup through a hash table.

Qrack::QEngineCUDA::GetMaxSize
bitCapIntOcl GetMaxSize()
Definition: qengine_cuda.hpp:341

Qrack::QEngineCUDA::GetExpectation
real1_f GetExpectation(bitLenInt valueStart, bitLenInt valueLength)

Qrack::QEngineCUDA::clFinish
void clFinish(bool doHard=false)
Finishes the asynchronous wait event list or queue of OpenCL events.

Qrack::QEngineCUDA::Dispose
void Dispose(bitLenInt start, bitLenInt length, bitCapInt disposedPerm)
Dispose a a contiguous set of qubits that are already in a permutation eigenstate.

Qrack::QEngineCUDA::poolItems
std::vector< PoolItemPtr > poolItems
Definition: qengine_cuda.hpp:201

Qrack::QEngineCUDA::AddQueueItem
void AddQueueItem(const QueueItem &item)
Definition: qengine_cuda.hpp:321

Qrack::QEngineCUDA::CArithmeticCall
void CArithmeticCall(OCLAPI api_call, const bitCapIntOcl(&bciArgs)[BCI_ARG_LEN], bitCapIntOcl *controlPowers, bitLenInt controlLen, const unsigned char *values=NULL, bitCapIntOcl valuesLength=0U)

Qrack::QEngineCUDA::Apply2x2
void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, const bitCapIntOcl *qPowersSorted, bool doCalcNorm, SPECIAL_2X2 special, real1_f norm_thresh=REAL1_DEFAULT_ARG)

Qrack::QEngineCUDA::CMULModNOut
void CMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled multiplication modulo N by integer, (out of place)

Qrack::QEngineCUDA::Clone
QInterfacePtr Clone()
Clone this QInterface.

Qrack::QEngineCUDA::SetDevice
void SetDevice(int64_t dID)
Set GPU device ID.

Qrack::QEngine
Abstract QEngine implementation, for all "Schroedinger method" engines.
Definition: qengine.hpp:31

Qrack::QEngine::Apply2x2
virtual void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, bitCapIntOcl const *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)=0

Qrack::QEngine::maxQPowerOcl
bitCapIntOcl maxQPowerOcl
Definition: qengine.hpp:40

Qrack::QEngine::Decompose
virtual void Decompose(bitLenInt start, QInterfacePtr dest)=0
Minimally decompose a set of contiguous bits from the separably composed unit, into "destination".

Qrack::QEngine::X
virtual void X(bitLenInt qubit)
X gate.
Definition: qinterface.hpp:1054

Qrack::QInterface::Allocate
virtual bitLenInt Allocate(bitLenInt length)
Allocate new "length" count of |0> state qubits at end of qubit index position.
Definition: qinterface.hpp:434

Qrack::QInterface::Compose
virtual bitLenInt Compose(QInterfacePtr toCopy)
Combine another QInterface with this one, after the last bit index of this one.
Definition: qinterface.hpp:338

Qrack::bad_alloc
Definition: qengine_gpu_util.hpp:21

half_float::half
Half-precision floating-point type.
Definition: half.hpp:2222

Qrack::QInterface::Invert
virtual void Invert(const complex topRight, const complex bottomLeft, bitLenInt qubitIndex)
Apply a single bit transformation that reverses bit probability and might effect phase.
Definition: qinterface.hpp:493

Qrack::QInterface::UniformlyControlledSingleBit
virtual void UniformlyControlledSingleBit(const std::vector< bitLenInt > &controls, bitLenInt qubitIndex, const complex *mtrxs)
Apply a "uniformly controlled" arbitrary single bit unitary transformation.
Definition: qinterface.hpp:590

Qrack::QInterface::Z
virtual void Z(bitLenInt qubit)
Z gate.
Definition: qinterface.hpp:1087

Qrack::QInterface::U
virtual void U(bitLenInt target, real1_f theta, real1_f phi, real1_f lambda)
General unitary gate.
Definition: rotational.cpp:18

Qrack::QInterface::Phase
virtual void Phase(const complex topLeft, const complex bottomRight, bitLenInt qubitIndex)
Apply a single bit transformation that only effects phase.
Definition: qinterface.hpp:480

Qrack::QInterface::FirstNonzeroPhase
virtual real1_f FirstNonzeroPhase()
Get phase of lowest permutation nonzero amplitude.
Definition: qinterface.hpp:2709

Qrack
Definition: complex16x2simd.hpp:25

Qrack::complex
std::complex< half_float::half > complex
Definition: qrack_types.hpp:62

Qrack::QEnginePtr
std::shared_ptr< QEngine > QEnginePtr
Definition: qrack_types.hpp:141

Qrack::DeviceContextPtr
std::shared_ptr< OCLDeviceContext > DeviceContextPtr
Definition: oclengine.hpp:47

Qrack::QInterfacePtr
std::shared_ptr< QInterface > QInterfacePtr
Definition: qinterface.hpp:28

Qrack::ZERO_R1_F
constexpr real1_f ZERO_R1_F
Definition: qrack_types.hpp:152

Qrack::EventVecPtr
std::shared_ptr< EventVec > EventVecPtr
Definition: oclengine.hpp:51

Qrack::FP_NORM_EPSILON_F
constexpr real1_f FP_NORM_EPSILON_F
Definition: qrack_types.hpp:245

Qrack::ONE_R1
const real1 ONE_R1
Definition: qrack_types.hpp:153

Qrack::cl_map_flags
unsigned long cl_map_flags
Definition: qengine_cuda.hpp:31

Qrack::pow2
bitCapInt pow2(const bitLenInt &p)
Definition: qrack_functions.hpp:22

Qrack::REAL1_DEFAULT_ARG
const real1 REAL1_DEFAULT_ARG
Definition: qrack_types.hpp:155

Qrack::PI_R1
const real1 PI_R1
Definition: qrack_types.hpp:158

Qrack::real1_f
float real1_f
Definition: qrack_types.hpp:64

Qrack::CMPLX_DEFAULT_ARG
QRACK_CONST complex CMPLX_DEFAULT_ARG
Definition: qrack_types.hpp:242

Qrack::QEngineCUDAPtr
std::shared_ptr< QEngineCUDA > QEngineCUDAPtr
Definition: qengine_cuda.hpp:47

Qrack::PoolItemPtr
std::shared_ptr< PoolItem > PoolItemPtr
Definition: qengine_cuda.hpp:162

Qrack::SPECIAL_2X2
SPECIAL_2X2
Definition: qengine_gpu_util.hpp:19

Qrack::NONE
@ NONE
Definition: qengine_gpu_util.hpp:19

Qrack::OCLAPI
OCLAPI
Definition: oclapi.hpp:19

Qrack::REAL1_EPSILON
const real1 REAL1_EPSILON
Definition: qrack_types.hpp:157

Qrack::BufferPtr
std::shared_ptr< void > BufferPtr
Definition: qengine_cuda.hpp:45

Qrack::cl_mem_flags
unsigned long cl_mem_flags
Definition: qengine_cuda.hpp:32

Qrack::log2
bitLenInt log2(bitCapInt n)
Definition: qrack_functions.hpp:26

U
MICROSOFT_QUANTUM_DECL void U(_In_ uintq sid, _In_ uintq q, _In_ double theta, _In_ double phi, _In_ double lambda)
(External API) 3-parameter unitary gate
Definition: pinvoke_api.cpp:1362

qengine.hpp

CL_MAP_WRITE
#define CL_MAP_WRITE
Definition: qengine_cuda.hpp:36

BCI_ARG_LEN
#define BCI_ARG_LEN
Definition: qengine_cuda.hpp:25

CL_MEM_USE_HOST_PTR
#define CL_MEM_USE_HOST_PTR
Definition: qengine_cuda.hpp:41

CL_MEM_COPY_HOST_PTR
#define CL_MEM_COPY_HOST_PTR
Definition: qengine_cuda.hpp:42

CMPLX_NORM_LEN
#define CMPLX_NORM_LEN
Definition: qengine_cuda.hpp:26

CL_MAP_READ
#define CL_MAP_READ
Definition: qengine_cuda.hpp:35

REAL_ARG_LEN
#define REAL_ARG_LEN
Definition: qengine_cuda.hpp:27

qengine_gpu_util.hpp

bitLenInt
#define bitLenInt
Definition: qrack_types.hpp:44

qrack_rand_gen_ptr
#define qrack_rand_gen_ptr
Definition: qrack_types.hpp:146

bitCapInt
#define bitCapInt
Definition: qrack_types.hpp:105

bitCapIntOcl
#define bitCapIntOcl
Definition: qrack_types.hpp:91

Qrack::QueueItem
Definition: qengine_cuda.hpp:50

Qrack::QueueItem::QueueItem
QueueItem(OCLAPI ac, size_t wic, size_t lgs, size_t ds, std::vector< BufferPtr > b, size_t lbs)
Definition: qengine_cuda.hpp:76

Qrack::QueueItem::QueueItem
QueueItem(real1_f runningNrm)
Definition: qengine_cuda.hpp:104

Qrack::QueueItem::doNorm
bool doNorm
Definition: qengine_cuda.hpp:59

Qrack::QueueItem::workItemCount
size_t workItemCount
Definition: qengine_cuda.hpp:52

Qrack::QueueItem::buffers
std::vector< BufferPtr > buffers
Definition: qengine_cuda.hpp:55

Qrack::QueueItem::deallocSize
size_t deallocSize
Definition: qengine_cuda.hpp:54

Qrack::QueueItem::QueueItem
QueueItem()
Definition: qengine_cuda.hpp:62

Qrack::QueueItem::isSetRunningNorm
bool isSetRunningNorm
Definition: qengine_cuda.hpp:58

Qrack::QueueItem::QueueItem
QueueItem(bool doNrm)
Definition: qengine_cuda.hpp:90

Qrack::QueueItem::localBuffSize
size_t localBuffSize
Definition: qengine_cuda.hpp:56

Qrack::QueueItem::api_call
OCLAPI api_call
Definition: qengine_cuda.hpp:51

Qrack::QueueItem::isSetDoNorm
bool isSetDoNorm
Definition: qengine_cuda.hpp:57

Qrack::QueueItem::localGroupSize
size_t localGroupSize
Definition: qengine_cuda.hpp:53

Qrack::QueueItem::runningNorm
real1 runningNorm
Definition: qengine_cuda.hpp:60