Qrack  9.0
General classical-emulating-quantum development framework
qengine_cuda.hpp
Go to the documentation of this file.
1 //
3 // (C) Daniel Strano and the Qrack contributors 2017-2023. All rights reserved.
4 //
5 // This is a multithreaded, universal quantum register simulation, allowing
6 // (nonphysical) register cloning and direct measurement of probability and
7 // phase, to leverage what advantages classical emulation of qubits can have.
8 //
9 // Licensed under the GNU Lesser General Public License V3.
10 // See LICENSE.md in the project root or https://www.gnu.org/licenses/lgpl-3.0.en.html
11 // for details.
12 
13 #pragma once
14 
15 #include "common/cudaengine.cuh"
16 #include "qengine.hpp"
17 #include "qengine_gpu_util.hpp"
18 
19 #if !ENABLE_CUDA
20 #error CUDA has not been enabled
21 #endif
22 
23 #include <list>
24 
25 #define BCI_ARG_LEN 10
26 #define CMPLX_NORM_LEN 6
27 #define REAL_ARG_LEN 2
28 
29 namespace Qrack {
30 
31 typedef unsigned long cl_map_flags;
32 typedef unsigned long cl_mem_flags;
33 
34 // clang-format off
35 #define CL_MAP_READ (1 << 0)
36 #define CL_MAP_WRITE (1 << 1)
37 
38 #define CL_MEM_READ_WRITE (1 << 0)
39 #define CL_MEM_WRITE_ONLY (1 << 1)
40 #define CL_MEM_READ_ONLY (1 << 2)
41 #define CL_MEM_USE_HOST_PTR (1 << 3)
42 #define CL_MEM_COPY_HOST_PTR (1 << 5)
43 // clang-format on
44 
45 typedef std::shared_ptr<void> BufferPtr;
46 
47 class QEngineCUDA;
48 typedef std::shared_ptr<QEngineCUDA> QEngineCUDAPtr;
49 
50 struct QueueItem {
52  size_t workItemCount;
54  size_t deallocSize;
55  std::vector<BufferPtr> buffers;
56  size_t localBuffSize;
59  bool doNorm;
61 
63  : api_call()
64  , workItemCount(0U)
65  , localGroupSize(0U)
66  , deallocSize(0U)
67  , buffers()
68  , localBuffSize(0U)
69  , isSetDoNorm(false)
70  , isSetRunningNorm(true)
71  , doNorm(false)
73  {
74  }
75 
76  QueueItem(OCLAPI ac, size_t wic, size_t lgs, size_t ds, std::vector<BufferPtr> b, size_t lbs)
77  : api_call(ac)
78  , workItemCount(wic)
79  , localGroupSize(lgs)
80  , deallocSize(ds)
81  , buffers(b)
82  , localBuffSize(lbs)
83  , isSetDoNorm(false)
84  , isSetRunningNorm(false)
85  , doNorm(false)
87  {
88  }
89 
90  QueueItem(bool doNrm)
91  : api_call()
92  , workItemCount(0U)
93  , localGroupSize(0U)
94  , deallocSize(0U)
95  , buffers()
96  , localBuffSize(0U)
97  , isSetDoNorm(true)
98  , isSetRunningNorm(false)
99  , doNorm(doNrm)
101  {
102  }
103 
104  QueueItem(real1_f runningNrm)
105  : api_call()
106  , workItemCount(0U)
107  , localGroupSize(0U)
108  , deallocSize(0U)
109  , buffers()
110  , localBuffSize(0U)
111  , isSetDoNorm(false)
112  , isSetRunningNorm(true)
113  , doNorm(false)
114  , runningNorm(runningNrm)
115  {
116  }
117 };
118 
119 class PoolItem {
120 public:
124 
125  std::shared_ptr<real1> probArray;
126  std::shared_ptr<real1> angleArray;
127 
129  : probArray(NULL)
130  , angleArray(NULL)
131  {
135  }
136 
138 
139 protected:
140  BufferPtr MakeBuffer(size_t size)
141  {
142  cudaError_t error;
143 
144  BufferPtr toRet = std::shared_ptr<void>(AllocRaw(size, &error), [](void* c) { cudaFree(c); });
145 
146  if (error != cudaSuccess) {
147  throw std::runtime_error("CUDA error code on buffer allocation attempt: " + std::to_string(error));
148  }
149 
150  return toRet;
151  }
152 
153  void* AllocRaw(size_t size, cudaError_t* errorPtr)
154  {
155  void* toRet;
156  *errorPtr = cudaMalloc(&toRet, size);
157 
158  return toRet;
159  }
160 };
161 
162 typedef std::shared_ptr<PoolItem> PoolItemPtr;
163 
182 class QEngineCUDA : public QEngine {
183 protected:
184  bool didInit;
188  size_t nrmGroupSize;
190  int64_t deviceID;
193  std::shared_ptr<complex> stateVec;
194  std::mutex queue_mutex;
195  // stateBuffer is allocated as a shared_ptr, because it's the only buffer that will be acted on outside of
196  // QEngineCUDA itself, specifically by QEngineCUDAMulti.
200  std::list<QueueItem> wait_queue_items;
201  std::vector<PoolItemPtr> poolItems;
202  std::unique_ptr<real1[], void (*)(real1*)> nrmArray;
203 
204  // For std::function, cudaError_t use might discard int qualifiers.
205  void tryCuda(std::string message, std::function<cudaError_t()> oclCall)
206  {
207  if (oclCall() == cudaSuccess) {
208  // Success
209  return;
210  }
211 
212  // Soft finish (just for this QEngineCUDA)
213  clFinish();
214 
215  if (oclCall() == cudaSuccess) {
216  // Success after clearing QEngineCUDA queue
217  return;
218  }
219 
220  // Hard finish (for the unique OpenCL device)
221  clFinish(true);
222 
223  cudaError_t error = oclCall();
224  if (error == cudaSuccess) {
225  // Success after clearing all queues for the OpenCL device
226  return;
227  }
228 
229  wait_queue_items.clear();
230 
231  // We're fatally blocked. Throw to exit.
232  throw std::runtime_error(message + ", error code: " + std::to_string(error));
233  }
234 
235 public:
238  static const bitCapIntOcl OclMemDenom = 3U;
239 
258  QEngineCUDA(bitLenInt qBitCount, bitCapInt initState, qrack_rand_gen_ptr rgp = nullptr,
259  complex phaseFac = CMPLX_DEFAULT_ARG, bool doNorm = false, bool randomGlobalPhase = true,
260  bool useHostMem = false, int64_t devID = -1, bool useHardwareRNG = true, bool ignored = false,
261  real1_f norm_thresh = REAL1_EPSILON, std::vector<int64_t> ignored2 = {}, bitLenInt ignored4 = 0U,
262  real1_f ignored3 = FP_NORM_EPSILON_F);
263 
265  {
266  // Make sure we track device allocation.
267  FreeAll();
268  }
269 
270  virtual bool isOpenCL() { return true; }
271 
272  bool IsZeroAmplitude() { return !stateBuffer; }
274  {
275  if (!stateBuffer) {
276  return ZERO_R1_F;
277  }
278 
280  }
281 
282  void SwitchHostPtr(bool useHostMem)
283  {
284  if (useHostMem == usingHostRam) {
285  return;
286  }
287 
288  std::shared_ptr<complex> copyVec = AllocStateVec(maxQPowerOcl, true);
289  GetQuantumState(copyVec.get());
290 
291  if (useHostMem) {
292  stateVec = copyVec;
294  } else {
295  stateVec = NULL;
297  clFinish();
298  tryCuda("Failed to write buffer", [&] {
299  return cudaMemcpy(
300  stateBuffer.get(), (void*)(copyVec.get()), sizeof(complex) * maxQPowerOcl, cudaMemcpyHostToDevice);
301  });
302  copyVec.reset();
303  }
304 
305  usingHostRam = useHostMem;
306  }
307 
308  void FreeAll();
311 
312  void GetAmplitudePage(complex* pagePtr, bitCapIntOcl offset, bitCapIntOcl length);
313  void SetAmplitudePage(const complex* pagePtr, bitCapIntOcl offset, bitCapIntOcl length);
315  QEnginePtr pageEnginePtr, bitCapIntOcl srcOffset, bitCapIntOcl dstOffset, bitCapIntOcl length);
318 
319  void QueueSetDoNormalize(bool doNorm) { AddQueueItem(QueueItem(doNorm)); }
320  void QueueSetRunningNorm(real1_f runningNrm) { AddQueueItem(QueueItem(runningNrm)); }
321  void AddQueueItem(const QueueItem& item)
322  {
323  // For lock_guard:
324  if (true) {
325  std::lock_guard<std::mutex> lock(queue_mutex);
326  wait_queue_items.push_back(item);
327  }
328 
329  DispatchQueue();
330  }
331  void QueueCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector<BufferPtr> args,
332  size_t localBuffSize = 0U, size_t deallocSize = 0U)
333  {
334  if (localBuffSize > device_context->GetLocalSize()) {
335  throw bad_alloc("Local memory limits exceeded in QEngineCUDA::QueueCall()");
336  }
337  cudaStreamSynchronize(device_context->params_queue);
338  AddQueueItem(QueueItem(api_call, workItemCount, localGroupSize, deallocSize, args, localBuffSize));
339  }
340 
341  bitCapIntOcl GetMaxSize() { return device_context->GetMaxAlloc() / sizeof(complex); };
342 
344 
346  void UniformlyControlledSingleBit(const std::vector<bitLenInt>& controls, bitLenInt qubitIndex,
347  complex const* mtrxs, const std::vector<bitCapInt>& mtrxSkipPowers, bitCapInt mtrxSkipValueMask);
348  void UniformParityRZ(bitCapInt mask, real1_f angle);
349  void CUniformParityRZ(const std::vector<bitLenInt>& controls, bitCapInt mask, real1_f angle);
350 
351  using QEngine::X;
352  void X(bitLenInt target);
353  using QEngine::Z;
354  void Z(bitLenInt target);
355  using QEngine::Invert;
356  void Invert(complex topRight, complex bottomLeft, bitLenInt qubitIndex);
357  using QEngine::Phase;
358  void Phase(complex topLeft, complex bottomRight, bitLenInt qubitIndex);
359 
360  void XMask(bitCapInt mask);
361  void PhaseParity(real1_f radians, bitCapInt mask);
362 
363  using QEngine::Compose;
365  bitLenInt Compose(QInterfacePtr toCopy) { return Compose(std::dynamic_pointer_cast<QEngineCUDA>(toCopy)); }
368  {
369  return Compose(std::dynamic_pointer_cast<QEngineCUDA>(toCopy), start);
370  }
371  using QEngine::Decompose;
372  void Decompose(bitLenInt start, QInterfacePtr dest);
373  void Dispose(bitLenInt start, bitLenInt length);
374  void Dispose(bitLenInt start, bitLenInt length, bitCapInt disposedPerm);
375  using QEngine::Allocate;
377 
378  void ROL(bitLenInt shift, bitLenInt start, bitLenInt length);
379 
380 #if ENABLE_ALU
381  void INC(bitCapInt toAdd, bitLenInt start, bitLenInt length);
382  void CINC(bitCapInt toAdd, bitLenInt inOutStart, bitLenInt length, const std::vector<bitLenInt>& controls);
383  void INCS(bitCapInt toAdd, bitLenInt start, bitLenInt length, bitLenInt carryIndex);
384 #if ENABLE_BCD
385  void INCBCD(bitCapInt toAdd, bitLenInt start, bitLenInt length);
386 #endif
387  void MUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);
388  void DIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);
389  void MULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);
390  void IMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);
391  void POWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length);
392  void CMUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,
393  const std::vector<bitLenInt>& controls);
394  void CDIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,
395  const std::vector<bitLenInt>& controls);
396  void CMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,
397  const std::vector<bitLenInt>& controls);
398  void CIMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,
399  const std::vector<bitLenInt>& controls);
400  void CPOWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length,
401  const std::vector<bitLenInt>& controls);
402  void FullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut);
403  void IFullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut);
404 
405  bitCapInt IndexedLDA(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,
406  const unsigned char* values, bool resetValue = true);
407  bitCapInt IndexedADC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,
408  bitLenInt carryIndex, const unsigned char* values);
409  bitCapInt IndexedSBC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength,
410  bitLenInt carryIndex, const unsigned char* values);
411  void Hash(bitLenInt start, bitLenInt length, const unsigned char* values);
412 
413  void CPhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length, bitLenInt flagIndex);
414  void PhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length);
415 #endif
416 
418  real1_f CtrlOrAntiProb(bool controlState, bitLenInt control, bitLenInt target);
419  real1_f ProbReg(bitLenInt start, bitLenInt length, bitCapInt permutation);
420  void ProbRegAll(bitLenInt start, bitLenInt length, real1* probsArray);
421  real1_f ProbMask(bitCapInt mask, bitCapInt permutation);
422  void ProbMaskAll(bitCapInt mask, real1* probsArray);
424  bool ForceMParity(bitCapInt mask, bool result, bool doForce = true);
425  real1_f ExpectationBitsAll(const std::vector<bitLenInt>& bits, bitCapInt offset = 0);
426 
427  void SetDevice(int64_t dID);
428  int64_t GetDevice() { return deviceID; }
429 
430  void SetQuantumState(complex const* inputState);
431  void GetQuantumState(complex* outputState);
432  void GetProbs(real1* outputProbs);
435  void SetAmplitude(bitCapInt perm, complex amp);
436 
438  {
439  return SumSqrDiff(std::dynamic_pointer_cast<QEngineCUDA>(toCompare));
440  }
442 
444  real1_f nrm = REAL1_DEFAULT_ARG, real1_f norm_thresh = REAL1_DEFAULT_ARG, real1_f phaseArg = ZERO_R1_F);
445  ;
447  void Finish() { clFinish(); };
448  bool isFinished() { return !wait_queue_items.size(); };
449 
451 
452  void PopQueue();
454 
455 protected:
456  void AddAlloc(size_t size)
457  {
458  size_t currentAlloc = CUDAEngine::Instance().AddToActiveAllocSize(deviceID, size);
459  if (device_context && (currentAlloc > device_context->GetGlobalAllocLimit())) {
460  CUDAEngine::Instance().SubtractFromActiveAllocSize(deviceID, size);
461  throw bad_alloc("VRAM limits exceeded in QEngineCUDA::AddAlloc()");
462  }
463  totalOclAllocSize += size;
464  }
465  void SubtractAlloc(size_t size)
466  {
467  CUDAEngine::Instance().SubtractFromActiveAllocSize(deviceID, size);
468  totalOclAllocSize -= size;
469  }
470 
471  BufferPtr MakeBuffer(cl_mem_flags flags, size_t size, void* host_ptr = NULL)
472  {
473  cudaError_t error;
474 
475  BufferPtr toRet = std::shared_ptr<void>(
476  AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });
477 
478  if (error == cudaSuccess) {
479  // Success
480  return toRet;
481  }
482 
483  // Soft finish (just for this QEngineCUDA)
484  clFinish();
485 
486  toRet = std::shared_ptr<void>(
487  AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });
488 
489  if (error == cudaSuccess) {
490  // Success after clearing QEngineCUDA queue
491  return toRet;
492  }
493 
494  // Hard finish (for the unique OpenCL device)
495  clFinish(true);
496 
497  toRet = std::shared_ptr<void>(
498  AllocRaw(flags, host_ptr, size, &error), [this, flags](void* c) { FreeRaw(flags, c); });
499 
500  if (error != cudaSuccess) {
501  throw std::runtime_error("CUDA error code on buffer allocation attempt: " + std::to_string(error));
502  }
503 
504  return toRet;
505  }
506 
507  void* AllocRaw(cl_mem_flags flags, void* host_ptr, size_t size, cudaError_t* errorPtr)
508  {
509  void* toRet = host_ptr;
510  *errorPtr = (flags & CL_MEM_USE_HOST_PTR) ? cudaHostRegister(host_ptr, size, cudaHostRegisterDefault)
511  : cudaMalloc(&toRet, size);
512  if ((*errorPtr == cudaSuccess) && (flags & CL_MEM_COPY_HOST_PTR)) {
513  cudaMemcpy(toRet, host_ptr, size, cudaMemcpyHostToDevice);
514  }
515 
516  return toRet;
517  }
518 
519  void FreeRaw(cl_mem_flags flags, void* c)
520  {
521  if (flags & CL_MEM_USE_HOST_PTR) {
522  cudaHostUnregister(c);
523  } else {
524  cudaFree(c);
525  }
526  }
527 
528  real1_f GetExpectation(bitLenInt valueStart, bitLenInt valueLength);
529 
530  std::shared_ptr<complex> AllocStateVec(bitCapInt elemCount, bool doForceAlloc = false);
531  void FreeStateVec() { stateVec = NULL; }
532  void ResetStateBuffer(BufferPtr nStateBuffer);
533  BufferPtr MakeStateVecBuffer(std::shared_ptr<complex> nStateVec);
534  void ReinitBuffer();
535 
536  void Compose(OCLAPI apiCall, const bitCapIntOcl* bciArgs, QEngineCUDAPtr toCopy);
537 
538  void InitOCL(int64_t devID);
540 
542 
564  void UnlockSync();
565 
572  void clFinish(bool doHard = false);
573 
577  void clDump();
578 
579  size_t FixWorkItemCount(size_t maxI, size_t wic)
580  {
581  if (wic > maxI) {
582  // Guaranteed to be a power of two
583  return maxI;
584  }
585 
586  // Otherwise, clamp to a power of two
587  return (size_t)pow2(log2(wic));
588  }
589 
590  size_t FixGroupSize(size_t wic, size_t gs)
591  {
592  if (gs > wic) {
593  return wic;
594  }
595 
596  return gs - (wic % gs);
597  }
598 
600 
601  using QEngine::Apply2x2;
602  void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const* mtrx, bitLenInt bitCount,
603  const bitCapIntOcl* qPowersSorted, bool doCalcNorm, real1_f norm_thresh = REAL1_DEFAULT_ARG)
604  {
605  Apply2x2(offset1, offset2, mtrx, bitCount, qPowersSorted, doCalcNorm, SPECIAL_2X2::NONE, norm_thresh);
606  }
607  void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const* mtrx, bitLenInt bitCount,
608  const bitCapIntOcl* qPowersSorted, bool doCalcNorm, SPECIAL_2X2 special,
609  real1_f norm_thresh = REAL1_DEFAULT_ARG);
610 
611  void BitMask(bitCapIntOcl mask, OCLAPI api_call, real1_f phase = (real1_f)PI_R1);
612 
613  void ApplyM(bitCapInt mask, bool result, complex nrm);
614  void ApplyM(bitCapInt mask, bitCapInt result, complex nrm);
615 
616  /* Utility functions used by the operations above. */
617  void WaitCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector<BufferPtr> args,
618  size_t localBuffSize = 0U);
619  EventVecPtr ResetWaitEvents(bool waitQueue = true);
620  void ApplyMx(OCLAPI api_call, const bitCapIntOcl* bciArgs, complex nrm);
621  real1_f Probx(OCLAPI api_call, const bitCapIntOcl* bciArgs);
622 
623  void ArithmeticCall(OCLAPI api_call, const bitCapIntOcl (&bciArgs)[BCI_ARG_LEN], const unsigned char* values = NULL,
624  bitCapIntOcl valuesLength = 0U);
625  void CArithmeticCall(OCLAPI api_call, const bitCapIntOcl (&bciArgs)[BCI_ARG_LEN], bitCapIntOcl* controlPowers,
626  bitLenInt controlLen, const unsigned char* values = NULL, bitCapIntOcl valuesLength = 0U);
627  void ROx(OCLAPI api_call, bitLenInt shift, bitLenInt start, bitLenInt length);
628 
629 #if ENABLE_ALU
630  void INCDECC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
631  void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
632  void INCDECSC(
633  bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex);
634 #if ENABLE_BCD
635  void INCDECBCDC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
636 #endif
637 
638  void INT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length);
639  void CINT(
640  OCLAPI api_call, bitCapIntOcl toMod, bitLenInt start, bitLenInt length, const std::vector<bitLenInt>& controls);
641  void INTC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
642  void INTS(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex);
643  void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
644  void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex,
645  bitLenInt carryIndex);
646 #if ENABLE_BCD
647  void INTBCD(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length);
648  void INTBCDC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex);
649 #endif
650  void xMULx(OCLAPI api_call, const bitCapIntOcl* bciArgs, BufferPtr controlBuffer);
651  void MULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length);
652  void MULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart,
653  bitLenInt length);
654  void CMULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length,
655  const std::vector<bitLenInt>& controls);
656  void CMULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart,
657  bitLenInt length, const std::vector<bitLenInt>& controls);
658  void FullAdx(
659  bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut, OCLAPI api_call);
660  void PhaseFlipX(OCLAPI api_call, const bitCapIntOcl* bciArgs);
661 
662  bitCapIntOcl OpIndexed(OCLAPI api_call, bitCapIntOcl carryIn, bitLenInt indexStart, bitLenInt indexLength,
663  bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char* values);
664 #endif
665 
666  void ClearBuffer(BufferPtr buff, bitCapIntOcl offset, bitCapIntOcl size);
667 };
668 
669 } // namespace Qrack
Definition: qengine_cuda.hpp:119
BufferPtr ulongBuffer
Definition: qengine_cuda.hpp:123
~PoolItem()
Definition: qengine_cuda.hpp:137
BufferPtr cmplxBuffer
Definition: qengine_cuda.hpp:121
BufferPtr MakeBuffer(size_t size)
Definition: qengine_cuda.hpp:140
BufferPtr realBuffer
Definition: qengine_cuda.hpp:122
std::shared_ptr< real1 > angleArray
Definition: qengine_cuda.hpp:126
PoolItem()
Definition: qengine_cuda.hpp:128
void * AllocRaw(size_t size, cudaError_t *errorPtr)
Definition: qengine_cuda.hpp:153
std::shared_ptr< real1 > probArray
Definition: qengine_cuda.hpp:125
OpenCL enhanced QEngineCPU implementation.
Definition: qengine_cuda.hpp:182
real1_f Prob(bitLenInt qubit)
Direct measure of bit probability to be in |1> state.
virtual void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, bitCapIntOcl const *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)=0
real1_f SumSqrDiff(QInterfacePtr toCompare)
Definition: qengine_cuda.hpp:437
void MUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
Multiply by integer.
void Compose(OCLAPI apiCall, const bitCapIntOcl *bciArgs, QEngineCUDAPtr toCopy)
void UniformlyControlledSingleBit(const std::vector< bitLenInt > &controls, bitLenInt qubitIndex, complex const *mtrxs, const std::vector< bitCapInt > &mtrxSkipPowers, bitCapInt mtrxSkipValueMask)
void INCBCD(bitCapInt toAdd, bitLenInt start, bitLenInt length)
Add classical BCD integer (without sign)
virtual bool isOpenCL()
Returns "true" if current simulation is OpenCL-based.
Definition: qengine_cuda.hpp:270
void QueueCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector< BufferPtr > args, size_t localBuffSize=0U, size_t deallocSize=0U)
Definition: qengine_cuda.hpp:331
void Decompose(bitLenInt start, QInterfacePtr dest)
Minimally decompose a set of contiguous bits from the separably composed unit, into "destination".
bitLenInt Allocate(bitLenInt start, bitLenInt length)
Allocate new "length" count of |0> state qubits at specified qubit index start position.
real1_f FirstNonzeroPhase()
Get phase of lowest permutation nonzero amplitude.
Definition: qengine_cuda.hpp:273
void ApplyM(bitCapInt mask, bool result, complex nrm)
bool didInit
Definition: qengine_cuda.hpp:184
void ShuffleBuffers(QEnginePtr engine)
Swap the high half of this engine with the low half of another.
void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, const bitCapIntOcl *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)
Definition: qengine_cuda.hpp:602
void Finish()
If asynchronous work is still running, block until it finishes.
Definition: qengine_cuda.hpp:447
void Phase(complex topLeft, complex bottomRight, bitLenInt qubitIndex)
Apply a single bit transformation that only effects phase.
void CUniformParityRZ(const std::vector< bitLenInt > &controls, bitCapInt mask, real1_f angle)
If the controls are set and the target qubit set parity is odd, this applies a phase factor of .
real1_f ProbReg(bitLenInt start, bitLenInt length, bitCapInt permutation)
Direct measure of register permutation probability.
void CPOWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled, raise a classical base to a quantum power, modulo N, (out of place)
void INTBCD(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length)
bitLenInt Compose(QEngineCUDAPtr toCopy)
void CDIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled division by power of integer.
void PhaseFlipX(OCLAPI api_call, const bitCapIntOcl *bciArgs)
void PhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length)
This is an expedient for an adaptive Grover's search for a function's global minimum.
void SubtractAlloc(size_t size)
Definition: qengine_cuda.hpp:465
void FullAdx(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut, OCLAPI api_call)
BufferPtr stateBuffer
Definition: qengine_cuda.hpp:197
void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (with overflow flag)
void tryCuda(std::string message, std::function< cudaError_t()> oclCall)
Definition: qengine_cuda.hpp:205
void Dispose(bitLenInt start, bitLenInt length)
Minimally decompose a set of contiguous bits from the separably composed unit, and discard the separa...
void FullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut)
Quantum analog of classical "Full Adder" gate.
void SetPermutation(bitCapInt perm, complex phaseFac=CMPLX_DEFAULT_ARG)
Set to a specific permutation of all qubits.
std::shared_ptr< complex > stateVec
Definition: qengine_cuda.hpp:193
bitCapInt IndexedSBC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)
Subtract from an entangled 8 bit register state with a superposed index-offset-based read from classi...
void QueueSetRunningNorm(real1_f runningNrm)
Add an operation to the (OpenCL) queue, to set the value of runningNorm, which is the normalization c...
Definition: qengine_cuda.hpp:320
void NormalizeState(real1_f nrm=REAL1_DEFAULT_ARG, real1_f norm_thresh=REAL1_DEFAULT_ARG, real1_f phaseArg=ZERO_R1_F)
Apply the normalization factor found by UpdateRunningNorm() or on the fly by a single bit gate.
void ROL(bitLenInt shift, bitLenInt start, bitLenInt length)
Circular shift left - shift bits left, and carry last bits.
std::unique_ptr< real1[], void(*)(real1 *)> nrmArray
Definition: qengine_cuda.hpp:202
static const bitCapIntOcl OclMemDenom
1 / OclMemDenom is the maximum fraction of total OCL device RAM that a single state vector should occ...
Definition: qengine_cuda.hpp:238
int64_t GetDevice()
Get GPU device ID.
Definition: qengine_cuda.hpp:428
void UnlockSync()
Unlocks synchronization between the state vector buffer and general RAM, so the state vector can be o...
void CPhaseFlipIfLess(bitCapInt greaterPerm, bitLenInt start, bitLenInt length, bitLenInt flagIndex)
The 6502 uses its carry flag also as a greater-than/less-than flag, for the CMP operation.
void AddAlloc(size_t size)
Definition: qengine_cuda.hpp:456
void ArithmeticCall(OCLAPI api_call, const bitCapIntOcl(&bciArgs)[BCI_ARG_LEN], const unsigned char *values=NULL, bitCapIntOcl valuesLength=0U)
void ClearBuffer(BufferPtr buff, bitCapIntOcl offset, bitCapIntOcl size)
void INC(bitCapInt toAdd, bitLenInt start, bitLenInt length)
Add integer (without sign)
size_t FixGroupSize(size_t wic, size_t gs)
Definition: qengine_cuda.hpp:590
size_t totalOclAllocSize
Definition: qengine_cuda.hpp:189
void X(bitLenInt target)
void GetProbs(real1 *outputProbs)
Get the pure quantum state representation.
void CINC(bitCapInt toAdd, bitLenInt inOutStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Add integer (without sign, with controls)
QEnginePtr CloneEmpty()
Clone this QEngine's settings, with a zeroed state vector.
void GetQuantumState(complex *outputState)
Get the pure quantum state representation.
void SetAmplitudePage(const complex *pagePtr, bitCapIntOcl offset, bitCapIntOcl length)
Copy a "page" of amplitudes from pagePtr into this QEngine's internal state.
bitLenInt Compose(QEngineCUDAPtr toCopy, bitLenInt start)
~QEngineCUDA()
Definition: qengine_cuda.hpp:264
bitLenInt Compose(QInterfacePtr toCopy, bitLenInt start)
Definition: qengine_cuda.hpp:367
real1_f ProbMask(bitCapInt mask, bitCapInt permutation)
Direct measure of masked permutation probability.
void SetQuantumState(complex const *inputState)
Set an arbitrary pure quantum state representation.
void MULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Multiplication modulo N by integer, (out of place)
void IFullAdd(bitLenInt inputBit1, bitLenInt inputBit2, bitLenInt carryInSumOut, bitLenInt carryOut)
Inverse of FullAdd.
void SetAmplitude(bitCapInt perm, complex amp)
Sets the representational amplitude of a full permutation.
QEngineCUDA(bitLenInt qBitCount, bitCapInt initState, qrack_rand_gen_ptr rgp=nullptr, complex phaseFac=CMPLX_DEFAULT_ARG, bool doNorm=false, bool randomGlobalPhase=true, bool useHostMem=false, int64_t devID=-1, bool useHardwareRNG=true, bool ignored=false, real1_f norm_thresh=REAL1_EPSILON, std::vector< int64_t > ignored2={}, bitLenInt ignored4=0U, real1_f ignored3=FP_NORM_EPSILON_F)
Initialize a Qrack::QEngineCUDA object.
int64_t deviceID
Definition: qengine_cuda.hpp:190
void INTBCDC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
void INCS(bitCapInt toAdd, bitLenInt start, bitLenInt length, bitLenInt carryIndex)
Add a classical integer to the register, with sign and without carry.
void INCDECSC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (without overflow flag)
void ApplyM(bitCapInt mask, bitCapInt result, complex nrm)
void ROx(OCLAPI api_call, bitLenInt shift, bitLenInt start, bitLenInt length)
std::mutex queue_mutex
Definition: qengine_cuda.hpp:194
void CMULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
bool isFinished()
Returns "false" if asynchronous work is still running, and "true" if all previously dispatched asynch...
Definition: qengine_cuda.hpp:448
void ResetStateBuffer(BufferPtr nStateBuffer)
bitCapInt IndexedLDA(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, const unsigned char *values, bool resetValue=true)
Set 8 bit register bits by a superposed index-offset-based read from classical memory.
void Z(bitLenInt target)
Z gate.
void INT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length)
void FreeStateVec()
Definition: qengine_cuda.hpp:531
complex permutationAmp
Definition: qengine_cuda.hpp:192
void WaitCall(OCLAPI api_call, size_t workItemCount, size_t localGroupSize, std::vector< BufferPtr > args, size_t localBuffSize=0U)
void ZeroAmplitudes()
Set all amplitudes to 0, and optionally temporarily deallocate state vector RAM.
bitCapIntOcl OpIndexed(OCLAPI api_call, bitCapIntOcl carryIn, bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)
void SetAmplitudePage(QEnginePtr pageEnginePtr, bitCapIntOcl srcOffset, bitCapIntOcl dstOffset, bitCapIntOcl length)
Copy a "page" of amplitudes from another QEngine, pointed to by pageEnginePtr, into this QEngine's in...
BufferPtr nrmBuffer
Definition: qengine_cuda.hpp:198
void CMUL(bitCapInt toMul, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled multiplication by integer.
bool usingHostRam
Definition: qengine_cuda.hpp:185
std::list< QueueItem > wait_queue_items
Definition: qengine_cuda.hpp:200
void PhaseParity(real1_f radians, bitCapInt mask)
Parity phase gate.
void Invert(complex topRight, complex bottomLeft, bitLenInt qubitIndex)
Apply a single bit transformation that reverses bit probability and might effect phase.
DeviceContextPtr device_context
Definition: qengine_cuda.hpp:199
bool ForceMParity(bitCapInt mask, bool result, bool doForce=true)
Act as if is a measurement of parity of the masked set of qubits was applied, except force the (usual...
void XMask(bitCapInt mask)
Masked X gate.
void CINT(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt start, bitLenInt length, const std::vector< bitLenInt > &controls)
void ProbRegAll(bitLenInt start, bitLenInt length, real1 *probsArray)
BufferPtr MakeStateVecBuffer(std::shared_ptr< complex > nStateVec)
real1_f ExpectationBitsAll(const std::vector< bitLenInt > &bits, bitCapInt offset=0)
Get permutation expectation value of bits.
void INCDECBCDC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCSC and DECSC (without overflow flag)
bitCapInt IndexedADC(bitLenInt indexStart, bitLenInt indexLength, bitLenInt valueStart, bitLenInt valueLength, bitLenInt carryIndex, const unsigned char *values)
Add to entangled 8 bit register state with a superposed index-offset-based read from classical memory...
void INTS(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex)
real1_f CtrlOrAntiProb(bool controlState, bitLenInt control, bitLenInt target)
real1_f ProbParity(bitCapInt mask)
Overall probability of any odd permutation of the masked set of bits.
void INCDECC(bitCapInt toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
Common driver method behind INCC and DECC (without sign, with carry)
void POWModNOut(bitCapInt base, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Raise a classical base to a quantum power, modulo N, (out of place)
void QueueSetDoNormalize(bool doNorm)
Add an operation to the (OpenCL) queue, to set the value of doNormalize, which controls whether to au...
Definition: qengine_cuda.hpp:319
void clDump()
Dumps the remaining asynchronous wait event list or queue of OpenCL events, for the current queue.
void MULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
bitLenInt Compose(QInterfacePtr toCopy)
Combine another QInterface with this one, after the last bit index of this one.
Definition: qengine_cuda.hpp:365
real1_f ParSum(real1 *toSum, bitCapIntOcl maxI)
void GetAmplitudePage(complex *pagePtr, bitCapIntOcl offset, bitCapIntOcl length)
Copy a "page" of amplitudes from this QEngine's internal state, into pagePtr.
bitCapInt MAll()
Measure permutation state of all coherent bits.
void LockSync(cl_map_flags flags=(CL_MAP_READ|CL_MAP_WRITE))
Locks synchronization between the state vector buffer and general RAM, so the state vector can be dir...
void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt overflowIndex, bitLenInt carryIndex)
complex GetAmplitude(bitCapInt perm)
Get the representational amplitude of a full permutation.
size_t FixWorkItemCount(size_t maxI, size_t wic)
Definition: qengine_cuda.hpp:579
void xMULx(OCLAPI api_call, const bitCapIntOcl *bciArgs, BufferPtr controlBuffer)
void INTC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
real1_f SumSqrDiff(QEngineCUDAPtr toCompare)
void DIV(bitCapInt toDiv, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
Divide by integer.
std::shared_ptr< complex > AllocStateVec(bitCapInt elemCount, bool doForceAlloc=false)
BufferPtr MakeBuffer(cl_mem_flags flags, size_t size, void *host_ptr=NULL)
Definition: qengine_cuda.hpp:471
void InitOCL(int64_t devID)
bool IsZeroAmplitude()
Returns "true" only if amplitudes are all totally 0.
Definition: qengine_cuda.hpp:272
void INTSC(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt length, bitLenInt carryIndex)
void MULx(OCLAPI api_call, bitCapIntOcl toMod, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length)
void FreeRaw(cl_mem_flags flags, void *c)
Definition: qengine_cuda.hpp:519
size_t nrmGroupSize
Definition: qengine_cuda.hpp:188
void UniformParityRZ(bitCapInt mask, real1_f angle)
If the target qubit set parity is odd, this applies a phase factor of .
void UpdateRunningNorm(real1_f norm_thresh=REAL1_DEFAULT_ARG)
Force a calculation of the norm of the state vector, in order to make it unit length before the next ...
EventVecPtr ResetWaitEvents(bool waitQueue=true)
void * AllocRaw(cl_mem_flags flags, void *host_ptr, size_t size, cudaError_t *errorPtr)
Definition: qengine_cuda.hpp:507
size_t nrmGroupCount
Definition: qengine_cuda.hpp:187
void ApplyMx(OCLAPI api_call, const bitCapIntOcl *bciArgs, complex nrm)
bool unlockHostMem
Definition: qengine_cuda.hpp:186
void CMULModx(OCLAPI api_call, bitCapIntOcl toMod, bitCapIntOcl modN, bitLenInt inOutStart, bitLenInt carryStart, bitLenInt length, const std::vector< bitLenInt > &controls)
cl_map_flags lockSyncFlags
Definition: qengine_cuda.hpp:191
void SwitchHostPtr(bool useHostMem)
Switch to/from host/device state vector bufffer.
Definition: qengine_cuda.hpp:282
void CIMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Inverse of controlled multiplication modulo N by integer, (out of place)
void CopyStateVec(QEnginePtr src)
Exactly copy the state vector of a different QEngine instance.
void ProbMaskAll(bitCapInt mask, real1 *probsArray)
Direct measure of masked permutation probability.
void BitMask(bitCapIntOcl mask, OCLAPI api_call, real1_f phase=(real1_f) PI_R1)
void DecomposeDispose(bitLenInt start, bitLenInt length, QEngineCUDAPtr dest)
void IMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length)
Inverse of multiplication modulo N by integer, (out of place)
real1_f Probx(OCLAPI api_call, const bitCapIntOcl *bciArgs)
PoolItemPtr GetFreePoolItem()
void Hash(bitLenInt start, bitLenInt length, const unsigned char *values)
Transform a length of qubit register via lookup through a hash table.
bitCapIntOcl GetMaxSize()
Definition: qengine_cuda.hpp:341
real1_f GetExpectation(bitLenInt valueStart, bitLenInt valueLength)
void clFinish(bool doHard=false)
Finishes the asynchronous wait event list or queue of OpenCL events.
void Dispose(bitLenInt start, bitLenInt length, bitCapInt disposedPerm)
Dispose a a contiguous set of qubits that are already in a permutation eigenstate.
std::vector< PoolItemPtr > poolItems
Definition: qengine_cuda.hpp:201
void AddQueueItem(const QueueItem &item)
Definition: qengine_cuda.hpp:321
void CArithmeticCall(OCLAPI api_call, const bitCapIntOcl(&bciArgs)[BCI_ARG_LEN], bitCapIntOcl *controlPowers, bitLenInt controlLen, const unsigned char *values=NULL, bitCapIntOcl valuesLength=0U)
void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, const bitCapIntOcl *qPowersSorted, bool doCalcNorm, SPECIAL_2X2 special, real1_f norm_thresh=REAL1_DEFAULT_ARG)
void CMULModNOut(bitCapInt toMul, bitCapInt modN, bitLenInt inStart, bitLenInt outStart, bitLenInt length, const std::vector< bitLenInt > &controls)
Controlled multiplication modulo N by integer, (out of place)
QInterfacePtr Clone()
Clone this QInterface.
void SetDevice(int64_t dID)
Set GPU device ID.
Abstract QEngine implementation, for all "Schroedinger method" engines.
Definition: qengine.hpp:31
virtual void Apply2x2(bitCapIntOcl offset1, bitCapIntOcl offset2, complex const *mtrx, bitLenInt bitCount, bitCapIntOcl const *qPowersSorted, bool doCalcNorm, real1_f norm_thresh=REAL1_DEFAULT_ARG)=0
bitCapIntOcl maxQPowerOcl
Definition: qengine.hpp:40
virtual void Decompose(bitLenInt start, QInterfacePtr dest)=0
Minimally decompose a set of contiguous bits from the separably composed unit, into "destination".
virtual void X(bitLenInt qubit)
X gate.
Definition: qinterface.hpp:1054
virtual bitLenInt Allocate(bitLenInt length)
Allocate new "length" count of |0> state qubits at end of qubit index position.
Definition: qinterface.hpp:434
virtual bitLenInt Compose(QInterfacePtr toCopy)
Combine another QInterface with this one, after the last bit index of this one.
Definition: qinterface.hpp:338
Definition: qengine_gpu_util.hpp:21
Half-precision floating-point type.
Definition: half.hpp:2222
virtual void Invert(const complex topRight, const complex bottomLeft, bitLenInt qubitIndex)
Apply a single bit transformation that reverses bit probability and might effect phase.
Definition: qinterface.hpp:493
virtual void UniformlyControlledSingleBit(const std::vector< bitLenInt > &controls, bitLenInt qubitIndex, const complex *mtrxs)
Apply a "uniformly controlled" arbitrary single bit unitary transformation.
Definition: qinterface.hpp:590
virtual void Z(bitLenInt qubit)
Z gate.
Definition: qinterface.hpp:1087
virtual void U(bitLenInt target, real1_f theta, real1_f phi, real1_f lambda)
General unitary gate.
Definition: rotational.cpp:18
virtual void Phase(const complex topLeft, const complex bottomRight, bitLenInt qubitIndex)
Apply a single bit transformation that only effects phase.
Definition: qinterface.hpp:480
virtual real1_f FirstNonzeroPhase()
Get phase of lowest permutation nonzero amplitude.
Definition: qinterface.hpp:2709
Definition: complex16x2simd.hpp:25
std::complex< half_float::half > complex
Definition: qrack_types.hpp:62
std::shared_ptr< QEngine > QEnginePtr
Definition: qrack_types.hpp:141
std::shared_ptr< OCLDeviceContext > DeviceContextPtr
Definition: oclengine.hpp:47
std::shared_ptr< QInterface > QInterfacePtr
Definition: qinterface.hpp:28
constexpr real1_f ZERO_R1_F
Definition: qrack_types.hpp:152
std::shared_ptr< EventVec > EventVecPtr
Definition: oclengine.hpp:51
constexpr real1_f FP_NORM_EPSILON_F
Definition: qrack_types.hpp:245
const real1 ONE_R1
Definition: qrack_types.hpp:153
unsigned long cl_map_flags
Definition: qengine_cuda.hpp:31
bitCapInt pow2(const bitLenInt &p)
Definition: qrack_functions.hpp:22
const real1 REAL1_DEFAULT_ARG
Definition: qrack_types.hpp:155
const real1 PI_R1
Definition: qrack_types.hpp:158
float real1_f
Definition: qrack_types.hpp:64
QRACK_CONST complex CMPLX_DEFAULT_ARG
Definition: qrack_types.hpp:242
std::shared_ptr< QEngineCUDA > QEngineCUDAPtr
Definition: qengine_cuda.hpp:47
std::shared_ptr< PoolItem > PoolItemPtr
Definition: qengine_cuda.hpp:162
SPECIAL_2X2
Definition: qengine_gpu_util.hpp:19
@ NONE
Definition: qengine_gpu_util.hpp:19
OCLAPI
Definition: oclapi.hpp:19
const real1 REAL1_EPSILON
Definition: qrack_types.hpp:157
std::shared_ptr< void > BufferPtr
Definition: qengine_cuda.hpp:45
unsigned long cl_mem_flags
Definition: qengine_cuda.hpp:32
bitLenInt log2(bitCapInt n)
Definition: qrack_functions.hpp:26
MICROSOFT_QUANTUM_DECL void U(_In_ uintq sid, _In_ uintq q, _In_ double theta, _In_ double phi, _In_ double lambda)
(External API) 3-parameter unitary gate
Definition: pinvoke_api.cpp:1362
#define CL_MAP_WRITE
Definition: qengine_cuda.hpp:36
#define BCI_ARG_LEN
Definition: qengine_cuda.hpp:25
#define CL_MEM_USE_HOST_PTR
Definition: qengine_cuda.hpp:41
#define CL_MEM_COPY_HOST_PTR
Definition: qengine_cuda.hpp:42
#define CMPLX_NORM_LEN
Definition: qengine_cuda.hpp:26
#define CL_MAP_READ
Definition: qengine_cuda.hpp:35
#define REAL_ARG_LEN
Definition: qengine_cuda.hpp:27
#define bitLenInt
Definition: qrack_types.hpp:44
#define qrack_rand_gen_ptr
Definition: qrack_types.hpp:146
#define bitCapInt
Definition: qrack_types.hpp:105
#define bitCapIntOcl
Definition: qrack_types.hpp:91
Definition: qengine_cuda.hpp:50
QueueItem(OCLAPI ac, size_t wic, size_t lgs, size_t ds, std::vector< BufferPtr > b, size_t lbs)
Definition: qengine_cuda.hpp:76
QueueItem(real1_f runningNrm)
Definition: qengine_cuda.hpp:104
bool doNorm
Definition: qengine_cuda.hpp:59
size_t workItemCount
Definition: qengine_cuda.hpp:52
std::vector< BufferPtr > buffers
Definition: qengine_cuda.hpp:55
size_t deallocSize
Definition: qengine_cuda.hpp:54
QueueItem()
Definition: qengine_cuda.hpp:62
bool isSetRunningNorm
Definition: qengine_cuda.hpp:58
QueueItem(bool doNrm)
Definition: qengine_cuda.hpp:90
size_t localBuffSize
Definition: qengine_cuda.hpp:56
OCLAPI api_call
Definition: qengine_cuda.hpp:51
bool isSetDoNorm
Definition: qengine_cuda.hpp:57
size_t localGroupSize
Definition: qengine_cuda.hpp:53
real1 runningNorm
Definition: qengine_cuda.hpp:60