Qrack  10.0
General classical-emulating-quantum development framework
oclengine.hpp
Go to the documentation of this file.
1 //
3 // (C) Daniel Strano and the Qrack contributors 2017-2023. All rights reserved.
4 //
5 // This is a multithreaded, universal quantum register simulation, allowing
6 // (nonphysical) register cloning and direct measurement of probability and
7 // phase, to leverage what advantages classical emulation of qubits can have.
8 //
9 // Licensed under the GNU Lesser General Public License V3.
10 // See LICENSE.md in the project root or https://www.gnu.org/licenses/lgpl-3.0.en.html
11 // for details.
12 
13 #pragma once
14 
15 #include "oclapi.hpp"
16 
17 #if !ENABLE_OPENCL
18 #error OpenCL has not been enabled
19 #endif
20 
21 #if defined(_WIN32) && !defined(__CYGWIN__)
22 #include <direct.h>
23 #endif
24 
25 #include <cstdint>
26 #include <map>
27 #include <memory>
28 #include <mutex>
29 #include <string>
30 #include <sys/stat.h>
31 
32 #if ENABLE_SNUCL
33 #include <CL/cl.hpp>
34 #include <stdexcept>
35 #elif defined(OPENCL_V3)
36 #include <CL/opencl.hpp>
37 #elif defined(__APPLE__)
38 #define CL_SILENCE_DEPRECATION
39 #include <CL/opencl.hpp>
40 #elif defined(_WIN32)
41 #include <CL/cl.hpp>
42 #else
43 #include <CL/cl2.hpp>
44 #endif
45 
46 namespace Qrack {
47 
48 class OCLDeviceCall;
49 
50 class OCLDeviceContext;
51 
52 typedef std::shared_ptr<OCLDeviceContext> DeviceContextPtr;
53 typedef std::vector<cl::Event> EventVec;
54 typedef std::shared_ptr<EventVec> EventVecPtr;
55 
58  std::string kernelname;
59 
60  OCLKernelHandle(OCLAPI o, std::string kn)
61  : oclapi(o)
62  , kernelname(kn)
63  {
64  }
65 };
66 
68 protected:
69  std::lock_guard<std::mutex> guard;
70 
71 public:
72  // A cl::Kernel is unique object which should always be taken by reference, or the OCLDeviceContext will lose
73  // ownership.
74  cl::Kernel& call;
76 
77 protected:
78  OCLDeviceCall(std::mutex& m, cl::Kernel& c)
79  : guard(m)
80  , call(c)
81  {
82  }
83 
84  friend class OCLDeviceContext;
85 
86 private:
88 };
89 
91 public:
92  const cl::Platform platform;
93  const cl::Device device;
94  const cl::Context context;
95  const int64_t context_id;
96  const int64_t device_id;
97  const bool is_gpu;
98  const bool is_cpu;
99  const bool use_host_mem;
100  cl::CommandQueue queue;
102 
103 protected:
104  std::mutex waitEventsMutex;
105  std::map<OCLAPI, cl::Kernel> calls;
106  std::map<OCLAPI, std::unique_ptr<std::mutex>> mutexes;
107 
108 private:
109  const size_t procElemCount = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
110  const size_t maxWorkItems = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
111  const size_t maxWorkGroupSize = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
112  const size_t maxAlloc = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
113  const size_t globalSize = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
114  const size_t localSize = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
115  size_t globalLimit;
118 
119 public:
120  OCLDeviceContext(cl::Platform& p, cl::Device& d, cl::Context& c, int64_t dev_id, int64_t cntxt_id, int64_t maxAlloc,
121  bool isGpu, bool isCpu, bool useHostMem)
122  : platform(p)
123  , device(d)
124  , context(c)
125  , context_id(cntxt_id)
126  , device_id(dev_id)
127  , is_gpu(isGpu)
128  , is_cpu(isCpu)
129  , use_host_mem(useHostMem)
130  , wait_events(new EventVec())
131 #if ENABLE_OCL_MEM_GUARDS
133 #else
134  , globalLimit((maxAlloc >= 0) ? maxAlloc : -1)
135 #endif
138  {
139  cl_int error;
140 #if ENABLE_OOO_OCL
141 #if ENABLE_ENV_VARS
142  if (getenv("DISABLE_OOO_OCL")) {
143  queue = cl::CommandQueue(c, d, 0, &error);
144  if (error != CL_SUCCESS) {
145  throw std::runtime_error("Failed to create OpenCL command queue!");
146  }
147  } else {
148  queue = cl::CommandQueue(c, d, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
149  if (error != CL_SUCCESS) {
150  queue = cl::CommandQueue(c, d, 0, &error);
151  if (error != CL_SUCCESS) {
152  throw std::runtime_error("Failed to create OpenCL command queue!");
153  }
154  }
155  }
156 #else
157  queue = cl::CommandQueue(c, d, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
158  if (error != CL_SUCCESS) {
159  queue = cl::CommandQueue(c, d, 0, &error);
160  if (error != CL_SUCCESS) {
161  throw std::runtime_error("Failed to create OpenCL command queue!");
162  }
163  }
164 #endif
165 #else
166  queue = cl::CommandQueue(c, d, 0, &error);
167  if (error != CL_SUCCESS) {
168  throw std::runtime_error("Failed to create OpenCL command queue!");
169  }
170 #endif
171  }
172 
173  OCLDeviceCall Reserve(OCLAPI call) { return OCLDeviceCall(*(mutexes[call]), calls[call]); }
174 
176  {
177  std::lock_guard<std::mutex> guard(waitEventsMutex);
178  EventVecPtr waitVec = std::move(wait_events);
180  return waitVec;
181  }
182 
183  template <typename Fn> void EmplaceEvent(Fn fn)
184  {
185  std::lock_guard<std::mutex> guard(waitEventsMutex);
186  wait_events->emplace_back();
187  fn(wait_events->back());
188  }
189 
191  {
192  std::lock_guard<std::mutex> guard(waitEventsMutex);
193  if ((wait_events.get())->size()) {
194  cl::Event::waitForEvents((const EventVec&)*(wait_events.get()));
195  wait_events->clear();
196  }
197  }
198 
200  {
201  return preferredSizeMultiple
204  calls[OCL_API_APPLY2X2_NORM_SINGLE].getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(
205  device);
206  }
207 
209  {
210  if (preferredConcurrency) {
211  return preferredConcurrency;
212  }
213 
214  int hybridOffset = 3U;
215 #if ENABLE_ENV_VARS
216  if (getenv("QRACK_GPU_OFFSET_QB")) {
217  hybridOffset = std::stoi(std::string(getenv("QRACK_GPU_OFFSET_QB")));
218  }
219 #endif
220 
221  const size_t pc = procElemCount * GetPreferredSizeMultiple();
223  while (preferredConcurrency < pc) {
224  preferredConcurrency <<= 1U;
225  }
227  hybridOffset > 0 ? (preferredConcurrency << hybridOffset) : (preferredConcurrency >> -hybridOffset);
228  if (preferredConcurrency < 1U) {
230  }
231 
232  return preferredConcurrency;
233  }
234 
235  size_t GetProcElementCount() { return procElemCount; }
236  size_t GetMaxWorkItems() { return maxWorkItems; }
238  size_t GetMaxAlloc() { return maxAlloc; }
239  size_t GetGlobalSize() { return globalSize; }
240  size_t GetLocalSize() { return localSize; }
241  size_t GetGlobalAllocLimit() { return globalLimit; }
242 
243  friend class OCLEngine;
244 };
245 
247  std::vector<DeviceContextPtr> all_dev_contexts;
249 
251  : all_dev_contexts()
252  , default_dev_context{ nullptr }
253  {
254  // Intentionally left blank
255  }
256 
257  InitOClResult(std::vector<DeviceContextPtr> adc, DeviceContextPtr ddc)
258  : all_dev_contexts(adc)
259  , default_dev_context(ddc)
260  {
261  // Intentionally left blank
262  }
263 };
264 
266 class OCLEngine {
267 public:
268  // See https://stackoverflow.com/questions/1008019/c-singleton-design-pattern
270  static OCLEngine& Instance()
271  {
272  static OCLEngine instance;
273  return instance;
274  }
276  static std::string GetDefaultBinaryPath()
277  {
278 #if ENABLE_ENV_VARS
279  if (getenv("QRACK_OCL_PATH")) {
280  std::string toRet = std::string(getenv("QRACK_OCL_PATH"));
281  if ((toRet.back() != '/') && (toRet.back() != '\\')) {
282 #if defined(_WIN32) && !defined(__CYGWIN__)
283  toRet += "\\";
284 #else
285  toRet += "/";
286 #endif
287  }
288 
289  return toRet;
290  }
291 #endif
292 
293 #if defined(_WIN32) && !defined(__CYGWIN__)
294  return std::string(getenv("HOMEDRIVE") ? getenv("HOMEDRIVE") : "") +
295  std::string(getenv("HOMEPATH") ? getenv("HOMEPATH") : "") + "\\.qrack\\";
296 #else
297  return std::string(getenv("HOME") ? getenv("HOME") : "") + "/.qrack/";
298 #endif
299  }
303  static InitOClResult InitOCL(bool buildFromSource = false, bool saveBinaries = false, std::string home = "*",
304  std::vector<int64_t> maxAllocVec = { -1 });
305 
307  DeviceContextPtr GetDeviceContextPtr(const int64_t& dev = -1);
309  std::vector<DeviceContextPtr> GetDeviceContextPtrVector();
314  void SetDeviceContextPtrVector(std::vector<DeviceContextPtr> vec, DeviceContextPtr dcp = nullptr);
316  int GetDeviceCount() { return all_device_contexts.size(); }
318  size_t GetDefaultDeviceID() { return default_device_context->device_id; }
321 
322  size_t GetActiveAllocSize(const int64_t& dev)
323  {
324  if (dev > ((int64_t)activeAllocSizes.size())) {
325  throw std::invalid_argument("OCLEngine::GetActiveAllocSize device ID is too high!");
326  }
327 
328  return (dev < 0) ? activeAllocSizes[GetDefaultDeviceID()] : activeAllocSizes[(size_t)dev];
329  }
330  size_t AddToActiveAllocSize(const int64_t& dev, size_t size)
331  {
332  if (dev > ((int64_t)activeAllocSizes.size())) {
333  throw std::invalid_argument("OCLEngine::GetActiveAllocSize device ID is too high!");
334  }
335 
336  const size_t lDev = (dev < 0) ? GetDefaultDeviceID() : dev;
337 
338  if (size == 0) {
339  return activeAllocSizes[lDev];
340  }
341 
342  std::lock_guard<std::mutex> lock(allocMutex);
343  activeAllocSizes[lDev] += size;
344 
345  return activeAllocSizes[lDev];
346  }
347  size_t SubtractFromActiveAllocSize(const int64_t& dev, size_t size)
348  {
349  if (dev > ((int64_t)activeAllocSizes.size())) {
350  throw std::invalid_argument("OCLEngine::GetActiveAllocSize device ID is too high!");
351  }
352 
353  const size_t lDev = (dev < 0) ? GetDefaultDeviceID() : dev;
354 
355  if (size == 0) {
356  return activeAllocSizes[lDev];
357  }
358 
359  std::lock_guard<std::mutex> lock(allocMutex);
360  if (size < activeAllocSizes[lDev]) {
361  activeAllocSizes[lDev] -= size;
362  } else {
363  activeAllocSizes[lDev] = 0;
364  }
365 
366  return activeAllocSizes[lDev];
367  }
368  void ResetActiveAllocSize(const int64_t& dev)
369  {
370  if (dev > ((int64_t)activeAllocSizes.size())) {
371  throw std::invalid_argument("OCLEngine::GetActiveAllocSize device ID is too high!");
372  }
373  const size_t lDev = (dev < 0) ? GetDefaultDeviceID() : dev;
374  std::lock_guard<std::mutex> lock(allocMutex);
375  // User code should catch std::bad_alloc and reset:
376  activeAllocSizes[lDev] = 0;
377  }
378 
379  OCLEngine(OCLEngine const&) = delete;
380  void operator=(OCLEngine const&) = delete;
381 
382 private:
383  static const std::vector<OCLKernelHandle> kernelHandles;
384  static const std::string binary_file_prefix;
385  static const std::string binary_file_ext;
386 
387  std::vector<size_t> activeAllocSizes;
388  std::vector<int64_t> maxActiveAllocSizes;
389  std::mutex allocMutex;
390  std::vector<DeviceContextPtr> all_device_contexts;
392 
393  OCLEngine(); // Private so that it can not be called
394 
396  static cl::Program MakeProgram(bool buildFromSource, std::string path, std::shared_ptr<OCLDeviceContext> devCntxt);
398  static void SaveBinary(cl::Program program, std::string path, std::string fileName);
399 };
400 
401 } // namespace Qrack
Definition: oclengine.hpp:67
std::lock_guard< std::mutex > guard
Definition: oclengine.hpp:69
OCLDeviceCall & operator=(const OCLDeviceCall &)=delete
cl::Kernel & call
Definition: oclengine.hpp:74
OCLDeviceCall(std::mutex &m, cl::Kernel &c)
Definition: oclengine.hpp:78
OCLDeviceCall(const OCLDeviceCall &)
Definition: oclengine.hpp:90
OCLDeviceContext(cl::Platform &p, cl::Device &d, cl::Context &c, int64_t dev_id, int64_t cntxt_id, int64_t maxAlloc, bool isGpu, bool isCpu, bool useHostMem)
Definition: oclengine.hpp:120
const cl::Device device
Definition: oclengine.hpp:93
const size_t globalSize
Definition: oclengine.hpp:113
size_t globalLimit
Definition: oclengine.hpp:115
size_t GetPreferredSizeMultiple()
Definition: oclengine.hpp:199
const size_t localSize
Definition: oclengine.hpp:114
std::mutex waitEventsMutex
Definition: oclengine.hpp:104
const bool use_host_mem
Definition: oclengine.hpp:99
const int64_t context_id
Definition: oclengine.hpp:95
std::map< OCLAPI, cl::Kernel > calls
Definition: oclengine.hpp:105
void EmplaceEvent(Fn fn)
Definition: oclengine.hpp:183
size_t GetGlobalSize()
Definition: oclengine.hpp:239
size_t GetMaxAlloc()
Definition: oclengine.hpp:238
std::map< OCLAPI, std::unique_ptr< std::mutex > > mutexes
Definition: oclengine.hpp:106
const bool is_cpu
Definition: oclengine.hpp:98
OCLDeviceCall Reserve(OCLAPI call)
Definition: oclengine.hpp:173
size_t GetMaxWorkGroupSize()
Definition: oclengine.hpp:237
void WaitOnAllEvents()
Definition: oclengine.hpp:190
size_t GetLocalSize()
Definition: oclengine.hpp:240
const int64_t device_id
Definition: oclengine.hpp:96
const size_t maxWorkGroupSize
Definition: oclengine.hpp:111
cl::CommandQueue queue
Definition: oclengine.hpp:100
size_t preferredConcurrency
Definition: oclengine.hpp:117
size_t GetMaxWorkItems()
Definition: oclengine.hpp:236
const size_t procElemCount
Definition: oclengine.hpp:109
EventVecPtr ResetWaitEvents()
Definition: oclengine.hpp:175
const cl::Context context
Definition: oclengine.hpp:94
size_t GetPreferredConcurrency()
Definition: oclengine.hpp:208
size_t preferredSizeMultiple
Definition: oclengine.hpp:116
const bool is_gpu
Definition: oclengine.hpp:97
size_t GetGlobalAllocLimit()
Definition: oclengine.hpp:241
size_t GetProcElementCount()
Definition: oclengine.hpp:235
EventVecPtr wait_events
Definition: oclengine.hpp:101
const size_t maxWorkItems
Definition: oclengine.hpp:110
const cl::Platform platform
Definition: oclengine.hpp:92
const size_t maxAlloc
Definition: oclengine.hpp:112
"Qrack::OCLEngine" manages the single OpenCL context.
Definition: oclengine.hpp:266
size_t AddToActiveAllocSize(const int64_t &dev, size_t size)
Definition: oclengine.hpp:330
static const std::string binary_file_ext
Definition: oclengine.hpp:385
std::vector< int64_t > maxActiveAllocSizes
Definition: oclengine.hpp:388
int GetDeviceCount()
Get the count of devices in the current list.
Definition: oclengine.hpp:316
std::vector< DeviceContextPtr > all_device_contexts
Definition: oclengine.hpp:390
OCLEngine(OCLEngine const &)=delete
std::vector< size_t > activeAllocSizes
Definition: oclengine.hpp:387
void ResetActiveAllocSize(const int64_t &dev)
Definition: oclengine.hpp:368
void operator=(OCLEngine const &)=delete
static const std::vector< OCLKernelHandle > kernelHandles
Definition: oclengine.hpp:383
std::vector< DeviceContextPtr > GetDeviceContextPtrVector()
Get the list of all available devices (and their supporting objects).
Definition: oclengine.cpp:153
size_t SubtractFromActiveAllocSize(const int64_t &dev, size_t size)
Definition: oclengine.hpp:347
OCLEngine()
Definition: oclengine.cpp:447
static const std::string binary_file_prefix
Definition: oclengine.hpp:384
std::mutex allocMutex
Definition: oclengine.hpp:389
static OCLEngine & Instance()
Get a pointer to the Instance of the singleton. (The instance will be instantiated,...
Definition: oclengine.hpp:270
static void SaveBinary(cl::Program program, std::string path, std::string fileName)
Save the program binary:
Definition: oclengine.cpp:243
static InitOClResult InitOCL(bool buildFromSource=false, bool saveBinaries=false, std::string home="*", std::vector< int64_t > maxAllocVec={ -1 })
Initialize the OCL environment, with the option to save the generated binaries.
Definition: oclengine.cpp:281
void SetDefaultDeviceContext(DeviceContextPtr dcp)
Pick a default device, for QEngineOCL instances that don't specify a preferred device.
Definition: oclengine.cpp:162
DeviceContextPtr default_device_context
Definition: oclengine.hpp:391
size_t GetActiveAllocSize(const int64_t &dev)
Definition: oclengine.hpp:322
static cl::Program MakeProgram(bool buildFromSource, std::string path, std::shared_ptr< OCLDeviceContext > devCntxt)
Make the program, from either source or binary.
Definition: oclengine.cpp:164
static std::string GetDefaultBinaryPath()
Get default location for precompiled binaries:
Definition: oclengine.hpp:276
size_t GetDefaultDeviceID()
Get default device ID.
Definition: oclengine.hpp:318
DeviceContextPtr GetDeviceContextPtr(const int64_t &dev=-1)
Get a pointer one of the available OpenCL contexts, by its index in the list of all contexts.
Definition: oclengine.cpp:54
void SetDeviceContextPtrVector(std::vector< DeviceContextPtr > vec, DeviceContextPtr dcp=nullptr)
Set the list of DeviceContextPtr object available for use.
Definition: oclengine.cpp:154
GLOSSARY: bitLenInt - "bit-length integer" - unsigned integer ID of qubit position in register bitCap...
Definition: complex16x2simd.hpp:25
std::shared_ptr< OCLDeviceContext > DeviceContextPtr
Definition: oclengine.hpp:50
std::shared_ptr< EventVec > EventVecPtr
Definition: oclengine.hpp:54
void U(quid sid, bitLenInt q, real1_f theta, real1_f phi, real1_f lambda)
(External API) 3-parameter unitary gate
Definition: wasm_api.cpp:1199
OCLAPI
Definition: oclapi.hpp:19
@ OCL_API_APPLY2X2_NORM_SINGLE
Definition: oclapi.hpp:23
std::vector< cl::Event > EventVec
Definition: oclengine.hpp:53
Definition: oclengine.hpp:246
std::vector< DeviceContextPtr > all_dev_contexts
Definition: oclengine.hpp:247
InitOClResult()
Definition: oclengine.hpp:250
InitOClResult(std::vector< DeviceContextPtr > adc, DeviceContextPtr ddc)
Definition: oclengine.hpp:257
DeviceContextPtr default_dev_context
Definition: oclengine.hpp:248
Definition: oclengine.hpp:56
std::string kernelname
Definition: oclengine.hpp:58
OCLAPI oclapi
Definition: oclengine.hpp:57
OCLKernelHandle(OCLAPI o, std::string kn)
Definition: oclengine.hpp:60