Ginkgo Generated from develop branch based on develop. Ginkgo version 1.8.0
A numerical linear algebra library targeting many-core architectures
Loading...
Searching...
No Matches
executor.hpp
1// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
2//
3// SPDX-License-Identifier: BSD-3-Clause
4
5#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
7
8
9#include <array>
10#include <atomic>
11#include <iostream>
12#include <memory>
13#include <mutex>
14#include <sstream>
15#include <string>
16#include <tuple>
17#include <type_traits>
18#include <vector>
19
20
21#include <ginkgo/core/base/device.hpp>
22#include <ginkgo/core/base/fwd_decls.hpp>
23#include <ginkgo/core/base/machine_topology.hpp>
24#include <ginkgo/core/base/memory.hpp>
25#include <ginkgo/core/base/scoped_device_id_guard.hpp>
26#include <ginkgo/core/base/types.hpp>
27#include <ginkgo/core/log/logger.hpp>
28#include <ginkgo/core/synthesizer/containers.hpp>
29
30
31namespace gko {
32
33
41 never,
48};
49
50
63enum class allocation_mode { device, unified_global, unified_host };
64
65
66#ifdef NDEBUG
67
68// When in release, prefer device allocations
69constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70
71constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
72
73#else
74
75// When in debug, always UM allocations.
76constexpr allocation_mode default_cuda_alloc_mode =
77 allocation_mode::unified_global;
78
79#if (GINKGO_HIP_PLATFORM_HCC == 1)
80
81// HIP on AMD GPUs does not support UM, so always prefer device allocations.
82constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
83
84#else
85
86// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
87constexpr allocation_mode default_hip_alloc_mode =
88 allocation_mode::unified_global;
89
90#endif
91
92#endif
93
94
95} // namespace gko
96
97
102enum class dpcpp_queue_property {
106 in_order = 1,
107
111 enable_profiling = 2
112};
113
114GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
115 dpcpp_queue_property b)
116{
117 return static_cast<dpcpp_queue_property>(static_cast<int>(a) |
118 static_cast<int>(b));
119}
120
121
122namespace gko {
123
124
125#define GKO_FORWARD_DECLARE(_type, ...) class _type
126
127GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128
129#undef GKO_FORWARD_DECLARE
130
131
132class ReferenceExecutor;
133
134
135namespace detail {
136
137
138template <typename>
139class ExecutorBase;
140
141
142} // namespace detail
143
144
260public:
261#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
262 virtual void run(std::shared_ptr<const _type>) const
263
264 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265
266#undef GKO_DECLARE_RUN_OVERLOAD
267
268 // ReferenceExecutor overload can be defaulted to OmpExecutor's
269 virtual void run(std::shared_ptr<const ReferenceExecutor> executor) const;
270
276 virtual const char* get_name() const noexcept;
277};
278
279
280namespace detail {
281
282
292template <typename Closure>
293class RegisteredOperation : public Operation {
294public:
301 RegisteredOperation(const char* name, Closure op)
302 : name_(name), op_(std::move(op))
303 {}
304
305 const char* get_name() const noexcept override { return name_; }
306
307 void run(std::shared_ptr<const ReferenceExecutor> exec) const override
308 {
309 op_(exec);
310 }
311
312 void run(std::shared_ptr<const OmpExecutor> exec) const override
313 {
314 op_(exec);
315 }
316
317 void run(std::shared_ptr<const CudaExecutor> exec) const override
318 {
319 op_(exec);
320 }
321
322 void run(std::shared_ptr<const HipExecutor> exec) const override
323 {
324 op_(exec);
325 }
326
327 void run(std::shared_ptr<const DpcppExecutor> exec) const override
328 {
329 op_(exec);
330 }
331
332private:
333 const char* name_;
334 Closure op_;
335};
336
337
338template <typename Closure>
339RegisteredOperation<Closure> make_register_operation(const char* name,
340 Closure op)
341{
342 return RegisteredOperation<Closure>{name, std::move(op)};
343}
344
345
346} // namespace detail
347
348
420#define GKO_REGISTER_OPERATION(_name, _kernel) \
421 template <typename... Args> \
422 auto make_##_name(Args&&... args) \
423 { \
424 return ::gko::detail::make_register_operation( \
425 #_kernel, [&args...](auto exec) { \
426 using exec_type = decltype(exec); \
427 if (std::is_same< \
428 exec_type, \
429 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430 value) { \
431 ::gko::kernels::reference::_kernel( \
432 std::dynamic_pointer_cast< \
433 const ::gko::ReferenceExecutor>(exec), \
434 std::forward<Args>(args)...); \
435 } else if (std::is_same< \
436 exec_type, \
437 std::shared_ptr<const ::gko::OmpExecutor>>:: \
438 value) { \
439 ::gko::kernels::omp::_kernel( \
440 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441 exec), \
442 std::forward<Args>(args)...); \
443 } else if (std::is_same< \
444 exec_type, \
445 std::shared_ptr<const ::gko::CudaExecutor>>:: \
446 value) { \
447 ::gko::kernels::cuda::_kernel( \
448 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449 exec), \
450 std::forward<Args>(args)...); \
451 } else if (std::is_same< \
452 exec_type, \
453 std::shared_ptr<const ::gko::HipExecutor>>:: \
454 value) { \
455 ::gko::kernels::hip::_kernel( \
456 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457 exec), \
458 std::forward<Args>(args)...); \
459 } else if (std::is_same< \
460 exec_type, \
461 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462 value) { \
463 ::gko::kernels::dpcpp::_kernel( \
464 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465 exec), \
466 std::forward<Args>(args)...); \
467 } else { \
468 GKO_NOT_IMPLEMENTED; \
469 } \
470 }); \
471 } \
472 static_assert(true, \
473 "This assert is used to counter the false positive extra " \
474 "semi-colon warnings")
475
476
514#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
515 template <typename... Args> \
516 auto make_##_name(Args&&... args) \
517 { \
518 return ::gko::detail::make_register_operation( \
519 #_kernel, \
520 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521 } \
522 static_assert(true, \
523 "This assert is used to counter the false positive extra " \
524 "semi-colon warnings")
525
526
527#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
528
616class Executor : public log::EnableLogging<Executor> {
617 template <typename T>
618 friend class detail::ExecutorBase;
619
620 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
621 friend class ReferenceExecutor;
622
623public:
624 virtual ~Executor() = default;
625
626 Executor() = default;
627 Executor(Executor&) = delete;
628 Executor(Executor&&) = delete;
629 Executor& operator=(Executor&) = delete;
630 Executor& operator=(Executor&&) = delete;
631
637 virtual void run(const Operation& op) const = 0;
638
653 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
654 typename ClosureDpcpp>
655 void run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
656 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp) const
657 {
658 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
659 op_omp, op_cuda, op_hip, op_dpcpp);
660 this->run(op);
661 }
662
674 template <typename T>
675 T* alloc(size_type num_elems) const
676 {
677 this->template log<log::Logger::allocation_started>(
678 this, num_elems * sizeof(T));
679 T* allocated = static_cast<T*>(this->raw_alloc(num_elems * sizeof(T)));
680 this->template log<log::Logger::allocation_completed>(
681 this, num_elems * sizeof(T), reinterpret_cast<uintptr>(allocated));
682 return allocated;
683 }
684
692 void free(void* ptr) const noexcept
693 {
694 this->template log<log::Logger::free_started>(
695 this, reinterpret_cast<uintptr>(ptr));
696 this->raw_free(ptr);
697 this->template log<log::Logger::free_completed>(
698 this, reinterpret_cast<uintptr>(ptr));
699 }
700
713 template <typename T>
715 const T* src_ptr, T* dest_ptr) const
716 {
717 const auto src_loc = reinterpret_cast<uintptr>(src_ptr);
718 const auto dest_loc = reinterpret_cast<uintptr>(dest_ptr);
719 this->template log<log::Logger::copy_started>(
720 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
721 if (this != src_exec.get()) {
722 src_exec->template log<log::Logger::copy_started>(
723 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
724 }
725 try {
726 this->raw_copy_from(src_exec.get(), num_elems * sizeof(T), src_ptr,
727 dest_ptr);
728 } catch (NotSupported&) {
729#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
730 // Unoptimized copy. Try to go through the masters.
731 // output to log when verbose >= 1 and debug build
732 std::clog << "Not direct copy. Try to copy data from the masters."
733 << std::endl;
734#endif
735 auto src_master = src_exec->get_master().get();
736 if (num_elems > 0 && src_master != src_exec.get()) {
737 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
738 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
739 master_ptr);
740 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
741 src_master->free(master_ptr);
742 }
743 }
744 this->template log<log::Logger::copy_completed>(
745 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
746 if (this != src_exec.get()) {
747 src_exec->template log<log::Logger::copy_completed>(
748 src_exec.get(), this, src_loc, dest_loc, num_elems * sizeof(T));
749 }
750 }
751
763 template <typename T>
764 void copy(size_type num_elems, const T* src_ptr, T* dest_ptr) const
765 {
766 this->copy_from(this, num_elems, src_ptr, dest_ptr);
767 }
768
778 template <typename T>
779 T copy_val_to_host(const T* ptr) const
780 {
781 T out{};
782 this->get_master()->copy_from(this, 1, ptr, &out);
783 return out;
784 }
785
790 virtual std::shared_ptr<Executor> get_master() noexcept = 0;
791
795 virtual std::shared_ptr<const Executor> get_master() const noexcept = 0;
796
800 virtual void synchronize() const = 0;
801
808 void add_logger(std::shared_ptr<const log::Logger> logger) override
809 {
810 this->propagating_logger_refcount_.fetch_add(
811 logger->needs_propagation() ? 1 : 0);
812 this->EnableLogging<Executor>::add_logger(logger);
813 }
814
821 void remove_logger(const log::Logger* logger) override
822 {
823 this->propagating_logger_refcount_.fetch_sub(
824 logger->needs_propagation() ? 1 : 0);
825 this->EnableLogging<Executor>::remove_logger(logger);
826 }
827
828 using EnableLogging<Executor>::remove_logger;
829
838 {
839 log_propagation_mode_ = mode;
840 }
841
850 {
851 return this->propagating_logger_refcount_.load() > 0 &&
852 log_propagation_mode_ == log_propagation_mode::automatic;
853 }
854
862 bool memory_accessible(const std::shared_ptr<const Executor>& other) const
863 {
864 return this->verify_memory_from(other.get());
865 }
866
867 virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
868
869protected:
874 struct exec_info {
878 int device_id = -1;
879
883 std::string device_type;
884
888 int numa_node = -1;
889
898 int num_computing_units = -1;
899
911 int num_pu_per_cu = -1;
912
921 std::vector<int> subgroup_sizes{};
922
931 int max_subgroup_size = -1;
932
943 std::vector<int> max_workitem_sizes{};
944
954 int max_workgroup_size;
955
959 int major = -1;
960
964 int minor = -1;
965
971 std::string pci_bus_id = std::string(13, 'x');
972
983 std::vector<int> closest_pu_ids{};
984 };
985
991 const exec_info& get_exec_info() const { return this->exec_info_; }
992
1002 virtual void* raw_alloc(size_type size) const = 0;
1003
1011 virtual void raw_free(void* ptr) const noexcept = 0;
1012
1023 virtual void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1024 const void* src_ptr, void* dest_ptr) const = 0;
1025
1035#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1036 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1037 const void* src_ptr, void* dest_ptr) const = 0
1038
1039 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1040
1041#undef GKO_ENABLE_RAW_COPY_TO
1042
1050 virtual bool verify_memory_from(const Executor* src_exec) const = 0;
1051
1061#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1062 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1063
1064 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1065
1066 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1067
1068#undef GKO_ENABLE_VERIFY_MEMORY_TO
1069
1076 virtual void populate_exec_info(const machine_topology* mach_topo) = 0;
1077
1083 exec_info& get_exec_info() { return this->exec_info_; }
1084
1085 exec_info exec_info_;
1086
1087 log_propagation_mode log_propagation_mode_{log_propagation_mode::automatic};
1088
1089 std::atomic<int> propagating_logger_refcount_{};
1090
1091private:
1106 template <typename ClosureOmp, typename ClosureCuda, typename ClosureHip,
1107 typename ClosureDpcpp>
1108 class LambdaOperation : public Operation {
1109 public:
1120 LambdaOperation(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
1121 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
1122 : op_omp_(op_omp),
1123 op_cuda_(op_cuda),
1124 op_hip_(op_hip),
1125 op_dpcpp_(op_dpcpp)
1126 {}
1127
1128 void run(std::shared_ptr<const OmpExecutor>) const override
1129 {
1130 op_omp_();
1131 }
1132
1133 void run(std::shared_ptr<const ReferenceExecutor>) const override
1134 {
1135 op_omp_();
1136 }
1137
1138 void run(std::shared_ptr<const CudaExecutor>) const override
1139 {
1140 op_cuda_();
1141 }
1142
1143 void run(std::shared_ptr<const HipExecutor>) const override
1144 {
1145 op_hip_();
1146 }
1147
1148 void run(std::shared_ptr<const DpcppExecutor>) const override
1149 {
1150 op_dpcpp_();
1151 }
1152
1153 private:
1154 ClosureOmp op_omp_;
1155 ClosureCuda op_cuda_;
1156 ClosureHip op_hip_;
1157 ClosureDpcpp op_dpcpp_;
1158 };
1159};
1160
1161
1170template <typename T>
1172public:
1173 using pointer = T*;
1174
1180 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1181 : exec_{exec}
1182 {}
1183
1189 void operator()(pointer ptr) const
1190 {
1191 if (exec_) {
1192 exec_->free(ptr);
1193 }
1194 }
1195
1196private:
1197 std::shared_ptr<const Executor> exec_;
1198};
1199
1200// a specialization for arrays
1201template <typename T>
1203public:
1204 using pointer = T[];
1205
1206 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1207 : exec_{exec}
1208 {}
1209
1210 void operator()(pointer ptr) const
1211 {
1212 if (exec_) {
1213 exec_->free(ptr);
1214 }
1215 }
1216
1217private:
1218 std::shared_ptr<const Executor> exec_;
1219};
1220
1221
1222namespace detail {
1223
1224
1225template <typename ConcreteExecutor>
1226class ExecutorBase : public Executor {
1227 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1228 friend class ReferenceExecutor;
1229
1230public:
1231 using Executor::run;
1232
1233 void run(const Operation& op) const override
1234 {
1235 this->template log<log::Logger::operation_launched>(this, &op);
1236 auto scope_guard = get_scoped_device_id_guard();
1237 op.run(self()->shared_from_this());
1238 this->template log<log::Logger::operation_completed>(this, &op);
1239 }
1240
1241protected:
1242 void raw_copy_from(const Executor* src_exec, size_type n_bytes,
1243 const void* src_ptr, void* dest_ptr) const override
1244 {
1245 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1246 }
1247
1248 virtual bool verify_memory_from(const Executor* src_exec) const override
1249 {
1250 return src_exec->verify_memory_to(self());
1251 }
1252
1253private:
1254 ConcreteExecutor* self() noexcept
1255 {
1256 return static_cast<ConcreteExecutor*>(this);
1257 }
1258
1259 const ConcreteExecutor* self() const noexcept
1260 {
1261 return static_cast<const ConcreteExecutor*>(this);
1262 }
1263};
1264
1265#undef GKO_DECLARE_EXECUTOR_FRIEND
1266
1267
1275class EnableDeviceReset {
1276public:
1282 GKO_DEPRECATED(
1283 "device_reset is no longer supported, call "
1284 "cudaDeviceReset/hipDeviceReset manually")
1285 void set_device_reset(bool device_reset) {}
1286
1292 GKO_DEPRECATED(
1293 "device_reset is no longer supported, call "
1294 "cudaDeviceReset/hipDeviceReset manually")
1295 bool get_device_reset() { return false; }
1296
1297protected:
1303 EnableDeviceReset() {}
1304
1305 GKO_DEPRECATED(
1306 "device_reset is no longer supported, call "
1307 "cudaDeviceReset/hipDeviceReset manually")
1308 EnableDeviceReset(bool device_reset) {}
1309};
1310
1311
1312} // namespace detail
1313
1314
1315#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1316 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1317 const void* src_ptr, void* dest_ptr) const override
1318
1319
1320#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1321 virtual bool verify_memory_to(const dest_* other) const override \
1322 { \
1323 return bool_; \
1324 } \
1325 static_assert(true, \
1326 "This assert is used to counter the false positive extra " \
1327 "semi-colon warnings")
1328
1329
1337class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
1338 public std::enable_shared_from_this<OmpExecutor> {
1339 friend class detail::ExecutorBase<OmpExecutor>;
1340
1341public:
1345 static std::shared_ptr<OmpExecutor> create(
1346 std::shared_ptr<CpuAllocatorBase> alloc =
1347 std::make_shared<CpuAllocator>())
1348 {
1349 return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
1350 }
1351
1352 std::shared_ptr<Executor> get_master() noexcept override;
1353
1354 std::shared_ptr<const Executor> get_master() const noexcept override;
1355
1356 void synchronize() const override;
1357
1358 int get_num_cores() const
1359 {
1360 return this->get_exec_info().num_computing_units;
1361 }
1362
1363 int get_num_threads_per_core() const
1364 {
1365 return this->get_exec_info().num_pu_per_cu;
1366 }
1367
1368 static int get_num_omp_threads();
1369
1370 scoped_device_id_guard get_scoped_device_id_guard() const override;
1371
1372protected:
1373 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1374 : alloc_{std::move(alloc)}
1375 {
1376 this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
1377 }
1378
1379 void populate_exec_info(const machine_topology* mach_topo) override;
1380
1381 void* raw_alloc(size_type size) const override;
1382
1383 void raw_free(void* ptr) const noexcept override;
1384
1385 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1386
1387 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
1388
1389 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1390
1391 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1392
1393 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1394
1395 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
1396
1397 std::shared_ptr<CpuAllocatorBase> alloc_;
1398};
1399
1400
1401namespace kernels {
1402namespace omp {
1403using DefaultExecutor = OmpExecutor;
1404} // namespace omp
1405} // namespace kernels
1406
1407
1416public:
1417 static std::shared_ptr<ReferenceExecutor> create(
1418 std::shared_ptr<CpuAllocatorBase> alloc =
1419 std::make_shared<CpuAllocator>())
1420 {
1421 return std::shared_ptr<ReferenceExecutor>(
1422 new ReferenceExecutor(std::move(alloc)));
1423 }
1424
1425 scoped_device_id_guard get_scoped_device_id_guard() const override
1426 {
1427 return {this, 0};
1428 }
1429
1430 void run(const Operation& op) const override
1431 {
1432 this->template log<log::Logger::operation_launched>(this, &op);
1433 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1434 this->shared_from_this()));
1435 this->template log<log::Logger::operation_completed>(this, &op);
1436 }
1437
1438protected:
1439 ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1440 : OmpExecutor{std::move(alloc)}
1441 {
1442 this->ReferenceExecutor::populate_exec_info(
1444 }
1445
1446 void populate_exec_info(const machine_topology*) override
1447 {
1448 this->get_exec_info().device_id = -1;
1449 this->get_exec_info().num_computing_units = 1;
1450 this->get_exec_info().num_pu_per_cu = 1;
1451 }
1452
1453 bool verify_memory_from(const Executor* src_exec) const override
1454 {
1455 return src_exec->verify_memory_to(this);
1456 }
1457
1458 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
1459
1460 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1461
1462 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1463
1464 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
1465
1466 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
1467};
1468
1469
1470namespace kernels {
1471namespace reference {
1472using DefaultExecutor = ReferenceExecutor;
1473} // namespace reference
1474} // namespace kernels
1475
1476
1483class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
1484 public std::enable_shared_from_this<CudaExecutor>,
1485 public detail::EnableDeviceReset {
1486 friend class detail::ExecutorBase<CudaExecutor>;
1487
1488public:
1500 GKO_DEPRECATED(
1501 "device_reset is deprecated entirely, call cudaDeviceReset directly. "
1502 "alloc_mode was replaced by the Allocator type "
1503 "hierarchy.")
1504 static std::shared_ptr<CudaExecutor> create(
1505 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1506 allocation_mode alloc_mode = default_cuda_alloc_mode,
1507 CUstream_st* stream = nullptr);
1508
1518 static std::shared_ptr<CudaExecutor> create(
1519 int device_id, std::shared_ptr<Executor> master,
1520 std::shared_ptr<CudaAllocatorBase> alloc =
1521 std::make_shared<CudaAllocator>(),
1522 CUstream_st* stream = nullptr);
1523
1524 std::shared_ptr<Executor> get_master() noexcept override;
1525
1526 std::shared_ptr<const Executor> get_master() const noexcept override;
1527
1528 void synchronize() const override;
1529
1530 scoped_device_id_guard get_scoped_device_id_guard() const override;
1531
1535 int get_device_id() const noexcept
1536 {
1537 return this->get_exec_info().device_id;
1538 }
1539
1543 static int get_num_devices();
1544
1549 {
1550 return this->get_exec_info().num_pu_per_cu;
1551 }
1552
1557 {
1558 return this->get_exec_info().num_computing_units;
1559 }
1560
1565 {
1566 return this->get_exec_info().num_computing_units *
1567 this->get_exec_info().num_pu_per_cu;
1568 }
1569
1574 {
1575 return this->get_exec_info().max_subgroup_size;
1576 }
1577
1582 {
1583 return this->get_exec_info().major;
1584 }
1585
1590 {
1591 return this->get_exec_info().minor;
1592 }
1593
1599 cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
1600
1607 {
1608 return cusparse_handle_.get();
1609 }
1610
1616 std::vector<int> get_closest_pus() const
1617 {
1618 return this->get_exec_info().closest_pu_ids;
1619 }
1620
1626 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1627
1634 CUstream_st* get_stream() const { return stream_; }
1635
1636protected:
1637 void set_gpu_property();
1638
1639 void init_handles();
1640
1641 CudaExecutor(int device_id, std::shared_ptr<Executor> master,
1642 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1643 : alloc_{std::move(alloc)}, master_(master), stream_{stream}
1644 {
1645 this->get_exec_info().device_id = device_id;
1646 this->get_exec_info().num_computing_units = 0;
1647 this->get_exec_info().num_pu_per_cu = 0;
1648 this->CudaExecutor::populate_exec_info(
1650 this->set_gpu_property();
1651 this->init_handles();
1652 }
1653
1654 void* raw_alloc(size_type size) const override;
1655
1656 void raw_free(void* ptr) const noexcept override;
1657
1658 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1659
1660 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1661
1662 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1663
1664 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1665
1666 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1667
1668 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1669
1670 void populate_exec_info(const machine_topology* mach_topo) override;
1671
1672private:
1673 std::shared_ptr<Executor> master_;
1674
1675 template <typename T>
1676 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1677 handle_manager<cublasContext> cublas_handle_;
1678 handle_manager<cusparseContext> cusparse_handle_;
1679 std::shared_ptr<CudaAllocatorBase> alloc_;
1680 CUstream_st* stream_;
1681};
1682
1683
1684namespace kernels {
1685namespace cuda {
1686using DefaultExecutor = CudaExecutor;
1687} // namespace cuda
1688} // namespace kernels
1689
1690
1697class HipExecutor : public detail::ExecutorBase<HipExecutor>,
1698 public std::enable_shared_from_this<HipExecutor>,
1699 public detail::EnableDeviceReset {
1700 friend class detail::ExecutorBase<HipExecutor>;
1701
1702public:
1714 GKO_DEPRECATED(
1715 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1716 "alloc_mode was replaced by the Allocator type "
1717 "hierarchy.")
1718 static std::shared_ptr<HipExecutor> create(
1719 int device_id, std::shared_ptr<Executor> master, bool device_reset,
1720 allocation_mode alloc_mode = default_hip_alloc_mode,
1721 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1722
1723 static std::shared_ptr<HipExecutor> create(
1724 int device_id, std::shared_ptr<Executor> master,
1725 std::shared_ptr<HipAllocatorBase> alloc =
1726 std::make_shared<HipAllocator>(),
1727 GKO_HIP_STREAM_STRUCT* stream = nullptr);
1728
1729 std::shared_ptr<Executor> get_master() noexcept override;
1730
1731 std::shared_ptr<const Executor> get_master() const noexcept override;
1732
1733 void synchronize() const override;
1734
1735 scoped_device_id_guard get_scoped_device_id_guard() const override;
1736
1740 int get_device_id() const noexcept
1741 {
1742 return this->get_exec_info().device_id;
1743 }
1744
1748 static int get_num_devices();
1749
1753 int get_num_warps_per_sm() const noexcept
1754 {
1755 return this->get_exec_info().num_pu_per_cu;
1756 }
1757
1761 int get_num_multiprocessor() const noexcept
1762 {
1763 return this->get_exec_info().num_computing_units;
1764 }
1765
1769 int get_major_version() const noexcept
1770 {
1771 return this->get_exec_info().major;
1772 }
1773
1777 int get_minor_version() const noexcept
1778 {
1779 return this->get_exec_info().minor;
1780 }
1781
1785 int get_num_warps() const noexcept
1786 {
1787 return this->get_exec_info().num_computing_units *
1788 this->get_exec_info().num_pu_per_cu;
1789 }
1790
1794 int get_warp_size() const noexcept
1795 {
1796 return this->get_exec_info().max_subgroup_size;
1797 }
1798
1804 hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
1805
1811 hipsparseContext* get_hipsparse_handle() const
1812 {
1813 return hipsparse_handle_.get();
1814 }
1815
1821 int get_closest_numa() const { return this->get_exec_info().numa_node; }
1822
1828 std::vector<int> get_closest_pus() const
1829 {
1830 return this->get_exec_info().closest_pu_ids;
1831 }
1832
1833 GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
1834
1835protected:
1836 void set_gpu_property();
1837
1838 void init_handles();
1839
1840 HipExecutor(int device_id, std::shared_ptr<Executor> master,
1841 std::shared_ptr<HipAllocatorBase> alloc,
1842 GKO_HIP_STREAM_STRUCT* stream)
1843 : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1844 {
1845 this->get_exec_info().device_id = device_id;
1846 this->get_exec_info().num_computing_units = 0;
1847 this->get_exec_info().num_pu_per_cu = 0;
1848 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1849 this->set_gpu_property();
1850 this->init_handles();
1851 }
1852
1853 void* raw_alloc(size_type size) const override;
1854
1855 void raw_free(void* ptr) const noexcept override;
1856
1857 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1858
1859 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
1860
1861 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
1862
1863 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
1864
1865 bool verify_memory_to(const CudaExecutor* dest_exec) const override;
1866
1867 bool verify_memory_to(const HipExecutor* dest_exec) const override;
1868
1869 void populate_exec_info(const machine_topology* mach_topo) override;
1870
1871private:
1872 std::shared_ptr<Executor> master_;
1873
1874 template <typename T>
1875 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1876 handle_manager<hipblasContext> hipblas_handle_;
1877 handle_manager<hipsparseContext> hipsparse_handle_;
1878 std::shared_ptr<HipAllocatorBase> alloc_;
1879 GKO_HIP_STREAM_STRUCT* stream_;
1880};
1881
1882
1883namespace kernels {
1884namespace hip {
1885using DefaultExecutor = HipExecutor;
1886} // namespace hip
1887} // namespace kernels
1888
1889
1896class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
1897 public std::enable_shared_from_this<DpcppExecutor> {
1898 friend class detail::ExecutorBase<DpcppExecutor>;
1899
1900public:
1910 static std::shared_ptr<DpcppExecutor> create(
1911 int device_id, std::shared_ptr<Executor> master,
1912 std::string device_type = "all",
1913 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1914
1915 std::shared_ptr<Executor> get_master() noexcept override;
1916
1917 std::shared_ptr<const Executor> get_master() const noexcept override;
1918
1919 void synchronize() const override;
1920
1921 scoped_device_id_guard get_scoped_device_id_guard() const override;
1922
1928 int get_device_id() const noexcept
1929 {
1930 return this->get_exec_info().device_id;
1931 }
1932
1933 sycl::queue* get_queue() const { return queue_.get(); }
1934
1942 static int get_num_devices(std::string device_type);
1943
1949 const std::vector<int>& get_subgroup_sizes() const noexcept
1950 {
1951 return this->get_exec_info().subgroup_sizes;
1952 }
1953
1959 int get_num_computing_units() const noexcept
1960 {
1961 return this->get_exec_info().num_computing_units;
1962 }
1963
1967 int get_num_subgroups() const noexcept
1968 {
1969 return this->get_exec_info().num_computing_units *
1970 this->get_exec_info().num_pu_per_cu;
1971 }
1972
1978 const std::vector<int>& get_max_workitem_sizes() const noexcept
1979 {
1980 return this->get_exec_info().max_workitem_sizes;
1981 }
1982
1988 int get_max_workgroup_size() const noexcept
1989 {
1990 return this->get_exec_info().max_workgroup_size;
1991 }
1992
1998 int get_max_subgroup_size() const noexcept
1999 {
2000 return this->get_exec_info().max_subgroup_size;
2001 }
2002
2008 std::string get_device_type() const noexcept
2009 {
2010 return this->get_exec_info().device_type;
2011 }
2012
2013protected:
2014 void set_device_property(
2015 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2016
2018 int device_id, std::shared_ptr<Executor> master,
2019 std::string device_type = "all",
2020 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2021 : master_(master)
2022 {
2023 std::for_each(device_type.begin(), device_type.end(),
2024 [](char& c) { c = std::tolower(c); });
2025 this->get_exec_info().device_type = std::string(device_type);
2026 this->get_exec_info().device_id = device_id;
2027 this->set_device_property(property);
2028 }
2029
2030 void populate_exec_info(const machine_topology* mach_topo) override;
2031
2032 void* raw_alloc(size_type size) const override;
2033
2034 void raw_free(void* ptr) const noexcept override;
2035
2036 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2037
2038 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
2039
2040 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
2041
2042 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
2043
2044 bool verify_memory_to(const OmpExecutor* dest_exec) const override;
2045
2046 bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
2047
2048private:
2049 std::shared_ptr<Executor> master_;
2050
2051 template <typename T>
2052 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2053 queue_manager<sycl::queue> queue_;
2054};
2055
2056
2057namespace kernels {
2058namespace dpcpp {
2059using DefaultExecutor = DpcppExecutor;
2060} // namespace dpcpp
2061} // namespace kernels
2062
2063
2064#undef GKO_OVERRIDE_RAW_COPY_TO
2065
2066
2067} // namespace gko
2068
2069
2070#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:40
Allocator using cudaMalloc.
Definition memory.hpp:102
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1485
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1616
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1626
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1548
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1581
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1556
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1573
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1606
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1634
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1589
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1564
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1599
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1897
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:1949
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:1959
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:1988
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:1967
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:1978
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:1998
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2008
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:616
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:692
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:655
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:849
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:862
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:764
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:714
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:837
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:675
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:779
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:821
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:65
Definition memory.hpp:172
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1699
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1753
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1769
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1828
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1777
static int get_num_devices()
Get the number of devices present on the system.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1761
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1785
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1811
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1821
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1794
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1804
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:128
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1338
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1345
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:259
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1415
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1430
This is a deleter that uses an executor's free method to deallocate the data.
Definition executor.hpp:1171
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1180
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1189
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:749
Definition logger.hpp:76
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:643
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:183
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:43
T * get() const
Definition utils_helper.hpp:77
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:76
The Ginkgo namespace.
Definition abstract_factory.hpp:20
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:775
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:144
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:92
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:35
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
@ never
Events only get reported at loggers attached to the triggering object.
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:63