33#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
   34#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ 
   49#include <ginkgo/core/base/device.hpp> 
   50#include <ginkgo/core/base/fwd_decls.hpp> 
   51#include <ginkgo/core/base/machine_topology.hpp> 
   52#include <ginkgo/core/base/memory.hpp> 
   53#include <ginkgo/core/base/scoped_device_id_guard.hpp> 
   54#include <ginkgo/core/base/types.hpp> 
   55#include <ginkgo/core/log/logger.hpp> 
   56#include <ginkgo/core/synthesizer/containers.hpp> 
   97constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
 
   99constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
  105    allocation_mode::unified_global;
 
  107#if (GINKGO_HIP_PLATFORM_HCC == 1) 
  110constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
  116    allocation_mode::unified_global;
 
  130enum class dpcpp_queue_property {
 
  142GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
 
  143                                                         dpcpp_queue_property b)
 
  145    return static_cast<dpcpp_queue_property
>(
static_cast<int>(a) |
 
  146                                             static_cast<int>(b));
 
  153#define GKO_FORWARD_DECLARE(_type, ...) class _type 
  155GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
 
  157#undef GKO_FORWARD_DECLARE 
  160class ReferenceExecutor;
 
  289#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \ 
  290    virtual void run(std::shared_ptr<const _type>) const 
  292    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
 
  294#undef GKO_DECLARE_RUN_OVERLOAD 
  297    virtual void run(std::shared_ptr<const ReferenceExecutor> 
executor) 
const;
 
 
  320template <
typename Closure>
 
  321class RegisteredOperation : 
public Operation {
 
  329    RegisteredOperation(
const char* name, Closure op)
 
  330        : name_(name), op_(std::move(op))
 
  333    const char* get_name() 
const noexcept override { 
return name_; }
 
  335    void run(std::shared_ptr<const ReferenceExecutor> exec)
 const override 
  340    void run(std::shared_ptr<const OmpExecutor> exec)
 const override 
  345    void run(std::shared_ptr<const CudaExecutor> exec)
 const override 
  350    void run(std::shared_ptr<const HipExecutor> exec)
 const override 
  355    void run(std::shared_ptr<const DpcppExecutor> exec)
 const override 
  366template <
typename Closure>
 
  448#define GKO_REGISTER_OPERATION(_name, _kernel)                                 \ 
  449    template <typename... Args>                                                \ 
  450    auto make_##_name(Args&&... args)                                          \ 
  452        return ::gko::detail::make_register_operation(                         \ 
  453            #_kernel, [&args...](auto exec) {                                  \ 
  454                using exec_type = decltype(exec);                              \ 
  457                        std::shared_ptr<const ::gko::ReferenceExecutor>>::     \ 
  459                    ::gko::kernels::reference::_kernel(                        \ 
  460                        std::dynamic_pointer_cast<                             \ 
  461                            const ::gko::ReferenceExecutor>(exec),             \ 
  462                        std::forward<Args>(args)...);                          \ 
  463                } else if (std::is_same<                                       \ 
  465                               std::shared_ptr<const ::gko::OmpExecutor>>::    \ 
  467                    ::gko::kernels::omp::_kernel(                              \ 
  468                        std::dynamic_pointer_cast<const ::gko::OmpExecutor>(   \ 
  470                        std::forward<Args>(args)...);                          \ 
  471                } else if (std::is_same<                                       \ 
  473                               std::shared_ptr<const ::gko::CudaExecutor>>::   \ 
  475                    ::gko::kernels::cuda::_kernel(                             \ 
  476                        std::dynamic_pointer_cast<const ::gko::CudaExecutor>(  \ 
  478                        std::forward<Args>(args)...);                          \ 
  479                } else if (std::is_same<                                       \ 
  481                               std::shared_ptr<const ::gko::HipExecutor>>::    \ 
  483                    ::gko::kernels::hip::_kernel(                              \ 
  484                        std::dynamic_pointer_cast<const ::gko::HipExecutor>(   \ 
  486                        std::forward<Args>(args)...);                          \ 
  487                } else if (std::is_same<                                       \ 
  489                               std::shared_ptr<const ::gko::DpcppExecutor>>::  \ 
  491                    ::gko::kernels::dpcpp::_kernel(                            \ 
  492                        std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \ 
  494                        std::forward<Args>(args)...);                          \ 
  496                    GKO_NOT_IMPLEMENTED;                                       \ 
  500    static_assert(true,                                                        \ 
  501                  "This assert is used to counter the false positive extra "   \ 
  502                  "semi-colon warnings") 
 
  542#define GKO_REGISTER_HOST_OPERATION(_name, _kernel)                          \ 
  543    template <typename... Args>                                              \ 
  544    auto make_##_name(Args&&... args)                                        \ 
  546        return ::gko::detail::make_register_operation(                       \ 
  548            [&args...](auto) { _kernel(std::forward<Args>(args)...); });     \ 
  550    static_assert(true,                                                      \ 
  551                  "This assert is used to counter the false positive extra " \ 
  552                  "semi-colon warnings") 
 
  555#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type 
  645    template <
typename T>
 
  646    friend class detail::ExecutorBase;
 
  648    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
 
  681    template <
typename ClosureOmp, 
typename ClosureCuda, 
typename ClosureHip,
 
  682              typename ClosureDpcpp>
 
  683    void run(
const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
  684             const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 const 
  686        LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
 
  687            op_omp, op_cuda, op_hip, op_dpcpp);
 
 
  702    template <
typename T>
 
  705        this->
template log<log::Logger::allocation_started>(
 
  706            this, num_elems * 
sizeof(T));
 
  707        T* allocated = 
static_cast<T*
>(this->raw_alloc(num_elems * 
sizeof(T)));
 
  708        this->
template log<log::Logger::allocation_completed>(
 
  709            this, num_elems * 
sizeof(T), 
reinterpret_cast<uintptr>(allocated));
 
 
  720    void free(
void* ptr) 
const noexcept 
  722        this->
template log<log::Logger::free_started>(
 
  723            this, 
reinterpret_cast<uintptr>(ptr));
 
  725        this->
template log<log::Logger::free_completed>(
 
  726            this, 
reinterpret_cast<uintptr>(ptr));
 
 
  741    template <
typename T>
 
  743                   const T* src_ptr, T* dest_ptr)
 const 
  745        const auto src_loc = 
reinterpret_cast<uintptr>(src_ptr);
 
  746        const auto dest_loc = 
reinterpret_cast<uintptr>(dest_ptr);
 
  747        this->
template log<log::Logger::copy_started>(
 
  748            src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  749        if (
this != src_exec.
get()) {
 
  750            src_exec->template log<log::Logger::copy_started>(
 
  751                src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  754            this->raw_copy_from(src_exec.
get(), num_elems * 
sizeof(T), src_ptr,
 
  757#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG) 
  760            std::clog << 
"Not direct copy. Try to copy data from the masters." 
  763            auto src_master = src_exec->get_master().
get();
 
  764            if (num_elems > 0 && src_master != src_exec.
get()) {
 
  765                auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
 
  766                src_master->copy_from<T>(src_exec, num_elems, src_ptr,
 
  768                this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
 
  769                src_master->free(master_ptr);
 
  772        this->
template log<log::Logger::copy_completed>(
 
  773            src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
  774        if (
this != src_exec.
get()) {
 
  775            src_exec->template log<log::Logger::copy_completed>(
 
  776                src_exec.
get(), 
this, src_loc, dest_loc, num_elems * 
sizeof(T));
 
 
  791    template <
typename T>
 
  794        this->copy_from(
this, num_elems, src_ptr, dest_ptr);
 
 
  806    template <
typename T>
 
  810        this->get_master()->copy_from(
this, 1, ptr, &out);
 
 
  823    virtual std::shared_ptr<const 
Executor> get_master() const noexcept = 0;
 
  828    virtual 
void synchronize() const = 0;
 
  836    void add_logger(std::shared_ptr<const log::Logger> logger)
 override 
  838        this->propagating_logger_refcount_.fetch_add(
 
  839            logger->needs_propagation() ? 1 : 0);
 
  840        this->EnableLogging<Executor>::add_logger(logger);
 
 
  851        this->propagating_logger_refcount_.fetch_sub(
 
  853        this->EnableLogging<Executor>::remove_logger(logger);
 
 
  856    using EnableLogging<
Executor>::remove_logger;
 
  867        log_propagation_mode_ = mode;
 
 
  879        return this->propagating_logger_refcount_.load() > 0 &&
 
  880               log_propagation_mode_ == log_propagation_mode::automatic;
 
 
  892        return this->verify_memory_from(other.get());
 
 
  911        std::string device_type;
 
  926        int num_computing_units = -1;
 
  939        int num_pu_per_cu = -1;
 
  949        std::vector<int> subgroup_sizes{};
 
  959        int max_subgroup_size = -1;
 
  971        std::vector<int> max_workitem_sizes{};
 
  982        int max_workgroup_size;
 
  999        std::string pci_bus_id = std::string(13, 
'x');
 
 1011        std::vector<int> closest_pu_ids{};
 
 1019    const exec_info& get_exec_info()
 const { 
return this->exec_info_; }
 
 1030    virtual void* raw_alloc(size_type size) 
const = 0;
 
 1039    virtual void raw_free(
void* ptr) 
const noexcept = 0;
 
 1051    virtual void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
 
 1052                               const void* src_ptr, 
void* dest_ptr) 
const = 0;
 
 1063#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...)                              \ 
 1064    virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \ 
 1065                             const void* src_ptr, void* dest_ptr) const = 0 
 1067    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
 
 1069#undef GKO_ENABLE_RAW_COPY_TO 
 1078    virtual bool verify_memory_from(
const Executor* src_exec) 
const = 0;
 
 1089#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \ 
 1090    virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0 
 1092    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
 
 1094    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
 
 1096#undef GKO_ENABLE_VERIFY_MEMORY_TO 
 1104    virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
 
 1111    exec_info& get_exec_info() { 
return this->exec_info_; }
 
 1113    exec_info exec_info_;
 
 1117    std::atomic<int> propagating_logger_refcount_{};
 
 1134    template <
typename ClosureOmp, 
typename ClosureCuda, 
typename ClosureHip,
 
 1135              typename ClosureDpcpp>
 
 1136    class LambdaOperation : 
public Operation {
 
 1148        LambdaOperation(
const ClosureOmp& op_omp, 
const ClosureCuda& op_cuda,
 
 1149                        const ClosureHip& op_hip, 
const ClosureDpcpp& op_dpcpp)
 
 1156        void run(std::shared_ptr<const OmpExecutor>)
 const override 
 1161        void run(std::shared_ptr<const ReferenceExecutor>)
 const override 
 1166        void run(std::shared_ptr<const CudaExecutor>)
 const override 
 1171        void run(std::shared_ptr<const HipExecutor>)
 const override 
 1176        void run(std::shared_ptr<const DpcppExecutor>)
 const override 
 1183        ClosureCuda op_cuda_;
 
 1185        ClosureDpcpp op_dpcpp_;
 
 
 1198template <
typename T>
 
 1225    std::shared_ptr<const Executor> exec_;
 
 
 1229template <
typename T>
 
 1232    using pointer = T[];
 
 1238    void operator()(pointer ptr)
 const 
 1246    std::shared_ptr<const Executor> exec_;
 
 
 1253template <
typename ConcreteExecutor>
 
 1254class ExecutorBase : 
public Executor {
 
 1255    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
 
 1259    using Executor::run;
 
 1261    void run(
const Operation& op)
 const override 
 1263        this->
template log<log::Logger::operation_launched>(
this, &op);
 
 1264        auto scope_guard = get_scoped_device_id_guard();
 
 1265        op.run(self()->shared_from_this());
 
 1266        this->
template log<log::Logger::operation_completed>(
this, &op);
 
 1270    void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
 
 1271                       const void* src_ptr, 
void* dest_ptr)
 const override 
 1273        src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
 
 1276    virtual bool verify_memory_from(
const Executor* src_exec)
 const override 
 1278        return src_exec->verify_memory_to(self());
 
 1282    ConcreteExecutor* self() noexcept
 
 1284        return static_cast<ConcreteExecutor*
>(
this);
 
 1287    const ConcreteExecutor* self() const noexcept
 
 1289        return static_cast<const ConcreteExecutor*
>(
this);
 
 1293#undef GKO_DECLARE_EXECUTOR_FRIEND 
 1303class EnableDeviceReset {
 
 1311        "device_reset is no longer supported, call " 
 1312        "cudaDeviceReset/hipDeviceReset manually")
 
 1313    void set_device_reset(
bool device_reset) {}
 
 1321        "device_reset is no longer supported, call " 
 1322        "cudaDeviceReset/hipDeviceReset manually")
 
 1323    bool get_device_reset() { 
return false; }
 
 1331    EnableDeviceReset() {}
 
 1334        "device_reset is no longer supported, call " 
 1335        "cudaDeviceReset/hipDeviceReset manually")
 
 1336    EnableDeviceReset(
bool device_reset) {}
 
 1343#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...)                    \ 
 1344    void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \ 
 1345                     const void* src_ptr, void* dest_ptr) const override 
 1348#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                     \ 
 1349    virtual bool verify_memory_to(const dest_* other) const override         \ 
 1353    static_assert(true,                                                      \ 
 1354                  "This assert is used to counter the false positive extra " \ 
 1355                  "semi-colon warnings") 
 1366                    public std::enable_shared_from_this<OmpExecutor> {
 
 1367    friend class detail::ExecutorBase<OmpExecutor>;
 
 1374        std::shared_ptr<CpuAllocatorBase> alloc =
 
 1375            std::make_shared<CpuAllocator>())
 
 1377        return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
 
 
 1386    int get_num_cores()
 const 
 1388        return this->get_exec_info().num_computing_units;
 
 1391    int get_num_threads_per_core()
 const 
 1393        return this->get_exec_info().num_pu_per_cu;
 
 1396    static int get_num_omp_threads();
 
 1398    scoped_device_id_guard get_scoped_device_id_guard() 
const override;
 
 1401    OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
 
 1402        : alloc_{std::
move(alloc)}
 
 1407    void populate_exec_info(
const machine_topology* 
mach_topo) 
override;
 
 1409    void* raw_alloc(
size_type size) 
const override;
 
 1411    void raw_free(
void* ptr) 
const noexcept override;
 
 1413    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1415    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
true);
 
 1417    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1419    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 1421    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 1423    bool verify_memory_to(
const DpcppExecutor* 
dest_exec) 
const override;
 
 1425    std::shared_ptr<CpuAllocatorBase> alloc_;
 
 
 1431using DefaultExecutor = OmpExecutor;
 
 1445    static std::shared_ptr<ReferenceExecutor> create(
 
 1446        std::shared_ptr<CpuAllocatorBase> alloc =
 
 1447            std::make_shared<CpuAllocator>())
 
 1449        return std::shared_ptr<ReferenceExecutor>(
 
 1461        op.run(std::static_pointer_cast<const ReferenceExecutor>(
 
 
 1470        this->ReferenceExecutor::populate_exec_info(
 
 1474    void populate_exec_info(
const machine_topology*)
 override 
 1476        this->get_exec_info().device_id = -1;
 
 1477        this->get_exec_info().num_computing_units = 1;
 
 1478        this->get_exec_info().num_pu_per_cu = 1;
 
 1481    bool verify_memory_from(
const Executor* 
src_exec)
 const override 
 1483        return src_exec->verify_memory_to(
this);
 
 1486    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
true);
 
 1488    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1490    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1492    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 1494    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 
 1499namespace reference {
 
 1500using DefaultExecutor = ReferenceExecutor;
 
 1512                     public std::enable_shared_from_this<CudaExecutor>,
 
 1513                     public detail::EnableDeviceReset {
 
 1514    friend class detail::ExecutorBase<CudaExecutor>;
 
 1529        "device_reset is deprecated entirely, call cudaDeviceReset directly. " 
 1530        "alloc_mode was replaced by the Allocator type " 
 1565        return this->get_exec_info().device_id;
 
 
 1578        return this->get_exec_info().num_pu_per_cu;
 
 
 1586        return this->get_exec_info().num_computing_units;
 
 
 1594        return this->get_exec_info().num_computing_units *
 
 1595               this->get_exec_info().num_pu_per_cu;
 
 
 1603        return this->get_exec_info().max_subgroup_size;
 
 
 1611        return this->get_exec_info().major;
 
 
 1619        return this->get_exec_info().minor;
 
 
 1636        return cusparse_handle_.get();
 
 
 1646        return this->get_exec_info().closest_pu_ids;
 
 
 1665    void set_gpu_property();
 
 1667    void init_handles();
 
 1673        this->get_exec_info().device_id = device_id;
 
 1674        this->get_exec_info().num_computing_units = 0;
 
 1675        this->get_exec_info().num_pu_per_cu = 0;
 
 1676        this->CudaExecutor::populate_exec_info(
 
 1678        this->set_gpu_property();
 
 1679        this->init_handles();
 
 1682    void* raw_alloc(
size_type size) 
const override;
 
 1684    void raw_free(
void* ptr) 
const noexcept override;
 
 1686    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1688    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1690    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1692    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1694    bool verify_memory_to(
const HipExecutor* 
dest_exec) 
const override;
 
 1696    bool verify_memory_to(
const CudaExecutor* 
dest_exec) 
const override;
 
 1698    void populate_exec_info(
const machine_topology* 
mach_topo) 
override;
 
 1701    std::shared_ptr<Executor> master_;
 
 1703    template <
typename T>
 
 1704    using handle_manager = std::unique_ptr<
T, std::function<
void(
T*)>>;
 
 1707    std::shared_ptr<CudaAllocatorBase> alloc_;
 
 
 1714using DefaultExecutor = CudaExecutor;
 
 1726                    public std::enable_shared_from_this<HipExecutor>,
 
 1727                    public detail::EnableDeviceReset {
 
 1743        "device_reset is deprecated entirely, call hipDeviceReset directly. " 
 1744        "alloc_mode was replaced by the Allocator type " 
 1747        int device_id, std::shared_ptr<
Executor> master, 
bool device_reset,
 
 1749        GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1752        int device_id, std::shared_ptr<
Executor> master,
 
 1755        GKO_HIP_STREAM_STRUCT* stream = 
nullptr);
 
 1757    std::shared_ptr<
Executor> get_master() noexcept override;
 
 1759    std::shared_ptr<const 
Executor> get_master() const noexcept override;
 
 1761    void synchronize() const override;
 
 1768    int get_device_id() const noexcept
 
 1770        return this->get_exec_info().device_id;
 
 
 1783        return this->get_exec_info().num_pu_per_cu;
 
 
 1791        return this->get_exec_info().num_computing_units;
 
 
 1799        return this->get_exec_info().major;
 
 
 1807        return this->get_exec_info().minor;
 
 
 1815        return this->get_exec_info().num_computing_units *
 
 1816               this->get_exec_info().num_pu_per_cu;
 
 
 1824        return this->get_exec_info().max_subgroup_size;
 
 
 1841        return hipsparse_handle_.get();
 
 
 1858        return this->get_exec_info().closest_pu_ids;
 
 
 1861    GKO_HIP_STREAM_STRUCT* get_stream()
 const { 
return stream_; }
 
 1864    void set_gpu_property();
 
 1866    void init_handles();
 
 1868    HipExecutor(
int device_id, std::shared_ptr<Executor> master,
 
 1869                std::shared_ptr<HipAllocatorBase> alloc,
 
 1870                GKO_HIP_STREAM_STRUCT* stream)
 
 1871        : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
 
 1873        this->get_exec_info().device_id = device_id;
 
 1874        this->get_exec_info().num_computing_units = 0;
 
 1875        this->get_exec_info().num_pu_per_cu = 0;
 
 1876        this->HipExecutor::populate_exec_info(machine_topology::get_instance());
 
 1877        this->set_gpu_property();
 
 1878        this->init_handles();
 
 1881    void* raw_alloc(size_type size) 
const override;
 
 1883    void raw_free(
void* ptr) 
const noexcept override;
 
 1885    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 1887    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, 
false);
 
 1889    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 1891    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, 
false);
 
 1893    bool verify_memory_to(
const CudaExecutor* dest_exec) 
const override;
 
 1895    bool verify_memory_to(
const HipExecutor* dest_exec) 
const override;
 
 1897    void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 1900    std::shared_ptr<Executor> master_;
 
 1902    template <
typename T>
 
 1903    using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 1904    handle_manager<hipblasContext> hipblas_handle_;
 
 1905    handle_manager<hipsparseContext> hipsparse_handle_;
 
 1906    std::shared_ptr<HipAllocatorBase> alloc_;
 
 1907    GKO_HIP_STREAM_STRUCT* stream_;
 
 
 1913using DefaultExecutor = HipExecutor;
 
 1925                      public std::enable_shared_from_this<DpcppExecutor> {
 
 1938    static std::shared_ptr<DpcppExecutor> 
create(
 
 1939        int device_id, std::shared_ptr<Executor> master,
 
 1940        std::string device_type = 
"all",
 
 1941        dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 1945    std::shared_ptr<const 
Executor> get_master() const noexcept override;
 
 1947    void synchronize() const override;
 
 1956    int get_device_id() const noexcept
 
 1958        return this->get_exec_info().device_id;
 
 
 1961    sycl::queue* get_queue()
 const { 
return queue_.get(); }
 
 1979        return this->get_exec_info().subgroup_sizes;
 
 
 1989        return this->get_exec_info().num_computing_units;
 
 
 1997        return this->get_exec_info().num_computing_units *
 
 1998               this->get_exec_info().num_pu_per_cu;
 
 
 2008        return this->get_exec_info().max_workitem_sizes;
 
 
 2018        return this->get_exec_info().max_workgroup_size;
 
 
 2028        return this->get_exec_info().max_subgroup_size;
 
 
 2038        return this->get_exec_info().device_type;
 
 
 2042    void set_device_property(
 
 2043        dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
 2046        int device_id, std::shared_ptr<Executor> master,
 
 2047        std::string device_type = 
"all",
 
 2048        dpcpp_queue_property property = dpcpp_queue_property::in_order)
 
 2051        std::for_each(device_type.begin(), device_type.end(),
 
 2052                      [](
char& c) { c = std::tolower(c); });
 
 2053        this->get_exec_info().device_type = std::string(device_type);
 
 2054        this->get_exec_info().device_id = device_id;
 
 2055        this->set_device_property(property);
 
 2058    void populate_exec_info(
const machine_topology* mach_topo) 
override;
 
 2060    void* raw_alloc(size_type size) 
const override;
 
 2062    void raw_free(
void* ptr) 
const noexcept override;
 
 2064    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
 2066    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, 
false);
 
 2068    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, 
false);
 
 2070    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, 
false);
 
 2072    bool verify_memory_to(
const OmpExecutor* dest_exec) 
const override;
 
 2074    bool verify_memory_to(
const DpcppExecutor* dest_exec) 
const override;
 
 2077    std::shared_ptr<Executor> master_;
 
 2079    template <
typename T>
 
 2080    using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
 
 2081    queue_manager<sycl::queue> queue_;
 
 
 2087using DefaultExecutor = DpcppExecutor;
 
 2092#undef GKO_OVERRIDE_RAW_COPY_TO 
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:68
Allocator using cudaMalloc.
Definition memory.hpp:130
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1513
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1644
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1654
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1576
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1609
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1584
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1601
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1634
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1662
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1617
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1592
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1627
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1925
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:1977
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:1987
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2016
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:1995
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2006
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2026
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2036
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:644
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:720
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:877
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:890
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:792
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:742
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:865
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:703
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:807
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:849
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:93
Definition memory.hpp:200
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1727
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1781
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1797
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1856
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1805
static int get_num_devices()
Get the number of devices present on the system.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1789
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1813
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1839
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1849
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1822
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1832
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:156
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1366
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1373
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:287
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1443
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1458
This is a deleter that uses an executor's free method to deallocate the data.
Definition executor.hpp:1199
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1208
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1217
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:777
Definition logger.hpp:104
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:671
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:211
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:71
T * get() const
Definition utils_helper.hpp:105
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:104
The Ginkgo namespace.
Definition abstract_factory.hpp:48
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:803
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:172
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:120
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:63
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
@ never
Events only get reported at loggers attached to the triggering object.
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:91