
/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/

#ifndef hypre_UTILITIES_HPP
#define hypre_UTILITIES_HPP


#ifdef __cplusplus
extern "C++" {
#endif

/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_FUNCTORS_H
#define HYPRE_FUNCTORS_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/*--------------------------------------------------------------------------
 * hypreFunctor_DenseMatrixIdentity
 *
 * Functor for generating a dense identity matrix.
 * This assumes that the input array "a" is zeros everywhere
 *--------------------------------------------------------------------------*/

struct hypreFunctor_DenseMatrixIdentity
{
   HYPRE_Int   n_;
   HYPRE_Real *a_;

   hypreFunctor_DenseMatrixIdentity(HYPRE_Int n, HYPRE_Real *a)
   {
      n_ = n;
      a_ = a;
   }

   __host__ __device__ void operator()(HYPRE_Int i)
   {
      a_[i * n_ + i] = 1.0;
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_ArrayStridedAccess
 *
 * Functor for performing strided data access on a templated array.
 *
 * The stride interval "s_" is used to access every "s_"-th element
 * from the source array "a_".
 *
 * It is templated to support various data types for the array.
 *--------------------------------------------------------------------------*/

template <typename T>
struct hypreFunctor_ArrayStridedAccess
{
   HYPRE_Int  s_;
   T         *a_;

   hypreFunctor_ArrayStridedAccess(HYPRE_Int s, T *a) : s_(s), a_(a) {}

   __host__ __device__ T operator()(HYPRE_Int i)
   {
      return a_[i * s_];
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_IndexStrided
 *
 * This functor multiplies a given index "i" by a specified stride "s_".
 *
 * It is templated to support various data types for the index and stride.
 *--------------------------------------------------------------------------*/

template <typename T>
struct hypreFunctor_IndexStrided
{
   T s_;

   hypreFunctor_IndexStrided(T s) : s_(s) {}

   __host__ __device__ T operator()(const T i) const
   {
      return i * s_;
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_IndexCycle
 *--------------------------------------------------------------------------*/

struct hypreFunctor_IndexCycle
{
   HYPRE_Int cycle_length;

   hypreFunctor_IndexCycle(HYPRE_Int _cycle_length) : cycle_length(_cycle_length) {}

   __host__ __device__ HYPRE_Int operator()(HYPRE_Int i) const
   {
      return i % cycle_length;
   }
};

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
#endif /* ifndef HYPRE_FUNCTORS_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_PREDICATES_H
#define HYPRE_PREDICATES_H

/******************************************************************************
 *
 * Header file defining predicates for thrust used throughout hypre
 *
 *****************************************************************************/

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/*--------------------------------------------------------------------------
 * hyprePred_StridedAccess
 *
 * This struct defines a predicate for strided access in array-like data.
 *
 * It is used to determine if an element at a given index should be processed
 * or not, based on a specified stride. The operator() returns true when the
 * index is a multiple of the stride, indicating the element at that index
 * is part of the strided subset.
 *--------------------------------------------------------------------------*/

struct hyprePred_StridedAccess
{
   HYPRE_Int  s_;

   hyprePred_StridedAccess(HYPRE_Int s) : s_(s) {}

   __host__ __device__ HYPRE_Int operator()(const HYPRE_Int i) const
   {
      return (!(i % s_));
   }
};

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
#endif /* ifndef HYPRE_PREDICATES_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef DEVICE_ALLOCATOR_H
#define DEVICE_ALLOCATOR_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* C++ style memory allocator for the device using the abstract memory model */
struct hypre_device_allocator
{
   typedef char value_type;

   hypre_device_allocator()
   {
      // constructor
   }

   ~hypre_device_allocator()
   {
      // destructor
   }

   char *allocate(std::ptrdiff_t num_bytes)
   {
      return hypre_TAlloc(char, num_bytes, HYPRE_MEMORY_DEVICE);
   }

   void deallocate(char *ptr, size_t n)
   {
      HYPRE_UNUSED_VAR(n);
      hypre_TFree(ptr, HYPRE_MEMORY_DEVICE);
   }
};

#endif /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

#endif
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_DEVICE_UTILS_H
#define HYPRE_DEVICE_UTILS_H

#if defined(HYPRE_USING_GPU)

/* Data types depending on GPU architecture */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
typedef hypre_uint          hypre_mask;
#define hypre_mask_one      1U

#elif defined(HYPRE_USING_HIP)
typedef hypre_ulonglongint  hypre_mask;
#define hypre_mask_one      1ULL

#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          cuda includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA)
using hypre_DeviceItem = void*;
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#if defined(HYPRE_USING_CURAND)
#include <curand.h>
#endif

#if defined(HYPRE_USING_CUBLAS)
#include <cublas_v2.h>
#endif

#if defined(HYPRE_USING_CUSPARSE)
/* Note (VPM) : As of cuSPARSE 12, ILU functionalities have been marked as deprecated.
   The following definition avoids compilation warnings regarding the use of ILU routines */
#define DISABLE_CUSPARSE_DEPRECATED
#include <cusparse.h>
#endif

#if defined(HYPRE_USING_CUSOLVER)
#include <cusolverDn.h>
#endif

#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif

#ifndef CUDA_VERSION
#error CUDA_VERSION Undefined!
#endif

#if CUDA_VERSION >= 11000
#define THRUST_IGNORE_DEPRECATED_CPP11
#define CUB_IGNORE_DEPRECATED_CPP11
#define THRUST_IGNORE_DEPRECATED_CPP_DIALECT
#define CUB_IGNORE_DEPRECATED_CPP_DIALECT
#endif

#ifndef CUSPARSE_VERSION
#if defined(CUSPARSE_VER_MAJOR) && defined(CUSPARSE_VER_MINOR) && defined(CUSPARSE_VER_PATCH)
#define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR *  100 + CUSPARSE_VER_PATCH)
#else
#define CUSPARSE_VERSION CUDA_VERSION
#endif
#endif

#define CUSPARSE_NEWAPI_VERSION 11000
#define CUSPARSE_NEWSPMM_VERSION 11401
#define CUDA_MALLOCASYNC_VERSION 11020
#define CUDA_THRUST_NOSYNC_VERSION 12000

#define CUSPARSE_SPSV_VERSION 11600
#if CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION
#define hypre_cusparseSpSVDescr         cusparseSpSVDescr_t
#define hypre_cusparseSpSV_createDescr  cusparseSpSV_createDescr
#define hypre_cusparseSpSV_destroyDescr cusparseSpSV_destroyDescr
#else
#define hypre_cusparseSpSVDescr         csrsv2Info_t
#define hypre_cusparseSpSV_createDescr  cusparseCreateCsrsv2Info
#define hypre_cusparseSpSV_destroyDescr cusparseDestroyCsrsv2Info
#endif

#define CUSPARSE_SPSM_VERSION 11600
#if CUSPARSE_VERSION >= CUSPARSE_SPSM_VERSION
#define hypre_cusparseSpSMDescr         cusparseSpSMDescr_t
#define hypre_cusparseSpSM_createDescr  cusparseSpSM_createDescr
#define hypre_cusparseSpSM_destroyDescr cusparseSpSM_destroyDescr
#else
#define hypre_cusparseSpSMDescr         csrsm2Info_t
#define hypre_cusparseSpSM_createDescr  cusparseCreateCsrsm2Info
#define hypre_cusparseSpSM_destroyDescr cusparseDestroyCsrsm2Info
#endif

#if defined(HYPRE_USING_DEVICE_MALLOC_ASYNC)
#if CUDA_VERSION < CUDA_MALLOCASYNC_VERSION
#error cudaMalloc/FreeAsync needs CUDA 11.2
#endif
#endif

#if defined(HYPRE_USING_THRUST_NOSYNC)
#if CUDA_VERSION < CUDA_THRUST_NOSYNC_VERSION
#error thrust::cuda::par_nosync needs CUDA 12
#endif
#define HYPRE_THRUST_EXECUTION thrust::cuda::par_nosync
#else
#define HYPRE_THRUST_EXECUTION thrust::cuda::par
#endif

#endif /* defined(HYPRE_USING_CUDA) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          hip includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_HIP)

using hypre_DeviceItem = void*;
#include <hip/hip_runtime.h>

#if defined(HYPRE_USING_ROCBLAS)
#include <rocblas/rocblas.h>
#endif

#if defined(HYPRE_USING_ROCSPARSE)
#include <rocsparse/rocsparse.h>
#if !defined(ROCSPARSE_VERSION)
#define ROCSPARSE_VERSION (ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 + ROCSPARSE_VERSION_PATCH)
#endif
#endif

#if defined(HYPRE_USING_ROCSOLVER)
#include <rocsolver/rocsolver.h>
#endif

#if defined(HYPRE_USING_ROCRAND)
#include <rocrand/rocrand.h>
#endif

#endif /* defined(HYPRE_USING_HIP) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          thrust includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

#include <thrust/execution_policy.h>
#if defined(HYPRE_USING_CUDA)
#include <thrust/system/cuda/execution_policy.h>
#elif defined(HYPRE_USING_HIP)
#include <thrust/system/hip/execution_policy.h>
#endif
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/unique.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/adjacent_difference.h>
#include <thrust/inner_product.h>
#include <thrust/logical.h>
#include <thrust/replace.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>
#include <thrust/version.h>

/* VPM: this is needed to support cuda 10. not_fn is the correct replacement going forward. */
#define THRUST_VERSION_NOTFN 200700
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
#define HYPRE_THRUST_NOT(pred) thrust::not1(pred)
#else
#define HYPRE_THRUST_NOT(pred) thrust::not_fn(pred)
#endif

using namespace thrust::placeholders;
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          sycl includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_SYCL)

#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#if defined(HYPRE_USING_ONEMKLSPARSE)
#include <oneapi/mkl/spblas.hpp>
#endif
#if defined(HYPRE_USING_ONEMKLBLAS)
#include <oneapi/mkl/blas.hpp>
#include <oneapi/mkl/lapack.hpp>
#endif
#if defined(HYPRE_USING_ONEMKLRAND)
#include <oneapi/mkl/rng.hpp>
#endif

/* The following definitions facilitate code reuse and limits
 * if/def-ing when unifying cuda/hip code with sycl code */
using dim3 = sycl::range<3>;
using hypre_DeviceItem = sycl::nd_item<3>;
#define __global__
#define __host__
#define __device__
#define __forceinline__ __inline__ __attribute__((always_inline))

#endif /* defined(HYPRE_USING_SYCL) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      device defined values
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#define HYPRE_MAX_NTHREADS_BLOCK 1024

// HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
#define HYPRE_WARP_SIZE       32
#define HYPRE_WARP_BITSHIFT   5
#define HYPRE_WARP_FULL_MASK  0xFFFFFFFF
#elif defined(HYPRE_USING_HIP)
#define HYPRE_WARP_SIZE       64
#define HYPRE_WARP_BITSHIFT   6
#define HYPRE_WARP_FULL_MASK  0xFFFFFFFFFFFFFFF
#endif

#define HYPRE_MAX_NUM_WARPS   (64 * 64 * 32)
#define HYPRE_FLT_LARGE       1e30
#define HYPRE_1D_BLOCK_SIZE   512
#define HYPRE_MAX_NUM_STREAMS 10
#define HYPRE_SPGEMM_MAX_NBIN 10

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *       macro for launching GPU kernels
 *       NOTE: IN HYPRE'S DEFAULT STREAM
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_DEBUG)
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(); hypre_GetDeviceLastError(); }
#else
#define GPU_LAUNCH_SYNC
#endif

/* cuda/hip version */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                        \
{                                                                                                                                   \
   if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                                 \
        blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                                  \
   {                                                                                                                                \
      /* printf("Warning %s %d: Zero CUDA grid/block (%d %d %d) (%d %d %d)\n",                                                      \
                 __FILE__, __LINE__,                                                                                                \
                 gridsize.x, gridsize.y, gridsize.z, blocksize.x, blocksize.y, blocksize.z); */                                     \
   }                                                                                                                                \
   else                                                                                                                             \
   {                                                                                                                                \
      hypre_DeviceItem item = NULL;                                                                                                 \
      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleComputeStream(hypre_handle()) >>> (item, __VA_ARGS__);     \
      GPU_LAUNCH_SYNC;                                                                                                              \
   }                                                                                                                                \
}

#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

/* sycl version */
#if defined(HYPRE_USING_SYCL)

#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, __VA_ARGS__);                                           \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_CPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      HPYRE_Int compute_stream = hypre_HandleComputeStreamNum(hypre_handle());               \
      hypre_HandleComputeStreamNum(hypre_handle) = HYPRE_MAX_NUM_STREAMS - 1;                \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, __VA_ARGS__);                                           \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
      hypre_HandleComputeStreamNum(hypre_handle) = compute_stream;                           \
   }                                                                                         \
}

#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                 \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         sycl::range<1> shmem_range(shmem_size);                                             \
         sycl::local_accessor<char, 1> shmem_accessor(shmem_range, cgh);                     \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item,                                                         \
                  shmem_accessor.get_multi_ptr<sycl::access::decorated::yes>(), __VA_ARGS__);\
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_GPU_DEBUG_LAUNCH(kernel_name, gridsize, blocksize, ...)                        \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         auto debug_stream = sycl::stream(4096, 1024, cgh);                                  \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, debug_stream, __VA_ARGS__);                             \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_GPU_DEBUG_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)           \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         auto debug_stream = sycl::stream(4096, 1024, cgh);                                  \
         sycl::range<1> shmem_range(shmem_size);                                             \
         sycl::accessor<char, 1, sycl::access_mode::read_write,                              \
            sycl::target::local> shmem_accessor(shmem_range, cgh);                           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, debug_stream,                                           \
                  shmem_accessor.get_multi_ptr<sycl::access::decorated::yes>(), __VA_ARGS__);\
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}
#endif /* defined(HYPRE_USING_SYCL) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping cuda/hip/sycl calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA)
#define HYPRE_CUDA_CALL(call) do {                                                           \
   cudaError_t err = call;                                                                   \
   if (cudaSuccess != err) {                                                                 \
      printf("CUDA ERROR (code = %d, %s) at %s:%d\n", err, cudaGetErrorString(err),          \
                   __FILE__, __LINE__);                                                      \
      hypre_assert(0);                                                                       \
   } } while(0)

#elif defined(HYPRE_USING_HIP)
#define HYPRE_HIP_CALL(call) do {                                                            \
   hipError_t err = call;                                                                    \
   if (hipSuccess != err) {                                                                  \
      printf("HIP ERROR (code = %d, %s) at %s:%d\n", err, hipGetErrorString(err),            \
                   __FILE__, __LINE__);                                                      \
      hypre_assert(0);                                                                       \
   } } while(0)

#elif defined(HYPRE_USING_SYCL)
#define HYPRE_SYCL_CALL(call)                                                                \
   try                                                                                       \
   {                                                                                         \
      call;                                                                                  \
   }                                                                                         \
   catch (sycl::exception const &ex)                                                         \
   {                                                                                         \
      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),                           \
                     __FILE__, __LINE__);                                                    \
      assert(0);                                                                             \
   }                                                                                         \
   catch(std::runtime_error const& ex)                                                       \
   {                                                                                         \
      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),                            \
                   __FILE__, __LINE__);                                                      \
      assert(0);                                                                             \
   }
#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping vendor library calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_COMPLEX) /* Double Complex */
#error "GPU build does not support complex numbers!"

#elif defined(HYPRE_SINGLE) /* Single */
#if defined(HYPRE_USING_CUDA)
/* cuBLAS */
#define hypre_cublas_scal                      cublasSscal
#define hypre_cublas_axpy                      cublasSaxpy
#define hypre_cublas_dot                       cublasSdot
#define hypre_cublas_gemv                      cublasSgemv
#define hypre_cublas_getrfBatched              cublasSgetrfBatched
#define hypre_cublas_getriBatched              cublasSgetriBatched

/* cuSPARSE */
#define hypre_cusparse_csru2csr_bufferSizeExt  cusparseScsru2csr_bufferSizeExt
#define hypre_cusparse_csru2csr                cusparseScsru2csr
#define hypre_cusparse_csrsv2_bufferSize       cusparseScsrsv2_bufferSize
#define hypre_cusparse_csrsv2_analysis         cusparseScsrsv2_analysis
#define hypre_cusparse_csrsv2_solve            cusparseScsrsv2_solve
#define hypre_cusparse_csrmv                   cusparseScsrmv
#define hypre_cusparse_csrgemm                 cusparseScsrgemm
#define hypre_cusparse_csr2csc                 cusparseScsr2csc
#define hypre_cusparse_csrilu02_bufferSize     cusparseScsrilu02_bufferSize
#define hypre_cusparse_csrilu02_analysis       cusparseScsrilu02_analysis
#define hypre_cusparse_csrilu02                cusparseScsrilu02
#define hypre_cusparse_csrsm2_bufferSizeExt    cusparseScsrsm2_bufferSizeExt
#define hypre_cusparse_csrsm2_analysis         cusparseScsrsm2_analysis
#define hypre_cusparse_csrsm2_solve            cusparseScsrsm2_solve

/* cuSOLVER */
#define hypre_cusolver_dngetrf                 cusolverDnSgetrf
#define hypre_cusolver_dngetrf_bs              cusolverDnSgetrf_bufferSize
#define hypre_cusolver_dngetrs                 cusolverDnSgetrs

#elif defined(HYPRE_USING_HIP)
/* rocSPARSE */
#define hypre_rocsparse_csrsv_buffer_size      rocsparse_scsrsv_buffer_size
#define hypre_rocsparse_csrsv_analysis         rocsparse_scsrsv_analysis
#define hypre_rocsparse_csrsv_solve            rocsparse_scsrsv_solve
#define hypre_rocsparse_gthr                   rocsparse_sgthr
#define hypre_rocsparse_csrmv_analysis         rocsparse_scsrmv_analysis
#define hypre_rocsparse_csrmv                  rocsparse_scsrmv
#define hypre_rocsparse_csrgemm_buffer_size    rocsparse_scsrgemm_buffer_size
#define hypre_rocsparse_csrgemm                rocsparse_scsrgemm
#define hypre_rocsparse_csr2csc                rocsparse_scsr2csc
#define hypre_rocsparse_csrilu0_buffer_size    rocsparse_scsrilu0_buffer_size
#define hypre_rocsparse_csrilu0_analysis       rocsparse_scsrilu0_analysis
#define hypre_rocsparse_csrilu0                rocsparse_scsrilu0
#define hypre_rocsparse_csritilu0_compute      rocsparse_scsritilu0_compute
#define hypre_rocsparse_csritilu0_history      rocsparse_scsritilu0_history

/* rocSOLVER */

/**************
 * TODO (VPM) *
 **************/

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

#elif defined(HYPRE_LONG_DOUBLE) /* Long Double */
#error "GPU build does not support Long Double numbers!"

#else /* Double */
#if defined(HYPRE_USING_CUDA)
/* cuBLAS */
#define hypre_cublas_scal                      cublasDscal
#define hypre_cublas_axpy                      cublasDaxpy
#define hypre_cublas_dot                       cublasDdot
#define hypre_cublas_gemv                      cublasDgemv
#define hypre_cublas_getrfBatched              cublasDgetrfBatched
#define hypre_cublas_getriBatched              cublasDgetriBatched

/* cuSPARSE */
#define hypre_cusparse_csru2csr_bufferSizeExt  cusparseDcsru2csr_bufferSizeExt
#define hypre_cusparse_csru2csr                cusparseDcsru2csr
#define hypre_cusparse_csrsv2_bufferSize       cusparseDcsrsv2_bufferSize
#define hypre_cusparse_csrsv2_analysis         cusparseDcsrsv2_analysis
#define hypre_cusparse_csrsv2_solve            cusparseDcsrsv2_solve
#define hypre_cusparse_csrmv                   cusparseDcsrmv
#define hypre_cusparse_csrgemm                 cusparseDcsrgemm
#define hypre_cusparse_csr2csc                 cusparseDcsr2csc
#define hypre_cusparse_csrilu02_bufferSize     cusparseDcsrilu02_bufferSize
#define hypre_cusparse_csrilu02_analysis       cusparseDcsrilu02_analysis
#define hypre_cusparse_csrilu02                cusparseDcsrilu02
#define hypre_cusparse_csrsm2_bufferSizeExt    cusparseDcsrsm2_bufferSizeExt
#define hypre_cusparse_csrsm2_analysis         cusparseDcsrsm2_analysis
#define hypre_cusparse_csrsm2_solve            cusparseDcsrsm2_solve

/* cuSOLVER */
#define hypre_cusolver_dngetrf                 cusolverDnDgetrf
#define hypre_cusolver_dngetrf_bs              cusolverDnDgetrf_bufferSize
#define hypre_cusolver_dngetrs                 cusolverDnDgetrs

#elif defined(HYPRE_USING_HIP)
/* rocSPARSE */
#define hypre_rocsparse_csrsv_buffer_size      rocsparse_dcsrsv_buffer_size
#define hypre_rocsparse_csrsv_analysis         rocsparse_dcsrsv_analysis
#define hypre_rocsparse_csrsv_solve            rocsparse_dcsrsv_solve
#define hypre_rocsparse_gthr                   rocsparse_dgthr
#define hypre_rocsparse_csrmv_analysis         rocsparse_dcsrmv_analysis
#define hypre_rocsparse_csrmv                  rocsparse_dcsrmv
#define hypre_rocsparse_csrgemm_buffer_size    rocsparse_dcsrgemm_buffer_size
#define hypre_rocsparse_csrgemm                rocsparse_dcsrgemm
#define hypre_rocsparse_csr2csc                rocsparse_dcsr2csc
#define hypre_rocsparse_csrilu0_buffer_size    rocsparse_dcsrilu0_buffer_size
#define hypre_rocsparse_csrilu0_analysis       rocsparse_dcsrilu0_analysis
#define hypre_rocsparse_csrilu0                rocsparse_dcsrilu0
#define hypre_rocsparse_csritilu0_compute      rocsparse_dcsritilu0_compute
#define hypre_rocsparse_csritilu0_history      rocsparse_dcsritilu0_history

/* rocSOLVER */

/**************
 * TODO (VPM) *
 **************/

#endif /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)*/
#endif /* #if defined(HYPRE_COMPLEX) || defined(HYPRE_SINGLE) || defined(HYPRE_LONG_DOUBLE) */

#define HYPRE_CUBLAS_CALL(call) do {                                                         \
   cublasStatus_t err = call;                                                                \
   if (CUBLAS_STATUS_SUCCESS != err) {                                                       \
      printf("CUBLAS ERROR (code = %d, %d) at %s:%d\n",                                      \
            err, err == CUBLAS_STATUS_EXECUTION_FAILED, __FILE__, __LINE__);                 \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ROCBLAS_CALL(call) do {                                                        \
   rocblas_status err = call;                                                                \
   if (rocblas_status_success != err) {                                                      \
      printf("rocBLAS ERROR (code = %d, %s) at %s:%d\n",                                     \
             err, rocblas_status_to_string(err), __FILE__, __LINE__);                        \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#if CUSPARSE_VERSION >= 10300
#define HYPRE_CUSPARSE_CALL(call) do {                                                       \
   cusparseStatus_t err = call;                                                              \
   if (CUSPARSE_STATUS_SUCCESS != err) {                                                     \
      printf("CUSPARSE ERROR (code = %d, %s) at %s:%d\n",                                    \
            err, cusparseGetErrorString(err), __FILE__, __LINE__);                           \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)
#else
#define HYPRE_CUSPARSE_CALL(call) do {                                                       \
   cusparseStatus_t err = call;                                                              \
   if (CUSPARSE_STATUS_SUCCESS != err) {                                                     \
      printf("CUSPARSE ERROR (code = %d) at %s:%d\n",                                        \
            err, __FILE__, __LINE__);                                                        \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)
#endif

#define HYPRE_ROCSPARSE_CALL(call) do {                                                      \
   rocsparse_status err = call;                                                              \
   if (rocsparse_status_success != err) {                                                    \
      printf("rocSPARSE ERROR (code = %d) at %s:%d\n",                                       \
            err, __FILE__, __LINE__);                                                        \
      assert(0); exit(1);                                                                    \
   } } while(0)

#define HYPRE_CUSOLVER_CALL(call) do {                                                       \
   cusolverStatus_t err = call;                                                              \
   if (CUSOLVER_STATUS_SUCCESS != err) {                                                     \
      printf("cuSOLVER ERROR (code = %d) at %s:%d\n",                                        \
            err, __FILE__, __LINE__);                                                        \
      hypre_assert(0);                                                                       \
   } } while(0)

#define HYPRE_ROCSOLVER_CALL(call) do {                                                      \
   rocblas_status err = call;                                                                \
   if (rocblas_status_success != err) {                                                      \
      printf("rocSOLVER ERROR (code = %d, %s) at %s:%d\n",                                   \
             err, rocblas_status_to_string(err), __FILE__, __LINE__);                        \
   } } while(0)

#define HYPRE_CURAND_CALL(call) do {                                                         \
   curandStatus_t err = call;                                                                \
   if (CURAND_STATUS_SUCCESS != err) {                                                       \
      printf("CURAND ERROR (code = %d) at %s:%d\n", err, __FILE__, __LINE__);                \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ROCRAND_CALL(call) do {                                                        \
   rocrand_status err = call;                                                                \
   if (ROCRAND_STATUS_SUCCESS != err) {                                                      \
      printf("ROCRAND ERROR (code = %d) at %s:%d\n", err, __FILE__, __LINE__);               \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ONEMKL_CALL(call)                                                              \
   try                                                                                       \
   {                                                                                         \
      call;                                                                                  \
   }                                                                                         \
   catch (oneapi::mkl::exception const &ex)                                                  \
   {                                                                                         \
      hypre_printf("ONEMKL ERROR (code = %s) at %s:%d\n", ex.what(),                         \
                   __FILE__, __LINE__);                                                      \
      assert(0); exit(1);                                                                    \
   }

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping thrust/oneDPL calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
 * The following one works OK for now */

#if defined(HYPRE_USING_CUDA)
#define HYPRE_THRUST_CALL(func_name, ...) \
   thrust::func_name(HYPRE_THRUST_EXECUTION(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
#elif defined(HYPRE_USING_HIP)
#define HYPRE_THRUST_CALL(func_name, ...) \
   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);

#elif defined(HYPRE_USING_SYCL)

#define HYPRE_ONEDPL_CALL(func_name, ...)                                                    \
  func_name(oneapi::dpl::execution::make_device_policy(                                      \
           *hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);

#define HYPRE_ONEDPL_CPU_CALL(func_name, ...)                                                \
   func_name(oneapi::dpl::execution::make_device_policy(                                     \
             *hypre_DeviceDataStream(hypre_HandleDeviceData(hypre_handle()), HYPRE_MAX_NUM_STREAMS - 1)), __VA_ARGS__);

#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      device info data structures
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

struct hypre_cub_CachingDeviceAllocator;
typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator;

#if defined(HYPRE_USING_CUSOLVER)
typedef cusolverDnHandle_t vendorSolverHandle_t;
#elif defined(HYPRE_USING_ROCSOLVER)
typedef rocblas_handle     vendorSolverHandle_t;
#endif

struct hypre_DeviceData
{
#if defined(HYPRE_USING_CURAND)
   curandGenerator_t                 curand_generator;
#endif

#if defined(HYPRE_USING_ROCRAND)
   rocrand_generator                 curand_generator;
#endif

#if defined(HYPRE_USING_CUBLAS)
   cublasHandle_t                    cublas_handle;
#endif

#if defined(HYPRE_USING_CUSPARSE)
   cusparseHandle_t                  cusparse_handle;
#endif

#if defined(HYPRE_USING_ROCSPARSE)
   rocsparse_handle                  cusparse_handle;
#endif

#if defined(HYPRE_USING_CUSOLVER) || defined(HYPRE_USING_ROCSOLVER)
   vendorSolverHandle_t              vendor_solver_handle;
#endif

   /* TODO (VPM): Change to HYPRE_USING_GPU_STREAMS*/
#if defined(HYPRE_USING_CUDA_STREAMS)
#if defined(HYPRE_USING_CUDA)
   cudaStream_t                      streams[HYPRE_MAX_NUM_STREAMS];
#elif defined(HYPRE_USING_HIP)
   hipStream_t                       streams[HYPRE_MAX_NUM_STREAMS];
#elif defined(HYPRE_USING_SYCL)
   sycl::queue*                      streams[HYPRE_MAX_NUM_STREAMS] = {NULL};
#endif
#endif

#if defined(HYPRE_USING_DEVICE_POOL)
   hypre_uint                        cub_bin_growth;
   hypre_uint                        cub_min_bin;
   hypre_uint                        cub_max_bin;
   size_t                            cub_max_cached_bytes;
   hypre_cub_CachingDeviceAllocator *cub_dev_allocator;
   hypre_cub_CachingDeviceAllocator *cub_uvm_allocator;
#endif

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   hypre_device_allocator            device_allocator;
#endif
#if defined(HYPRE_USING_SYCL)
   sycl::device                     *device;
   HYPRE_Int                         device_max_work_group_size;
#else
   HYPRE_Int                         device;
#endif
   hypre_int                         device_max_shmem_per_block[3];
   /* by default, hypre puts GPU computations in this stream
    * Do not be confused with the default (null) stream */
   HYPRE_Int                         compute_stream_num;
   /* work space for hypre's device reducer */
   void                             *reduce_buffer;
   /* device spgemm options */
   HYPRE_Int                         spgemm_algorithm;
   HYPRE_Int                         spgemm_binned;
   HYPRE_Int                         spgemm_num_bin;
   /* the highest bins for symbl [0] and numer [1]
    * which are not necessary to be `spgemm_num_bin' due to shmem limit on GPUs */
   HYPRE_Int                         spgemm_highest_bin[2];
   /* for bin i: ([0][i], [2][i]) = (max #block to launch, block dimension) for symbl
    *            ([1][i], [3][i]) = (max #block to launch, block dimension) for numer */
   HYPRE_Int                         spgemm_block_num_dim[4][HYPRE_SPGEMM_MAX_NBIN + 1];
   HYPRE_Int                         spgemm_rownnz_estimate_method;
   HYPRE_Int                         spgemm_rownnz_estimate_nsamples;
   float                             spgemm_rownnz_estimate_mult_factor;
   /* cusparse */
   HYPRE_Int                         spmv_use_vendor;
   HYPRE_Int                         sptrans_use_vendor;
   HYPRE_Int                         spgemm_use_vendor;
   /* PMIS RNG */
   HYPRE_Int                         use_gpu_rand;
};

#define hypre_DeviceDataCubBinGrowth(data)                   ((data) -> cub_bin_growth)
#define hypre_DeviceDataCubMinBin(data)                      ((data) -> cub_min_bin)
#define hypre_DeviceDataCubMaxBin(data)                      ((data) -> cub_max_bin)
#define hypre_DeviceDataCubMaxCachedBytes(data)              ((data) -> cub_max_cached_bytes)
#define hypre_DeviceDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
#define hypre_DeviceDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
#define hypre_DeviceDataDevice(data)                         ((data) -> device)
#define hypre_DeviceDataDeviceMaxWorkGroupSize(data)         ((data) -> device_max_work_group_size)
#define hypre_DeviceDataDeviceMaxShmemPerBlock(data)         ((data) -> device_max_shmem_per_block)
#define hypre_DeviceDataDeviceMaxShmemPerBlockInited(data)  (((data) -> device_max_shmem_per_block)[2])
#define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
#define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
#define hypre_DeviceDataSpgemmUseVendor(data)                ((data) -> spgemm_use_vendor)
#define hypre_DeviceDataSpMVUseVendor(data)                  ((data) -> spmv_use_vendor)
#define hypre_DeviceDataSpTransUseVendor(data)               ((data) -> sptrans_use_vendor)
#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_algorithm)
#define hypre_DeviceDataSpgemmBinned(data)                   ((data) -> spgemm_binned)
#define hypre_DeviceDataSpgemmNumBin(data)                   ((data) -> spgemm_num_bin)
#define hypre_DeviceDataSpgemmHighestBin(data)               ((data) -> spgemm_highest_bin)
#define hypre_DeviceDataSpgemmBlockNumDim(data)              ((data) -> spgemm_block_num_dim)
#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
#define hypre_DeviceDataDeviceAllocator(data)                ((data) -> device_allocator)
#define hypre_DeviceDataUseGpuRand(data)                     ((data) -> use_gpu_rand)

hypre_DeviceData*     hypre_DeviceDataCreate();
void                  hypre_DeviceDataDestroy(hypre_DeviceData* data);

#if defined(HYPRE_USING_CURAND)
curandGenerator_t     hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_ROCRAND)
rocrand_generator     hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUBLAS)
cublasHandle_t        hypre_DeviceDataCublasHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUSPARSE)
cusparseHandle_t      hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_ROCSPARSE)
rocsparse_handle      hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUSOLVER) || defined(HYPRE_USING_ROCSOLVER)
vendorSolverHandle_t  hypre_DeviceDataVendorSolverHandle(hypre_DeviceData *data);
#endif

/* TODO (VPM): Create a deviceStream_t to encapsulate all stream types below */
#if defined(HYPRE_USING_CUDA)
cudaStream_t          hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
cudaStream_t          hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#elif defined(HYPRE_USING_HIP)
hipStream_t           hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
hipStream_t           hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#elif defined(HYPRE_USING_SYCL)
sycl::queue*          hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
sycl::queue*          hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#endif

/* Data structure and accessor routines for Sparse Triangular Matrices */
struct hypre_CsrsvData
{
#if defined(HYPRE_USING_CUSPARSE)
   hypre_cusparseSpSVDescr   info_L;
   hypre_cusparseSpSVDescr   info_U;
   cusparseSolvePolicy_t     analysis_policy;
   cusparseSolvePolicy_t     solve_policy;
#elif defined(HYPRE_USING_ROCSPARSE)
   rocsparse_mat_info        info_L;
   rocsparse_mat_info        info_U;
   rocsparse_analysis_policy analysis_policy;
   rocsparse_solve_policy    solve_policy;
#endif

#if defined(HYPRE_USING_CUSPARSE) && (CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION)
   size_t                    buffer_size_L;
   size_t                    buffer_size_U;
   char                     *buffer_L;
   char                     *buffer_U;
#else
   hypre_int                 buffer_size;
   char                     *buffer;
#endif

   /* Temporary array to save matrix values with modified diagonal */
   HYPRE_Complex            *mat_data;

   /* Flags for checking whether the analysis phase has been executed or not */
   HYPRE_Int                 analyzed_L;
   HYPRE_Int                 analyzed_U;
};

#define hypre_CsrsvDataInfoL(data)          ((data) -> info_L)
#define hypre_CsrsvDataInfoU(data)          ((data) -> info_U)
#define hypre_CsrsvDataAnalyzedL(data)      ((data) -> analyzed_L)
#define hypre_CsrsvDataAnalyzedU(data)      ((data) -> analyzed_U)
#define hypre_CsrsvDataSolvePolicy(data)    ((data) -> solve_policy)
#define hypre_CsrsvDataAnalysisPolicy(data) ((data) -> analysis_policy)
#if defined(HYPRE_USING_CUSPARSE) && (CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION)
#define hypre_CsrsvDataBufferSizeL(data)    ((data) -> buffer_size_L)
#define hypre_CsrsvDataBufferSizeU(data)    ((data) -> buffer_size_U)
#define hypre_CsrsvDataBufferL(data)        ((data) -> buffer_L)
#define hypre_CsrsvDataBufferU(data)        ((data) -> buffer_U)
#else
#define hypre_CsrsvDataBufferSize(data)     ((data) -> buffer_size)
#define hypre_CsrsvDataBuffer(data)         ((data) -> buffer)
#endif
#define hypre_CsrsvDataMatData(data)        ((data) -> mat_data)

struct hypre_GpuMatData
{
#if defined(HYPRE_USING_CUSPARSE)
   cusparseMatDescr_t                   mat_descr;
   char                                *spmv_buffer;

#elif defined(HYPRE_USING_ROCSPARSE)
   rocsparse_mat_descr                  mat_descr;
   rocsparse_mat_info                   mat_info;

#elif defined(HYPRE_USING_ONEMKLSPARSE)
   oneapi::mkl::sparse::matrix_handle_t mat_handle;
#endif
};

#define hypre_GpuMatDataMatDescr(data)    ((data) -> mat_descr)
#define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
#define hypre_GpuMatDataMatHandle(data)   ((data) -> mat_handle)
#define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)

#endif /* if defined(HYPRE_USING_GPU) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      generic device functions (cuda/hip/sycl)
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_GPU)
template <typename T>
static __device__ __forceinline__
T read_only_load( const T *ptr )
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
   return __ldg( ptr );
#else
   return *ptr;
#endif
}

static __device__ __forceinline__
hypre_int next_power_of_2(hypre_int n)
{
   if (n <= 0)
   {
      return 0;
   }

   /* if n is power of 2, return itself */
   if ( (n & (n - 1)) == 0 )
   {
      return n;
   }

   n |= (n >>  1);
   n |= (n >>  2);
   n |= (n >>  4);
   n |= (n >>  8);
   n |= (n >> 16);
   n ^= (n >>  1);
   n  = (n <<  1);

   return n;
}

/* Flip n-th bit of bitmask (0 becomes 1. 1 becomes 0) */
static __device__ __forceinline__
hypre_mask hypre_mask_flip_at(hypre_mask bitmask, hypre_int n)
{
   return bitmask ^ (hypre_mask_one << n);
}

#endif // defined(HYPRE_USING_GPU)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      cuda/hip functions
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* return the number of threads in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_threads(hypre_DeviceItem &item)
{
   switch (dim)
   {
      case 1:
         return (blockDim.x);
      case 2:
         return (blockDim.x * blockDim.y);
      case 3:
         return (blockDim.x * blockDim.y * blockDim.z);
   }

   return -1;
}

/* return the flattened thread id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_thread_id(hypre_DeviceItem &item)
{
   switch (dim)
   {
      case 1:
         return (threadIdx.x);
      case 2:
         return (threadIdx.y * blockDim.x + threadIdx.x);
      case 3:
         return (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
                 threadIdx.x);
   }

   return -1;
}

/* return the number of warps in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_threads<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the warp id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_thread_id<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the thread lane id in warp */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_lane_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_thread_id<dim>(item) & (HYPRE_WARP_SIZE - 1);
}

/* return the num of blocks in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_blocks()
{
   switch (dim)
   {
      case 1:
         return (gridDim.x);
      case 2:
         return (gridDim.x * gridDim.y);
      case 3:
         return (gridDim.x * gridDim.y * gridDim.z);
   }

   return -1;
}

/* return the flattened block id in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_block_id(hypre_DeviceItem &item)
{
   switch (dim)
   {
      case 1:
         return (blockIdx.x);
      case 2:
         return (blockIdx.y * gridDim.x + blockIdx.x);
      case 3:
         return (blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x +
                 blockIdx.x);
   }

   return -1;
}

/* return the number of threads in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_threads(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>(item);
}

/* return the flattened thread id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_thread_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item) +
          hypre_gpu_get_thread_id<bdim>(item);
}

/* return the number of warps in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>(item);
}

/* return the flattened warp id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item) +
          hypre_gpu_get_warp_id<bdim>(item);
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __device__ __forceinline__
hypre_double atomicAdd(hypre_double* address, hypre_double val)
{
   hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
   hypre_ulonglongint old = *address_as_ull, assumed;

   do
   {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed,
                      __double_as_longlong(val +
                                           __longlong_as_double(assumed)));

      // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   }
   while (assumed != old);

   return __longlong_as_double(old);
}
#endif

// There are no *_sync functions in HIP
#if defined(HYPRE_USING_HIP) || (CUDA_VERSION < 9000)

template <typename T>
static __device__ __forceinline__
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl(val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_up(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_down(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_xor(val, lanemask, width);
}

static __device__ __forceinline__
void __syncwarp()
{
}

#endif // #if defined(HYPRE_USING_HIP) || (CUDA_VERSION < 9000)

static __device__ __forceinline__
hypre_mask hypre_ballot_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
#if defined(HYPRE_USING_CUDA)
   return __ballot_sync(mask, predicate);
#else
   return __ballot(predicate);
#endif
}

static __device__ __forceinline__
HYPRE_Int hypre_popc(hypre_mask mask)
{
#if defined(HYPRE_USING_CUDA)
   return (HYPRE_Int) __popc(mask);
#else
   return (HYPRE_Int) __popcll(mask);
#endif
}

static __device__ __forceinline__
HYPRE_Int hypre_ffs(hypre_mask mask)
{
#if defined(HYPRE_USING_CUDA)
   return (HYPRE_Int) __ffs(mask);
#else
   return (HYPRE_Int) __ffsll(mask);
#endif
}

#if defined(HYPRE_USING_HIP)
static __device__ __forceinline__
hypre_int __any_sync(unsigned mask, hypre_int predicate)
{
   return __any(predicate);
}
#endif

/* sync the thread block */
static __device__ __forceinline__
void block_sync(hypre_DeviceItem &item)
{
   __syncthreads();
}

/* sync the warp */
static __device__ __forceinline__
void warp_sync(hypre_DeviceItem &item)
{
   __syncwarp();
}

/* exclusive prefix scan */
template <typename T>
static __device__ __forceinline__
T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
{
#pragma unroll
   for (hypre_int d = 2; d <= HYPRE_WARP_SIZE; d <<= 1)
   {
      T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
      if ( (lane_id & (d - 1)) == (d - 1) )
      {
         in += t;
      }
   }

   all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE - 1);

   if (lane_id == HYPRE_WARP_SIZE - 1)
   {
      in = 0;
   }

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);

      if ( (lane_id & (d - 1)) == (d - 1))
      {
         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
         {
            in += t;
         }
         else
         {
            in = t;
         }
      }
   }
   return in;
}

static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   return __any_sync(mask, predicate);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
                    hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_sync(mask, val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                       hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_up_sync(mask, val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                         hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_down_sync(mask, val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
                        hypre_int width = HYPRE_WARP_SIZE)
{
   return __shfl_xor_sync(mask, val, lane_mask, width);
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_sum(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d);
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_sum(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_max(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_max(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_min(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_min(hypre_DeviceItem &item, T in)
{
#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template<typename T1, typename T2>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct type_cast : public thrust::unary_function<T1, T2>
#else
struct type_cast
#endif
{
   __host__ __device__ T2 operator()(const T1 &x) const
   {
      return (T2) x;
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct absolute_value : public thrust::unary_function<T, T>
#else
struct absolute_value
#endif
{
   __host__ __device__ T operator()(const T &x) const
   {
      return x < T(0) ? -x : x;
   }
};

template<typename T1, typename T2>
struct TupleComp2
{
   typedef thrust::tuple<T1, T2> Tuple;

   __host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (thrust::get<0>(t1) < thrust::get<0>(t2))
      {
         return true;
      }
      if (thrust::get<0>(t1) > thrust::get<0>(t2))
      {
         return false;
      }
      return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
   }
};

template<typename T1, typename T2>
struct TupleComp3
{
   typedef thrust::tuple<T1, T2> Tuple;

   __host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (thrust::get<0>(t1) < thrust::get<0>(t2))
      {
         return true;
      }
      if (thrust::get<0>(t1) > thrust::get<0>(t2))
      {
         return false;
      }
      if (thrust::get<0>(t2) == thrust::get<1>(t2))
      {
         return false;
      }
      return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_negative : public thrust::unary_function<T, bool>
#else
struct is_negative
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x < 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_positive : public thrust::unary_function<T, bool>
#else
struct is_positive
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x > 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_nonnegative : public thrust::unary_function<T, bool>
#else
struct is_nonnegative
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x >= 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct in_range : public thrust::unary_function<T, bool>
#else
struct in_range
#endif
{
   T low, up;

   in_range(T low_, T up_) { low = low_; up = up_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x >= low && x <= up);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct out_of_range : public thrust::unary_function<T, bool>
#else
struct out_of_range
#endif
{
   T low, up;

   out_of_range(T low_, T up_) { low = low_; up = up_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x < low || x > up);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct less_than : public thrust::unary_function<T, bool>
#else
struct less_than
#endif
{
   T val;

   less_than(T val_) { val = val_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x < val);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct modulo : public thrust::unary_function<T, T>
#else
struct modulo
#endif
{
   T val;

   modulo(T val_) { val = val_; }

   __host__ __device__ T operator()(const T &x)
   {
      return (x % val);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct equal : public thrust::unary_function<T, bool>
#else
struct equal
#endif
{
   T val;

   equal(T val_) { val = val_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x == val);
   }
};

struct print_functor
{
   __host__ __device__ void operator()(HYPRE_Real val)
   {
      printf("%f\n", val);
   }
};

#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      sycl functions
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_SYCL)

/* return the number of threads in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_threads(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_local_range().size());
}

/* return the flattened thread id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_thread_id(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_local_linear_id());
}

/* return the number of warps in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_threads<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the warp id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_thread_id<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the thread lane id in warp */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_lane_id(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_sub_group().get_local_linear_id());
}

/* return the num of blocks in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_blocks(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_group_range().size());
}

/* return the flattened block id in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_block_id(hypre_DeviceItem &item)
{
   return item.get_group_linear_id();
}

/* return the number of threads in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_threads(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item);
}

/* return the flattened thread id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_thread_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item) +
          hypre_gpu_get_thread_id<bdim>(item);
}

/* return the number of warps in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item);
}

/* return the flattened warp id in nd_range */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item) +
          hypre_gpu_get_warp_id<bdim>(item);
}

/* sync the thread block */
static __device__ __forceinline__
void block_sync(hypre_DeviceItem &item)
{
   sycl::group_barrier(item.get_group());
}

/* sync the warp */
static __device__ __forceinline__
void warp_sync(hypre_DeviceItem &item)
{
   sycl::group_barrier(item.get_sub_group());
}

/* exclusive prefix scan */
template <typename T>
static __device__ __forceinline__
T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
{
#pragma unroll
   for (hypre_int d = 2; d <= HYPRE_WARP_SIZE; d <<= 1)
   {
      T t = sycl::shift_group_right(item.get_sub_group(), in, d >> 1);
      if ( (lane_id & (d - 1)) == (d - 1) )
      {
         in += t;
      }
   }

   all_sum = sycl::group_broadcast(item.get_sub_group(), in, HYPRE_WARP_SIZE - 1);

   if (lane_id == HYPRE_WARP_SIZE - 1)
   {
      in = 0;
   }

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      T t = sycl::permute_group_by_xor(item.get_sub_group(), in, d);

      if ( (lane_id & (d - 1)) == (d - 1))
      {
         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
         {
            in += t;
         }
         else
         {
            in = t;
         }
      }
   }
   return in;
}

static __device__ __forceinline__
hypre_mask hypre_ballot_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   return sycl::reduce_over_group(
             item.get_sub_group(),
             (mask & (0x1 << item.get_sub_group().get_local_linear_id())) &&
             predicate ? (0x1 << item.get_sub_group().get_local_linear_id()) : 0,
             sycl::ext::oneapi::plus<>());
}

static __device__ __forceinline__
HYPRE_Int hypre_popc(hypre_mask mask)
{
   return (HYPRE_Int) sycl::popcount(mask);
}

static __device__ __forceinline__
HYPRE_Int hypre_ffs(hypre_mask mask)
{
   return (HYPRE_Int)dpct::ffs<HYPRE_Int>(mask);
}

static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   return sycl::any_of_group(item.get_sub_group(), predicate);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
{
   return sycl::group_broadcast(item.get_sub_group(), val, src_line);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
                    hypre_int width)
{
   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_start = (lane_id / width) * width;
   hypre_int src_in_warp = group_start + (src_line % width);
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
   return sycl::shift_group_right(item.get_sub_group(), val, delta);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                       hypre_int width)
{
   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_start = (lane_id / width) * width;
   hypre_int src_in_warp = lane_id - delta >= group_start ? lane_id - delta : lane_id;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
   return sycl::shift_group_left(item.get_sub_group(), val, delta);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                         hypre_int width)
{
   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_end = ((lane_id / width) + 1) * width - 1;
   hypre_int src_in_warp = lane_id + delta <= group_end ? lane_id + delta : lane_id;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
{
   return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
                        hypre_int width)
{
   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_end = ((lane_id / width) + 1) * width - 1;
   hypre_int src_in_warp = lane_id ^ lane_mask;
   src_in_warp = src_in_warp > group_end ? lane_id : src_in_warp;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __forceinline__
T warp_reduce_sum(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += sycl::shift_group_left(item.get_sub_group(), in, d);
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_sum(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += sycl::permute_group_by_xor(item.get_sub_group(), in, d);
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_reduce_max(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::max(in, sycl::shift_group_left(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_max(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::max(in, sycl::permute_group_by_xor(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_reduce_min(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::min(in, sycl::shift_group_left(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_min(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::min(in, sycl::permute_group_by_xor(item.get_sub_group(), in, d));
   }
   return in;
}

template<typename T>
struct is_negative
{
   is_negative() {}

   constexpr bool operator()(const T &x = T()) const { return (x < 0); }
};

template<typename T>
struct is_positive
{
   is_positive() {}

   constexpr bool operator()(const T &x = T()) const { return (x > 0); }
};

template<typename T>
struct is_nonnegative
{
   is_nonnegative() {}

   constexpr bool operator()(const T &x = T()) const { return (x >= 0); }
};

template<typename T>
struct in_range
{
   T low, high;
   in_range(T low_ = T(), T high_ = T()) { low = low_; high = high_; }

   constexpr bool operator()(const T &x) const { return (x >= low && x <= high); }
};

template<typename T>
struct out_of_range
{
   T low, high;
   out_of_range(T low_ = T(), T high_ = T()) { low = low_; high = high_; }

   constexpr bool operator()(const T &x) const { return (x < low || x > high); }
};

template<typename T>
struct less_than
{
   T val;
   less_than(T val_ = T()) { val = val_; }

   constexpr bool operator()(const T &x) const { return (x < val); }
};

template<typename T>
struct modulo
{
   T val;
   modulo(T val_ = T()) { val = val_; }

   constexpr T operator()(const T &x) const { return (x % val); }
};

template<typename T>
struct equal
{
   T val;
   equal(T val_ = T()) { val = val_; }

   constexpr bool operator()(const T &x) const { return (x == val); }
};

template<typename T1, typename T2>
struct type_cast
{
   constexpr T2 operator()(const T1 &x = T1()) const { return (T2) x; }
};

template<typename T>
struct absolute_value
{
   constexpr T operator()(const T &x) const { return x < T(0) ? -x : x; }
};

template<typename... T>
struct TupleComp2
{
   typedef std::tuple<T...> Tuple;
   bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (std::get<0>(t1) < std::get<0>(t2))
      {
         return true;
      }
      if (std::get<0>(t1) > std::get<0>(t2))
      {
         return false;
      }
      return hypre_abs(std::get<1>(t1)) > hypre_abs(std::get<1>(t2));
   }
};

template<typename... T>
struct TupleComp3
{
   typedef std::tuple<T...> Tuple;
   bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (std::get<0>(t1) < std::get<0>(t2))
      {
         return true;
      }
      if (std::get<0>(t1) > std::get<0>(t2))
      {
         return false;
      }
      if (std::get<0>(t2) == std::get<1>(t2))
      {
         return false;
      }
      return std::get<0>(t1) == std::get<1>(t1) || std::get<1>(t1) < std::get<1>(t2);
   }
};

#endif // #if defined(HYPRE_USING_SYCL)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      end of functions defined here
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

/* device_utils.c */
#if defined(HYPRE_USING_GPU)
dim3 hypre_GetDefaultDeviceBlockDimension();

dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );

dim3 hypre_dim3(HYPRE_Int x);
dim3 hypre_dim3(HYPRE_Int x, HYPRE_Int y);
dim3 hypre_dim3(HYPRE_Int x, HYPRE_Int y, HYPRE_Int z);

template <typename T1, typename T2, typename T3>
HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2,
                                           T3 *vals, HYPRE_Int opt);

template <typename T1, typename T2, typename T3, typename T4>
HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2,
                                                T3 *vals1, T4 *vals2, HYPRE_Int opt);

template <typename T1, typename T2, typename T3>
HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in,
                                       T3 *vals_in, T1 *keys1_out, T2 *keys2_out,
                                       T3 *vals_out);

template <typename T>
HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);

HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map,
                                    HYPRE_Real *y, char *work);

template <typename T>
HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);

#endif

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

#if defined(HYPRE_USING_CUDA)
cudaError_t hypre_CachingMallocDevice(void **ptr, size_t nbytes);

cudaError_t hypre_CachingMallocManaged(void **ptr, size_t nbytes);

cudaError_t hypre_CachingFreeDevice(void *ptr);

cudaError_t hypre_CachingFreeManaged(void *ptr);
#endif

hypre_cub_CachingDeviceAllocator * hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth,
                                                                             hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug,
                                                                             bool use_managed_memory);

void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);

#endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

#if defined(HYPRE_USING_CUSPARSE)

cudaDataType hypre_HYPREComplexToCudaDataType();

#if CUSPARSE_VERSION >= CUSPARSE_NEWAPI_VERSION
cusparseIndexType_t hypre_HYPREIntToCusparseIndexType();
#endif

#endif // #if defined(HYPRE_USING_CUSPARSE)

#endif /* #ifndef HYPRE_CUDA_UTILS_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

/* CUDA reducer class */

#ifndef HYPRE_CUDA_REDUCER_H
#define HYPRE_CUDA_REDUCER_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
#if !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS)

template<typename T> void OneBlockReduce(T *d_arr, HYPRE_Int N, T *h_out);

struct HYPRE_double4
{
   HYPRE_Real x, y, z, w;

   __host__ __device__
   HYPRE_double4() {}

   __host__ __device__
   HYPRE_double4(HYPRE_Real x1, HYPRE_Real x2, HYPRE_Real x3, HYPRE_Real x4)
   {
      x = x1;
      y = x2;
      z = x3;
      w = x4;
   }

   __host__ __device__
   void operator=(HYPRE_Real val)
   {
      x = y = z = w = val;
   }

   __host__ __device__
   void operator+=(HYPRE_double4 rhs)
   {
      x += rhs.x;
      y += rhs.y;
      z += rhs.z;
      w += rhs.w;
   }

};

struct HYPRE_double6
{
   HYPRE_Real x, y, z, w, u, v;

   __host__ __device__
   HYPRE_double6() {}

   __host__ __device__
   HYPRE_double6(HYPRE_Real x1, HYPRE_Real x2, HYPRE_Real x3, HYPRE_Real x4,
                 HYPRE_Real x5, HYPRE_Real x6)
   {
      x = x1;
      y = x2;
      z = x3;
      w = x4;
      u = x5;
      v = x6;
   }

   __host__ __device__
   void operator=(HYPRE_Real val)
   {
      x = y = z = w = u = v = val;
   }

   __host__ __device__
   void operator+=(HYPRE_double6 rhs)
   {
      x += rhs.x;
      y += rhs.y;
      z += rhs.z;
      w += rhs.w;
      u += rhs.u;
      v += rhs.v;
   }

};

/* reduction within a warp */
__inline__ __host__ __device__
HYPRE_Real warpReduceSum(HYPRE_Real val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val, offset);
   }
#endif
   return val;
}

__inline__ __host__ __device__
HYPRE_double4 warpReduceSum(HYPRE_double4 val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val.x += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.x, offset);
      val.y += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.y, offset);
      val.z += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.z, offset);
      val.w += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.w, offset);
   }
#endif
   return val;
}

__inline__ __host__ __device__
HYPRE_double6 warpReduceSum(HYPRE_double6 val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val.x += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.x, offset);
      val.y += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.y, offset);
      val.z += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.z, offset);
      val.w += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.w, offset);
      val.u += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.u, offset);
      val.v += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.v, offset);
   }
#endif
   return val;
}

/* reduction within a block */
template <typename T>
__inline__ __host__ __device__
T blockReduceSum(T val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   // Shared mem for HYPRE_WARP_SIZE partial sums
   __shared__ T shared[HYPRE_WARP_SIZE];

   HYPRE_Int lane = threadIdx.x & (warpSize - 1);
   HYPRE_Int wid  = threadIdx.x >> HYPRE_WARP_BITSHIFT;

   // Each warp performs partial reduction
   val = warpReduceSum(val);

   // Write reduced value to shared memory
   if (lane == 0)
   {
      shared[wid] = val;
   }

   // Wait for all partial reductions
   __syncthreads();

   // read from shared memory only if that warp existed
   if (threadIdx.x < (blockDim.x >> HYPRE_WARP_BITSHIFT))
   {
      val = shared[lane];
   }
   else
   {
      val = 0.0;
   }

   // Final reduce within first warp
   if (wid == 0)
   {
      val = warpReduceSum(val);
   }

#endif
   return val;
}

template<typename T>
__global__ void
OneBlockReduceKernel(hypre_DeviceItem &item,
                     T                *arr,
                     HYPRE_Int         N)
{
   T sum;

   sum = 0.0;

   if (threadIdx.x < N)
   {
      sum = arr[threadIdx.x];
   }

   sum = blockReduceSum(sum);

   if (threadIdx.x == 0)
   {
      arr[0] = sum;
   }
}

/* Reducer class */
template <typename T>
struct ReduceSum
{
   using value_type = T;

   T init;                    /* initial value passed in */
   mutable T __thread_sum;    /* place to hold local sum of a thread,
                                 and partial sum of a block */
   T *d_buf;                  /* place to store partial sum within blocks
                                 in the 1st round, used in the 2nd round */
   HYPRE_Int nblocks;         /* number of blocks used in the 1st round */

   /* constructor
    * val is the initial value (added to the reduced sum) */
   __host__
   ReduceSum(T val)
   {
      init = val;
      __thread_sum = 0.0;
      nblocks = -1;
   }

   /* copy constructor */
   __host__ __device__
   ReduceSum(const ReduceSum<T>& other)
   {
      *this = other;
   }

   __host__ void
   Allocate2ndPhaseBuffer()
   {
      if (hypre_HandleReduceBuffer(hypre_handle()) == NULL)
      {
         /* allocate for the max size for reducing double6 type */
         hypre_HandleReduceBuffer(hypre_handle()) =
            hypre_TAlloc(HYPRE_double6, HYPRE_MAX_NTHREADS_BLOCK, HYPRE_MEMORY_DEVICE);
      }

      d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle());
   }

   /* reduction within blocks */
   __host__ __device__
   void BlockReduce() const
   {
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
      __thread_sum = blockReduceSum(__thread_sum);
      if (threadIdx.x == 0)
      {
         d_buf[blockIdx.x] = __thread_sum;
      }
#endif
   }

   __host__ __device__
   void operator+=(T val) const
   {
      __thread_sum += val;
   }

   /* invoke the 2nd reduction at the time want the sum from the reducer */
   __host__
   operator T()
   {
      T val;

      const HYPRE_MemoryLocation memory_location = hypre_HandleMemoryLocation(hypre_handle());
      const HYPRE_ExecutionPolicy exec_policy = hypre_GetExecPolicy1(memory_location);

      if (exec_policy == HYPRE_EXEC_HOST)
      {
         val = __thread_sum;
         val += init;
      }
      else
      {
         /* 2nd reduction with only *one* block */
         hypre_assert(nblocks >= 0 && nblocks <= HYPRE_MAX_NTHREADS_BLOCK);
         const dim3 gDim(1), bDim(HYPRE_MAX_NTHREADS_BLOCK);
         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
         hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
         val += init;
      }

      return val;
   }

   /* destructor */
   __host__ __device__
   ~ReduceSum<T>()
   {
   }
};

#endif /* #if !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) */
#endif /* #if defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
#endif /* #ifndef HYPRE_CUDA_REDUCER_H */
/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Simple caching allocator for device memory allocations. The allocator is
 * thread-safe and capable of managing device allocations on multiple devices.
 ******************************************************************************/

#ifndef HYPRE_CUB_ALLOCATOR_HEADER
#define HYPRE_CUB_ALLOCATOR_HEADER

#if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL)

#include <set>
#include <map>

#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
    #include <mutex>
#else
    #if defined(_WIN32) || defined(_WIN64)
        #include <intrin.h>

        #define WIN32_LEAN_AND_MEAN
        #define NOMINMAX
        #include <windows.h>
        #undef WIN32_LEAN_AND_MEAN
        #undef NOMINMAX

        /**
         * Compiler read/write barrier
         */
        #pragma intrinsic(_ReadWriteBarrier)

    #endif
#endif

/**
 * Simple portable mutex
 *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
 *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
 */
struct hypre_cub_Mutex
{
#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)

    std::mutex mtx;

    void Lock()
    {
        mtx.lock();
    }

    void Unlock()
    {
        mtx.unlock();
    }

    void TryLock()
    {
        mtx.try_lock();
    }

#else       //__cplusplus > 199711L

    #if defined(_MSC_VER)

        // Microsoft VC++
        typedef hypre_longint Spinlock;

    #else

        // GNU g++
        typedef hypre_int Spinlock;

        /**
         * Compiler read/write barrier
         */
        __forceinline__ void _ReadWriteBarrier()
        {
            __sync_synchronize();
        }

        /**
         * Atomic exchange
         */
        __forceinline__ hypre_longint _InterlockedExchange(volatile hypre_int * const Target, const hypre_int Value)
        {
            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
            _ReadWriteBarrier();
            return __sync_lock_test_and_set(Target, Value);
        }

        /**
         * Pause instruction to prevent excess processor bus usage
         */
        __forceinline__ void YieldProcessor()
        {
        }

    #endif  // defined(_MSC_VER)

        /// Lock member
        volatile Spinlock lock;

        /**
         * Constructor
         */
        hypre_cub_Mutex() : lock(0) {}

        /**
         * Return when the specified spinlock has been acquired
         */
        __forceinline__ void Lock()
        {
            while (1)
            {
                if (!_InterlockedExchange(&lock, 1)) return;
                while (lock) YieldProcessor();
            }
        }


        /**
         * Release the specified spinlock
         */
        __forceinline__ void Unlock()
        {
            _ReadWriteBarrier();
            lock = 0;
        }

#endif      // __cplusplus > 199711L

};

#include <math.h>

/******************************************************************************
 * CachingDeviceAllocator (host use)
 ******************************************************************************/

/**
 * \brief A simple caching allocator for device memory allocations.
 *
 * \par Overview
 * The allocator is thread-safe and stream-safe and is capable of managing cached
 * device allocations on multiple devices.  It behaves as follows:
 *
 * \par
 * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
 *   the allocation becomes available immediately for reuse within the \p active_stream
 *   with which it was associated with during allocation, and it becomes available for
 *   reuse within other streams when all prior work submitted to \p active_stream has completed.
 * - Allocations are categorized and cached by bin size.  A new allocation request of
 *   a given size will only consider cached allocations within the corresponding bin.
 * - Bin limits progress geometrically in accordance with the growth factor
 *   \p bin_growth provided during construction.  Unused device allocations within
 *   a larger bin cache are not reused for allocation requests that categorize to
 *   smaller bin sizes.
 * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
 *   (\p bin_growth ^ \p min_bin).
 * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
 *   bin and are simply freed when they are deallocated instead of being returned
 *   to a bin-cache.
 * - %If the total storage of cached allocations on a given device will exceed
 *   \p max_cached_bytes, allocations for that device are simply freed when they are
 *   deallocated instead of being returned to their bin-cache.
 *
 * \par
 * For example, the default-constructed CachingDeviceAllocator is configured with:
 * - \p bin_growth          = 8
 * - \p min_bin             = 3
 * - \p max_bin             = 7
 * - \p max_cached_bytes    = 6MB - 1B
 *
 * \par
 * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
 * and sets a maximum of 6,291,455 cached bytes per device
 *
 */
struct hypre_cub_CachingDeviceAllocator
{
   typedef char value_type;

   //---------------------------------------------------------------------
   // Constants
   //---------------------------------------------------------------------

   /// Out-of-bounds bin
   static const hypre_uint INVALID_BIN = (hypre_uint) -1;

   /// Invalid size
   static const size_t INVALID_SIZE = (size_t) -1;

   /// Invalid device ordinal
   static const hypre_int INVALID_DEVICE_ORDINAL = -1;

   //---------------------------------------------------------------------
   // Type definitions and helper types
   //---------------------------------------------------------------------

   /**
    * Descriptor for device memory allocations
    */
   struct BlockDescriptor
   {
      void*           d_ptr;              // Device pointer
      size_t          bytes;              // Size of allocation in bytes
      hypre_uint      bin;                // Bin enumeration
      hypre_int       device;             // device ordinal
      cudaStream_t    associated_stream;  // Associated associated_stream
      cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed

      // Constructor (suitable for searching maps for a specific block, given its pointer and device)
      BlockDescriptor(void *d_ptr, hypre_int device) :
         d_ptr(d_ptr),
         bytes(0),
         bin(INVALID_BIN),
         device(device),
         associated_stream(0),
         ready_event(0)
      {}

      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
      BlockDescriptor(hypre_int device) :
         d_ptr(NULL),
         bytes(0),
         bin(INVALID_BIN),
         device(device),
         associated_stream(0),
         ready_event(0)
      {}

      // Comparison functor for comparing device pointers
      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
      {
         if (a.device == b.device)
            return (a.d_ptr < b.d_ptr);
         else
            return (a.device < b.device);
      }

      // Comparison functor for comparing allocation sizes
      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
      {
         if (a.device == b.device)
            return (a.bytes < b.bytes);
         else
            return (a.device < b.device);
      }
   };

   /// BlockDescriptor comparator function interface
   typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);

   class TotalBytes {
      public:
         size_t free;
         size_t live;
         TotalBytes() { free = live = 0; }
   };

   /// Set type for cached blocks (ordered by size)
   typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;

   /// Set type for live blocks (ordered by ptr)
   typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

   /// Map type of device ordinals to the number of cached bytes cached by each device
   typedef std::map<hypre_int, TotalBytes> GpuCachedBytes;


   //---------------------------------------------------------------------
   // Utility functions
   //---------------------------------------------------------------------

   /**
    * Integer pow function for unsigned base and exponent
    */
   static hypre_uint IntPow(
         hypre_uint base,
         hypre_uint exp)
   {
      hypre_uint retval = 1;
      while (exp > 0)
      {
         if (exp & 1) {
            retval = retval * base;        // multiply the result by the current base
         }
         base = base * base;                // square the base
         exp = exp >> 1;                    // divide the exponent in half
      }
      return retval;
   }


   /**
    * Round up to the nearest power-of
    */
   void NearestPowerOf(
         hypre_uint      &power,
         size_t          &rounded_bytes,
         hypre_uint       base,
         size_t           value)
   {
      power = 0;
      rounded_bytes = 1;

      if (value * base < value)
      {
         // Overflow
         power = sizeof(size_t) * 8;
         rounded_bytes = size_t(0) - 1;
         return;
      }

      while (rounded_bytes < value)
      {
         rounded_bytes *= base;
         power++;
      }
   }


   //---------------------------------------------------------------------
   // Fields
   //---------------------------------------------------------------------

   hypre_cub_Mutex mutex;              /// Mutex for thread-safety

   hypre_uint      bin_growth;         /// Geometric growth factor for bin-sizes
   hypre_uint      min_bin;            /// Minimum bin enumeration
   hypre_uint      max_bin;            /// Maximum bin enumeration

   size_t          min_bin_bytes;      /// Minimum bin size
   size_t          max_bin_bytes;      /// Maximum bin size
   size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device

   const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
   bool            debug;              /// Whether or not to print (de)allocation events to stdout

   GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
   CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
   BusyBlocks      live_blocks;        /// Set of live device allocations currently in use

   bool            use_managed_memory; /// Whether to use managed memory or device memory

   //---------------------------------------------------------------------
   // Methods
   //---------------------------------------------------------------------

   /**
    * \brief Constructor.
    */
   hypre_cub_CachingDeviceAllocator(
         hypre_uint      bin_growth,                             ///< Geometric growth factor for bin-sizes
         hypre_uint      min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
         hypre_uint      max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
         size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
         bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
         bool            debug               = false,            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
         bool            use_managed_memory  = false)            ///< Whether to use managed memory or device memory
      :
         bin_growth(bin_growth),
         min_bin(min_bin),
         max_bin(max_bin),
         min_bin_bytes(IntPow(bin_growth, min_bin)),
         max_bin_bytes(IntPow(bin_growth, max_bin)),
         max_cached_bytes(max_cached_bytes),
         skip_cleanup(skip_cleanup),
         debug(debug),
         use_managed_memory(use_managed_memory),
         cached_blocks(BlockDescriptor::SizeCompare),
         live_blocks(BlockDescriptor::PtrCompare)
   {}


   /**
    * \brief Default constructor.
    *
    * Configured with:
    * \par
    * - \p bin_growth          = 8
    * - \p min_bin             = 3
    * - \p max_bin             = 7
    * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
    *
    * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
    * sets a maximum of 6,291,455 cached bytes per device
    */
   hypre_cub_CachingDeviceAllocator(
         bool skip_cleanup = false,
         bool debug = false,
         bool use_managed_memory = false)
      :
         bin_growth(8),
         min_bin(3),
         max_bin(7),
         min_bin_bytes(IntPow(bin_growth, min_bin)),
         max_bin_bytes(IntPow(bin_growth, max_bin)),
         max_cached_bytes((max_bin_bytes * 3) - 1),
         skip_cleanup(skip_cleanup),
         debug(debug),
         use_managed_memory(use_managed_memory),
         cached_blocks(BlockDescriptor::SizeCompare),
         live_blocks(BlockDescriptor::PtrCompare)
   {}


   /**
    * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
    *
    * Changing the ceiling of cached bytes does not cause any allocations (in-use or
    * cached-in-reserve) to be freed.  See \p FreeAllCached().
    */
   cudaError_t SetMaxCachedBytes(
         size_t max_cached_bytes)
   {
      // Lock
      mutex.Lock();

      if (debug) printf("Changing max_cached_bytes (%zu -> %zu)\n", this->max_cached_bytes, max_cached_bytes);

      this->max_cached_bytes = max_cached_bytes;

      // Unlock
      mutex.Unlock();

      return cudaSuccess;
   }


   /**
    * \brief Provides a suitable allocation of device memory for the given size on the specified device.
    *
    * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
    * with which it was associated with during allocation, and it becomes available for reuse within other
    * streams when all prior work submitted to \p active_stream has completed.
    */
   cudaError_t DeviceAllocate(
         hypre_int       device,             ///< [in] Device on which to place the allocation
         void            **d_ptr,            ///< [out] Reference to pointer to the allocation
         size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
         cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
   {
      *d_ptr                          = NULL;
      hypre_int entrypoint_device     = INVALID_DEVICE_ORDINAL;
      cudaError_t error               = cudaSuccess;

      if (device == INVALID_DEVICE_ORDINAL)
      {
         if ((error = cudaGetDevice(&entrypoint_device))) return error;
         device = entrypoint_device;
      }

      // Create a block descriptor for the requested allocation
      bool found = false;
      BlockDescriptor search_key(device);
      search_key.associated_stream = active_stream;
      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);

      if (search_key.bin > max_bin)
      {
         // Bin is greater than our maximum bin: allocate the request
         // exactly and give out-of-bounds bin.  It will not be cached
         // for reuse when returned.
         search_key.bin      = INVALID_BIN;
         search_key.bytes    = bytes;
      }
      else
      {
         // Search for a suitable cached allocation: lock
         mutex.Lock();

         if (search_key.bin < min_bin)
         {
            // Bin is less than minimum bin: round up
            search_key.bin      = min_bin;
            search_key.bytes    = min_bin_bytes;
         }

         // Iterate through the range of cached blocks on the same device in the same bin
         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
         while ((block_itr != cached_blocks.end())
               && (block_itr->device == device)
               && (block_itr->bin == search_key.bin))
         {
            // To prevent races with reusing blocks returned by the host but still
            // in use by the device, only consider cached blocks that are
            // either (from the active stream) or (from an idle stream)
            if ((active_stream == block_itr->associated_stream) ||
                  (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
            {
               // Reuse existing cache block.  Insert into live blocks.
               found = true;
               search_key = *block_itr;
               search_key.associated_stream = active_stream;
               live_blocks.insert(search_key);

               // Remove from free blocks
               cached_bytes[device].free -= search_key.bytes;
               cached_bytes[device].live += search_key.bytes;

               if (debug) printf("\tDevice %d reused cached block at %p (%zu bytes) for stream %p (previously associated with stream %p).\n",
                     device, search_key.d_ptr, search_key.bytes, search_key.associated_stream, block_itr->associated_stream);

               cached_blocks.erase(block_itr);

               break;
            }
            block_itr++;
         }

         // Done searching: unlock
         mutex.Unlock();
      }

      // Allocate the block if necessary
      if (!found)
      {
         // Set runtime's current device to specified device (entrypoint may not be set)
         if (device != entrypoint_device)
         {
            if ((error = cudaGetDevice(&entrypoint_device))) return error;
            if ((error = cudaSetDevice(device))) return error;
         }

         // Attempt to allocate

         if (use_managed_memory)
         {
            error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes);
         }
         else
         {
            error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
         }
         if ((error) == cudaErrorMemoryAllocation)
         {
            // The allocation attempt failed: free all cached blocks on device and retry
            if (debug) printf("\tDevice %d failed to allocate %zu bytes for stream %p, retrying after freeing cached allocations",
                  device, search_key.bytes, search_key.associated_stream);

            error = cudaSuccess;    // Reset the error we will return
            cudaGetLastError();     // Reset CUDART's error

            // Lock
            mutex.Lock();

            // Iterate the range of free blocks on the same device
            BlockDescriptor free_key(device);
            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);

            while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
            {
               // No need to worry about synchronization with the device: cudaFree is
               // blocking and will synchronize across all kernels executing
               // on the current device

               // Free device memory and destroy stream event.
               if ((error = cudaFree(block_itr->d_ptr))) break;
               if ((error = cudaEventDestroy(block_itr->ready_event))) break;

               // Reduce balance and erase entry
               cached_bytes[device].free -= block_itr->bytes;

               if (debug) printf("\tDevice %d freed %zu bytes.\n\t\t  %zu available blocks cached (%zu bytes), %zu live blocks (%zu bytes) outstanding.\n",
                     device, block_itr->bytes, cached_blocks.size(), cached_bytes[device].free, live_blocks.size(), cached_bytes[device].live);

               cached_blocks.erase(block_itr);

               block_itr++;
            }

            // Unlock
            mutex.Unlock();

            // Return under error
            if (error) return error;

            // Try to allocate again

            if (use_managed_memory)
            {
               error = cudaMallocManaged(&search_key.d_ptr, search_key.bytes);
            }
            else
            {
               error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
            }
            if ((error)) return error;
         }

         // Create ready event
         if ((error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
            return error;

         // Insert into live blocks
         mutex.Lock();
         live_blocks.insert(search_key);
         cached_bytes[device].live += search_key.bytes;
         mutex.Unlock();

         if (debug) printf("\tDevice %d allocated new device block at %p (%zu bytes associated with stream %p).\n",
               device, search_key.d_ptr, search_key.bytes, search_key.associated_stream);

         // Attempt to revert back to previous device if necessary
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
         {
            if ((error = cudaSetDevice(entrypoint_device))) return error;
         }
      }

      // Copy device pointer to output parameter
      *d_ptr = search_key.d_ptr;

      if (debug) printf("\t\t%zu available blocks cached (%zu bytes), %zu live blocks outstanding(%zu bytes).\n",
            cached_blocks.size(), cached_bytes[device].free, live_blocks.size(), cached_bytes[device].live);

      return error;
   }


   /**
    * \brief Provides a suitable allocation of device memory for the given size on the current device.
    *
    * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
    * with which it was associated with during allocation, and it becomes available for reuse within other
    * streams when all prior work submitted to \p active_stream has completed.
    */
   cudaError_t DeviceAllocate(
         void            **d_ptr,            ///< [out] Reference to pointer to the allocation
         size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
         cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
   {
      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
   }

   char * allocate(size_t bytes)
   {
      char *ptr;
      DeviceAllocate((void **) &ptr, bytes);

      return ptr;
   }


   /**
    * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
    *
    * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
    * with which it was associated with during allocation, and it becomes available for reuse within other
    * streams when all prior work submitted to \p active_stream has completed.
    */
   cudaError_t DeviceFree(
         hypre_int       device,
         void*           d_ptr)
   {
      hypre_int entrypoint_device     = INVALID_DEVICE_ORDINAL;
      cudaError_t error               = cudaSuccess;

      if (device == INVALID_DEVICE_ORDINAL)
      {
         if ((error = cudaGetDevice(&entrypoint_device)))
            return error;
         device = entrypoint_device;
      }

      // Lock
      mutex.Lock();

      // Find corresponding block descriptor
      bool recached = false;
      BlockDescriptor search_key(d_ptr, device);
      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
      if (block_itr != live_blocks.end())
      {
         // Remove from live blocks
         search_key = *block_itr;
         live_blocks.erase(block_itr);
         cached_bytes[device].live -= search_key.bytes;

         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
         {
            // Insert returned allocation into free blocks
            recached = true;
            cached_blocks.insert(search_key);
            cached_bytes[device].free += search_key.bytes;

            if (debug) printf("\tDevice %d returned %zu bytes from associated stream %p.\n\t\t %zu available blocks cached (%zu bytes), %zu live blocks outstanding. (%zu bytes)\n",
                  device, search_key.bytes, search_key.associated_stream, cached_blocks.size(),
                  cached_bytes[device].free, live_blocks.size(), cached_bytes[device].live);
         }
      }

      // Unlock
      mutex.Unlock();

      // First set to specified device (entrypoint may not be set)
      if (device != entrypoint_device)
      {
         if ((error = cudaGetDevice(&entrypoint_device))) return error;
         if ((error = cudaSetDevice(device))) return error;
      }

      if (recached)
      {
         // Insert the ready event in the associated stream (must have current device set properly)
         if ((error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
      }
      else
      {
         // Free the allocation from the runtime and cleanup the event.
         if ((error = cudaFree(d_ptr))) return error;
         if ((error = cudaEventDestroy(search_key.ready_event))) return error;

         if (debug) printf("\tDevice %d freed %zu bytes from associated stream %p.\n\t\t  %zu available blocks cached (%zu bytes), %zu live blocks (%zu bytes) outstanding.\n",
               device, search_key.bytes, search_key.associated_stream, cached_blocks.size(), cached_bytes[device].free, live_blocks.size(), cached_bytes[device].live);
      }

      // Reset device
      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
      {
         if ((error = cudaSetDevice(entrypoint_device))) return error;
      }

      return error;
   }

   /**
    * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
    *
    * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
    * with which it was associated with during allocation, and it becomes available for reuse within other
    * streams when all prior work submitted to \p active_stream has completed.
    */
   cudaError_t DeviceFree(
         void*           d_ptr)
   {
      return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
   }

   void deallocate(char *ptr, size_t)
   {
      DeviceFree((void *) ptr);
   }

   /**
    * \brief Frees all cached device allocations on all devices
    */
   cudaError_t FreeAllCached()
   {
      cudaError_t error           = cudaSuccess;
      hypre_int entrypoint_device = INVALID_DEVICE_ORDINAL;
      hypre_int current_device    = INVALID_DEVICE_ORDINAL;

      mutex.Lock();

      while (!cached_blocks.empty())
      {
         // Get first block
         CachedBlocks::iterator begin = cached_blocks.begin();

         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL)
         {
            if ((error = cudaGetDevice(&entrypoint_device))) break;
         }

         // Set current device ordinal if necessary
         if (begin->device != current_device)
         {
            if ((error = cudaSetDevice(begin->device))) break;
            current_device = begin->device;
         }

         // Free device memory
         if ((error = cudaFree(begin->d_ptr))) break;
         if ((error = cudaEventDestroy(begin->ready_event))) break;

         // Reduce balance and erase entry
         cached_bytes[current_device].free -= begin->bytes;

         if (debug) printf("\tDevice %d freed %zu bytes.\n\t\t  %zu available blocks cached (%zu bytes), %zu live blocks (%zu bytes) outstanding.\n",
               current_device, begin->bytes, cached_blocks.size(), cached_bytes[current_device].free, live_blocks.size(), cached_bytes[current_device].live);

         cached_blocks.erase(begin);
      }

      mutex.Unlock();

      // Attempt to revert back to entry-point device if necessary
      if (entrypoint_device != INVALID_DEVICE_ORDINAL)
      {
         if ((error = cudaSetDevice(entrypoint_device))) return error;
      }

      return error;
   }


   /**
    * \brief Destructor
    */
   virtual ~hypre_cub_CachingDeviceAllocator()
   {
      if (!skip_cleanup)
         FreeAllCached();
   }
};

#endif // #if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL)
#endif // #ifndef HYPRE_CUB_ALLOCATOR_HEADER


#ifdef __cplusplus
}
#endif

#endif

