20#ifdef DFTFE_WITH_DEVICE
21# ifndef dftfeDeviceKernelLauncherHelpers_h
22# define dftfeDeviceKernelLauncherHelpers_h
24# ifdef DFTFE_WITH_DEVICE_NVIDIA
29 static const int DEVICE_WARP_SIZE = 32;
30 static const int DEVICE_MAX_BLOCK_SIZE = 1024;
31 static const int DEVICE_BLOCK_SIZE = 256;
36# elif DFTFE_WITH_DEVICE_AMD
42 static const int DEVICE_WARP_SIZE = 64;
43 static const int DEVICE_MAX_BLOCK_SIZE = 1024;
44 static const int DEVICE_BLOCK_SIZE = 512;
49# elif DFTFE_WITH_DEVICE_INTEL
55 static const int DEVICE_WARP_SIZE = 32;
56 static const int DEVICE_MAX_BLOCK_SIZE = 1024;
57 static const int DEVICE_BLOCK_SIZE = 256;
63# ifdef DFTFE_WITH_DEVICE_LANG_CUDA
64# define DFTFE_LAUNCH_KERNEL(kernel, grid, block, stream, ...) \
67 kernel<<<grid, block, 0, stream>>>(__VA_ARGS__); \
69# elif defined(DFTFE_WITH_DEVICE_LANG_HIP)
70# define DFTFE_LAUNCH_KERNEL(kernel, grid, block, stream, ...) \
74 HIP_KERNEL_NAME(kernel), grid, block, 0, stream, __VA_ARGS__); \
76# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
77# define DFTFE_LAUNCH_KERNEL(kernel, grid, block, stream, ...) \
80 dftfe::utils::queueRegistry.find(stream)->second.parallel_for( \
81 sycl::nd_range<1>((grid) * (block), block), \
82 [=](sycl::nd_item<1> ind) { kernel(ind, __VA_ARGS__); }); \
86 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
89# ifdef DFTFE_WITH_DEVICE_LANG_CUDA
90# define DFTFE_LAUNCH_KERNEL_SMEM_D( \
91 kernel, grid, block, smemtype, smemcount, stream, ...) \
94 kernel<<<grid, block, smemcount * sizeof(smemtype), stream>>>( \
97# elif defined(DFTFE_WITH_DEVICE_LANG_HIP)
98# define DFTFE_LAUNCH_KERNEL_SMEM_D( \
99 kernel, grid, block, smemtype, smemcount, stream, ...) \
102 hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), \
105 smemcount * sizeof(smemtype), \
109# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
110# define DFTFE_LAUNCH_KERNEL_SMEM_D( \
111 kernel, grid, block, smemtype, smemcount, stream, ...) \
114 dftfe::utils::queueRegistry.find(stream)->second.submit( \
115 [=](sycl::handler &cgh) { \
116 sycl::local_accessor<smemtype, 1> SMem_acc(smemcount, cgh); \
117 cgh.parallel_for(sycl::nd_range<1>((grid) * (block), block), \
118 [=](sycl::nd_item<1> ind) { \
120 SMem_acc.get_pointer(), \
127 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
130# ifdef DFTFE_WITH_DEVICE_LANG_CUDA
131# define DFTFE_LAUNCH_KERNEL_SMEM_S( \
132 kernel, grid, block, smemtype, smemcount, stream, ...) \
135 kernel<<<grid, block, 0, stream>>>(__VA_ARGS__); \
137# elif defined(DFTFE_WITH_DEVICE_LANG_HIP)
138# define DFTFE_LAUNCH_KERNEL_SMEM_S( \
139 kernel, grid, block, smemtype, smemcount, stream, ...) \
142 hipLaunchKernelGGL( \
143 HIP_KERNEL_NAME(kernel), grid, block, 0, stream, __VA_ARGS__); \
145# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
146# define DFTFE_LAUNCH_KERNEL_SMEM_S( \
147 kernel, grid, block, smemtype, smemcount, stream, ...) \
150 dftfe::utils::queueRegistry.find(stream)->second.submit( \
151 [=](sycl::handler &cgh) { \
152 sycl::local_accessor<smemtype, 1> SMem_acc(smemcount, cgh); \
153 cgh.parallel_for(sycl::nd_range<1>((grid) * (block), block), \
154 [=](sycl::nd_item<1> ind) { \
156 SMem_acc.get_pointer(), \
163 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
167# define DFTFE_KERNEL_NAME(...) __VA_ARGS__
170# if defined(DFTFE_WITH_DEVICE_LANG_CUDA) || \
171 defined(DFTFE_WITH_DEVICE_LANG_HIP)
172# define DFTFE_CREATE_KERNEL(RET, NAME, BODY, ...) \
173 __global__ RET NAME(__VA_ARGS__) \
175 const dftfe::uInt globalThreadId = \
176 blockIdx.x * blockDim.x + threadIdx.x; \
177 const dftfe::uInt nThreadsPerBlock = blockDim.x; \
178 const dftfe::uInt nThreadBlock = gridDim.x; \
181# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
182# define DFTFE_CREATE_KERNEL(RET, NAME, BODY, ...) \
183 RET NAME(sycl::nd_item<1> ind, __VA_ARGS__) \
185 const dftfe::uInt globalThreadId = ind.get_global_id(0); \
186 const dftfe::uInt nThreadsPerBlock = ind.get_local_range(0); \
187 const dftfe::uInt nThreadBlock = ind.get_group_range(0); \
192 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
195# if defined(DFTFE_WITH_DEVICE_LANG_CUDA) || \
196 defined(DFTFE_WITH_DEVICE_LANG_HIP)
197# define DFTFE_CREATE_KERNEL_SMEM_D(SMEMTYPE, RET, NAME, BODY, ...) \
198 __global__ RET NAME(__VA_ARGS__) \
200 extern __shared__ SMEMTYPE smem[]; \
201 const dftfe::uInt globalThreadId = \
202 blockIdx.x * blockDim.x + threadIdx.x; \
203 const dftfe::uInt threadId = threadIdx.x; \
204 const dftfe::uInt blockId = blockIdx.x; \
205 const dftfe::uInt nThreadsPerBlock = blockDim.x; \
206 const dftfe::uInt nThreadBlock = gridDim.x; \
209# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
210# define DFTFE_CREATE_KERNEL_SMEM_D(SMEMTYPE, RET, NAME, BODY, ...) \
211 RET NAME(sycl::nd_item<1> ind, SMEMTYPE *smem, __VA_ARGS__) \
213 const dftfe::uInt globalThreadId = ind.get_global_id(0); \
214 const dftfe::uInt threadId = ind.get_local_id(0); \
215 const dftfe::uInt blockId = ind.get_group(0); \
216 const dftfe::uInt nThreadsPerBlock = ind.get_local_range(0); \
217 const dftfe::uInt nThreadBlock = ind.get_group_range(0); \
222 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
225# if defined(DFTFE_WITH_DEVICE_LANG_CUDA) || \
226 defined(DFTFE_WITH_DEVICE_LANG_HIP)
227# define DFTFE_CREATE_KERNEL_SMEM_S( \
228 SMEMTYPE, SMEMCOUNT, RET, NAME, BODY, ...) \
229 __global__ RET NAME(__VA_ARGS__) \
231 __shared__ SMEMTYPE smem[SMEMCOUNT]; \
232 const dftfe::uInt globalThreadId = \
233 blockIdx.x * blockDim.x + threadIdx.x; \
234 const dftfe::uInt threadId = threadIdx.x; \
235 const dftfe::uInt blockId = blockIdx.x; \
236 const dftfe::uInt nThreadsPerBlock = blockDim.x; \
237 const dftfe::uInt nThreadBlock = gridDim.x; \
240# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
241# define DFTFE_CREATE_KERNEL_SMEM_S( \
242 SMEMTYPE, SMEMCOUNT, RET, NAME, BODY, ...) \
243 RET NAME(sycl::nd_item<1> ind, SMEMTYPE *smem, __VA_ARGS__) \
245 const dftfe::uInt globalThreadId = ind.get_global_id(0); \
246 const dftfe::uInt threadId = ind.get_local_id(0); \
247 const dftfe::uInt blockId = ind.get_group(0); \
248 const dftfe::uInt nThreadsPerBlock = ind.get_local_range(0); \
249 const dftfe::uInt nThreadBlock = ind.get_group_range(0); \
254 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
258# if defined(DFTFE_WITH_DEVICE_LANG_CUDA) || \
259 defined(DFTFE_WITH_DEVICE_LANG_HIP)
260# define SYNCTHREADS __syncthreads()
261# elif defined(DFTFE_WITH_DEVICE_LANG_SYCL)
262# define SYNCTHREADS sycl::group_barrier(ind.get_group());
265 "No device backend defined (DFTFE_WITH_DEVICE_LANG_CUDA or DFTFE_WITH_DEVICE_LANG_HIP or DFTFE_WITH_DEVICE_LANG_SYCL)"
Definition pseudoPotentialToDftfeConverter.cc:34