27#if defined(DFTFE_WITH_DEVICE)
34 template <dftfe::utils::MemorySpace memorySpace>
43 template <
typename ValueType>
50 template <
typename ValueType>
79 const std::complex<float> *alpha,
80 const std::complex<float> *A,
82 const std::complex<float> *B,
84 const std::complex<float> *beta,
85 std::complex<float> *C,
112 const std::complex<double> *alpha,
113 const std::complex<double> *A,
115 const std::complex<double> *B,
117 const std::complex<double> *beta,
118 std::complex<double> *C,
151 const std::complex<double> *alpha,
152 const std::complex<double> *A,
154 const std::complex<double> *x,
156 const std::complex<double> *beta,
157 std::complex<double> *y,
164 const std::complex<float> *alpha,
165 const std::complex<float> *A,
167 const std::complex<float> *x,
169 const std::complex<float> *beta,
170 std::complex<float> *y,
174 template <
typename ValueType1,
typename ValueType2>
187 template <
typename ValueType>
191 const ValueType *input1,
192 const ValueType *input2,
203 const MPI_Comm &mpi_communicator,
210 const std::complex<double> *x,
212 const MPI_Comm &mpi_communicator,
237 const MPI_Comm &mpi_communicator,
243 const std::complex<double> *X,
245 const std::complex<double> *Y,
247 std::complex<double> *result);
251 const std::complex<float> *X,
253 const std::complex<float> *Y,
255 std::complex<float> *result);
259 const std::complex<double> *X,
261 const std::complex<double> *Y,
263 const MPI_Comm &mpi_communicator,
264 std::complex<double> *result);
268 template <
typename ValueType>
274 const ValueType *onesVec,
275 ValueType *tempVector,
276 ValueType *tempResults,
280 template <
typename ValueType>
286 const ValueType *onesVec,
287 ValueType *tempVector,
288 ValueType *tempResults,
289 const MPI_Comm &mpi_communicator,
305 const std::complex<double> *alpha,
306 const std::complex<double> *x,
308 std::complex<double> *y,
323 const std::complex<float> *alpha,
324 const std::complex<float> *x,
326 std::complex<float> *y,
340 const std::complex<double> *x,
342 std::complex<double> *y,
356 const std::complex<float> *x,
358 std::complex<float> *y,
396 const std::complex<double> *alpha,
397 const std::complex<double> *A[],
399 const std::complex<double> *B[],
401 const std::complex<double> *beta,
402 std::complex<double> *C[],
429 const std::complex<float> *alpha,
430 const std::complex<float> *A[],
432 const std::complex<float> *B[],
434 const std::complex<float> *beta,
435 std::complex<float> *C[],
449 long long int strideA,
452 long long int strideB,
456 long long int strideC,
465 const std::complex<double> *alpha,
466 const std::complex<double> *A,
468 long long int strideA,
469 const std::complex<double> *B,
471 long long int strideB,
472 const std::complex<double> *beta,
473 std::complex<double> *C,
475 long long int strideC,
484 const std::complex<float> *alpha,
485 const std::complex<float> *A,
487 long long int strideA,
488 const std::complex<float> *B,
490 long long int strideB,
491 const std::complex<float> *beta,
492 std::complex<float> *C,
494 long long int strideC,
506 long long int strideA,
509 long long int strideB,
513 long long int strideC,
516 template <
typename ValueTypeComplex,
typename ValueTypeReal>
519 const ValueTypeComplex *complexArr,
520 ValueTypeReal *realArr,
521 ValueTypeReal *imagArr);
524 template <
typename ValueTypeComplex,
typename ValueTypeReal>
527 const ValueTypeReal *realArr,
528 const ValueTypeReal *imagArr,
529 ValueTypeComplex *complexArr);
531 template <
typename ValueType1,
typename ValueType2>
534 const ValueType1 *valueType1Arr,
535 ValueType2 *valueType2Arr);
538 template <
typename ValueType1,
typename ValueType2>
543 const ValueType1 *copyFromVec,
544 ValueType2 *copyToVecBlock,
545 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
548 template <
typename ValueType1,
typename ValueType2>
554 const ValueType1 *copyFromVec,
555 ValueType2 *copyToVecBlock,
556 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
558 template <
typename ValueType1,
typename ValueType2>
563 const ValueType1 *copyFromVecBlock,
564 ValueType2 *copyToVec,
565 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
567 template <
typename ValueType1,
typename ValueType2>
573 const ValueType1 *copyFromVec,
574 ValueType2 *copyToVec);
577 template <
typename ValueType1,
typename ValueType2>
585 const ValueType1 *copyFromVec,
586 ValueType2 *copyToVec);
589 template <
typename ValueType1,
typename ValueType2>
595 const ValueType1 *copyFromVec,
596 ValueType2 *copyToVec);
598 template <
typename ValueType1,
typename ValueType2>
602 const ValueType1 *addFromVec,
603 const ValueType2 *scalingVector,
605 ValueType1 *addToVec);
608 template <
typename ValueType1,
typename ValueType2>
612 const ValueType1 *addFromVec,
613 const ValueType2 *scalingVector,
616 ValueType1 *addToVec);
617 template <
typename ValueType1,
typename ValueType2>
620 const ValueType2 alpha,
622 const ValueType2 beta,
624 template <
typename ValueType0,
632 const ValueType0 alpha,
638 template <
typename ValueType>
643 const ValueType *addFromVec,
645 const dftfe::uInt *addToVecStartingContiguousBlockIds);
647 template <
typename ValueType1,
typename ValueType2,
typename ValueType3>
654 const ValueType2 *addFromVec,
655 ValueType3 *addToVec,
656 const dftfe::uInt *addToVecStartingContiguousBlockIds);
657 template <
typename ValueType1,
typename ValueType2,
typename ValueType3>
663 const ValueType2 *addFromVec,
664 ValueType3 *addToVec,
665 const dftfe::uInt *addToVecStartingContiguousBlockIds);
667 template <
typename ValueType1,
typename ValueType2>
675 template <
typename ValueType1,
typename ValueType2>
682 const ValueType2 *copyFromVec,
683 ValueType2 *copyToVecBlock,
684 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
686 template <
typename ValueType>
690 const ValueType *beta,
693 template <
typename ValueType>
698 const ValueType *beta,
701 template <
typename ValueType>
707 const ValueType *alpha,
709 const ValueType *beta,
712 template <
typename ValueType1,
typename ValueType2>
721#if defined(DFTFE_WITH_DEVICE)
723 enum class tensorOpDataType
737 template <
typename ValueType1,
typename ValueType2>
739 copyValueType1ArrToValueType2ArrDeviceCall(
741 const ValueType1 *valueType1Arr,
742 ValueType2 *valueType2Arr,
745 template <
typename ValueType>
752 template <
typename ValueType>
761 xgemm(
const char transA,
776 xgemm(
const char transA,
781 const std::complex<float> *alpha,
782 const std::complex<float> *A,
784 const std::complex<float> *B,
786 const std::complex<float> *beta,
787 std::complex<float> *C,
792 xgemm(
const char transA,
809 xgemm(
const char transA,
814 const std::complex<double> *alpha,
815 const std::complex<double> *A,
817 const std::complex<double> *B,
819 const std::complex<double> *beta,
820 std::complex<double> *C,
825 xgemv(
const char transA,
838 xgemv(
const char transA,
851 xgemv(
const char transA,
854 const std::complex<double> *alpha,
855 const std::complex<double> *A,
857 const std::complex<double> *x,
859 const std::complex<double> *beta,
860 std::complex<double> *y,
864 xgemv(
const char transA,
867 const std::complex<float> *alpha,
868 const std::complex<float> *A,
870 const std::complex<float> *x,
872 const std::complex<float> *beta,
873 std::complex<float> *y,
876 template <
typename ValueType>
878 addVecOverContinuousIndex(
const dftfe::uInt numContiguousBlocks,
880 const ValueType *input1,
881 const ValueType *input2,
886 template <
typename ValueType1,
typename ValueType2>
888 xscal(ValueType1 *x,
const ValueType2 alpha,
const dftfe::uInt n);
897 const MPI_Comm &mpi_communicator,
904 const std::complex<double> *x,
906 const MPI_Comm &mpi_communicator,
933 const MPI_Comm &mpi_communicator,
939 const std::complex<double> *X,
941 const std::complex<double> *Y,
943 std::complex<double> *result);
947 const std::complex<float> *X,
949 const std::complex<float> *Y,
951 std::complex<float> *result);
955 const std::complex<double> *X,
957 const std::complex<double> *Y,
959 const MPI_Comm &mpi_communicator,
960 std::complex<double> *result);
963 template <
typename ValueType>
965 MultiVectorXDot(
const dftfe::uInt contiguousBlockSize,
969 const ValueType *onesVec,
970 ValueType *tempVector,
971 ValueType *tempResults,
974 template <
typename ValueType>
976 MultiVectorXDot(
const dftfe::uInt contiguousBlockSize,
980 const ValueType *onesVec,
981 ValueType *tempVector,
982 ValueType *tempResults,
983 const MPI_Comm &mpi_communicator,
998 const std::complex<double> *alpha,
999 const std::complex<double> *x,
1001 std::complex<double> *y,
1015 const std::complex<double> *x,
1017 std::complex<double> *y,
1031 const std::complex<float> *x,
1033 std::complex<float> *y,
1038 xsymv(
const char UPLO,
1040 const double *alpha,
1050 xgemmBatched(
const char transA,
1055 const double *alpha,
1066 xgemmBatched(
const char transA,
1071 const std::complex<double> *alpha,
1072 const std::complex<double> *A[],
1074 const std::complex<double> *B[],
1076 const std::complex<double> *beta,
1077 std::complex<double> *C[],
1082 xgemmBatched(
const char transA,
1098 xgemmBatched(
const char transA,
1103 const std::complex<float> *alpha,
1104 const std::complex<float> *A[],
1106 const std::complex<float> *B[],
1108 const std::complex<float> *beta,
1109 std::complex<float> *C[],
1114 xgemmStridedBatched(
const char transA,
1119 const double *alpha,
1122 long long int strideA,
1125 long long int strideB,
1129 long long int strideC,
1133 xgemmStridedBatched(
const char transA,
1138 const std::complex<double> *alpha,
1139 const std::complex<double> *A,
1141 long long int strideA,
1142 const std::complex<double> *B,
1144 long long int strideB,
1145 const std::complex<double> *beta,
1146 std::complex<double> *C,
1148 long long int strideC,
1152 xgemmStridedBatched(
const char transA,
1157 const std::complex<float> *alpha,
1158 const std::complex<float> *A,
1160 long long int strideA,
1161 const std::complex<float> *B,
1163 long long int strideB,
1164 const std::complex<float> *beta,
1165 std::complex<float> *C,
1167 long long int strideC,
1171 xgemmStridedBatched(
const char transA,
1179 long long int strideA,
1182 long long int strideB,
1186 long long int strideC,
1189 template <
typename ValueTypeComplex,
typename ValueTypeReal>
1192 const ValueTypeComplex *complexArr,
1193 ValueTypeReal *realArr,
1194 ValueTypeReal *imagArr);
1197 template <
typename ValueTypeComplex,
typename ValueTypeReal>
1200 const ValueTypeReal *realArr,
1201 const ValueTypeReal *imagArr,
1202 ValueTypeComplex *complexArr);
1204 template <
typename ValueType1,
typename ValueType2>
1206 copyValueType1ArrToValueType2Arr(
const dftfe::uInt size,
1207 const ValueType1 *valueType1Arr,
1208 ValueType2 *valueType2Arr);
1211 template <
typename ValueType1,
typename ValueType2>
1216 const ValueType1 *copyFromVec,
1217 ValueType2 *copyToVecBlock,
1218 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
1220 template <
typename ValueType1,
typename ValueType2>
1226 const ValueType1 *copyFromVec,
1227 ValueType2 *copyToVecBlock,
1228 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
1231 template <
typename ValueType1,
typename ValueType2>
1233 stridedCopyFromBlock(
1236 const ValueType1 *copyFromVecBlock,
1237 ValueType2 *copyToVec,
1238 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
1240 template <
typename ValueType1,
typename ValueType2>
1242 stridedCopyToBlockConstantStride(
const dftfe::uInt blockSizeTo,
1246 const ValueType1 *copyFromVec,
1247 ValueType2 *copyToVec);
1250 template <
typename ValueType1,
typename ValueType2>
1252 stridedCopyConstantStride(
const dftfe::uInt blockSize,
1258 const ValueType1 *copyFromVec,
1259 ValueType2 *copyToVec);
1262 template <
typename ValueType1,
typename ValueType2>
1264 stridedCopyFromBlockConstantStride(
const dftfe::uInt blockSizeTo,
1268 const ValueType1 *copyFromVec,
1269 ValueType2 *copyToVec);
1270 template <
typename ValueType1,
typename ValueType2>
1273 const ValueType2 alpha,
1274 const ValueType1 *x,
1275 const ValueType2 beta,
1278 template <
typename ValueType1,
typename ValueType2>
1280 stridedBlockAxpy(
const dftfe::uInt contiguousBlockSize,
1282 const ValueType1 *addFromVec,
1283 const ValueType2 *scalingVector,
1285 ValueType1 *addToVec);
1286 template <
typename ValueType1,
typename ValueType2>
1288 stridedBlockAxpBy(
const dftfe::uInt contiguousBlockSize,
1290 const ValueType1 *addFromVec,
1291 const ValueType2 *scalingVector,
1294 ValueType1 *addToVec);
1296 template <
typename ValueType0,
1297 typename ValueType1,
1298 typename ValueType2,
1299 typename ValueType3,
1300 typename ValueType4>
1304 const ValueType0 alpha,
1305 const ValueType1 *A,
1306 const ValueType2 *B,
1307 const ValueType3 *D,
1311 template <
typename ValueType>
1313 axpyStridedBlockAtomicAdd(
1316 const ValueType *addFromVec,
1317 ValueType *addToVec,
1318 const dftfe::uInt *addToVecStartingContiguousBlockIds);
1320 template <
typename ValueType1,
typename ValueType2,
typename ValueType3>
1322 axpyStridedBlockAtomicAdd(
1326 const ValueType1 *s,
1327 const ValueType2 *addFromVec,
1328 ValueType3 *addToVec,
1329 const dftfe::uInt *addToVecStartingContiguousBlockIds);
1330 template <
typename ValueType1,
typename ValueType2,
typename ValueType3>
1332 axpyStridedBlockAtomicAdd(
1336 const ValueType2 *addFromVec,
1337 ValueType3 *addToVec,
1338 const dftfe::uInt *addToVecStartingContiguousBlockIds);
1340 template <
typename ValueType1,
typename ValueType2>
1342 stridedBlockScale(
const dftfe::uInt contiguousBlockSize,
1345 const ValueType1 *s,
1347 template <
typename ValueType1,
typename ValueType2>
1349 stridedBlockScaleCopy(
1353 const ValueType1 *s,
1354 const ValueType2 *copyFromVec,
1355 ValueType2 *copyToVecBlock,
1356 const dftfe::uInt *copyFromVecStartingContiguousBlockIds);
1358 template <
typename ValueType>
1360 stridedBlockScaleColumnWise(
const dftfe::uInt contiguousBlockSize,
1362 const ValueType *beta,
1365 template <
typename ValueType>
1367 stridedBlockScaleAndAddColumnWise(
const dftfe::uInt contiguousBlockSize,
1370 const ValueType *beta,
1373 template <
typename ValueType>
1375 stridedBlockScaleAndAddTwoVecColumnWise(
1379 const ValueType *alpha,
1381 const ValueType *beta,
1384 template <
typename ValueType1,
typename ValueType2>
1386 rightDiagonalScale(
const dftfe::uInt numberofVectors,
1392 getDeviceBlasHandle();
1395 template <
typename ValueType1,
typename ValueType2>
1397 copyBlockDiagonalValueType1OffDiagonalValueType2FromValueType1Arr(
1401 const ValueType1 *valueType1SrcArray,
1402 ValueType1 *valueType1DstArray,
1403 ValueType2 *valueType2DstArray);
1406 setTensorOpDataType(tensorOpDataType opType)
1418# ifdef DFTFE_WITH_DEVICE_AMD
1424 tensorOpDataType d_opType;
void xaxpy(const dftfe::uInt n, const std::complex< float > *alpha, const std::complex< float > *x, const dftfe::uInt incx, std::complex< float > *y, const dftfe::uInt incy)
void hadamardProduct(const dftfe::uInt m, const ValueType *X, const ValueType *Y, ValueType *output)
void stridedCopyToBlock(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const dftfe::uInt startingVecId, const ValueType1 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::uInt *copyFromVecStartingContiguousBlockIds)
void xgemmStridedBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const float *alpha, const float *A, const dftfe::uInt lda, long long int strideA, const float *B, const dftfe::uInt ldb, long long int strideB, const float *beta, float *C, const dftfe::uInt ldc, long long int strideC, const dftfe::Int batchCount)
void xgemmStridedBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< float > *alpha, const std::complex< float > *A, const dftfe::uInt lda, long long int strideA, const std::complex< float > *B, const dftfe::uInt ldb, long long int strideB, const std::complex< float > *beta, std::complex< float > *C, const dftfe::uInt ldc, long long int strideC, const dftfe::Int batchCount)
void xcopy(const dftfe::uInt n, const float *x, const dftfe::uInt incx, float *y, const dftfe::uInt incy)
void xgemmBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< float > *alpha, const std::complex< float > *A[], const dftfe::uInt lda, const std::complex< float > *B[], const dftfe::uInt ldb, const std::complex< float > *beta, std::complex< float > *C[], const dftfe::uInt ldc, const dftfe::Int batchCount)
void xcopy(const dftfe::uInt n, const std::complex< float > *x, const dftfe::uInt incx, std::complex< float > *y, const dftfe::uInt incy)
void stridedBlockScaleCopy(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 a, const ValueType1 *s, const ValueType2 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::uInt *copyFromVecStartingContiguousBlockIds)
void xcopy(const dftfe::uInt n, const std::complex< double > *x, const dftfe::uInt incx, std::complex< double > *y, const dftfe::uInt incy)
void xdot(const dftfe::uInt N, const double *X, const dftfe::uInt INCX, const double *Y, const dftfe::uInt INCY, double *result)
void axpby(const dftfe::uInt n, const ValueType2 alpha, const ValueType1 *x, const ValueType2 beta, ValueType1 *y)
void xdot(const dftfe::uInt N, const std::complex< double > *X, const dftfe::uInt INCX, const std::complex< double > *Y, const dftfe::uInt INCY, std::complex< double > *result)
void xdot(const dftfe::uInt N, const float *X, const dftfe::uInt INCX, const float *Y, const dftfe::uInt INCY, float *result)
void copyValueType1ArrToValueType2Arr(const dftfe::uInt size, const ValueType1 *valueType1Arr, ValueType2 *valueType2Arr)
void xgemv(const char transA, const dftfe::uInt m, const dftfe::uInt n, const std::complex< double > *alpha, const std::complex< double > *A, const dftfe::uInt lda, const std::complex< double > *x, const dftfe::uInt incx, const std::complex< double > *beta, std::complex< double > *y, const dftfe::uInt incy)
void xgemv(const char transA, const dftfe::uInt m, const dftfe::uInt n, const float *alpha, const float *A, const dftfe::uInt lda, const float *x, const dftfe::uInt incx, const float *beta, float *y, const dftfe::uInt incy)
void MultiVectorXDot(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *X, const ValueType *Y, const ValueType *onesVec, ValueType *tempVector, ValueType *tempResults, ValueType *result)
void xnrm2(const dftfe::uInt n, const std::complex< double > *x, const dftfe::uInt incx, const MPI_Comm &mpi_communicator, double *result)
void axpyStridedBlockAtomicAdd(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 a, const ValueType2 *addFromVec, ValueType3 *addToVec, const dftfe::uInt *addToVecStartingContiguousBlockIds)
void axpyStridedBlockAtomicAdd(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 a, const ValueType1 *s, const ValueType2 *addFromVec, ValueType3 *addToVec, const dftfe::uInt *addToVecStartingContiguousBlockIds)
void xnrm2(const dftfe::uInt n, const double *x, const dftfe::uInt incx, const MPI_Comm &mpi_communicator, double *result)
void xaxpy(const dftfe::uInt n, const double *alpha, const double *x, const dftfe::uInt incx, double *y, const dftfe::uInt incy)
void xgemmStridedBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const double *alpha, const double *A, const dftfe::uInt lda, long long int strideA, const double *B, const dftfe::uInt ldb, long long int strideB, const double *beta, double *C, const dftfe::uInt ldc, long long int strideC, const dftfe::Int batchCount)
void axpyStridedBlockAtomicAdd(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *addFromVec, ValueType *addToVec, const dftfe::uInt *addToVecStartingContiguousBlockIds)
void copyComplexArrToRealArrs(const dftfe::uInt size, const ValueTypeComplex *complexArr, ValueTypeReal *realArr, ValueTypeReal *imagArr)
void stridedBlockScale(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 a, const ValueType1 *s, ValueType2 *x)
void xgemmStridedBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< double > *alpha, const std::complex< double > *A, const dftfe::uInt lda, long long int strideA, const std::complex< double > *B, const dftfe::uInt ldb, long long int strideB, const std::complex< double > *beta, std::complex< double > *C, const dftfe::uInt ldc, long long int strideC, const dftfe::Int batchCount)
void xgemm(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const float *alpha, const float *A, const dftfe::uInt lda, const float *B, const dftfe::uInt ldb, const float *beta, float *C, const dftfe::uInt ldc)
void xgemv(const char transA, const dftfe::uInt m, const dftfe::uInt n, const std::complex< float > *alpha, const std::complex< float > *A, const dftfe::uInt lda, const std::complex< float > *x, const dftfe::uInt incx, const std::complex< float > *beta, std::complex< float > *y, const dftfe::uInt incy)
void xaxpy(const dftfe::uInt n, const std::complex< double > *alpha, const std::complex< double > *x, const dftfe::uInt incx, std::complex< double > *y, const dftfe::uInt incy)
void stridedBlockAxpBy(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 *addFromVec, const ValueType2 *scalingVector, const ValueType2 a, const ValueType2 b, ValueType1 *addToVec)
void xscal(ValueType1 *x, const ValueType2 alpha, const dftfe::uInt n)
void stridedCopyToBlock(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::uInt *copyFromVecStartingContiguousBlockIds)
void stridedCopyFromBlock(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 *copyFromVecBlock, ValueType2 *copyToVec, const dftfe::uInt *copyFromVecStartingContiguousBlockIds)
void xdot(const dftfe::uInt N, const std::complex< double > *X, const dftfe::uInt INCX, const std::complex< double > *Y, const dftfe::uInt INCY, const MPI_Comm &mpi_communicator, std::complex< double > *result)
void stridedBlockAxpy(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType1 *addFromVec, const ValueType2 *scalingVector, const ValueType2 a, ValueType1 *addToVec)
void stridedBlockScaleAndAddTwoVecColumnWise(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *x, const ValueType *alpha, const ValueType *y, const ValueType *beta, ValueType *z)
void xgemm(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< double > *alpha, const std::complex< double > *A, const dftfe::uInt lda, const std::complex< double > *B, const dftfe::uInt ldb, const std::complex< double > *beta, std::complex< double > *C, const dftfe::uInt ldc)
void hadamardProductWithConj(const dftfe::uInt m, const ValueType *X, const ValueType *Y, ValueType *output)
void xcopy(const dftfe::uInt n, const double *x, const dftfe::uInt incx, double *y, const dftfe::uInt incy)
void MultiVectorXDot(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *X, const ValueType *Y, const ValueType *onesVec, ValueType *tempVector, ValueType *tempResults, const MPI_Comm &mpi_communicator, ValueType *result)
void xaxpy(const dftfe::uInt n, const float *alpha, const float *x, const dftfe::uInt incx, float *y, const dftfe::uInt incy)
void stridedBlockScaleColumnWise(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *beta, ValueType *x)
void ApaBD(const dftfe::uInt m, const dftfe::uInt n, const ValueType0 alpha, const ValueType1 *A, const ValueType2 *B, const ValueType3 *D, ValueType4 *C)
void stridedCopyFromBlockConstantStride(const dftfe::uInt blockSizeTo, const dftfe::uInt blockSizeFrom, const dftfe::uInt numBlocks, const dftfe::uInt startingId, const ValueType1 *copyFromVec, ValueType2 *copyToVec)
void xdot(const dftfe::uInt N, const std::complex< float > *X, const dftfe::uInt INCX, const std::complex< float > *Y, const dftfe::uInt INCY, std::complex< float > *result)
void stridedBlockScaleAndAddColumnWise(const dftfe::uInt contiguousBlockSize, const dftfe::uInt numContiguousBlocks, const ValueType *x, const ValueType *beta, ValueType *y)
void xgemm(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const double *alpha, const double *A, const dftfe::uInt lda, const double *B, const dftfe::uInt ldb, const double *beta, double *C, const dftfe::uInt ldc)
void stridedCopyConstantStride(const dftfe::uInt blockSize, const dftfe::uInt strideTo, const dftfe::uInt strideFrom, const dftfe::uInt numBlocks, const dftfe::uInt startingToId, const dftfe::uInt startingFromId, const ValueType1 *copyFromVec, ValueType2 *copyToVec)
void xgemv(const char transA, const dftfe::uInt m, const dftfe::uInt n, const double *alpha, const double *A, const dftfe::uInt lda, const double *x, const dftfe::uInt incx, const double *beta, double *y, const dftfe::uInt incy)
void addVecOverContinuousIndex(const dftfe::uInt numContiguousBlocks, const dftfe::uInt contiguousBlockSize, const ValueType *input1, const ValueType *input2, ValueType *output)
void xgemmBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const double *alpha, const double *A[], const dftfe::uInt lda, const double *B[], const dftfe::uInt ldb, const double *beta, double *C[], const dftfe::uInt ldc, const dftfe::Int batchCount)
void xsymv(const char UPLO, const dftfe::uInt N, const double *alpha, const double *A, const dftfe::uInt LDA, const double *X, const dftfe::uInt INCX, const double *beta, double *C, const dftfe::uInt INCY)
void stridedCopyToBlockConstantStride(const dftfe::uInt blockSizeTo, const dftfe::uInt blockSizeFrom, const dftfe::uInt numBlocks, const dftfe::uInt startingId, const ValueType1 *copyFromVec, ValueType2 *copyToVec)
void xgemmBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const float *alpha, const float *A[], const dftfe::uInt lda, const float *B[], const dftfe::uInt ldb, const float *beta, float *C[], const dftfe::uInt ldc, const dftfe::Int batchCount)
void xdot(const dftfe::uInt N, const double *X, const dftfe::uInt INCX, const double *Y, const dftfe::uInt INCY, const MPI_Comm &mpi_communicator, double *result)
void rightDiagonalScale(const dftfe::uInt numberofVectors, const dftfe::uInt sizeOfVector, ValueType1 *X, ValueType2 *D)
void xgemm(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< float > *alpha, const std::complex< float > *A, const dftfe::uInt lda, const std::complex< float > *B, const dftfe::uInt ldb, const std::complex< float > *beta, std::complex< float > *C, const dftfe::uInt ldc)
void copyRealArrsToComplexArr(const dftfe::uInt size, const ValueTypeReal *realArr, const ValueTypeReal *imagArr, ValueTypeComplex *complexArr)
void xgemmBatched(const char transA, const char transB, const dftfe::uInt m, const dftfe::uInt n, const dftfe::uInt k, const std::complex< double > *alpha, const std::complex< double > *A[], const dftfe::uInt lda, const std::complex< double > *B[], const dftfe::uInt ldb, const std::complex< double > *beta, std::complex< double > *C[], const dftfe::uInt ldc, const dftfe::Int batchCount)
Definition BLASWrapper.h:35
Definition BLASWrapper.h:33
cudaStream_t deviceStream_t
Definition DeviceTypeConfig.cu.h:27
cublasHandle_t deviceBlasHandle_t
Definition DeviceTypeConfig.cu.h:36
@ HOST
Definition MemorySpaceType.h:34
@ DEVICE
Definition MemorySpaceType.h:36
cublasStatus_t deviceBlasStatus_t
Definition DeviceTypeConfig.cu.h:38
static cudaStream_t defaultStream
Definition DeviceTypeConfig.cu.h:62
Definition pseudoPotentialToDftfeConverter.cc:34
std::uint32_t uInt
Definition TypeConfig.h:10
@ LDA
Definition ExcSSDFunctionalBaseClass.h:35
std::int32_t Int
Definition TypeConfig.h:11