DFT-FE 1.1.0-pre
Density Functional Theory With Finite-Elements
Loading...
Searching...
No Matches
BLASWrapper.h
Go to the documentation of this file.
1// ---------------------------------------------------------------------
2//
3// Copyright (c) 2017-2025 The Regents of the University of Michigan and DFT-FE
4// authors.
5//
6// This file is part of the DFT-FE code.
7//
8// The DFT-FE code is free software; you can use it, redistribute
9// it, and/or modify it under the terms of the GNU Lesser General
10// Public License as published by the Free Software Foundation; either
11// version 2.1 of the License, or (at your option) any later version.
12// The full text of the license can be found in the file LICENSE at
13// the top level of the DFT-FE distribution.
14//
15// ---------------------------------------------------------------------
16//
17
18#ifndef BLASWrapper_h
19#define BLASWrapper_h
20
21#include <dftfeDataTypes.h>
22#include <MemorySpaceType.h>
23#include <complex>
24#include <TypeConfig.h>
25#include <DeviceTypeConfig.h>
26#include <cmath>
27#if defined(DFTFE_WITH_DEVICE)
28# include "Exceptions.h"
29#endif
30namespace dftfe
31{
32 namespace linearAlgebra
33 {
34 template <dftfe::utils::MemorySpace memorySpace>
36
37 template <>
39 {
40 public:
42
43 template <typename ValueType>
44 void
45 hadamardProduct(const unsigned int m,
46 const ValueType * X,
47 const ValueType * Y,
48 ValueType * output) const;
49
50 template <typename ValueType>
51 void
52 hadamardProductWithConj(const unsigned int m,
53 const ValueType * X,
54 const ValueType * Y,
55 ValueType * output) const;
56
57 // Real-Single Precision GEMM
58 void
59 xgemm(const char transA,
60 const char transB,
61 const unsigned int m,
62 const unsigned int n,
63 const unsigned int k,
64 const float * alpha,
65 const float * A,
66 const unsigned int lda,
67 const float * B,
68 const unsigned int ldb,
69 const float * beta,
70 float * C,
71 const unsigned int ldc) const;
72 // Complex-Single Precision GEMM
73 void
74 xgemm(const char transA,
75 const char transB,
76 const unsigned int m,
77 const unsigned int n,
78 const unsigned int k,
79 const std::complex<float> *alpha,
80 const std::complex<float> *A,
81 const unsigned int lda,
82 const std::complex<float> *B,
83 const unsigned int ldb,
84 const std::complex<float> *beta,
85 std::complex<float> * C,
86 const unsigned int ldc) const;
87
88 // Real-double precison GEMM
89 void
90 xgemm(const char transA,
91 const char transB,
92 const unsigned int m,
93 const unsigned int n,
94 const unsigned int k,
95 const double * alpha,
96 const double * A,
97 const unsigned int lda,
98 const double * B,
99 const unsigned int ldb,
100 const double * beta,
101 double * C,
102 const unsigned int ldc) const;
103
104
105 // Complex-double precision GEMM
106 void
107 xgemm(const char transA,
108 const char transB,
109 const unsigned int m,
110 const unsigned int n,
111 const unsigned int k,
112 const std::complex<double> *alpha,
113 const std::complex<double> *A,
114 const unsigned int lda,
115 const std::complex<double> *B,
116 const unsigned int ldb,
117 const std::complex<double> *beta,
118 std::complex<double> * C,
119 const unsigned int ldc) const;
120
121 void
122 xgemv(const char transA,
123 const unsigned int m,
124 const unsigned int n,
125 const double * alpha,
126 const double * A,
127 const unsigned int lda,
128 const double * x,
129 const unsigned int incx,
130 const double * beta,
131 double * y,
132 const unsigned int incy) const;
133
134 void
135 xgemv(const char transA,
136 const unsigned int m,
137 const unsigned int n,
138 const float * alpha,
139 const float * A,
140 const unsigned int lda,
141 const float * x,
142 const unsigned int incx,
143 const float * beta,
144 float * y,
145 const unsigned int incy) const;
146
147 void
148 xgemv(const char transA,
149 const unsigned int m,
150 const unsigned int n,
151 const std::complex<double> *alpha,
152 const std::complex<double> *A,
153 const unsigned int lda,
154 const std::complex<double> *x,
155 const unsigned int incx,
156 const std::complex<double> *beta,
157 std::complex<double> * y,
158 const unsigned int incy) const;
159
160 void
161 xgemv(const char transA,
162 const unsigned int m,
163 const unsigned int n,
164 const std::complex<float> *alpha,
165 const std::complex<float> *A,
166 const unsigned int lda,
167 const std::complex<float> *x,
168 const unsigned int incx,
169 const std::complex<float> *beta,
170 std::complex<float> * y,
171 const unsigned int incy) const;
172
173
174 template <typename ValueType1, typename ValueType2>
175 void
176 xscal(ValueType1 * x,
177 const ValueType2 alpha,
178 const dftfe::size_type n) const;
179
180 // Brief
181 // for ( i = 0 i < numContiguousBlocks; i ++)
182 // {
183 // for( j = 0 ; j < contiguousBlockSize; j++)
184 // {
185 // output[j] += input1[i*contiguousBlockSize+j] *
186 // input2[i*contiguousBlockSize+j];
187 // }
188 // }
189 template <typename ValueType>
190 void
192 const dftfe::size_type contiguousBlockSize,
193 const ValueType * input1,
194 const ValueType * input2,
195 ValueType * output);
196
197 // Real-Float scaling of Real-vector
198
199
200 // Real double Norm2
201 void
202 xnrm2(const unsigned int n,
203 const double * x,
204 const unsigned int incx,
205 const MPI_Comm & mpi_communicator,
206 double * result) const;
207
208
209 // Comples double Norm2
210 void
211 xnrm2(const unsigned int n,
212 const std::complex<double> *x,
213 const unsigned int incx,
214 const MPI_Comm & mpi_communicator,
215 double * result) const;
216 // Real dot product
217 void
218 xdot(const unsigned int N,
219 const double * X,
220 const unsigned int INCX,
221 const double * Y,
222 const unsigned int INCY,
223 double * result) const;
224 // Real dot proeuct with all Reduce call
225 void
226 xdot(const unsigned int N,
227 const double * X,
228 const unsigned int INCX,
229 const double * Y,
230 const unsigned int INCY,
231 const MPI_Comm & mpi_communicator,
232 double * result) const;
233
234 // Complex dot product
235 void
236 xdot(const unsigned int N,
237 const std::complex<double> *X,
238 const unsigned int INCX,
239 const std::complex<double> *Y,
240 const unsigned int INCY,
241 std::complex<double> * result) const;
242
243 // Complex dot proeuct with all Reduce call
244 void
245 xdot(const unsigned int N,
246 const std::complex<double> *X,
247 const unsigned int INCX,
248 const std::complex<double> *Y,
249 const unsigned int INCY,
250 const MPI_Comm & mpi_communicator,
251 std::complex<double> * result) const;
252
253
254 // MultiVector Real dot product
255 template <typename ValueType>
256 void
257 MultiVectorXDot(const unsigned int contiguousBlockSize,
258 const unsigned int numContiguousBlocks,
259 const ValueType * X,
260 const ValueType * Y,
261 const ValueType * onesVec,
262 ValueType * tempVector,
263 ValueType * tempResults,
264 ValueType * result) const;
265
266 // MultiVector Real dot product with all Reduce call
267 template <typename ValueType>
268 void
269 MultiVectorXDot(const unsigned int contiguousBlockSize,
270 const unsigned int numContiguousBlocks,
271 const ValueType * X,
272 const ValueType * Y,
273 const ValueType * onesVec,
274 ValueType * tempVector,
275 ValueType * tempResults,
276 const MPI_Comm & mpi_communicator,
277 ValueType * result) const;
278
279
280 // Real double Ax+y
281 void
282 xaxpy(const unsigned int n,
283 const double * alpha,
284 const double * x,
285 const unsigned int incx,
286 double * y,
287 const unsigned int incy) const;
288
289 // Complex double Ax+y
290 void
291 xaxpy(const unsigned int n,
292 const std::complex<double> *alpha,
293 const std::complex<double> *x,
294 const unsigned int incx,
295 std::complex<double> * y,
296 const unsigned int incy) const;
297
298 // Real float Ax+y
299 void
300 xaxpy(const unsigned int n,
301 const float * alpha,
302 const float * x,
303 const unsigned int incx,
304 float * y,
305 const unsigned int incy) const;
306
307 // Complex double Ax+y
308 void
309 xaxpy(const unsigned int n,
310 const std::complex<float> *alpha,
311 const std::complex<float> *x,
312 const unsigned int incx,
313 std::complex<float> * y,
314 const unsigned int incy) const;
315
316 // Real copy of double data
317 void
318 xcopy(const unsigned int n,
319 const double * x,
320 const unsigned int incx,
321 double * y,
322 const unsigned int incy) const;
323
324 // Complex double copy of data
325 void
326 xcopy(const unsigned int n,
327 const std::complex<double> *x,
328 const unsigned int incx,
329 std::complex<double> * y,
330 const unsigned int incy) const;
331
332 // Real copy of float data
333 void
334 xcopy(const unsigned int n,
335 const float * x,
336 const unsigned int incx,
337 float * y,
338 const unsigned int incy) const;
339
340 // Complex float copy of data
341 void
342 xcopy(const unsigned int n,
343 const std::complex<float> *x,
344 const unsigned int incx,
345 std::complex<float> * y,
346 const unsigned int incy) const;
347
348 // Real double symmetric matrix-vector product
349 void
350 xsymv(const char UPLO,
351 const unsigned int N,
352 const double * alpha,
353 const double * A,
354 const unsigned int LDA,
355 const double * X,
356 const unsigned int INCX,
357 const double * beta,
358 double * C,
359 const unsigned int INCY) const;
360
361 void
362 xgemmBatched(const char transA,
363 const char transB,
364 const unsigned int m,
365 const unsigned int n,
366 const unsigned int k,
367 const double * alpha,
368 const double * A[],
369 const unsigned int lda,
370 const double * B[],
371 const unsigned int ldb,
372 const double * beta,
373 double * C[],
374 const unsigned int ldc,
375 const int batchCount) const;
376
377 void
378 xgemmBatched(const char transA,
379 const char transB,
380 const unsigned int m,
381 const unsigned int n,
382 const unsigned int k,
383 const std::complex<double> *alpha,
384 const std::complex<double> *A[],
385 const unsigned int lda,
386 const std::complex<double> *B[],
387 const unsigned int ldb,
388 const std::complex<double> *beta,
389 std::complex<double> * C[],
390 const unsigned int ldc,
391 const int batchCount) const;
392
393
394 void
395 xgemmBatched(const char transA,
396 const char transB,
397 const unsigned int m,
398 const unsigned int n,
399 const unsigned int k,
400 const float * alpha,
401 const float * A[],
402 const unsigned int lda,
403 const float * B[],
404 const unsigned int ldb,
405 const float * beta,
406 float * C[],
407 const unsigned int ldc,
408 const int batchCount) const;
409
410 void
411 xgemmBatched(const char transA,
412 const char transB,
413 const unsigned int m,
414 const unsigned int n,
415 const unsigned int k,
416 const std::complex<float> *alpha,
417 const std::complex<float> *A[],
418 const unsigned int lda,
419 const std::complex<float> *B[],
420 const unsigned int ldb,
421 const std::complex<float> *beta,
422 std::complex<float> * C[],
423 const unsigned int ldc,
424 const int batchCount) const;
425
426
427 void
428 xgemmStridedBatched(const char transA,
429 const char transB,
430 const unsigned int m,
431 const unsigned int n,
432 const unsigned int k,
433 const double * alpha,
434 const double * A,
435 const unsigned int lda,
436 long long int strideA,
437 const double * B,
438 const unsigned int ldb,
439 long long int strideB,
440 const double * beta,
441 double * C,
442 const unsigned int ldc,
443 long long int strideC,
444 const int batchCount) const;
445
446 void
447 xgemmStridedBatched(const char transA,
448 const char transB,
449 const unsigned int m,
450 const unsigned int n,
451 const unsigned int k,
452 const std::complex<double> *alpha,
453 const std::complex<double> *A,
454 const unsigned int lda,
455 long long int strideA,
456 const std::complex<double> *B,
457 const unsigned int ldb,
458 long long int strideB,
459 const std::complex<double> *beta,
460 std::complex<double> * C,
461 const unsigned int ldc,
462 long long int strideC,
463 const int batchCount) const;
464
465 void
466 xgemmStridedBatched(const char transA,
467 const char transB,
468 const unsigned int m,
469 const unsigned int n,
470 const unsigned int k,
471 const std::complex<float> *alpha,
472 const std::complex<float> *A,
473 const unsigned int lda,
474 long long int strideA,
475 const std::complex<float> *B,
476 const unsigned int ldb,
477 long long int strideB,
478 const std::complex<float> *beta,
479 std::complex<float> * C,
480 const unsigned int ldc,
481 long long int strideC,
482 const int batchCount) const;
483
484 void
485 xgemmStridedBatched(const char transA,
486 const char transB,
487 const unsigned int m,
488 const unsigned int n,
489 const unsigned int k,
490 const float * alpha,
491 const float * A,
492 const unsigned int lda,
493 long long int strideA,
494 const float * B,
495 const unsigned int ldb,
496 long long int strideB,
497 const float * beta,
498 float * C,
499 const unsigned int ldc,
500 long long int strideC,
501 const int batchCount) const;
502
503 template <typename ValueTypeComplex, typename ValueTypeReal>
504 void
506 const ValueTypeComplex *complexArr,
507 ValueTypeReal * realArr,
508 ValueTypeReal * imagArr);
509
510
511 template <typename ValueTypeComplex, typename ValueTypeReal>
512 void
514 const ValueTypeReal * realArr,
515 const ValueTypeReal * imagArr,
516 ValueTypeComplex * complexArr);
517
518 template <typename ValueType1, typename ValueType2>
519 void
521 const ValueType1 * valueType1Arr,
522 ValueType2 * valueType2Arr);
523
524
525 template <typename ValueType1, typename ValueType2>
526 void
528 const dftfe::size_type contiguousBlockSize,
529 const dftfe::size_type numContiguousBlocks,
530 const ValueType1 * copyFromVec,
531 ValueType2 * copyToVecBlock,
532 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
533
534
535 template <typename ValueType1, typename ValueType2>
536 void
538 const dftfe::size_type contiguousBlockSize,
539 const dftfe::size_type numContiguousBlocks,
540 const dftfe::size_type startingVecId,
541 const ValueType1 * copyFromVec,
542 ValueType2 * copyToVecBlock,
543 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
544
545 template <typename ValueType1, typename ValueType2>
546 void
548 const dftfe::size_type contiguousBlockSize,
549 const dftfe::size_type numContiguousBlocks,
550 const ValueType1 * copyFromVecBlock,
551 ValueType2 * copyToVec,
552 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
553
554 template <typename ValueType1, typename ValueType2>
555 void
557 const dftfe::size_type blockSizeFrom,
558 const dftfe::size_type numBlocks,
559 const dftfe::size_type startingId,
560 const ValueType1 * copyFromVec,
561 ValueType2 * copyToVec) const;
562
563
564 template <typename ValueType1, typename ValueType2>
565 void
567 const dftfe::size_type strideTo,
568 const dftfe::size_type strideFrom,
569 const dftfe::size_type numBlocks,
570 const dftfe::size_type startingToId,
571 const dftfe::size_type startingFromId,
572 const ValueType1 * copyFromVec,
573 ValueType2 * copyToVec);
574
575
576 template <typename ValueType1, typename ValueType2>
577 void
579 const dftfe::size_type blockSizeFrom,
580 const dftfe::size_type numBlocks,
581 const dftfe::size_type startingId,
582 const ValueType1 * copyFromVec,
583 ValueType2 * copyToVec);
584
585 template <typename ValueType1, typename ValueType2>
586 void
587 stridedBlockAxpy(const dftfe::size_type contiguousBlockSize,
588 const dftfe::size_type numContiguousBlocks,
589 const ValueType1 * addFromVec,
590 const ValueType2 * scalingVector,
591 const ValueType2 a,
592 ValueType1 * addToVec) const;
593
594
595 template <typename ValueType1, typename ValueType2>
596 void
597 stridedBlockAxpBy(const dftfe::size_type contiguousBlockSize,
598 const dftfe::size_type numContiguousBlocks,
599 const ValueType1 * addFromVec,
600 const ValueType2 * scalingVector,
601 const ValueType2 a,
602 const ValueType2 b,
603 ValueType1 * addToVec) const;
604 template <typename ValueType1, typename ValueType2>
605 void
606 axpby(const unsigned int n,
607 const ValueType2 alpha,
608 const ValueType1 * x,
609 const ValueType2 beta,
610 ValueType1 * y) const;
611 template <typename ValueType0,
612 typename ValueType1,
613 typename ValueType2,
614 typename ValueType3,
615 typename ValueType4>
616 void
617 ApaBD(const unsigned int m,
618 const unsigned int n,
619 const ValueType0 alpha,
620 const ValueType1 * A,
621 const ValueType2 * B,
622 const ValueType3 * D,
623 ValueType4 * C) const;
624
625 template <typename ValueType>
626 void
628 const dftfe::size_type numContiguousBlocks,
629 const ValueType * addFromVec,
630 ValueType * addToVec,
632 *addToVecStartingContiguousBlockIds) const;
633
634 template <typename ValueType1, typename ValueType2, typename ValueType3>
635 void
637 const dftfe::size_type numContiguousBlocks,
638 const ValueType1 a,
639 const ValueType1 * s,
640 const ValueType2 * addFromVec,
641 ValueType3 * addToVec,
643 *addToVecStartingContiguousBlockIds) const;
644 template <typename ValueType1, typename ValueType2, typename ValueType3>
645 void
647 const dftfe::size_type numContiguousBlocks,
648 const ValueType1 a,
649 const ValueType2 * addFromVec,
650 ValueType3 * addToVec,
652 *addToVecStartingContiguousBlockIds) const;
653
654 template <typename ValueType1, typename ValueType2>
655 void
656 stridedBlockScale(const dftfe::size_type contiguousBlockSize,
657 const dftfe::size_type numContiguousBlocks,
658 const ValueType1 a,
659 const ValueType1 * s,
660 ValueType2 * x);
661
662 template <typename ValueType1, typename ValueType2>
663 void
665 const dftfe::size_type contiguousBlockSize,
666 const dftfe::size_type numContiguousBlocks,
667 const ValueType1 a,
668 const ValueType1 * s,
669 const ValueType2 * copyFromVec,
670 ValueType2 * copyToVecBlock,
671 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
672
673 void
674 add(double * y,
675 const double * x,
676 const double alpha,
677 const dftfe::size_type size);
678
679 template <typename ValueType>
680 void
681 sadd(ValueType * y,
682 ValueType * x,
683 const ValueType beta,
684 const dftfe::size_type size);
685
686 template <typename ValueType>
687 void
689 const dftfe::size_type numContiguousBlocks,
690 const ValueType * beta,
691 ValueType * x);
692
693 template <typename ValueType>
694 void
696 const dftfe::size_type contiguousBlockSize,
697 const dftfe::size_type numContiguousBlocks,
698 const ValueType * x,
699 const ValueType * beta,
700 ValueType * y);
701
702 template <typename ValueType>
703 void
705 const dftfe::size_type contiguousBlockSize,
706 const dftfe::size_type numContiguousBlocks,
707 const ValueType * x,
708 const ValueType * alpha,
709 const ValueType * y,
710 const ValueType * beta,
711 ValueType * z);
712
713 template <typename ValueType1, typename ValueType2>
714 void
716 const dftfe::size_type sizeOfVector,
717 ValueType1 * X,
718 ValueType2 * D);
719
720 private:
721 };
722#if defined(DFTFE_WITH_DEVICE)
723# include "Exceptions.h"
724 enum class tensorOpDataType
725 {
726 fp32,
727 tf32,
728 bf16,
729 fp16
730 };
731
732 template <>
733 class BLASWrapper<dftfe::utils::MemorySpace::DEVICE>
734 {
735 public:
736 BLASWrapper();
737
738 template <typename ValueType1, typename ValueType2>
739 static void
740 copyValueType1ArrToValueType2ArrDeviceCall(
741 const dftfe::size_type size,
742 const ValueType1 * valueType1Arr,
743 ValueType2 * valueType2Arr,
744 const dftfe::utils::deviceStream_t streamId = 0);
745
746 template <typename ValueType>
747 void
748 hadamardProduct(const unsigned int m,
749 const ValueType * X,
750 const ValueType * Y,
751 ValueType * output) const;
752
753 template <typename ValueType>
754 void
755 hadamardProductWithConj(const unsigned int m,
756 const ValueType * X,
757 const ValueType * Y,
758 ValueType * output) const;
759
760 // Real-Single Precision GEMM
761 void
762 xgemm(const char transA,
763 const char transB,
764 const unsigned int m,
765 const unsigned int n,
766 const unsigned int k,
767 const float * alpha,
768 const float * A,
769 const unsigned int lda,
770 const float * B,
771 const unsigned int ldb,
772 const float * beta,
773 float * C,
774 const unsigned int ldc) const;
775 // Complex-Single Precision GEMM
776 void
777 xgemm(const char transA,
778 const char transB,
779 const unsigned int m,
780 const unsigned int n,
781 const unsigned int k,
782 const std::complex<float> *alpha,
783 const std::complex<float> *A,
784 const unsigned int lda,
785 const std::complex<float> *B,
786 const unsigned int ldb,
787 const std::complex<float> *beta,
788 std::complex<float> * C,
789 const unsigned int ldc) const;
790
791 // Real-double precison GEMM
792 void
793 xgemm(const char transA,
794 const char transB,
795 const unsigned int m,
796 const unsigned int n,
797 const unsigned int k,
798 const double * alpha,
799 const double * A,
800 const unsigned int lda,
801 const double * B,
802 const unsigned int ldb,
803 const double * beta,
804 double * C,
805 const unsigned int ldc) const;
806
807
808 // Complex-double precision GEMM
809 void
810 xgemm(const char transA,
811 const char transB,
812 const unsigned int m,
813 const unsigned int n,
814 const unsigned int k,
815 const std::complex<double> *alpha,
816 const std::complex<double> *A,
817 const unsigned int lda,
818 const std::complex<double> *B,
819 const unsigned int ldb,
820 const std::complex<double> *beta,
821 std::complex<double> * C,
822 const unsigned int ldc) const;
823
824
825 void
826 xgemv(const char transA,
827 const unsigned int m,
828 const unsigned int n,
829 const double * alpha,
830 const double * A,
831 const unsigned int lda,
832 const double * x,
833 const unsigned int incx,
834 const double * beta,
835 double * y,
836 const unsigned int incy) const;
837
838 void
839 xgemv(const char transA,
840 const unsigned int m,
841 const unsigned int n,
842 const float * alpha,
843 const float * A,
844 const unsigned int lda,
845 const float * x,
846 const unsigned int incx,
847 const float * beta,
848 float * y,
849 const unsigned int incy) const;
850
851 void
852 xgemv(const char transA,
853 const unsigned int m,
854 const unsigned int n,
855 const std::complex<double> *alpha,
856 const std::complex<double> *A,
857 const unsigned int lda,
858 const std::complex<double> *x,
859 const unsigned int incx,
860 const std::complex<double> *beta,
861 std::complex<double> * y,
862 const unsigned int incy) const;
863
864 void
865 xgemv(const char transA,
866 const unsigned int m,
867 const unsigned int n,
868 const std::complex<float> *alpha,
869 const std::complex<float> *A,
870 const unsigned int lda,
871 const std::complex<float> *x,
872 const unsigned int incx,
873 const std::complex<float> *beta,
874 std::complex<float> * y,
875 const unsigned int incy) const;
876
877 template <typename ValueType>
878 void
879 addVecOverContinuousIndex(const dftfe::size_type numContiguousBlocks,
880 const dftfe::size_type contiguousBlockSize,
881 const ValueType * input1,
882 const ValueType * input2,
883 ValueType * output);
884
885
886
887 template <typename ValueType1, typename ValueType2>
888 void
889 xscal(ValueType1 * x,
890 const ValueType2 alpha,
891 const dftfe::size_type n) const;
892
893
894
895 // Real double Norm2
896 void
897 xnrm2(const unsigned int n,
898 const double * x,
899 const unsigned int incx,
900 const MPI_Comm & mpi_communicator,
901 double * result) const;
902
903
904 // Complex double Norm2
905 void
906 xnrm2(const unsigned int n,
907 const std::complex<double> *x,
908 const unsigned int incx,
909 const MPI_Comm & mpi_communicator,
910 double * result) const;
911
912 // Real dot product
913 void
914 xdot(const unsigned int N,
915 const double * X,
916 const unsigned int INCX,
917 const double * Y,
918 const unsigned int INCY,
919 double * result) const;
920
921 //
922 // Real dot product
923 void
924 xdot(const unsigned int N,
925 const double * X,
926 const unsigned int INCX,
927 const double * Y,
928 const unsigned int INCY,
929 const MPI_Comm & mpi_communicator,
930 double * result) const;
931
932 // Complex dot product
933 void
934 xdot(const unsigned int N,
935 const std::complex<double> *X,
936 const unsigned int INCX,
937 const std::complex<double> *Y,
938 const unsigned int INCY,
939 std::complex<double> * result) const;
940
941 // Complex dot product
942 void
943 xdot(const unsigned int N,
944 const std::complex<double> *X,
945 const unsigned int INCX,
946 const std::complex<double> *Y,
947 const unsigned int INCY,
948 const MPI_Comm & mpi_communicator,
949 std::complex<double> * result) const;
950
951
952 template <typename ValueType>
953 void
954 MultiVectorXDot(const unsigned int contiguousBlockSize,
955 const unsigned int numContiguousBlocks,
956 const ValueType * X,
957 const ValueType * Y,
958 const ValueType * onesVec,
959 ValueType * tempVector,
960 ValueType * tempResults,
961 ValueType * result) const;
962
963 template <typename ValueType>
964 void
965 MultiVectorXDot(const unsigned int contiguousBlockSize,
966 const unsigned int numContiguousBlocks,
967 const ValueType * X,
968 const ValueType * Y,
969 const ValueType * onesVec,
970 ValueType * tempVector,
971 ValueType * tempResults,
972 const MPI_Comm & mpi_communicator,
973 ValueType * result) const;
974
975 // Real double Ax+y
976 void
977 xaxpy(const unsigned int n,
978 const double * alpha,
979 const double * x,
980 const unsigned int incx,
981 double * y,
982 const unsigned int incy) const;
983
984 // Complex double Ax+y
985 void
986 xaxpy(const unsigned int n,
987 const std::complex<double> *alpha,
988 const std::complex<double> *x,
989 const unsigned int incx,
990 std::complex<double> * y,
991 const unsigned int incy) const;
992
993 // Real copy of double data
994 void
995 xcopy(const unsigned int n,
996 const double * x,
997 const unsigned int incx,
998 double * y,
999 const unsigned int incy) const;
1000
1001 // Complex double copy of data
1002 void
1003 xcopy(const unsigned int n,
1004 const std::complex<double> *x,
1005 const unsigned int incx,
1006 std::complex<double> * y,
1007 const unsigned int incy) const;
1008
1009 // Real copy of float data
1010 void
1011 xcopy(const unsigned int n,
1012 const float * x,
1013 const unsigned int incx,
1014 float * y,
1015 const unsigned int incy) const;
1016
1017 // Complex float copy of data
1018 void
1019 xcopy(const unsigned int n,
1020 const std::complex<float> *x,
1021 const unsigned int incx,
1022 std::complex<float> * y,
1023 const unsigned int incy) const;
1024
1025 // Real double symmetric matrix-vector product
1026 void
1027 xsymv(const char UPLO,
1028 const unsigned int N,
1029 const double * alpha,
1030 const double * A,
1031 const unsigned int LDA,
1032 const double * X,
1033 const unsigned int INCX,
1034 const double * beta,
1035 double * C,
1036 const unsigned int INCY) const;
1037
1038 void
1039 xgemmBatched(const char transA,
1040 const char transB,
1041 const unsigned int m,
1042 const unsigned int n,
1043 const unsigned int k,
1044 const double * alpha,
1045 const double * A[],
1046 const unsigned int lda,
1047 const double * B[],
1048 const unsigned int ldb,
1049 const double * beta,
1050 double * C[],
1051 const unsigned int ldc,
1052 const int batchCount) const;
1053
1054 void
1055 xgemmBatched(const char transA,
1056 const char transB,
1057 const unsigned int m,
1058 const unsigned int n,
1059 const unsigned int k,
1060 const std::complex<double> *alpha,
1061 const std::complex<double> *A[],
1062 const unsigned int lda,
1063 const std::complex<double> *B[],
1064 const unsigned int ldb,
1065 const std::complex<double> *beta,
1066 std::complex<double> * C[],
1067 const unsigned int ldc,
1068 const int batchCount) const;
1069
1070 void
1071 xgemmBatched(const char transA,
1072 const char transB,
1073 const unsigned int m,
1074 const unsigned int n,
1075 const unsigned int k,
1076 const float * alpha,
1077 const float * A[],
1078 const unsigned int lda,
1079 const float * B[],
1080 const unsigned int ldb,
1081 const float * beta,
1082 float * C[],
1083 const unsigned int ldc,
1084 const int batchCount) const;
1085
1086 void
1087 xgemmBatched(const char transA,
1088 const char transB,
1089 const unsigned int m,
1090 const unsigned int n,
1091 const unsigned int k,
1092 const std::complex<float> *alpha,
1093 const std::complex<float> *A[],
1094 const unsigned int lda,
1095 const std::complex<float> *B[],
1096 const unsigned int ldb,
1097 const std::complex<float> *beta,
1098 std::complex<float> * C[],
1099 const unsigned int ldc,
1100 const int batchCount) const;
1101
1102 void
1103 xgemmStridedBatched(const char transA,
1104 const char transB,
1105 const unsigned int m,
1106 const unsigned int n,
1107 const unsigned int k,
1108 const double * alpha,
1109 const double * A,
1110 const unsigned int lda,
1111 long long int strideA,
1112 const double * B,
1113 const unsigned int ldb,
1114 long long int strideB,
1115 const double * beta,
1116 double * C,
1117 const unsigned int ldc,
1118 long long int strideC,
1119 const int batchCount) const;
1120
1121 void
1122 xgemmStridedBatched(const char transA,
1123 const char transB,
1124 const unsigned int m,
1125 const unsigned int n,
1126 const unsigned int k,
1127 const std::complex<double> *alpha,
1128 const std::complex<double> *A,
1129 const unsigned int lda,
1130 long long int strideA,
1131 const std::complex<double> *B,
1132 const unsigned int ldb,
1133 long long int strideB,
1134 const std::complex<double> *beta,
1135 std::complex<double> * C,
1136 const unsigned int ldc,
1137 long long int strideC,
1138 const int batchCount) const;
1139
1140 void
1141 xgemmStridedBatched(const char transA,
1142 const char transB,
1143 const unsigned int m,
1144 const unsigned int n,
1145 const unsigned int k,
1146 const std::complex<float> *alpha,
1147 const std::complex<float> *A,
1148 const unsigned int lda,
1149 long long int strideA,
1150 const std::complex<float> *B,
1151 const unsigned int ldb,
1152 long long int strideB,
1153 const std::complex<float> *beta,
1154 std::complex<float> * C,
1155 const unsigned int ldc,
1156 long long int strideC,
1157 const int batchCount) const;
1158
1159 void
1160 xgemmStridedBatched(const char transA,
1161 const char transB,
1162 const unsigned int m,
1163 const unsigned int n,
1164 const unsigned int k,
1165 const float * alpha,
1166 const float * A,
1167 const unsigned int lda,
1168 long long int strideA,
1169 const float * B,
1170 const unsigned int ldb,
1171 long long int strideB,
1172 const float * beta,
1173 float * C,
1174 const unsigned int ldc,
1175 long long int strideC,
1176 const int batchCount) const;
1177
1178 template <typename ValueTypeComplex, typename ValueTypeReal>
1179 void
1180 copyComplexArrToRealArrs(const dftfe::size_type size,
1181 const ValueTypeComplex *complexArr,
1182 ValueTypeReal * realArr,
1183 ValueTypeReal * imagArr);
1184
1185
1186 template <typename ValueTypeComplex, typename ValueTypeReal>
1187 void
1188 copyRealArrsToComplexArr(const dftfe::size_type size,
1189 const ValueTypeReal * realArr,
1190 const ValueTypeReal * imagArr,
1191 ValueTypeComplex * complexArr);
1192
1193 template <typename ValueType1, typename ValueType2>
1194 void
1195 copyValueType1ArrToValueType2Arr(const dftfe::size_type size,
1196 const ValueType1 * valueType1Arr,
1197 ValueType2 * valueType2Arr);
1198
1199
1200 template <typename ValueType1, typename ValueType2>
1201 void
1202 stridedCopyToBlock(
1203 const dftfe::size_type contiguousBlockSize,
1204 const dftfe::size_type numContiguousBlocks,
1205 const ValueType1 * copyFromVec,
1206 ValueType2 * copyToVecBlock,
1207 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
1208
1209 template <typename ValueType1, typename ValueType2>
1210 void
1211 stridedCopyToBlock(
1212 const dftfe::size_type contiguousBlockSize,
1213 const dftfe::size_type numContiguousBlocks,
1214 const dftfe::size_type startingVecId,
1215 const ValueType1 * copyFromVec,
1216 ValueType2 * copyToVecBlock,
1217 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
1218
1219
1220 template <typename ValueType1, typename ValueType2>
1221 void
1222 stridedCopyFromBlock(
1223 const dftfe::size_type contiguousBlockSize,
1224 const dftfe::size_type numContiguousBlocks,
1225 const ValueType1 * copyFromVecBlock,
1226 ValueType2 * copyToVec,
1227 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
1228
1229 template <typename ValueType1, typename ValueType2>
1230 void
1231 stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo,
1232 const dftfe::size_type blockSizeFrom,
1233 const dftfe::size_type numBlocks,
1234 const dftfe::size_type startingId,
1235 const ValueType1 * copyFromVec,
1236 ValueType2 * copyToVec) const;
1237
1238
1239 template <typename ValueType1, typename ValueType2>
1240 void
1241 stridedCopyConstantStride(const dftfe::size_type blockSize,
1242 const dftfe::size_type strideTo,
1243 const dftfe::size_type strideFrom,
1244 const dftfe::size_type numBlocks,
1245 const dftfe::size_type startingToId,
1246 const dftfe::size_type startingFromId,
1247 const ValueType1 * copyFromVec,
1248 ValueType2 * copyToVec);
1249
1250
1251 template <typename ValueType1, typename ValueType2>
1252 void
1253 stridedCopyFromBlockConstantStride(const dftfe::size_type blockSizeTo,
1254 const dftfe::size_type blockSizeFrom,
1255 const dftfe::size_type numBlocks,
1256 const dftfe::size_type startingId,
1257 const ValueType1 * copyFromVec,
1258 ValueType2 * copyToVec);
1259 template <typename ValueType1, typename ValueType2>
1260 void
1261 axpby(const unsigned int n,
1262 const ValueType2 alpha,
1263 const ValueType1 * x,
1264 const ValueType2 beta,
1265 ValueType1 * y) const;
1266
1267 template <typename ValueType1, typename ValueType2>
1268 void
1269 stridedBlockAxpy(const dftfe::size_type contiguousBlockSize,
1270 const dftfe::size_type numContiguousBlocks,
1271 const ValueType1 * addFromVec,
1272 const ValueType2 * scalingVector,
1273 const ValueType2 a,
1274 ValueType1 * addToVec) const;
1275 template <typename ValueType1, typename ValueType2>
1276 void
1277 stridedBlockAxpBy(const dftfe::size_type contiguousBlockSize,
1278 const dftfe::size_type numContiguousBlocks,
1279 const ValueType1 * addFromVec,
1280 const ValueType2 * scalingVector,
1281 const ValueType2 a,
1282 const ValueType2 b,
1283 ValueType1 * addToVec) const;
1284
1285 template <typename ValueType0,
1286 typename ValueType1,
1287 typename ValueType2,
1288 typename ValueType3,
1289 typename ValueType4>
1290 void
1291 ApaBD(const unsigned int m,
1292 const unsigned int n,
1293 const ValueType0 alpha,
1294 const ValueType1 * A,
1295 const ValueType2 * B,
1296 const ValueType3 * D,
1297 ValueType4 * C) const;
1298
1299
1300 template <typename ValueType>
1301 void
1302 axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize,
1303 const dftfe::size_type numContiguousBlocks,
1304 const ValueType * addFromVec,
1305 ValueType * addToVec,
1307 *addToVecStartingContiguousBlockIds) const;
1308
1309 template <typename ValueType1, typename ValueType2, typename ValueType3>
1310 void
1311 axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize,
1312 const dftfe::size_type numContiguousBlocks,
1313 const ValueType1 a,
1314 const ValueType1 * s,
1315 const ValueType2 * addFromVec,
1316 ValueType3 * addToVec,
1318 *addToVecStartingContiguousBlockIds) const;
1319 template <typename ValueType1, typename ValueType2, typename ValueType3>
1320 void
1321 axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize,
1322 const dftfe::size_type numContiguousBlocks,
1323 const ValueType1 a,
1324 const ValueType2 * addFromVec,
1325 ValueType3 * addToVec,
1327 *addToVecStartingContiguousBlockIds) const;
1328
1329 template <typename ValueType1, typename ValueType2>
1330 void
1331 stridedBlockScale(const dftfe::size_type contiguousBlockSize,
1332 const dftfe::size_type numContiguousBlocks,
1333 const ValueType1 a,
1334 const ValueType1 * s,
1335 ValueType2 * x);
1336 template <typename ValueType1, typename ValueType2>
1337 void
1338 stridedBlockScaleCopy(
1339 const dftfe::size_type contiguousBlockSize,
1340 const dftfe::size_type numContiguousBlocks,
1341 const ValueType1 a,
1342 const ValueType1 * s,
1343 const ValueType2 * copyFromVec,
1344 ValueType2 * copyToVecBlock,
1345 const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds);
1346
1347 void
1348 add(double * y,
1349 const double * x,
1350 const double alpha,
1351 const dftfe::size_type size);
1352
1353 template <typename ValueType>
1354 void
1355 sadd(ValueType * y,
1356 ValueType * x,
1357 const ValueType beta,
1358 const dftfe::size_type size);
1359
1360 template <typename ValueType>
1361 void
1362 stridedBlockScaleColumnWise(const dftfe::size_type contiguousBlockSize,
1363 const dftfe::size_type numContiguousBlocks,
1364 const ValueType * beta,
1365 ValueType * x);
1366
1367 template <typename ValueType>
1368 void
1369 stridedBlockScaleAndAddColumnWise(
1370 const dftfe::size_type contiguousBlockSize,
1371 const dftfe::size_type numContiguousBlocks,
1372 const ValueType * x,
1373 const ValueType * beta,
1374 ValueType * y);
1375
1376 template <typename ValueType>
1377 void
1378 stridedBlockScaleAndAddTwoVecColumnWise(
1379 const dftfe::size_type contiguousBlockSize,
1380 const dftfe::size_type numContiguousBlocks,
1381 const ValueType * x,
1382 const ValueType * alpha,
1383 const ValueType * y,
1384 const ValueType * beta,
1385 ValueType * z);
1386
1387 template <typename ValueType1, typename ValueType2>
1388 void
1389 rightDiagonalScale(const dftfe::size_type numberofVectors,
1390 const dftfe::size_type sizeOfVector,
1391 ValueType1 * X,
1392 ValueType2 * D);
1393
1395 getDeviceBlasHandle();
1396
1397
1398 template <typename ValueType1, typename ValueType2>
1399 void
1400 copyBlockDiagonalValueType1OffDiagonalValueType2FromValueType1Arr(
1401 const dftfe::size_type B,
1402 const dftfe::size_type DRem,
1403 const dftfe::size_type D,
1404 const ValueType1 * valueType1SrcArray,
1405 ValueType1 * valueType1DstArray,
1406 ValueType2 * valueType2DstArray);
1407
1408# ifdef DFTFE_WITH_DEVICE_LANG_CUDA
1410 setMathMode(dftfe::utils::deviceBlasMath_t mathMode);
1411# endif
1412 void
1413 setTensorOpDataType(tensorOpDataType opType)
1414 {
1415 d_opType = opType;
1416 }
1417
1419 setStream(dftfe::utils::deviceStream_t streamId);
1420
1421 private:
1422# ifdef DFTFE_WITH_DEVICE_AMD
1423 void
1424 initialize();
1425# endif
1426
1427 /// storage for deviceblas handle
1428 dftfe::utils::deviceBlasHandle_t d_deviceBlasHandle;
1430 tensorOpDataType d_opType;
1431
1433 create();
1434
1436 destroy();
1437 };
1438#endif
1439
1440 } // end of namespace linearAlgebra
1441
1442} // end of namespace dftfe
1443
1444
1445#endif // BLASWrapper_h
void axpby(const unsigned int n, const ValueType2 alpha, const ValueType1 *x, const ValueType2 beta, ValueType1 *y) const
void xgemv(const char transA, const unsigned int m, const unsigned int n, const float *alpha, const float *A, const unsigned int lda, const float *x, const unsigned int incx, const float *beta, float *y, const unsigned int incy) const
void xgemv(const char transA, const unsigned int m, const unsigned int n, const double *alpha, const double *A, const unsigned int lda, const double *x, const unsigned int incx, const double *beta, double *y, const unsigned int incy) const
void xdot(const unsigned int N, const std::complex< double > *X, const unsigned int INCX, const std::complex< double > *Y, const unsigned int INCY, const MPI_Comm &mpi_communicator, std::complex< double > *result) const
void xgemm(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const double *alpha, const double *A, const unsigned int lda, const double *B, const unsigned int ldb, const double *beta, double *C, const unsigned int ldc) const
void xaxpy(const unsigned int n, const std::complex< double > *alpha, const std::complex< double > *x, const unsigned int incx, std::complex< double > *y, const unsigned int incy) const
void xgemv(const char transA, const unsigned int m, const unsigned int n, const std::complex< double > *alpha, const std::complex< double > *A, const unsigned int lda, const std::complex< double > *x, const unsigned int incx, const std::complex< double > *beta, std::complex< double > *y, const unsigned int incy) const
void xnrm2(const unsigned int n, const std::complex< double > *x, const unsigned int incx, const MPI_Comm &mpi_communicator, double *result) const
void xgemmStridedBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const double *alpha, const double *A, const unsigned int lda, long long int strideA, const double *B, const unsigned int ldb, long long int strideB, const double *beta, double *C, const unsigned int ldc, long long int strideC, const int batchCount) const
void stridedCopyToBlock(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds)
void xcopy(const unsigned int n, const double *x, const unsigned int incx, double *y, const unsigned int incy) const
void xdot(const unsigned int N, const double *X, const unsigned int INCX, const double *Y, const unsigned int INCY, const MPI_Comm &mpi_communicator, double *result) const
void hadamardProduct(const unsigned int m, const ValueType *X, const ValueType *Y, ValueType *output) const
void xgemmStridedBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< double > *alpha, const std::complex< double > *A, const unsigned int lda, long long int strideA, const std::complex< double > *B, const unsigned int ldb, long long int strideB, const std::complex< double > *beta, std::complex< double > *C, const unsigned int ldc, long long int strideC, const int batchCount) const
void xgemm(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< float > *alpha, const std::complex< float > *A, const unsigned int lda, const std::complex< float > *B, const unsigned int ldb, const std::complex< float > *beta, std::complex< float > *C, const unsigned int ldc) const
void stridedBlockScale(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 *s, ValueType2 *x)
void xgemmStridedBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const float *alpha, const float *A, const unsigned int lda, long long int strideA, const float *B, const unsigned int ldb, long long int strideB, const float *beta, float *C, const unsigned int ldc, long long int strideC, const int batchCount) const
void MultiVectorXDot(const unsigned int contiguousBlockSize, const unsigned int numContiguousBlocks, const ValueType *X, const ValueType *Y, const ValueType *onesVec, ValueType *tempVector, ValueType *tempResults, ValueType *result) const
void xdot(const unsigned int N, const std::complex< double > *X, const unsigned int INCX, const std::complex< double > *Y, const unsigned int INCY, std::complex< double > *result) const
void add(double *y, const double *x, const double alpha, const dftfe::size_type size)
void ApaBD(const unsigned int m, const unsigned int n, const ValueType0 alpha, const ValueType1 *A, const ValueType2 *B, const ValueType3 *D, ValueType4 *C) const
void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 *s, const ValueType2 *addFromVec, ValueType3 *addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const
void xcopy(const unsigned int n, const std::complex< double > *x, const unsigned int incx, std::complex< double > *y, const unsigned int incy) const
void xgemmBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< float > *alpha, const std::complex< float > *A[], const unsigned int lda, const std::complex< float > *B[], const unsigned int ldb, const std::complex< float > *beta, std::complex< float > *C[], const unsigned int ldc, const int batchCount) const
void stridedCopyToBlock(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const dftfe::size_type startingVecId, const ValueType1 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds)
void stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, const dftfe::size_type blockSizeFrom, const dftfe::size_type numBlocks, const dftfe::size_type startingId, const ValueType1 *copyFromVec, ValueType2 *copyToVec) const
void xgemmBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const double *alpha, const double *A[], const unsigned int lda, const double *B[], const unsigned int ldb, const double *beta, double *C[], const unsigned int ldc, const int batchCount) const
void xgemmStridedBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< float > *alpha, const std::complex< float > *A, const unsigned int lda, long long int strideA, const std::complex< float > *B, const unsigned int ldb, long long int strideB, const std::complex< float > *beta, std::complex< float > *C, const unsigned int ldc, long long int strideC, const int batchCount) const
void xcopy(const unsigned int n, const std::complex< float > *x, const unsigned int incx, std::complex< float > *y, const unsigned int incy) const
void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType2 *addFromVec, ValueType3 *addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const
void hadamardProductWithConj(const unsigned int m, const ValueType *X, const ValueType *Y, ValueType *output) const
void xaxpy(const unsigned int n, const float *alpha, const float *x, const unsigned int incx, float *y, const unsigned int incy) const
void stridedCopyFromBlockConstantStride(const dftfe::size_type blockSizeTo, const dftfe::size_type blockSizeFrom, const dftfe::size_type numBlocks, const dftfe::size_type startingId, const ValueType1 *copyFromVec, ValueType2 *copyToVec)
void xdot(const unsigned int N, const double *X, const unsigned int INCX, const double *Y, const unsigned int INCY, double *result) const
void xaxpy(const unsigned int n, const double *alpha, const double *x, const unsigned int incx, double *y, const unsigned int incy) const
void xgemm(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const float *alpha, const float *A, const unsigned int lda, const float *B, const unsigned int ldb, const float *beta, float *C, const unsigned int ldc) const
void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType *addFromVec, ValueType *addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const
void copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const ValueType1 *valueType1Arr, ValueType2 *valueType2Arr)
void stridedBlockScaleAndAddTwoVecColumnWise(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType *x, const ValueType *alpha, const ValueType *y, const ValueType *beta, ValueType *z)
void xsymv(const char UPLO, const unsigned int N, const double *alpha, const double *A, const unsigned int LDA, const double *X, const unsigned int INCX, const double *beta, double *C, const unsigned int INCY) const
void copyComplexArrToRealArrs(const dftfe::size_type size, const ValueTypeComplex *complexArr, ValueTypeReal *realArr, ValueTypeReal *imagArr)
void stridedBlockScaleCopy(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 *s, const ValueType2 *copyFromVec, ValueType2 *copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds)
void stridedBlockScaleColumnWise(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType *beta, ValueType *x)
void stridedBlockAxpBy(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 *addFromVec, const ValueType2 *scalingVector, const ValueType2 a, const ValueType2 b, ValueType1 *addToVec) const
void xgemmBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< double > *alpha, const std::complex< double > *A[], const unsigned int lda, const std::complex< double > *B[], const unsigned int ldb, const std::complex< double > *beta, std::complex< double > *C[], const unsigned int ldc, const int batchCount) const
void xnrm2(const unsigned int n, const double *x, const unsigned int incx, const MPI_Comm &mpi_communicator, double *result) const
void addVecOverContinuousIndex(const dftfe::size_type numContiguousBlocks, const dftfe::size_type contiguousBlockSize, const ValueType *input1, const ValueType *input2, ValueType *output)
void stridedCopyFromBlock(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 *copyFromVecBlock, ValueType2 *copyToVec, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds)
void xgemm(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const std::complex< double > *alpha, const std::complex< double > *A, const unsigned int lda, const std::complex< double > *B, const unsigned int ldb, const std::complex< double > *beta, std::complex< double > *C, const unsigned int ldc) const
void rightDiagonalScale(const dftfe::size_type numberofVectors, const dftfe::size_type sizeOfVector, ValueType1 *X, ValueType2 *D)
void xaxpy(const unsigned int n, const std::complex< float > *alpha, const std::complex< float > *x, const unsigned int incx, std::complex< float > *y, const unsigned int incy) const
void stridedBlockScaleAndAddColumnWise(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType *x, const ValueType *beta, ValueType *y)
void xgemv(const char transA, const unsigned int m, const unsigned int n, const std::complex< float > *alpha, const std::complex< float > *A, const unsigned int lda, const std::complex< float > *x, const unsigned int incx, const std::complex< float > *beta, std::complex< float > *y, const unsigned int incy) const
void xgemmBatched(const char transA, const char transB, const unsigned int m, const unsigned int n, const unsigned int k, const float *alpha, const float *A[], const unsigned int lda, const float *B[], const unsigned int ldb, const float *beta, float *C[], const unsigned int ldc, const int batchCount) const
void stridedCopyConstantStride(const dftfe::size_type blockSize, const dftfe::size_type strideTo, const dftfe::size_type strideFrom, const dftfe::size_type numBlocks, const dftfe::size_type startingToId, const dftfe::size_type startingFromId, const ValueType1 *copyFromVec, ValueType2 *copyToVec)
void sadd(ValueType *y, ValueType *x, const ValueType beta, const dftfe::size_type size)
void xcopy(const unsigned int n, const float *x, const unsigned int incx, float *y, const unsigned int incy) const
void stridedBlockAxpy(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 *addFromVec, const ValueType2 *scalingVector, const ValueType2 a, ValueType1 *addToVec) const
void xscal(ValueType1 *x, const ValueType2 alpha, const dftfe::size_type n) const
void MultiVectorXDot(const unsigned int contiguousBlockSize, const unsigned int numContiguousBlocks, const ValueType *X, const ValueType *Y, const ValueType *onesVec, ValueType *tempVector, ValueType *tempResults, const MPI_Comm &mpi_communicator, ValueType *result) const
void copyRealArrsToComplexArr(const dftfe::size_type size, const ValueTypeReal *realArr, const ValueTypeReal *imagArr, ValueTypeComplex *complexArr)
Definition BLASWrapper.h:35
Definition BLASWrapper.h:33
cudaStream_t deviceStream_t
Definition DeviceTypeConfig.cu.h:27
cublasStatus_t deviceBlasStatus_t
Definition DeviceTypeConfig.cu.h:38
@ HOST
Definition MemorySpaceType.h:34
@ DEVICE
Definition MemorySpaceType.h:36
cublasMath_t deviceBlasMath_t
Definition DeviceTypeConfig.cu.h:39
cublasHandle_t deviceBlasHandle_t
Definition DeviceTypeConfig.cu.h:36
Definition pseudoPotentialToDftfeConverter.cc:34
unsigned int size_type
Definition TypeConfig.h:6
unsigned long int global_size_type
Definition TypeConfig.h:7
@ LDA
Definition ExcSSDFunctionalBaseClass.h:29