template <typename ValueTypeBasisCoeff,
typename ValueTypeBasisData,
utils::MemorySpace memorySpace,
size_type dim>
void
FEBasisOperations<ValueTypeBasisCoeff,
ValueTypeBasisData,
memorySpace,
dim>::
interpolateWithBasisGradient(
const linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>
& vectorData,
const BasisManager<ValueTypeBasisCoeff, memorySpace> &basisManager,
std::vector<quadrature::QuadratureValuesContainer<
linearAlgebra::blasLapack::scalar_type<ValueTypeBasisCoeff,
ValueTypeBasisData>,
memorySpace>> &quadValuesContainerVec) const
{
quadrature::QuadratureRuleAttributes quadratureRuleAttributes =
d_feBasisDataStorage->getQuadratureRuleContainer()
->getQuadratureRuleAttributes();
const FEBasisManager<ValueTypeBasisCoeff,
ValueTypeBasisData,
memorySpace,
dim> &feBasisManager =
dynamic_cast<const FEBasisManager<ValueTypeBasisCoeff,
ValueTypeBasisData,
memorySpace,
dim> &>(basisManager);
utils::throwException(
&feBasisManager != nullptr,
"Could not cast BasisManager of the input vector to FEBasisManager
in " "FEBasisOperations.interpolate()");
const BasisDofHandler &basisDofHandler =
basisManager.getBasisDofHandler();
const FEBasisDofHandler<ValueTypeBasisCoeff, memorySpace, dim>
&feBasisDofHandler = dynamic_cast<
const FEBasisDofHandler<ValueTypeBasisCoeff, memorySpace, dim> &>(
basisDofHandler);
utils::throwException(
&feBasisDofHandler != nullptr,
"Could not cast BasisDofHandler of the input vector to
FEBasisDofHandler " "in FEBasisOperations.interpolate()");
const size_type numComponents = vectorData.getNumberComponents();
const size_type numLocallyOwnedCells =
feBasisManager.nLocallyOwnedCells();
const size_type numCumulativeLocallyOwnedCellDofs =
feBasisManager.nCumulativeLocallyOwnedCellDofs();
auto itCellLocalIdsBegin =
feBasisManager.locallyOwnedCellLocalDofIdsBegin();
reinit the quadValuesContainerVec
utils::throwException(
dim == quadValuesContainerVec.size(),
"The dim do not match with that of size of quadValuesContainerVec");
for (auto &i : quadValuesContainerVec)
{
utils::throwException(
quadratureRuleAttributes ==
i.getQuadratureRuleContainer()->getQuadratureRuleAttributes(),
"The quadRuleAttributes do not match with that in the input
quadValuesContainer"); utils::throwException( numComponents ==
i.getNumberComponents(), "The number of components of input vector do not
match with that in the input quadValuesContainer");
}
std::shared_ptr<const quadrature::QuadratureRuleContainer>
quadRuleContainer =
d_feBasisDataStorage->getQuadratureRuleContainer(); for (auto &i :
quadValuesContainerVec)
{
i.reinit(quadRuleContainer, numComponents, ValueTypeUnion());
}
const quadrature::QuadratureFamily quadratureFamily =
quadratureRuleAttributes.getQuadratureFamily();
std::vector<size_type> numCellDofs(numLocallyOwnedCells, 0);
std::vector<size_type> numCellQuad(numLocallyOwnedCells, 0);
for (size_type iCell = 0; iCell < numLocallyOwnedCells; ++iCell)
{
numCellDofs[iCell] = feBasisManager.nLocallyOwnedCellDofs(iCell);
numCellQuad[iCell] =
quadRuleContainer->nCellQuadraturePoints(iCell);
}
Perform Ce = Ae*Be, where Ce_ij = interpolated value of the i-th component at j-th quad point in e-th cell Ae_ik = i-th vector components at k-th basis function of e-th cell Be_kj = k-th basis function gradient value at j-th quad point in e-th cell
For better performance, we evaluate Ce for multiple cells at a time
- Note
- : The Be matrix is stored with the quad point as the fastest index. That is Be_kj (k-th basis function value at j-th quad point in e-th cell) is stored in a row-major format. Instead of copying it to a column major format (which is assumed native format for Blas/Lapack data), we use the transpose of Be matrix. That is, we perform Ce = Ae*(Be)^T, with Be stored in row major format
linearAlgebra::blasLapack::Layout layout =
linearAlgebra::blasLapack::Layout::ColMajor;
size_type cellLocalIdsOffset = 0;
std::vector<size_type> BStartOffset(dim, 0);
for (size_type iDim = 0; iDim < dim; iDim++)
BStartOffset[iDim] =
numCellDofs[0] * quadRuleContainer->nCellQuadraturePoints(0) *
iDim; size_type quadValueContainerStartOffset = 0; const size_type
cellBlockSize = d_maxCellTimesFieldBlock / numComponents; for (size_type
cellStartId = 0; cellStartId < numLocallyOwnedCells; cellStartId +=
cellBlockSize)
{
const size_type cellEndId =
std::min(cellStartId + cellBlockSize, numLocallyOwnedCells);
const size_type numCellsInBlock = cellEndId - cellStartId;
std::vector<size_type> numCellsInBlockDofs(numCellsInBlock, 0);
std::copy(numCellDofs.begin() + cellStartId,
numCellDofs.begin() + cellEndId,
numCellsInBlockDofs.begin());
const size_type numCumulativeDofsCellsInBlock =
std::accumulate(numCellsInBlockDofs.begin(),
numCellsInBlockDofs.end(),
0);
utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>
fieldCellValues(numCumulativeDofsCellsInBlock * numComponents);
utils::MemoryTransfer<memorySpace, utils::MemorySpace::HOST>
memoryTransfer;
utils::MemoryStorage<size_type, memorySpace>
numCellsInBlockDofsMemSpace(numCellsInBlock);
memoryTransfer.copy(numCellsInBlock,
numCellsInBlockDofsMemSpace.data(),
numCellsInBlockDofs.data());
FECellWiseDataOperations<ValueTypeBasisCoeff, memorySpace>::
copyFieldToCellWiseData(vectorData.begin(),
numComponents,
itCellLocalIdsBegin +
cellLocalIdsOffset, numCellsInBlockDofsMemSpace, fieldCellValues);
std::vector<linearAlgebra::blasLapack::Op> transA(
numCellsInBlock, linearAlgebra::blasLapack::Op::NoTrans);
std::vector<linearAlgebra::blasLapack::Op> transB(
numCellsInBlock, linearAlgebra::blasLapack::Op::Trans);
std::vector<size_type> mSizesTmp(numCellsInBlock, 0);
std::vector<size_type> nSizesTmp(numCellsInBlock, 0);
std::vector<size_type> kSizesTmp(numCellsInBlock, 0);
std::vector<size_type> ldaSizesTmp(numCellsInBlock, 0);
std::vector<size_type> ldbSizesTmp(numCellsInBlock, 0);
std::vector<size_type> ldcSizesTmp(numCellsInBlock, 0);
std::vector<size_type> strideATmp(numCellsInBlock, 0);
std::vector<size_type> strideBTmp(numCellsInBlock, 0);
std::vector<size_type> strideCTmp(numCellsInBlock, 0);
for (size_type iCell = 0; iCell < numCellsInBlock; ++iCell)
{
const size_type cellId = cellStartId + iCell;
mSizesTmp[iCell] = numComponents;
nSizesTmp[iCell] =
quadRuleContainer->nCellQuadraturePoints(cellId);
kSizesTmp[iCell] = numCellsInBlockDofs[iCell];
ldaSizesTmp[iCell] = mSizesTmp[iCell];
ldbSizesTmp[iCell] = nSizesTmp[iCell];
ldcSizesTmp[iCell] = mSizesTmp[iCell];
strideATmp[iCell] = mSizesTmp[iCell] * kSizesTmp[iCell];
strideCTmp[iCell] = mSizesTmp[iCell] * nSizesTmp[iCell];
strideBTmp[iCell] = kSizesTmp[iCell] * nSizesTmp[iCell] *
dim;
}
utils::MemoryStorage<size_type, memorySpace>
mSizes(numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
nSizes(numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
kSizes(numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
ldaSizes( numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
ldbSizes( numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
ldcSizes( numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
strideA(numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
strideB(numCellsInBlock); utils::MemoryStorage<size_type, memorySpace>
strideC(numCellsInBlock); memoryTransfer.copy(numCellsInBlock,
mSizes.data(), mSizesTmp.data()); memoryTransfer.copy(numCellsInBlock,
nSizes.data(), nSizesTmp.data()); memoryTransfer.copy(numCellsInBlock,
kSizes.data(), kSizesTmp.data()); memoryTransfer.copy(numCellsInBlock,
ldaSizes.data(),
ldaSizesTmp.data());
memoryTransfer.copy(numCellsInBlock,
ldbSizes.data(),
ldbSizesTmp.data());
memoryTransfer.copy(numCellsInBlock,
ldcSizes.data(),
ldcSizesTmp.data());
memoryTransfer.copy(numCellsInBlock,
strideA.data(),
strideATmp.data());
memoryTransfer.copy(numCellsInBlock,
strideB.data(),
strideBTmp.data());
memoryTransfer.copy(numCellsInBlock,
strideC.data(),
strideCTmp.data());
ValueTypeUnion alpha = 1.0;
ValueTypeUnion beta = 0.0;
linearAlgebra::LinAlgOpContext<memorySpace> &linAlgOpContext =
(vectorData.getLinAlgOpContext().get());
std::vector<size_type> numCellsInBlockQuad(numCellsInBlock, 0);
std::copy(numCellQuad.begin() + cellStartId,
numCellQuad.begin() + cellEndId,
numCellsInBlockQuad.begin());
const size_type numCumulativeQuadCellsInBlock =
std::accumulate(numCellsInBlockQuad.begin(),
numCellsInBlockQuad.end(),
0);
for (size_type iDim = 0; iDim < dim; iDim++)
{
const ValueTypeBasisData *B =
(d_feBasisDataStorage->getBasisGradientDataInAllCells())
.begin() +
BStartOffset[iDim];
ValueTypeUnion *C = quadValuesContainerVec[iDim].begin() +
quadValueContainerStartOffset;
linearAlgebra::blasLapack::gemmStridedVarBatched<
ValueTypeBasisCoeff,
ValueTypeBasisData,
memorySpace>(layout,
numCellsInBlock,
transA.data(),
transB.data(),
strideA.data(),
strideB.data(),
strideC.data(),
mSizes.data(),
nSizes.data(),
kSizes.data(),
alpha,
fieldCellValues.begin(),
ldaSizes.data(),
B,
ldbSizes.data(),
beta,
C,
ldcSizes.data(),
linAlgOpContext);
}
for (size_type iCell = 0; iCell < numCellsInBlock; ++iCell)
{
for (size_type iDim = 0; iDim < dim; iDim++)
BStartOffset[iDim] += kSizesTmp[iCell] * nSizesTmp[iCell] *
dim; cellLocalIdsOffset += numCellDofs[cellStartId + iCell];
}
quadValueContainerStartOffset +=
numComponents * numCumulativeQuadCellsInBlock;
}
}
Assess i,j,k element by C[i*numvec*dim + j*numvec + k (numvec is the fastest)] cell->dim->numVec (i,j,k)