cuda
This commit is contained in:
722
smat.cc
722
smat.cc
@@ -1,3 +1,6 @@
|
||||
//------------------------------------------------------------------------------
|
||||
/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
|
||||
//------------------------------------------------------------------------------
|
||||
/*
|
||||
LA: linear algebra C++ interface library
|
||||
Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
|
||||
@@ -25,219 +28,359 @@
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
extern "C" {
|
||||
extern ssize_t read(int, void *, size_t);
|
||||
extern ssize_t write(int, const void *, size_t);
|
||||
extern ssize_t read(int, void *, size_t);
|
||||
extern ssize_t write(int, const void *, size_t);
|
||||
}
|
||||
// TODO
|
||||
// specialize unary minus
|
||||
|
||||
namespace LA {
|
||||
|
||||
|
||||
/*
|
||||
* * Templates first, specializations for BLAS next
|
||||
*
|
||||
*/
|
||||
|
||||
//raw I/O
|
||||
/***************************************************************************//**
|
||||
* routine for raw output
|
||||
* @param[in] fd file descriptor for output
|
||||
* @param[in] dim number of elements intended for output
|
||||
* @param[in] transp reserved
|
||||
* @see NRMat<T>::get(), NRSMat<T>::copyonwrite()
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::put(int fd, bool dim, bool transp) const
|
||||
{
|
||||
void NRSMat<T>::put(int fd, bool dim, bool transp) const {
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRSMat<T> tmp= *this;
|
||||
tmp.moveto(cpu);
|
||||
tmp.put(fd,dim,transp);
|
||||
return;
|
||||
}
|
||||
if(location != cpu){
|
||||
NRSMat<T> tmp= *this;
|
||||
tmp.moveto(cpu);
|
||||
tmp.put(fd,dim,transp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
errno = 0;
|
||||
if(dim){
|
||||
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
|
||||
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
|
||||
}
|
||||
LA_traits<T>::multiput(NN2,fd,v,dim);
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
* routine for raw input
|
||||
* @param[in] fd file descriptor for input
|
||||
* @param[in] dim number of elements intended for input
|
||||
* @param[in] transp reserved
|
||||
* @see NRSMat<T>::put(), NRSMat<T>::copyonwrite()
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::get(int fd, bool dim, bool transp) {
|
||||
#ifdef CUDALA
|
||||
if(location != cpu){
|
||||
NRSMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
tmp.get(fd,dim,transp);
|
||||
tmp.moveto(location);
|
||||
*this = tmp;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
errno=0;
|
||||
if(dim)
|
||||
{
|
||||
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
|
||||
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
|
||||
}
|
||||
LA_traits<T>::multiput(NN2,fd,v,dim);
|
||||
int nn0[2]; //align at least 8-byte
|
||||
errno = 0;
|
||||
if(dim){
|
||||
if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
|
||||
resize(nn0[0]);
|
||||
}else{
|
||||
copyonwrite();
|
||||
}
|
||||
LA_traits<T>::multiget(NN2,fd,v,dim);
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* constructor symmetrizing given matrix \f$A\f$ of general type <code>T</code> yielding \f$(A+A^\mathrm{T})/2\f$
|
||||
* @param[in] rhs matrix \f$A\f$
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::get(int fd, bool dim, bool transp)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRSMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
tmp.get(fd,dim,transp);
|
||||
tmp.moveto(location);
|
||||
*this = tmp;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
NRSMat<T>::NRSMat(const NRMat<T> &rhs) {
|
||||
NOT_GPU(rhs);
|
||||
|
||||
int nn0[2]; //align at least 8-byte
|
||||
errno=0;
|
||||
if(dim)
|
||||
{
|
||||
if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
|
||||
resize(nn0[0]);
|
||||
}
|
||||
else
|
||||
copyonwrite();
|
||||
LA_traits<T>::multiget(NN2,fd,v,dim);
|
||||
}
|
||||
|
||||
|
||||
// conversion ctor, symmetrize general Mat into SMat
|
||||
template <typename T>
|
||||
NRSMat<T>::NRSMat(const NRMat<T> &rhs)
|
||||
{
|
||||
nn=rhs.nrows();
|
||||
nn = rhs.nrows();
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat");
|
||||
if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat<T> to NRSMat<T>");
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
location = rhs.getlocation();
|
||||
#endif
|
||||
count = new int;
|
||||
*count = 1;
|
||||
v = new T[NN2];
|
||||
int i, j, k=0;
|
||||
for (i=0; i<nn; i++)
|
||||
for (j=0; j<=i;j++) v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
|
||||
int i, j, k(0);
|
||||
for(i=0; i<nn; i++){
|
||||
for(j=0; j<=i; j++){
|
||||
v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// assign to diagonal
|
||||
/***************************************************************************//**
|
||||
* zero out this symmetric matrix of general type <code>T</code> and then set
|
||||
* the diagonal elements to prescribed value
|
||||
* @param[in] a scalar value to be assigned to the diagonal
|
||||
* @return reference to the modified matrix
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
NRSMat<T> & NRSMat<T>::operator=(const T &a)
|
||||
{
|
||||
NRSMat<T> & NRSMat<T>::operator=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
memset(v,0,NN2*sizeof(T));
|
||||
for (int i=0; i<nn; i++) v[i*(i+1)/2+i] = a;
|
||||
memset(v, 0, NN2*sizeof(T));
|
||||
for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//get diagonal
|
||||
/***************************************************************************//**
|
||||
* get or divide by the diagonal of real symmetric double-precision matrix
|
||||
* @param[in, out] r vector for storing the diagonal
|
||||
* @param[in] divide
|
||||
* \li \c false save the diagonal to vector r
|
||||
* \li \c true divide the vector r by the diagonal elements element-wise
|
||||
* @param[in] cache reserved
|
||||
* @return
|
||||
* \li <tt>divide == true</tt> NULL
|
||||
* \li <tt>divide == false</tt> pointer to the first element of r
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
|
||||
{
|
||||
const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const {
|
||||
#ifdef DEBUG
|
||||
if(r.size()!=nn) laerror("incompatible vector in diagonalof()");
|
||||
if(r.size() != nn) laerror("incompatible vector in const T* NRSMat<T>::diagonalof(NRVec<T> &, const bool, bool)");
|
||||
#endif
|
||||
NOT_GPU(*this);
|
||||
SAME_LOC(*this, r);
|
||||
|
||||
r.copyonwrite();
|
||||
r.copyonwrite();
|
||||
|
||||
if (divide)
|
||||
for (int i=0; i<nn; i++) {T a =v[i*(i+1)/2+i]; if(a!=0.) r[i] /= a;}
|
||||
else
|
||||
for (int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
|
||||
return divide?NULL:&r[0];
|
||||
if(divide){
|
||||
for(register int i=0; i<nn; i++){
|
||||
const T a = v[i*(i+1)/2+i];
|
||||
if(a != 0.) r[i] /= a;
|
||||
}
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
|
||||
}
|
||||
|
||||
return divide?NULL:&r[0];
|
||||
}
|
||||
|
||||
|
||||
// unary minus
|
||||
/***************************************************************************//**
|
||||
* implements unary minus operator for this symmetric
|
||||
* matrix of general type <code>T</code>
|
||||
* @return modified copy of this matrix
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
const NRSMat<T> NRSMat<T>::operator-() const
|
||||
{
|
||||
NRSMat<T> result(nn);
|
||||
for(int i=0; i<NN2; i++) result.v[i]= -v[i];
|
||||
const NRSMat<T> NRSMat<T>::operator-() const {
|
||||
NOT_GPU(*this);
|
||||
|
||||
NRSMat<T> result(nn, getlocation());
|
||||
for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
|
||||
return result;
|
||||
}
|
||||
|
||||
// trace of Smat
|
||||
/***************************************************************************//**
|
||||
* implements unary minus operator for this real symmetric matrix
|
||||
* @return modified copy of this matrix
|
||||
******************************************************************************/
|
||||
template <>
|
||||
const NRSMat<double> NRSMat<double>::operator-() const {
|
||||
NRSMat<double> result(nn, getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
memcpy(result.v, v, NN2*sizeof(double));
|
||||
cblas_dscal(NN2, -1., result.v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDcopy(NN2, v, 1, result.v, 1);
|
||||
TEST_CUBLAS("cublasDcopy");
|
||||
|
||||
cublasDscal(NN2, -1., result.v, 1);
|
||||
TEST_CUBLAS("cublasDscal");
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
* implements unary minus operator for this hermitian matrix
|
||||
* @return modified copy of this matrix
|
||||
******************************************************************************/
|
||||
template <>
|
||||
const NRSMat<complex<double> > NRSMat<complex<double> >::operator-() const {
|
||||
NRSMat<complex<double> > result(nn, getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu) {
|
||||
#endif
|
||||
memcpy(result.v, v, NN2*sizeof(complex<double>));
|
||||
cblas_zscal(NN2, &CMONE, result.v, 1);
|
||||
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1);
|
||||
TEST_CUBLAS("cublasZcopy");
|
||||
|
||||
cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1);
|
||||
TEST_CUBLAS("cublasZscal");
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
* @return the sum of the diagonal elements
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
const T NRSMat<T>::trace() const
|
||||
{
|
||||
const T NRSMat<T>::trace() const {
|
||||
NOT_GPU(*this);
|
||||
|
||||
T tmp = 0;
|
||||
for (int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
|
||||
for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
* fill this real symmetric matrix with
|
||||
* pseudorandom numbers generated from uniform distribution
|
||||
******************************************************************************/
|
||||
template<>
|
||||
void NRSMat<double>::randomize(const double &x)
|
||||
{
|
||||
for(int i=0; i<NN2; ++i) v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
|
||||
void NRSMat<double>::randomize(const double &x) {
|
||||
NOT_GPU(*this);
|
||||
|
||||
for(int i=0; i<NN2; ++i){
|
||||
v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
|
||||
}
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
* Fill this hermitian matrix with pseudorandom numbers generated from uniform
|
||||
* distribution. The real and imaginary parts are generated independently.
|
||||
******************************************************************************/
|
||||
template<>
|
||||
void NRSMat<complex<double> >::randomize(const double &x)
|
||||
{
|
||||
for(int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1.+RAND_MAX) -1.);
|
||||
for(int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1.+RAND_MAX) -1.);
|
||||
for(int i=0; i<nn; ++i) for(int j=0; j<=i; ++j) if(i==j) v[i*(i+1)/2+j].imag()=0; //hermitean
|
||||
void NRSMat<complex<double> >::randomize(const double &x) {
|
||||
for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register int i=0; i<nn; ++i){
|
||||
for(register int j=0; j<=i; ++j){
|
||||
if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// write matrix to the file with specific format
|
||||
/***************************************************************************//**
|
||||
* routine for formatted output via lawritemat
|
||||
* @param[in] file pointer to <tt>FILE</tt> structure representing the output file
|
||||
* @param[in] format format specification in standard printf-like form
|
||||
* @param[in] modulo
|
||||
* @see lawritemat()
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const
|
||||
{
|
||||
void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const {
|
||||
NOT_GPU(*this);
|
||||
|
||||
lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1);
|
||||
}
|
||||
|
||||
// read matrix from the file with specific format
|
||||
|
||||
/***************************************************************************//**
|
||||
* routine for formatted input via fscanf
|
||||
* @param[in] f pointer to <tt>FILE</tt> structure representing the input file
|
||||
* @param[in] format format specification in standard printf-like form
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::fscanf(FILE *f, const char *format)
|
||||
{
|
||||
void NRSMat<T>::fscanf(FILE *f, const char *format) {
|
||||
int n, m;
|
||||
if (::fscanf(f,"%d %d",&n,&m) != 2)
|
||||
laerror("cannot read matrix dimensions in SMat::fscanf");
|
||||
if (n != m) laerror("different dimensions of SMat");
|
||||
NOT_GPU(*this);
|
||||
|
||||
if (::fscanf(f,"%d %d", &n, &m) != 2)
|
||||
laerror("cannot read matrix dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
|
||||
if (n != m) laerror("different dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
|
||||
resize(n);
|
||||
for (int i=0; i<n; i++)
|
||||
for (int j=0; j<n; j++)
|
||||
if (::fscanf(f,format,&((*this)(i,j))) != 1)
|
||||
laerror("Smat - cannot read matrix element");
|
||||
laerror("NRSMat<T>::fscanf(FILE *, const char *) - unable to read matrix element");
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* BLAS specializations for double and complex<double>
|
||||
*/
|
||||
|
||||
|
||||
|
||||
// SMat * Mat
|
||||
//NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix
|
||||
/***************************************************************************//**
|
||||
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
|
||||
* with real double-precision dense matrix \f$A\f$
|
||||
* @param[in] rhs real double-precision matrix \f$A\f$
|
||||
* @return matrix produt \f$S\times{}A\f$
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const
|
||||
{
|
||||
const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
|
||||
if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRMat<double> &)");
|
||||
#endif
|
||||
SAME_LOC(*this, rhs);
|
||||
NRMat<double> result(nn, rhs.ncols(), getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
for(register int k = 0; k<rhs.ncols(); k++){
|
||||
cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
|
||||
}
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int k = 0; k<rhs.ncols(); k++){
|
||||
cublasDspmv('U', nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
|
||||
TEST_CUBLAS("cublasDspmv");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
NRMat<double> result(nn, rhs.ncols());
|
||||
for (int k=0; k<rhs.ncols(); k++)
|
||||
cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0]+k, rhs.ncols(),
|
||||
0.0, result[0]+k, rhs.ncols());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
|
||||
* with real double-precision dense matrix \f$A\f$
|
||||
* @param[in] rhs real double-precision matrix \f$A\f$
|
||||
* @return matrix produt \f$S\times{}A\f$
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const NRMat< complex<double> >
|
||||
NRSMat< complex<double> >::operator*(const NRMat< complex<double> > &rhs) const
|
||||
{
|
||||
const NRMat<complex<double> >
|
||||
NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
|
||||
if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &)");
|
||||
#endif
|
||||
SAME_LOC(*this, rhs);
|
||||
NRMat<complex<double> > result(nn, rhs.ncols(), getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
for(register int k=0; k<rhs.ncols(); k++){
|
||||
cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(), &CZERO, result[0]+k, rhs.ncols());
|
||||
}
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int k = 0; k<rhs.ncols(); k++){
|
||||
cublasZhpmv('U', nn,
|
||||
CUONE, (cuDoubleComplex*)v, (cuDoubleComplex*)(rhs[0] + k), rhs.ncols(),
|
||||
CUZERO, (cuDoubleComplex*)(result[0] + k), rhs.ncols());
|
||||
|
||||
TEST_CUBLAS("cublasDspmv");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
NRMat< complex<double> > result(nn, rhs.ncols());
|
||||
for (int k=0; k<rhs.ncols(); k++)
|
||||
cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(),
|
||||
&CZERO, result[0]+k, rhs.ncols());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// SMat * SMat
|
||||
/***************************************************************************//**
|
||||
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
|
||||
* with real double-precision symmetric matrix \f$T\f$
|
||||
* @return matrix produt \f$S\times{}T\f$ (not necessarily symmetric)
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
|
||||
{
|
||||
const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &)");
|
||||
#endif
|
||||
NRMat<double> result(0.0, nn, nn);
|
||||
double *p, *q;
|
||||
@@ -283,156 +426,295 @@ const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form
|
||||
* with complex double-precision symmetric matrix \f$H\f$
|
||||
* @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric)
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const NRMat< complex<double> >
|
||||
NRSMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
|
||||
{
|
||||
const NRMat<complex<double> >
|
||||
NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &)");
|
||||
#endif
|
||||
NRMat< complex<double> > result(0.0, nn, nn);
|
||||
NRMat< complex<double> > rhsmat(rhs);
|
||||
SAME_LOC(*this, rhs);
|
||||
NRMat<complex<double> > result(nn, nn, getlocation());
|
||||
NRMat<complex<double> > rhsmat(rhs);
|
||||
result = *this * rhsmat;
|
||||
return result;
|
||||
// laerror("complex SMat*Smat not implemented");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// S dot S
|
||||
/***************************************************************************//**
|
||||
* compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$
|
||||
* i.e. determine the value of
|
||||
* \f[\sum_{i,j}A_{i,j}B_{i,j}\f]
|
||||
* @param[in] rhs matrix \f$B\f$
|
||||
* @return computed inner product
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const double NRSMat<double>::dot(const NRSMat<double> &rhs) const
|
||||
{
|
||||
const double NRSMat<double>::dot(const NRSMat<double> &rhs) const {
|
||||
double ret(0.);
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible SMat's");
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRSMat<double> &)");
|
||||
#endif
|
||||
return cblas_ddot(NN2, v, 1, rhs.v, 1);
|
||||
SAME_LOC(*this, rhs);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
ret = cublasDdot(NN2, v, 1, rhs.v, 1);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$
|
||||
* i.e. determine the value of
|
||||
* \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f]
|
||||
* @param[in] rhs matrix \f$B\f$
|
||||
* @return computed inner product
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const complex<double>
|
||||
NRSMat< complex<double> >::dot(const NRSMat< complex<double> > &rhs) const
|
||||
{
|
||||
const complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible SMat's");
|
||||
if (nn != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &)");
|
||||
#endif
|
||||
complex<double> dot(0., 0.);
|
||||
SAME_LOC(*this, rhs);
|
||||
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
|
||||
dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
|
||||
TEST_CUBLAS("cublasZdotc");
|
||||
}
|
||||
#endif
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$
|
||||
* with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
|
||||
* @param[in] rhs real double-precision vector \f$\vec{v}\f$
|
||||
* @return computed inner product
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const double NRSMat<double>::dot(const NRVec<double> &rhs) const
|
||||
{
|
||||
const double NRSMat<double>::dot(const NRVec<double> &rhs) const {
|
||||
double ret(0.0);
|
||||
#ifdef DEBUG
|
||||
if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
|
||||
if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRVec<double> &)");
|
||||
#endif
|
||||
SAME_LOC(*this, rhs);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
ret = cublasDdot(NN2, v, 1, rhs.v, 1);
|
||||
TEST_CUBLAS("cublasDdot");
|
||||
}
|
||||
#endif
|
||||
return cblas_ddot(NN2, v, 1, rhs.v, 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$
|
||||
* with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
|
||||
* @param[in] rhs complex double-precision vector \f$\vec{v}\f$
|
||||
* @return computed inner product
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const complex<double>
|
||||
NRSMat< complex<double> >::dot(const NRVec< complex<double> > &rhs) const
|
||||
{
|
||||
NRSMat<complex<double> >::dot(const NRVec<complex<double> > &rhs) const {
|
||||
#ifdef DEBUG
|
||||
if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
|
||||
if(NN2 != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRVec<complex<double> > &)");
|
||||
#endif
|
||||
complex<double> dot(0., 0.);
|
||||
SAME_LOC(*this, rhs);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1);
|
||||
TEST_CUBLAS("cublasZdotc");
|
||||
dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
|
||||
}
|
||||
#endif
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
|
||||
// norm of the matrix
|
||||
/***************************************************************************//**
|
||||
* compute the Frobenius norm of this real double-precision symmetric matrix
|
||||
* @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const double NRSMat<double>::norm(const double scalar) const
|
||||
{
|
||||
if (!scalar) return cblas_dnrm2(NN2, v, 1);
|
||||
double sum = 0;
|
||||
int k = 0;
|
||||
for (int i=0; i<nn; ++i)
|
||||
for (int j=0; j<=i; ++j) {
|
||||
register double tmp;
|
||||
tmp = v[k++];
|
||||
if (i == j) tmp -= scalar;
|
||||
const double NRSMat<double>::norm(const double scalar) const {
|
||||
if(!scalar){
|
||||
double ret(0.);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
ret = cblas_dnrm2(NN2, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
ret = cublasDnrm2(NN2, v, 1);
|
||||
TEST_CUBLAS("cublasDnrm2");
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
NOT_GPU(*this);
|
||||
|
||||
double sum(0.);
|
||||
int k(0);
|
||||
for(register int i=0; i<nn; ++i){
|
||||
for(register int j=0; j<=i; ++j) {
|
||||
register double tmp = v[k++];
|
||||
if(i == j) tmp -= scalar;
|
||||
sum += tmp*tmp;
|
||||
}
|
||||
}
|
||||
return std::sqrt(sum);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* compute the Frobenius norm of this complex double-precision hermitian matrix
|
||||
* @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
|
||||
******************************************************************************/
|
||||
template<>
|
||||
const double NRSMat< complex<double> >::norm(const complex<double> scalar) const
|
||||
{
|
||||
if (!(scalar.real()) && !(scalar.imag()))
|
||||
return cblas_dznrm2(NN2, v, 1);
|
||||
double sum = 0;
|
||||
const double NRSMat< complex<double> >::norm(const complex<double> scalar) const {
|
||||
if(!(scalar.real()) && !(scalar.imag())){
|
||||
double ret(0.);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
ret = cblas_dznrm2(NN2, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1);
|
||||
TEST_CUBLAS("cublasDznrm2");
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
int k(0);
|
||||
double sum(0.);
|
||||
complex<double> tmp;
|
||||
int k = 0;
|
||||
for (int i=0; i<nn; ++i)
|
||||
for (int j=0; j<=i; ++j) {
|
||||
|
||||
for(register int i=0; i<nn; ++i){
|
||||
for(register int j=0; j<=i; ++j){
|
||||
tmp = v[k++];
|
||||
if (i == j) tmp -= scalar;
|
||||
sum += tmp.real()*tmp.real() + tmp.imag()*tmp.imag();
|
||||
}
|
||||
}
|
||||
return std::sqrt(sum);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// axpy: S = S * a
|
||||
/***************************************************************************//**
|
||||
* for this real double-precision symmetric matrix \f$S\f$ stored in packed form,
|
||||
* real scalar value \f$\alpha\f$ and real double-precision symmetric matrix \f$T\f$, compute
|
||||
* \f[S \leftarrow \alpha T + S\f]
|
||||
******************************************************************************/
|
||||
template<>
|
||||
void NRSMat<double>::axpy(const double alpha, const NRSMat<double> & x)
|
||||
{
|
||||
void NRSMat<double>::axpy(const double alpha, const NRSMat<double> &x) {
|
||||
#ifdef DEBUG
|
||||
if (nn != x.nn) laerror("axpy of incompatible SMats");
|
||||
if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<double>::axpy(const double, const NRSMat<double>&)");
|
||||
#endif
|
||||
SAME_LOC(*this, x);
|
||||
copyonwrite();
|
||||
cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDaxpy(NN2, alpha, x.v, 1, v, 1);
|
||||
TEST_CUBLAS("cublasDaxpy");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
* for this complex double-precision hermitian matrix \f$H\f$ stored in packed form,
|
||||
* complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute
|
||||
* \f[H \leftarrow \alpha G + H\f]
|
||||
******************************************************************************/
|
||||
template<>
|
||||
void NRSMat< complex<double> >::axpy(const complex<double> alpha,
|
||||
const NRSMat< complex<double> > & x)
|
||||
{
|
||||
void NRSMat<complex<double> >::axpy(const complex<double> alpha, const NRSMat<complex<double> > & x) {
|
||||
#ifdef DEBUG
|
||||
if (nn != x.nn) laerror("axpy of incompatible SMats");
|
||||
if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<complex<double> >::axpy(const complex<double> , const NRSMat<complex<double> >&)");
|
||||
#endif
|
||||
SAME_LOC(*this, x);
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
|
||||
cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1);
|
||||
TEST_CUBLAS("cublasZaxpy");
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
//complex from real
|
||||
/***************************************************************************//**
|
||||
* create hermitian matrix \f$H\f$ from given real double-precision symmetric
|
||||
* matrix \f$S\f$
|
||||
* @param[in] rhs real double-precision symmetric matrix \f$S\f$
|
||||
* @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$
|
||||
******************************************************************************/
|
||||
template<>
|
||||
NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart)
|
||||
: nn(rhs.nrows()), v(new complex<double>[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1))
|
||||
{
|
||||
memset(v,0,nn*(nn+1)/2*sizeof(complex<double>));
|
||||
cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
|
||||
NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) {
|
||||
//inconsistent in general case?
|
||||
const int nnp1 = nn*(nn + 1)/2;
|
||||
#ifdef CUDALA
|
||||
location = rhs.getlocation();
|
||||
if(location == cpu){
|
||||
#endif
|
||||
v = new complex<double>[nnp1];
|
||||
memset(v, 0, nnp1*sizeof(complex<double>));
|
||||
cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
v = (complex<double>*) gpualloc(nnp1*sizeof(complex<double>));
|
||||
|
||||
complex<double> *_val = gpuputcomplex(CZERO);
|
||||
cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1);
|
||||
TEST_CUBLAS("cublasZcopy");
|
||||
gpufree(_val);
|
||||
|
||||
cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2);
|
||||
TEST_CUBLAS("cublasDcopy");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//some template specializations leading to BLAS/CUBLAS calls
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
////// forced instantization in the corresponding object file
|
||||
/***************************************************************************//**
|
||||
* forced instantization in the corresponding object file
|
||||
******************************************************************************/
|
||||
template class NRSMat<double>;
|
||||
template class NRSMat< complex<double> >;
|
||||
template class NRSMat<complex<double> >;
|
||||
|
||||
template class NRSMat<long long>;
|
||||
template class NRSMat<long>;
|
||||
|
||||
Reference in New Issue
Block a user