diff --git a/cuda_la.cc b/cuda_la.cc index a91e01e..b47f7ac 100644 --- a/cuda_la.cc +++ b/cuda_la.cc @@ -1,3 +1,6 @@ +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +/******************************************************************************* +*******************************************************************************/ #include "la_traits.h" #include "cuda_la.h" @@ -7,63 +10,54 @@ namespace LA { GPUID DEFAULT_LOC = cpu; -void set_default_loc(const GPUID loc) -{ -DEFAULT_LOC = loc; +void set_default_loc(const GPUID loc){ + DEFAULT_LOC = loc; } -void *gpualloc(size_t size) -{ -cublasStatus status; -void *ptr=NULL; -status = cublasAlloc(size,1,&ptr); -if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); -return ptr; +void *gpualloc(size_t size){ + void *ptr = NULL; + cublasAlloc(size, 1, &ptr); + TEST_CUBLAS("cublasAlloc"); + return ptr; } -void gpufree(void *ptr) -{ -cublasStatus status = cublasFree(ptr); -if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree"); +void gpufree(void *ptr){ + cublasFree(ptr); + TEST_CUBLAS("cublasFree"); } -void gpuget(size_t n, size_t elsize, const void *from, void *to) -{ -cublasStatus status; -status=cublasGetVector(n,elsize,from,1,to,1); -if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector"); +void gpuget(size_t n, size_t elsize, const void *from, void *to){ + cublasGetVector(n, elsize, from, 1, to, 1); + TEST_CUBLAS("cublasGetVector"); } -void gpuput(size_t n, size_t elsize, const void *from, void *to) -{ -cublasStatus status; -status=cublasSetVector(n,elsize,from,1,to,1); -if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); +void gpuput(size_t n, size_t elsize, const void *from, void *to){ + cublasSetVector(n, elsize, from, 1, to, 1); + TEST_CUBLAS("cublasSetVector"); } -double *gpuputdouble(const double &x) -{ -cublasStatus status; -void *ptr=NULL; -status = cublasAlloc(1,sizeof(double),&ptr); -if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); -status=cublasSetVector(1,sizeof(double),&x,1,ptr,1); -if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); -return (double *)ptr; +double *gpuputdouble(const double &x){ + void *ptr = NULL; + cublasAlloc(1, sizeof(double), &ptr); + TEST_CUBLAS("cublasAlloc"); + + cublasSetVector(1, sizeof(double), &x, 1, ptr, 1); + TEST_CUBLAS("cublasSetVector"); + + return (double *)ptr; } -complex *gpuputcomplex(const complex &x) -{ -cublasStatus status; -void *ptr=NULL; -status = cublasAlloc(1,sizeof(complex),&ptr); -if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); -status=cublasSetVector(1,sizeof(complex),&x,1,ptr,1); -if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); -return (complex *)ptr; -} +complex *gpuputcomplex(const complex &x){ + void *ptr = NULL; + cublasAlloc(1, sizeof(complex), &ptr); + TEST_CUBLAS("cublasAlloc"); + cublasSetVector(1, sizeof(complex), &x, 1, ptr, 1); + TEST_CUBLAS("cublasSetVector"); + + return (complex *)ptr; +} } diff --git a/cuda_la.h b/cuda_la.h index 7e1d85a..8656cd8 100644 --- a/cuda_la.h +++ b/cuda_la.h @@ -1,6 +1,10 @@ +//------------------------------------------------------------------------------ +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +//------------------------------------------------------------------------------ #ifndef _CUDA_LA_H #define _CUDA_LA_H +#include #ifdef CUDALA #undef MATPTR #include "cublas.h" @@ -13,6 +17,7 @@ namespace LA { #ifdef CUDALA #define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");} #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");} +#define NOT_CPU(x) {if((x).getlocation()==cpu) laerror("Operation not implemented on CPU (yet). Use .moveto(>0) first.");} #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");} #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");} #else @@ -22,6 +27,16 @@ namespace LA { #define SAME_LOC3(x,y,z) {} #endif +#ifdef DEBUG + #ifdef __GNUG__ + #define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __PRETTY_FUNCTION__); } } + #else + #define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __func__); } } + #endif +#else + #define TEST_CUBLAS(X) {} +#endif + typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID; #ifdef CUDALA @@ -33,6 +48,7 @@ public: { cublasStatus status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS"); + errno = 0; } ~GPU_START(void) { @@ -50,8 +66,41 @@ extern complex *gpuputcomplex(const complex &x); void set_default_loc(const GPUID loc); -extern GPUID DEFAULT_LOC; +template +void smart_gpu_set(size_t n, const T& val, void *gpu_to, size_t _step = 1){ + void *ptr(NULL); + if(sizeof(T)%sizeof(float) != 0){ laerror("memory alignment error"); } + cublasAlloc(1, sizeof(T), &ptr); + TEST_CUBLAS("cublasAlloc"); + + cublasSetVector(1, sizeof(T), &val, 1, ptr, 1); + TEST_CUBLAS("cublasSetVector"); + + if(sizeof(T) == sizeof(float)){ + cublasScopy(n, (float*)ptr, 0, ((float*)gpu_to), _step); + TEST_CUBLAS("cublasScopy"); + + }else if(sizeof(T) == sizeof(double)){ + cublasDcopy(n, (double*)ptr, 0, ((double*)gpu_to), _step); + TEST_CUBLAS("cublasDcopy"); + + }else if(sizeof(T) == sizeof(complex)){ + cublasZcopy(n, (cuDoubleComplex*)ptr, 0, (cuDoubleComplex*)gpu_to, _step); + TEST_CUBLAS("cublasZcopy"); + + }else{ + for(register int i=0; i NRIMat; +typedef NRMat NRDMat; +typedef NRMat > NRCMat; +typedef NRVec NRIVec; +typedef NRVec NRDVec; +typedef NRVec > NRCVec; #endif /* _LA_H_ */ + diff --git a/la_traits.h b/la_traits.h index 0947cee..f64e9ff 100644 --- a/la_traits.h +++ b/la_traits.h @@ -32,6 +32,7 @@ #include #include #include +#include #include diff --git a/laerror.cc b/laerror.cc index 54c1416..99be356 100644 --- a/laerror.cc +++ b/laerror.cc @@ -42,20 +42,20 @@ bool _LA_count_check=true; extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage -void laerror(const char *s1) +void laerror2(const char *s1, const char *s2) { std::cerr << "LA:ERROR - "; std::cout << "LA:ERROR - "; if(s1) { - std::cerr << s1 << "\n"; - std::cout << s1 << "\n"; + std::cerr << s2 << ": " << s1 << "\n"; + std::cout << s2 << ": " << s1 << "\n"; } #ifdef CUDALA { -cublasStatus s=cublasGetError(); -std::cerr << "CUBLAS status = "< or complex versions written by Roman Curik @@ -16,7 +17,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -*/ +*******************************************************************************/ #include "mat.h" #include @@ -25,375 +26,522 @@ #include #include #include + extern "C" { -extern ssize_t read(int, void *, size_t); -extern ssize_t write(int, const void *, size_t); + extern ssize_t read(int, void *, size_t); + extern ssize_t write(int, const void *, size_t); } -// TODO : -// namespace LA { -/* - * Templates first, specializations for BLAS next -*/ - - - -//direct sum +/***************************************************************************//** + * implements direct sum with a given matrix \f$B\f$ via storesubmatrix() + * @param[in] rhs input matrix \f$B\f$ + * @return result of the computation (new instance of NRMat) + * @see submatrix() + ******************************************************************************/ template -const NRMat NRMat::oplus(const NRMat &rhs) const -{ -if(nn==0 && mm == 0) return rhs; -if(rhs.nn==0 && rhs.mm== 0) return *this; +const NRMat NRMat::oplus(const NRMat &rhs) const { -NRMat r((T)0,nn+rhs.nn,mm+rhs.mm); + // special cases + if(nn == 0 && mm == 0) return rhs; + if(rhs.nn == 0 && rhs.mm == 0) return *this; -#ifdef oldversion -int i,j; -for(i=0;i ret(nn + rhs.nn, mm + rhs.mm, getlocation()); -return r; + ret = 0; + ret.storesubmatrix(0, 0, *this); + ret.storesubmatrix(nn, mm, rhs); + return ret; } - -//direct product - +/***************************************************************************//** + * implements direct product with a given matrix \f$B\f$ + * @param[in] rhs input matrix \f$B\f$ + * @return result of the computation (new instance of NRMat) + ******************************************************************************/ template -const NRMat NRMat::otimes(const NRMat &rhs, bool reversecolumns) const -{ -if(nn==0 && mm == 0) return *this; -if(rhs.nn==0 && rhs.mm== 0) return rhs; +const NRMat NRMat::otimes(const NRMat &rhs, bool reversecolumns) const { -NRMat r((T)0,nn*rhs.nn,mm*rhs.mm); + // special cases + if(nn == 0 && mm == 0) return *this; + if(rhs.nn == 0 && rhs.mm == 0) return rhs; -int i,j,k,l; + NRMat r((T)0, nn*rhs.nn, mm*rhs.mm); -if(reversecolumns) -{ -for(i=0;iT + * @param[in] i row index starting from zero + * @param[in] l consider this value as the count of columns + * @return extracted elements as a NRVec object + ******************************************************************************/ template -const NRVec NRMat::row(const int i, int l) const -{ +const NRVec NRMat::row(const int i, int l) const { #ifdef DEBUG -if(i<0||i>=nn) laerror("illegal index in row()"); + if(i < 0 || i >= nn) laerror("illegal index"); #endif -if(l < 0) l=mm; -NRVec r(l); -LA_traits::copy(&r[0], + if(l < 0) l = mm; + NRVec r(l); + LA_traits::copy(&r[0], #ifdef MATPTR - v[i] + v[i] #else - v+i*l + v + i*l #endif -,l); -return r; + , l); + return r; } - -//raw I/O +/***************************************************************************//** + * routine for raw output + * @param[in] fd file descriptor for output + * @param[in] dim number of elements intended for output + * @param[in] transp reserved + * @see NRVec::put() + ******************************************************************************/ template -void NRMat::put(int fd, bool dim, bool transp) const -{ +void NRMat::put(int fd, bool dim, bool transp) const { #ifdef CUDALA -if(location!=cpu) - { - NRMat tmp= *this; - tmp.moveto(cpu); - tmp.put(fd,dim,transp); - return; + if(location != cpu) { + NRMat tmp = *this; + tmp.moveto(cpu); + tmp.put(fd, dim, transp); + return; } #endif -errno=0; -if(dim) -{ -if(sizeof(int) != write(fd,&(transp?mm:nn),sizeof(int))) laerror("cannot write"); -if(sizeof(int) != write(fd,&(transp?nn:mm),sizeof(int))) laerror("cannot write"); -} - -if(transp) //not particularly efficient - { - for(int j=0; j::put(fd, -#ifdef MATPTR - v[i][j] -#else - v[i*mm+j] -#endif - ,dim,transp); - } -else LA_traits::multiput(nn*mm,fd, -#ifdef MATPTR - v[0] -#else - v -#endif -,dim); -} - -template -void NRMat::get(int fd, bool dim, bool transp) -{ -#ifdef CUDALA -if(location!=cpu) - { - NRMat tmp; - tmp.moveto(cpu); - tmp.get(fd,dim,transp); - tmp.moveto(location); - *this = tmp; - return; - } -#endif -int nn0,mm0; -errno=0; -if(dim) -{ -if(sizeof(int) != read(fd,&nn0,sizeof(int))) laerror("cannot read"); -if(sizeof(int) != read(fd,&mm0,sizeof(int))) laerror("cannot read"); -if(transp) resize(mm0,nn0); else resize(nn0,mm0); -} -else -copyonwrite(); -if(transp) //not particularly efficient - { - for(int j=0; j::get(fd, -#ifdef MATPTR - v[i][j] -#else - v[i*mm+j] -#endif - ,dim,transp); + errno = 0; + if(dim){ + if(sizeof(int) != write(fd,&(transp?mm:nn),sizeof(int))) laerror("write failed"); + if(sizeof(int) != write(fd,&(transp?nn:mm),sizeof(int))) laerror("write failed"); + } + + if(transp){ //not particularly efficient + for(int j=0; j::put(fd, + #ifdef MATPTR + v[i][j] + #else + v[i*mm+j] + #endif + ,dim ,transp); + } + } + }else{ + LA_traits::multiput(nn*mm,fd, + #ifdef MATPTR + v[0] + #else + v + #endif + ,dim); } -else LA_traits::multiget(nn*mm,fd, -#ifdef MATPTR - v[0] -#else - v -#endif -,dim); } +/***************************************************************************//** + * routine for raw input + * @param[in] fd file descriptor for input + * @param[in] dim number of elements intended for input, for dim=0 perform copyonwrite + * @param[in] transp reserved + * @see NRVec::get(), copyonwrite() + ******************************************************************************/ +template +void NRMat::get(int fd, bool dim, bool transp){ +#ifdef CUDALA + if(location != cpu){ + NRMat tmp; + tmp.moveto(cpu); + tmp.get(fd, dim, transp); + tmp.moveto(getlocation()); + *this = tmp; + return; + } +#endif + int nn0, mm0; + errno = 0; + if(dim){ + if(sizeof(int) != read(fd, &nn0, sizeof(int))) laerror("read failed"); + if(sizeof(int) != read(fd, &mm0, sizeof(int))) laerror("read failed"); + if(transp) resize(mm0, nn0); else resize(nn0, mm0); + }else{ + copyonwrite(); + } + if(transp){ + for(register int j=0; j::get(fd, + #ifdef MATPTR + v[i][j] + #else + v[i*mm+j] + #endif + ,dim,transp); + } + } + }else{ + LA_traits::multiget(nn*mm,fd, + #ifdef MATPTR + v[0] + #else + v + #endif + ,dim); + } +} +/***************************************************************************//** + * assigns a scalar value of general type T to the diagonal elements of this + * matrix of general type T + * @param[in] a scalar value of type T + * @return reference to the modified matrix + ******************************************************************************/ +template +NRMat& NRMat::operator=(const T &a) { + NOT_GPU(*this); + const int n2 = nn*nn; -// Assign diagonal + copyonwrite(); +#ifdef DEBUG + if(nn != mm) laerror("nonsquare matrix"); +#endif +#ifdef MATPTR + memset(v[0], 0, n2*sizeof(T)); + for(register int i=0; i < nn; i++) v[i][i] = a; +#else + memset(v, 0, n2*sizeof(T)); + for(register int i=0; i < n2; i += nn + 1) v[i] = a; +#endif + return *this; +} + +/***************************************************************************//** + * assigns a double-precision real scalar value to the diagonal elements of this + * double-precision real matrix + * @param[in] a double-precision real scalar value + * @return reference to the modified matrix + ******************************************************************************/ template <> -NRMat & NRMat::operator=(const double &a) -{ - copyonwrite(); +NRMat& NRMat::operator=(const double &a){ + const int n2 = nn*nn; + copyonwrite(); #ifdef DEBUG - if (nn != mm) laerror("RMat.operator=scalar on non-square matrix"); + if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA - if(location==cpu) - { + if(location == cpu){ #endif #ifdef MATPTR - memset(v[0],0,nn*nn*sizeof(double)); - for (int i=0; i< nn; i++) v[i][i] = a; + memset(v[0], 0, n2*sizeof(double)); + for(register int i=0; i< nn; i++) v[i][i] = a; #else - double n=0.; - cblas_dcopy(nn*nn, &n, 0, v, 1); - cblas_dcopy(nn, &a, 0, v, nn+1); + const double n = 0.; + //set all matrix elements equal to zero + cblas_dcopy(n2, &n, 0, v, 1); + //update the diagonal elements + cblas_dcopy(nn, &a, 0, v, nn + 1); #endif #ifdef CUDALA - } - else - { - double *d=gpuputdouble(0.); - cublasDcopy(nn*nn, d, 0, v, 1); - gpufree(d); - d=gpuputdouble(a); - cublasDcopy(nn, d, 0, v, nn+1); - gpufree(d); + }else{ + smart_gpu_set(n2, 0.0, v, 1); + smart_gpu_set(nn, a, v, nn + 1); } #endif return *this; } - - - -template -NRMat & NRMat::operator=(const T &a) -{ +/***************************************************************************//** + * assigns a double-precision complex scalar value to the diagonal elements of this + * double-precision complex matrix + * @param[in] a double-precision complex scalar value + * @return reference to the modified matrix + ******************************************************************************/ +template <> +NRMat >& NRMat >::operator=(const complex &a){ + const int n2 = nn*nn; copyonwrite(); #ifdef DEBUG - if (nn != mm) laerror("RMat.operator=scalar on non-square matrix"); + if(nn != mm) laerror("nonsquare matrix"); +#endif +#ifdef CUDALA + if(location == cpu){ #endif #ifdef MATPTR - memset(v[0],0,nn*nn*sizeof(T)); - for (int i=0; i< nn; i++) v[i][i] = a; + memset(v[0], 0, n2*sizeof(complex)); + for(register int i=0; i< nn; i++) v[i][i] = a; #else - memset(v,0,nn*nn*sizeof(T)); - for (int i=0; i< nn*nn; i+=nn+1) v[i] = a; + //set all matrix elements equal to zero + cblas_zcopy(n2, &CZERO, 0, v, 1); + //update the diagonal elements + cblas_zcopy(nn, &a, 0, v, nn + 1); #endif - return *this; +#ifdef CUDALA + }else{ + smart_gpu_set(n2, CZERO, v, 1); + smart_gpu_set(nn, a, v, nn + 1); + } +#endif + return *this; } - - +/***************************************************************************//** + * adds a double-precision real scalar value to the diagonal elements of this + * double-precision real matrix + * @param[in] a double-precision real scalar value + * @return reference to the modified matrix + ******************************************************************************/ template <> -NRMat & NRMat::operator+=(const double&a) -{ +NRMat & NRMat::operator+=(const double& a) { copyonwrite(); #ifdef DEBUG - if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix"); + if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA - if(location==cpu) - { + if(location == cpu){ #endif #ifdef MATPTR - for (int i=0; i< nn; i++) v[i][i] += a; + for(register int i=0; i < nn; i++) v[i][i] += a; #else - cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1); + cblas_daxpy(nn, 1.0, &a, 0, *this, nn + 1); #endif #ifdef CUDALA - } - else - { - double *d=gpuputdouble(a); - cublasDaxpy(nn, 1.0, d, 0, *this, nn+1); - gpufree(d); + }else{ + double *d = gpuputdouble(a); + cublasDaxpy(nn, 1.0, d, 0, *this, nn+1); + TEST_CUBLAS("cublasDaxpy"); + gpufree(d); } #endif return *this; } - +/***************************************************************************//** + * adds a double-precision complex scalar value to the diagonal elements of this + * double-precision complex matrix + * @param[in] a double-precision complex scalar value + * @return reference to the modified matrix + ******************************************************************************/ template <> -NRMat & NRMat::operator-=(const double&a) -{ +NRMat > & NRMat >::operator+=(const complex& a) { copyonwrite(); #ifdef DEBUG - if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix"); + if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA - if(location==cpu) - { + if(location == cpu){ #endif #ifdef MATPTR - for (int i=0; i< nn; i++) v[i][i] -= a; + for(register int i=0; i < nn; i++) v[i][i] += a; #else - cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1); + cblas_zaxpy(nn, &CONE, &a, 0, *this, nn + 1); #endif #ifdef CUDALA - } - else - { - double *d=gpuputdouble(a); - cublasDaxpy(nn, -1.0, d, 0, *this, nn+1); - gpufree(d); + }else{ + complex* d = gpuputcomplex(a); + cublasZaxpy(nn, CUMONE, (cuDoubleComplex*)d, 0, (cuDoubleComplex*)v, nn+1); + TEST_CUBLAS("cublasDaxpy"); + gpufree(d); } #endif return *this; } - - - -// M += a -template -NRMat & NRMat::operator+=(const T &a) -{ - copyonwrite(); -#ifdef DEBUG - if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix"); -#endif -#ifdef MATPTR - for (int i=0; i< nn; i++) v[i][i] += a; -#else - for (int i=0; i< nn*nn; i+=nn+1) v[i] += a; -#endif - return *this; -} - -// M -= a -template -NRMat & NRMat::operator-=(const T &a) -{ - copyonwrite(); -#ifdef DEBUG - if (nn != mm) laerror("Mat.operator-=scalar on non-square matrix"); -#endif -#ifdef MATPTR - for (int i=0; i< nn; i++) v[i][i] -= a; -#else - for (int i=0; i< nn*nn; i+=nn+1) v[i] -= a; -#endif - return *this; -} - - +/***************************************************************************//** + * subtracts a double-precision real scalar value from the diagonal elements of this + * double-precision real matrix + * @param[in] a double-precision real scalar value + * @return reference to the modified matrix + ******************************************************************************/ template <> -const NRMat NRMat::operator-() const -{ +NRMat& NRMat::operator-=(const double& a) { + copyonwrite(); +#ifdef DEBUG + if(nn != mm) laerror("nonsquare matrix"); +#endif #ifdef CUDALA - NRMat result(nn, mm, location); - if(location==cpu) - { -#else - NRMat result(nn, mm); + if(location == cpu){ #endif #ifdef MATPTR - for (int i=0; i +NRMat >& NRMat >::operator-=(const complex& a) { + copyonwrite(); +#ifdef DEBUG + if(nn != mm) laerror("nonsquare matrix"); +#endif +#ifdef CUDALA + if(location == cpu){ +#endif +#ifdef MATPTR + for(register int i=0; i < nn; i++) v[i][i] -= a; +#else + cblas_zaxpy(nn, &CMONE, &a, 0, *this, nn + 1); +#endif +#ifdef CUDALA + }else{ + complex* d = gpuputcomplex(a); + cublasZaxpy(nn, CUMONE, (cuDoubleComplex*)d, 0, (cuDoubleComplex*)v, nn+1); + TEST_CUBLAS("cublasDaxpy"); + gpufree(d); + } +#endif + return *this; +} +/***************************************************************************//** + * add a scalar value of type T to the diagonal elements of this + * matrix of general type T + * @return reference to the modified matrix + ******************************************************************************/ +template +NRMat& NRMat::operator+=(const T &a) { + NOT_GPU(*this); + + copyonwrite(); +#ifdef DEBUG + if(nn != mm) laerror("nonsquare matrix"); +#endif +#ifdef MATPTR + for(register int i=0; i < nn; i++) v[i][i] += a; +#else + for(register int i=0; i < nn*nn; i += nn+1) v[i] += a; +#endif + return *this; +} + +/***************************************************************************//** + * subtracts a scalar value of type T from the diagonal elements of this + * matrix of general type T + * @return reference to the modified matrix + ******************************************************************************/ +template +NRMat & NRMat::operator-=(const T &a) { + NOT_GPU(*this); + + copyonwrite(); +#ifdef DEBUG + if(nn != mm) laerror("nonsquare matrix"); +#endif +#ifdef MATPTR + for(register int i=0; i< nn; i++) v[i][i] -= a; +#else + for(register int i=0; i< nn*nn; i+=nn+1) v[i] -= a; +#endif + return *this; +} + +/***************************************************************************//** + * implements unary minus operator for this double-recision real matrix + * @return modified copy of this matrix + ******************************************************************************/ +template <> +const NRMat NRMat::operator-() const { + const int nm = nn*mm; + NRMat result(nn, mm, getlocation()); +#ifdef CUDALA + if(location == cpu) { +#endif +#ifdef MATPTR + for(register int i=0; i -const NRMat NRMat::operator-() const -{ - NRMat result(nn, mm); +/***************************************************************************//** + * implements unary minus operator for this double-precision complex matrix + * @return modified copy of this matrix + ******************************************************************************/ +template <> +const NRMat > NRMat >::operator-() const { + const int nm = nn*mm; + NRMat > result(nn, mm, getlocation()); +#ifdef CUDALA + if(location == cpu) { +#endif #ifdef MATPTR - for (int i=0; i)); + cblas_zscal(nm, &CMONE, result.v, 1); +#endif +#ifdef CUDALA + }else{ + cublasZcopy(nm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1); + TEST_CUBLAS("cublasZcopy"); + + cublasZscal(nm, CUMONE, (cuDoubleComplex*)result.v, 1); + TEST_CUBLAS("cublasZscal"); + } +#endif + return result; +} + +/***************************************************************************//** + * implements unary minus operator for this matrix of general type T + * @return modified copy of this matrix + ******************************************************************************/ +template +const NRMat NRMat::operator-() const { + NOT_GPU(*this); + + NRMat result(nn, mm, getlocation()); +#ifdef MATPTR + for(register int i=0; i NRMat::operator-() const // direct sum template -const NRMat NRMat::operator&(const NRMat & b) const -{ - NRMat result((T)0, nn+b.nn, mm+b.mm); - for (int i=0; i NRMat::operator&(const NRMat &b) const { + SAME_LOC(*this, b); + NRMat result((T)0, nn + b.nn, mm + b.mm, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i -const NRMat NRMat::operator|(const NRMat &b) const -{ +const NRMat NRMat::operator|(const NRMat &b) const { NRMat result(nn*b.nn, mm*b.mm); for (int i=0; i NRMat::operator|(const NRMat &b) const return result; } -// sum of columns +/***************************************************************************//** + * sum up the columns of the current matrix of general type T + * @return summed columns in a form of a vector + ******************************************************************************/ template -const NRVec NRMat::csum() const -{ - NRVec result(nn); +const NRVec NRMat::csum() const { + NOT_GPU(*this); + NRVec result(nn, getlocation()); T sum; - for (int i=0; i NRMat::csum() const return result; } -// sum of rows +/***************************************************************************//** + * sum up the columns of the current double-precision real matrix + * @return summed columns in a form of a vector + ******************************************************************************/ +template <> +const NRVec NRMat::csum() const { + NRVec result(nn, getlocation()); + result = 0.0; +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i +const NRVec > NRMat >::csum() const { + NRVec > result(nn, getlocation()); + result = 0.0; +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0;iT + * @return summed rows in a form of a vector + ******************************************************************************/ template -const NRVec NRMat::rsum() const -{ - NRVec result(nn); +const NRVec NRMat::rsum() const { + NOT_GPU(*this); + NRVec result(mm, getlocation()); T sum; - for (int i=0; i NRMat::rsum() const return result; } -//block submatrix -template -const NRMat NRMat::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const -{ -#ifdef DEBUG -if(fromrow <0 ||fromrow >=nn||torow <0 ||torow >=nn ||fromcol<0||fromcol>=mm||tocol<0||tocol>=mm||fromrow>torow||fromcol>tocol) laerror("bad indices in submatrix"); -#endif -int n=torow-fromrow+1; -int m=tocol-fromcol+1; -NRMat r(n,m); -for(int i=fromrow; i<=torow; ++i) -#ifdef MATPTR - memcpy(r.v[i-fromrow],v[i]+fromcol,m*sizeof(T)); -#else - memcpy(r.v+(i-fromrow)*m,v+i*mm+fromcol,m*sizeof(T)); -#endif -return r; -} - -template -void NRMat::storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs) -{ -int tocol=fromcol+rhs.ncols()-1; -int torow=fromrow+rhs.nrows()-1; -#ifdef DEBUG -if(fromrow <0 ||fromrow >=nn||torow >=nn ||fromcol<0||fromcol>=mm||tocol>=mm) laerror("bad indices in storesubmatrix"); -#endif -int m=tocol-fromcol+1; -for(int i=fromrow; i<=torow; ++i) -#ifdef MATPTR - memcpy(v[i]+fromcol,rhs.v[i-fromrow],m*sizeof(T)); -#else - memcpy(v+i*mm+fromcol,rhs.v+(i-fromrow)*m,m*sizeof(T)); -#endif -} - - -// transpose Mat -template -NRMat & NRMat::transposeme(int n) -{ -if(n==0) n=nn; -#ifdef DEBUG - if (n==nn && nn != mm || n>mm || n>nn) laerror("transpose of non-square Mat"); -#endif - copyonwrite(); - for(int i=1; i +const NRVec NRMat::rsum() const { + NRVec result(mm, getlocation()); + result = 0.0; +#ifdef CUDALA + if(location == cpu){ #endif + for(register int i=0;i +const NRVec > NRMat >::rsum() const { + NRVec > result(mm, getlocation()); + result = 0.0; +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0;i +const NRMat NRMat::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const { +#ifdef DEBUG + if(fromrow<0 || fromrow>=nn|| torow<0 || torow>=nn || fromcol<0 || fromcol>=mm || tocol<0 || tocol>=mm || fromrow>torow || fromcol>tocol){ + laerror("invalid submatrix specification"); + } +#endif + const int n = torow - fromrow + 1; + const int m = tocol - fromcol + 1; + NRMat r(n, m, getlocation()); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=fromrow; i<=torow; ++i){ + #ifdef MATPTR + memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T)); + #else + memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T)); + #endif + } +#ifdef CUDALA + }else{ + if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); + for(register int i=fromrow; i<=torow; ++i){ + cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1); + TEST_CUBLAS("cublasScopy"); + } + } +#endif + return r; +} + +/***************************************************************************//** + * places given matrix as submatrix at given position + * @param[in] fromrow row-coordinate of top left corner + * @param[in] fromcol col-coordinate of top left corner + * @param[in] rhs input matrix + ******************************************************************************/ +template +void NRMat::storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs) { + const int tocol = fromcol + rhs.ncols() - 1; + const int torow = fromrow + rhs.nrows() - 1; +#ifdef DEBUG + if(fromrow<0 || fromrow>=nn || torow>=nn || fromcol<0 || fromcol>=mm || tocol>=mm) laerror("bad indices in storesubmatrix"); +#endif + SAME_LOC(*this, rhs); + + const int m = tocol - fromcol + 1; + for(register int i = fromrow; i <= torow; ++i){ + #ifdef CUDALA + if(location == cpu){ + #endif + #ifdef MATPTR + memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T)); + #else + memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T)); + #endif + + #ifdef CUDALA + }else{ + if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); + cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1); + } + #endif + } +} + +/***************************************************************************//** + * compute matrix transposition for a principal leading minor + * @param[in] _n order of the leading minor + * @return reference to the modified matrix + ******************************************************************************/ +template +NRMat& NRMat::transposeme(const int _n) { + const int n = (n <= 0)?nn:_n;//!< transpose the entire matrix +#ifdef DEBUG + if (n==nn && nn != mm || n>mm || n>nn ) laerror("NRMat::transposeme() - invalid parameter n. Non-square matrix?"); +#endif +#ifdef CUDALA + if(location == cpu){ +#endif + copyonwrite(); + for(register int i=1; i -NRMat >::NRMat(const NRMat &rhs, bool imagpart) -: nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) -{ -#ifdef MATPTR - v = new complex*[n]; - v[0] = new complex[mm*nn]; - for (int i=1; i)); - cblas_dcopy(nn*mm,&rhs[0][0],1,((double *)v[0]) + (imagpart?1:0),2); -#else - v = new complex[mm*nn]; - memset(v, 0, nn*mm*sizeof(complex)); - cblas_dcopy(nn*mm,&rhs[0][0],1,((double *)v) + (imagpart?1:0),2); +NRMat >::NRMat(const NRMat &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { + const int nn_mm = nn*mm; +#ifdef CUDALA + if(location == cpu){ #endif -} + #ifdef MATPTR + v = new complex*[n]; + v[0] = new complex[nn_mm]; + for(register int i=1; i)); + cblas_dcopy(nn_mm, &rhs[0][0], 1, ((double *)v[0]) + (imagpart?1:0), 2); + #else + v = new complex[nn_mm]; + memset(v, 0, nn_mm*sizeof(complex)); + + cblas_dcopy(nn_mm, &rhs[0][0], 1, ((double *)v) + (imagpart?1:0), 2); + #endif +#ifdef CUDALA + }else{ + v = (complex*)gpualloc(sizeof(complex)*nn_mm); + complex *_val = gpuputcomplex(CZERO); + cublasZcopy(nn_mm, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZcopy"); + gpufree(_val); -// Output of Mat -template -void NRMat::fprintf(FILE *file, const char *format, const int modulo) const -{ - lawritemat(file, (const T*)(*this), nn, mm, format, 2, modulo, 0); -} - -// Input of Mat -template -void NRMat::fscanf(FILE *f, const char *format) -{ - int n, m; - if (::fscanf(f, "%d %d", &n, &m) != 2) - laerror("cannot read matrix dimensions in Mat::fscanf()"); - resize(n,m); - T *p = *this; - for(int i=0; i - */ - -template<> -const NRSMat NRMat::transposedtimes() const -{ -NRSMat r(mm,mm); -int i,j; -for(i=0; i -const NRSMat > NRMat >::transposedtimes() const -{ -NRSMat > r(mm,mm); -int i,j; -for(i=0; i -const NRSMat NRMat::transposedtimes() const -{ -NRSMat r(mm,mm); -int i,j; -for(i=0; i -const NRSMat NRMat::timestransposed() const -{ -NRSMat r(nn,nn); -int i,j; -for(i=0; i -const NRSMat > NRMat >::timestransposed() const -{ -NRSMat > r(nn,nn); -int i,j; -for(i=0; i -const NRSMat NRMat::timestransposed() const -{ -NRSMat r(nn,nn); -int i,j; -for(i=0; i::fprintf(FILE *file, const char *format, const int modulo) const { +#ifdef CUDALA + if(location == cpu){ +#endif + lawritemat(file, (const T*)(*this), nn, mm, format, 2, modulo, 0); +#ifdef CUDALA + }else{ + NRMat tmp = *this; + tmp.moveto(cpu); + lawritemat(file, (const T*)(tmp), nn, mm, format, 2, modulo, 0); } -return r; +#endif + } +/***************************************************************************//** + * input of a matrix of general type via lawritemat + ******************************************************************************/ +template +void NRMat::fscanf(FILE *f, const char *format) { + T *p; + NRMat *tmp; -//randomize -template<> -void NRMat::randomize(const double &x) -{ -for(int i=0; i -void NRMat >::randomize(const double &x) -{ -for(int i=0; i(n, m, this->location); + p = *tmp; + } +#endif + resize(n, m); + for(int i=0; i types +//----------------------------------------------------------------------------- + +/***************************************************************************//** + * for a given real matrix \f$A\f$ compute \f$A^\mathrm{T}A\f$ + * @return real NRSMat object because of the symmetry of \f$A^\mathrm{T}A\f$ + ******************************************************************************/ +template<> +const NRSMat NRMat::transposedtimes() const { + int i(0), j(0); + NRSMat r(mm, getlocation());//!< resulting matrix has mm rows +#ifdef CUDALA + if(location == cpu){ +#endif + for(i=0; ilocation); + } +#endif + return r; +} + +/***************************************************************************//** + * for a given complex matrix \f$A\f$ compute \f$A^\dagger{}A\f$ + * @return complex NRSMat object because of the hermiticity of \f$A^\dagger{}A\f$ + ******************************************************************************/ +template<> +const NRSMat > NRMat >::transposedtimes() const { + int i(0), j(0); + NRSMat > r(mm, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(i=0; i*> (&val)); + } + } + r.moveto(this->location); + } +#endif + return r; +} + +/***************************************************************************//** + * for a given matrix \f$A\f$ (general type) compute \f$A^\mathrm{T}A\f$ + * @return NRSMat object because of the symmetry of the result + ******************************************************************************/ +template +const NRSMat NRMat::transposedtimes() const { + int i(0), j(0); + NOT_GPU(*this); + + NRSMat r(mm, getlocation()); + for(i=0; i -NRMat & NRMat::operator*=(const double &a) -{ +const NRSMat NRMat::timestransposed() const { + int i(0), j(0); + NRSMat r(nn, getlocation());//!< resulting matrix has nn rows +#ifdef CUDALA + if(location == cpu){ +#endif + for(i=0; ilocation); + } +#endif + return r; +} + +/***************************************************************************//** + * for a given complex matrix \f$A\f$ compute \f$AA^\dagger{}\f$ + * @return complex NRSMat object because of the hermiticity of \f$AA^\dagger{}\f$ + ******************************************************************************/ +template<> +const NRSMat > NRMat >::timestransposed() const { + int i(0), j(0); + NRSMat > r(nn, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(i=0; i*> (&val)); + } + } + r.moveto(this->location); + } +#endif + return r; +} + + +/***************************************************************************//** + * for a given matrix \f$A\f$ (general type) compute \f$A^\mathrm{T}A\f$ + * @return NRSMat object because of the symmetry of the result + ******************************************************************************/ +template +const NRSMat NRMat::timestransposed() const { + int i(0), j(0); + NOT_GPU(*this); + + NRSMat r(nn); + for(i=0; i +void NRMat::randomize(const double &x) { +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i tmp(nn, mm, cpu); + double *tmp_data = tmp; + for(register int i=0; ilocation); + *this |= tmp; + } +#endif +} + + +/***************************************************************************//** + * fill given complex matrix with random numbers + * real/imaginary components are generated independently + * @param[in] x generate random numbers from the interval [0, x] + ******************************************************************************/ +template<> +void NRMat >::randomize(const double &x) { +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i > tmp(nn, mm, cpu); + complex *tmp_data = tmp; + for(register int i=0; i(re, im); + } + tmp.moveto(this->location); + *this |= tmp; + } +#endif +} + + +/***************************************************************************//** + * scale real matrix with a real factor + * @param[in] a scaling factor + * @return reference to the modified matrix + ******************************************************************************/ +template<> +NRMat& NRMat::operator*=(const double &a) { copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_dscal(nn*mm, a, *this, 1); #ifdef CUDALA - else cublasDscal(nn*mm, a, v, 1); + }else{ + cublasDscal(nn*mm, a, v, 1); + TEST_CUBLAS("cublasDscal"); + } #endif return *this; } +/***************************************************************************//** + * scale complex matrix with a complex factor + * @param[in] a scaling factor + * @return reference to the modified matrix + ******************************************************************************/ template<> -NRMat< complex > & -NRMat< complex >::operator*=(const complex &a) -{ +NRMat > & +NRMat >::operator*=(const complex &a) { copyonwrite(); - cblas_zscal(nn*mm, &a, (*this)[0], 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zscal(nn*mm, &a, (*this)[0], 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex fac = *(reinterpret_cast (&a)); + cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1); + TEST_CUBLAS("cublasZscal"); + } +#endif return *this; } -//and for general type +/***************************************************************************//** + * scale matrix of type T with a factor + * @param[in] a scaling factor + * @return reference to the modified matrix + ******************************************************************************/ template -NRMat & NRMat::operator*=(const T &a) -{ -NOT_GPU(*this); +NRMat & NRMat::operator*=(const T &a) { + NOT_GPU(*this); copyonwrite(); #ifdef MATPTR - for (int i=0; i< nn*mm; i++) v[0][i] *= a; + for(register int i=0; i< nn*mm; i++) v[0][i] *= a; #else - for (int i=0; i< nn*mm; i++) v[i] *= a; + for(register int i=0; i< nn*mm; i++) v[i] *= a; #endif return *this; } - -// Mat += Mat +/***************************************************************************//** + * add a given real matrix \f$A\f$ to the current real matrix + * @param[in] rhs matrix \f$A\f$ + * @return reference to the modified matrix + ******************************************************************************/ template<> -NRMat & NRMat::operator+=(const NRMat &rhs) -{ +NRMat & NRMat::operator+=(const NRMat &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat += Mat of incompatible matrices"); + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif -SAME_LOC(*this,rhs); + SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1); + cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1); #ifdef CUDALA - else - cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1); + }else{ + cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1); + TEST_CUBLAS("cublasDaxpy"); + } #endif return *this; } - +/***************************************************************************//** + * add a given complex matrix \f$A\f$ to the current complex matrix + * @param[in] rhs complex matrix \f$A\f$ + * @return reference to the modified matrix + ******************************************************************************/ template<> -NRMat< complex > & -NRMat< complex >::operator+=(const NRMat< complex > &rhs) -{ +NRMat > & +NRMat >::operator+=(const NRMat< complex > &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat += Mat of incompatible matrices"); + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif - copyonwrite(); - cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); - return *this; + SAME_LOC(*this, rhs); + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); +#ifdef CUDALA + }else{ + cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); + } +#endif + return *this; } - - -//and for general type +/***************************************************************************//** + * add a given general matrix (type T) \f$A\f$ to the current complex matrix + * @param[in] rhs matrix \f$A\f$ of type T + * @return reference to the modified matrix + ******************************************************************************/ template -NRMat & NRMat::operator+=(const NRMat &rhs) -{ +NRMat & NRMat::operator+=(const NRMat &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat -= Mat of incompatible matrices"); -#endif - copyonwrite(); -#ifdef MATPTR - for (int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i] ; -#else - for (int i=0; i< nn*mm; i++) v[i] += rhs.v[i] ; + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif + SAME_LOC(*this, rhs); + NOT_GPU(*this); + + copyonwrite(); + + #ifdef MATPTR + for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; + #else + for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; + #endif return *this; } -// Mat -= Mat +/***************************************************************************//** + * subtract a given real matrix \f$A\f$ from the current real matrix + * @param[in] rhs matrix \f$A\f$ + * @return reference to the modified matrix + ******************************************************************************/ template<> -NRMat & NRMat::operator-=(const NRMat &rhs) -{ +NRMat & NRMat::operator-=(const NRMat &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat -= Mat of incompatible matrices"); + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif SAME_LOC(*this,rhs); copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1); + cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1); #ifdef CUDALA - else - cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1); + }else{ + cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1); + } #endif return *this; } - +/***************************************************************************//** + * subtract a given complex matrix \f$A\f$ from the current complex matrix + * @param[in] rhs matrix \f$A\f$ + * @return reference to the modified matrix + ******************************************************************************/ template<> NRMat< complex > & -NRMat< complex >::operator-=(const NRMat< complex > &rhs) -{ +NRMat< complex >::operator-=(const NRMat< complex > &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat -= Mat of incompatible matrices"); + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif - copyonwrite(); - cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); - return *this; + SAME_LOC(*this, rhs); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); +#ifdef CUDALA + }else{ + cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); + } +#endif + return *this; } - -//and for general type +/***************************************************************************//** + * subtract a given general matrix (type T) \f$A\f$ from the current matrix + * @param[in] rhs matrix \f$A\f$ of type T + * @return reference to the modified matrix + ******************************************************************************/ template -NRMat & NRMat::operator-=(const NRMat &rhs) -{ +NRMat & NRMat::operator-=(const NRMat &rhs) { #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat -= Mat of incompatible matrices"); -#endif - copyonwrite(); -#ifdef MATPTR - for (int i=0; i< nn*mm; i++) v[0][i] -= rhs.v[0][i] ; -#else - for (int i=0; i< nn*mm; i++) v[i] -= rhs.v[i] ; + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif + SAME_LOC(*this, rhs); + NOT_GPU(*this); + + copyonwrite(); + + #ifdef MATPTR + for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; + #else + for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; + #endif return *this; } -// Mat += SMat +/***************************************************************************//** + * add a given sparse real matrix \f$A\f$ stored in packed form to the current + * real matrix + * @param[in] rhs symmetric real matrix \f$A\f$ in packed form + * @return reference to the modified matrix + * @see NRSMat + ******************************************************************************/ template<> -NRMat & NRMat::operator+=(const NRSMat &rhs) -{ +NRMat & NRMat::operator+=(const NRSMat &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat+=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const double *p = rhs; + + SAME_LOC(*this, rhs); copyonwrite(); - for (int i=0; i + ******************************************************************************/ template<> NRMat< complex > & NRMat< complex >::operator+=(const NRSMat< complex > &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat+=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const complex *p = rhs; + + SAME_LOC(*this, rhs); copyonwrite(); - for (int i=0; i + ******************************************************************************/ template -NRMat & NRMat::operator+=(const NRSMat &rhs) -{ +NRMat & NRMat::operator+=(const NRSMat &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat+=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif - const T *p = rhs; - copyonwrite(); - for (int i=0; i + ******************************************************************************/ template<> NRMat & NRMat::operator-=(const NRSMat &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat-=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const double *p = rhs; + SAME_LOC(*this, rhs); copyonwrite(); - for (int i=0; i + ******************************************************************************/ template<> -NRMat< complex > & -NRMat< complex >::operator-=(const NRSMat< complex > &rhs) -{ +NRMat > & +NRMat >::operator-=(const NRSMat > &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat-=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const complex *p = rhs; + + SAME_LOC(*this, rhs); copyonwrite(); - for (int i=0; i + ******************************************************************************/ template -NRMat & NRMat::operator-=(const NRSMat &rhs) -{ +NRMat & NRMat::operator-=(const NRSMat &rhs) { #ifdef DEBUG - if (nn!=mm || nn!=rhs.nrows()) laerror("incompatible matrix size in Mat+=SMat"); + if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif - const T *p = rhs; - copyonwrite(); - for (int i=0; i -const double NRMat::dot(const NRMat &rhs) const -{ +const double NRMat::dot(const NRMat &rhs) const { #ifdef DEBUG - if(nn!=rhs.nn || mm!= rhs.mm) laerror("Mat.Mat incompatible matrices"); + if(nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices in NRMat::dot(const NRMat&)"); #endif - return cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1); + double ret(0.0); + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1); +#ifdef CUDALA + }else{ + ret = cublasDdot(nn*mm, v, 1, rhs.v, 1); + } +#endif + return ret; } +/***************************************************************************//** + * compute scalar product of this matrix \f$A\f$ with given complex matrix \f$B\f$ + * i.e. determine \f$\sum_{i,j}A^{*}_{i,j}B_{i,j}\f$ + * @param[in] rhs matrix \f$B\f$ + * @return computed scalar product + ******************************************************************************/ template<> const complex -NRMat< complex >::dot(const NRMat< complex > &rhs) const -{ +NRMat >::dot(const NRMat > &rhs) const { #ifdef DEBUG - if(nn!=rhs.nn || mm!= rhs.mm) laerror("Mat.Mat incompatible matrices"); + if(nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices in NRMat >::dot(const NRMat >&)"); #endif - complex dot; - cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, - &dot); - return dot; + + complex ret(0.0, 0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret); +#ifdef CUDALA + }else{ + cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); + ret = *(reinterpret_cast*> (&val)); + } +#endif + return ret; } -// Mat * Mat +/***************************************************************************//** + * compute product of this matrix \f$A\f$ with given real matrix \f$B\f$ + * @param[in] rhs matrix \f$B\f$ + * @return computed product by value + ******************************************************************************/ template<> -const NRMat NRMat::operator*(const NRMat &rhs) const -{ +const NRMat NRMat::operator*(const NRMat &rhs) const { #ifdef DEBUG - if (mm != rhs.nn) laerror("product of incompatible matrices"); - if (rhs.mm <=0) laerror("illegal matrix dimension in gemm"); + if(mm != rhs.nn) laerror("incompatible matrices in NRMat::operator*(const NRMat&)"); + if(rhs.mm <= 0) laerror("illegal matrix dimension in gemm"); #endif -SAME_LOC(*this,rhs); - - NRMat result(nn, rhs.mm,rhs.getlocation()); + SAME_LOC(*this, rhs); + NRMat result(nn, rhs.mm, getlocation()); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0, - *this, mm, rhs, rhs.mm, 0.0, result, rhs.mm); + *this, mm, rhs, rhs.mm, 0.0, result, rhs.mm); #ifdef CUDALA - else - cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm); + }else{ + cublasDgemm('N', 'N', rhs.mm, nn, mm, 1.0, rhs, rhs.mm, *this, mm, 0.0, result, rhs.mm); + } #endif return result; } - - +/***************************************************************************//** + * compute product of this matrix \f$A\f$ with given complex matrix \f$B\f$ + * @param[in] rhs matrix \f$B\f$ + * @return computed product by value + ******************************************************************************/ template<> const NRMat< complex > -NRMat< complex >::operator*(const NRMat< complex > &rhs) const -{ +NRMat< complex >::operator*(const NRMat< complex > &rhs) const { #ifdef DEBUG - if (mm != rhs.nn) laerror("product of incompatible matrices"); + if(mm != rhs.nn) laerror("incompatible matrices in NRMat >::operator*(const NRMat >&)"); + if(rhs.mm <= 0) laerror("illegal matrix dimension in gemm"); +#endif + SAME_LOC(*this, rhs); + NRMat > result(nn, rhs.mm, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, + &CONE, (*this)[0], mm, rhs[0], rhs.mm, &CZERO, result[0], rhs.mm); +#ifdef CUDALA + }else{ + cublasZgemm('N', 'N', rhs.mm, nn, mm, CUONE, + (cuDoubleComplex*)rhs.v, rhs.mm, (cuDoubleComplex*)(this->v), mm, CUZERO, (cuDoubleComplex*)result.v, rhs.mm); + } #endif - NRMat< complex > result(nn, rhs.mm); - cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, - &CONE,(*this)[0], mm, rhs[0], - rhs.mm, &CZERO, result[0], rhs.mm); return result; } - -// Multiply by diagonal from L +/***************************************************************************//** + * multiply this real matrix \f$A\f$ by diagonal real matrix \f$D\f$ from left + * because of cuBlas implementation, \f$D\f$ is required to be placed in CPU memory + * @param[in] rhs real vector represeting the diagonal of matrix \f$D\f$ + ******************************************************************************/ template<> -void NRMat::diagmultl(const NRVec &rhs) -{ +void NRMat::diagmultl(const NRVec &rhs) { #ifdef DEBUG - if (nn != rhs.size()) laerror("incompatible matrix dimension in diagmultl"); + if(nn != rhs.size()) laerror("incompatible matrices in NRMat::diagmultl(const NRVec&)"); #endif + NOT_GPU(rhs); copyonwrite(); - for(int i=0; i +void NRMat< complex >::diagmultl(const NRVec< complex > &rhs) { +#ifdef DEBUG + if (nn != rhs.size()) laerror("incompatible matrices in NRMat >::diagmultl(const NRVec >&)"); +#endif + NOT_GPU(rhs); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i +void NRMat::diagmultr(const NRVec &rhs) { +#ifdef DEBUG + if(mm != rhs.size()) laerror("incompatible matrices in NRMat::diagmultr(const NRVec&)"); +#endif + NOT_GPU(rhs); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i -void NRMat< complex >::diagmultl(const NRVec< complex > &rhs) -{ +void NRMat< complex >::diagmultr(const NRVec< complex > &rhs) { #ifdef DEBUG - if (nn != rhs.size()) laerror("incompatible matrix dimension in diagmultl"); + if(mm != rhs.size()) laerror("incompatible matrices in NRMat >::diagmultr(const NRVec >&)"); #endif + NOT_GPU(rhs); copyonwrite(); - for (int i=0; i -void NRMat::diagmultr(const NRVec &rhs) -{ -#ifdef DEBUG - if (mm != rhs.size()) laerror("incompatible matrix dimension in diagmultr"); -#endif - copyonwrite(); - for (int i=0; i -void NRMat< complex >::diagmultr(const NRVec< complex > &rhs) -{ -#ifdef DEBUG - if (mm != rhs.size()) laerror("incompatible matrix dimension in diagmultl"); -#endif - copyonwrite(); - for (int i=0; i const NRMat -NRMat::operator*(const NRSMat &rhs) const -{ +NRMat::operator*(const NRSMat &rhs) const { #ifdef DEBUG - if (mm != rhs.nrows()) laerror("incompatible dimension in Mat*SMat"); + if(mm != rhs.nrows()) laerror("incompatible matrices int NRMat::operator*(const NRSMat &)"); +#endif + SAME_LOC(*this, rhs); + const int rhs_ncols = rhs.ncols(); + NRMat result(nn, rhs_ncols, getlocation()); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i result(nn, rhs.ncols()); - for (int i=0; i const NRMat< complex > -NRMat< complex >::operator*(const NRSMat< complex > &rhs) const -{ +NRMat< complex >::operator*(const NRSMat< complex > &rhs) const { #ifdef DEBUG - if (mm != rhs.nrows()) laerror("incompatible dimension in Mat*SMat"); + if(mm != rhs.nrows()) laerror("incompatible matrices int NRMat >::operator*(const NRSMat > &)"); +#endif + SAME_LOC(*this, rhs); + const int rhs_ncols = rhs.ncols(); + NRMat > result(nn, rhs_ncols, getlocation()); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i > result(nn, rhs.ncols()); - for (int i=0; i -NRMat &NRMat::conjugateme() {return *this;} - -template<> -NRMat< complex > & NRMat< complex >::conjugateme() -{ - copyonwrite(); - cblas_dscal(mm*nn, -1.0, (double *)((*this)[0])+1, 2); +NRMat& NRMat::conjugateme() { return *this; } -// transpose and optionally conjugate +/***************************************************************************//** + * conjugate this complex matrix \f$A\f$, i.e. do nothing :-) + * @return reference to the modified matrix + ******************************************************************************/ template<> -const NRMat NRMat::transpose(bool conj) const -{ - NRMat result(mm,nn); - for(int i=0; i >& NRMat >::conjugateme() { + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2); +#ifdef CUDALA + }else{ + cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2); + } +#endif + return *this; } + +/***************************************************************************//** + * compute transpose (optionally conjugated) of this real matrix \f$A\f$ + * @param[in] conj conjugation flag, unused for real matrices + * @return transposed (conjugated) matrix by value + ******************************************************************************/ template<> -const NRMat< complex > -NRMat< complex >::transpose(bool conj) const -{ - NRMat< complex > result(mm,nn); - for (int i=0; i NRMat::transpose(bool conj) const { + + NRMat result(mm, nn, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i +const NRMat > +NRMat >::transpose(bool conj) const { + NRMat > result(mm, nn, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i void NRMat::gemm(const double &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, - const double &alpha) -{ + const double &alpha) { + int k(transa=='n'?a.mm:a.nn); #ifdef DEBUG int l(transa=='n'?a.nn:a.mm); int kk(transb=='n'?b.nn:b.mm); int ll(transb=='n'?b.mm:b.nn); - if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()"); + if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat::gemm(...)"); if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm"); #endif -SAME_LOC3(*this,a,b); + + SAME_LOC3(*this, a, b); if (alpha==0.0 && beta==1.0) return; copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans), - (transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, - a.mm, b , b.mm, beta, *this , mm); + cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans), + (transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, + a.mm, b , b.mm, beta, *this , mm); #ifdef CUDALA - else - cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm); + }else{ + cublasDgemm(transb, transa, mm, nn, k, alpha, b, b.mm, a, a.mm, beta, *this, mm); + } #endif } - template<> -void NRMat< complex >::gemm(const complex & beta, - const NRMat< complex > & a, const char transa, - const NRMat< complex > & b, const char transb, +void NRMat >::gemm(const complex & beta, + const NRMat > & a, const char transa, + const NRMat > & b, const char transb, const complex & alpha) { int k(transa=='n'?a.mm:a.nn); @@ -1200,7 +2050,7 @@ void NRMat< complex >::gemm(const complex & beta, int l(transa=='n'?a.nn:a.mm); int kk(transb=='n'?b.nn:b.mm); int ll(transb=='n'?b.mm:b.nn); - if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()"); + if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat >::gemm(...)"); #endif if (alpha==CZERO && beta==CONE) return; @@ -1211,366 +2061,766 @@ void NRMat< complex >::gemm(const complex & beta, nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm); } -// norm of Mat +/***************************************************************************//** + * compute the Frobenius norm of the current real matrix \f$A\f$, i.e. + * \f[ \sqrt{\sum_{i=1}^{N}\sum_{j=1}^{M}\left|A_{i,j}\right|^2} \f] + * where \f$N\f$ and \f$M\f$ is the number of rows and columns, respectively + * @param[in] scalar real value subtracted from the diagonal elements + * @return computed norm + ******************************************************************************/ template<> -const double NRMat::norm(const double scalar) const -{ - if (!scalar) - { +const double NRMat::norm(const double scalar) const { + if(!scalar){ #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - return cblas_dnrm2(nn*mm, (*this)[0], 1); + return cblas_dnrm2(nn*mm, (*this)[0], 1); #ifdef CUDALA - else - return cublasDnrm2(nn*mm, v, 1); -#endif + }else{ + return cublasDnrm2(nn*mm, v, 1); } +#endif + } -NOT_GPU(*this); + NOT_GPU(*this); - double sum = 0; - for (int i=0; i -const double NRMat< complex >::norm(const complex scalar) const -{ - if (scalar == CZERO) return cblas_dznrm2(nn*mm, (*this)[0], 1); - double sum = 0; - for (int i=0; i tmp; +const double NRMat >::norm(const complex scalar) const { + if(scalar == CZERO){ +#ifdef CUDALA + if(location == cpu){ +#endif + return cblas_dznrm2(nn*mm, (*this)[0], 1); +#ifdef CUDALA + }else{ + return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1); + } +#endif + } + + NOT_GPU(*this); + double sum(0.0); + for(register int i=0; i tmp(0.0, 0.0); #ifdef MATPTR tmp = v[i][j]; #else tmp = v[i*mm+j]; #endif - if (i==j) tmp -= scalar; - sum += tmp.real()*tmp.real()+tmp.imag()*tmp.imag(); + if(i == j) tmp -= scalar; + const double re = tmp.real(); + const double im = tmp.imag(); + sum += re*re + im*im; } return std::sqrt(sum); } - - - -// axpy: this = a * Mat +/***************************************************************************//** + * perform the axpy operation on the current real matrix \f$A\f$, i.e. + * \f[ A \leftarrow A + \alpha{}B \f] + * for real matrix \f$B\f$ + * @param[in] alpha \f$\alpha\f$ parameter + * @param[in] mat real matrix \f$B\f$ + ******************************************************************************/ template<> -void NRMat::axpy(const double alpha, const NRMat &mat) -{ +void NRMat::axpy(const double alpha, const NRMat &mat) { #ifdef DEBUG - if (nn!=mat.nn || mm!=mat.mm) laerror("daxpy of incompatible matrices"); + if (nn != mat.nn || mm != mat.mm) laerror("incompatible matrices in NRMat::axpy(...)"); #endif + SAME_LOC(*this, mat); copyonwrite(); - cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1); + } +#endif } - - +/***************************************************************************//** + * perform the axpy operation on the current complex matrix \f$A\f$, i.e. + * \f[ A \leftarrow A + \alpha{}B \f] + * for real matrix \f$B\f$ + * @param[in] alpha complex parameter \f$\alpha\f$ + * @param[in] mat complex matrix \f$B\f$ + ******************************************************************************/ template<> -void NRMat< complex >::axpy(const complex alpha, - const NRMat< complex > & mat) -{ +void NRMat >::axpy(const complex alpha, + const NRMat > & mat) { #ifdef DEBUG - if (nn!=mat.nn || mm!=mat.mm) laerror("zaxpy of incompatible matrices"); + if (nn != mat.nn || mm != mat.mm) laerror("incompatible matrices in NRMat >::axpy(...)"); #endif + SAME_LOC(*this, mat); copyonwrite(); - cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); + cublasZaxpy(nn*mm, _alpha, (cuDoubleComplex*)(mat[0]), 1, (cuDoubleComplex*)(this->v), 1); + } +#endif } - - - -// trace of Mat +/***************************************************************************//** + * compute the trace of current genenal square matrix \f$A\f$, i.e. + * \f[ \sum_{i=1}^{N} A_{i,i} \f] + * where \f$N\f$ is the order of the matrix + ******************************************************************************/ template -const T NRMat::trace() const -{ +const T NRMat::trace() const { #ifdef DEBUG - if (nn != mm) laerror("no-square matrix in Mat::trace()"); + if (nn != mm) laerror("nonsquare matrix in NRMat::trace()"); #endif -T sum=0; + NOT_GPU(*this); + T sum(0); #ifdef MATPTR -for (int i=0; idivide == true NULL + * \li divide == false pointer to the first element of r + ******************************************************************************/ template<> -const double * NRMat::diagonalof(NRVec &r, const bool divide, bool cache) const -{ - if (r.size() != nn) laerror("diagonalof() incompatible vector"); - -double a; - -r.copyonwrite(); - -if(nn==mm) -{ -#ifdef MATPTR -if(divide) for (int i=0; i< nn; i++) if((a=v[i][i])) r[i]/=a; -else for (int i=0; i< nn; i++) r[i] = v[i][i]; -#else -if(divide) {int i,j; for (i=j=0; j< nn; ++j, i+=nn+1) if((a=v[i])) r[j] /=a;} -else {int i,j; for (i=j=0; j< nn; ++j, i+=nn+1) r[j] = v[i];} +const double* NRMat::diagonalof(NRVec &r, const bool divide, bool cache) const { + double *ret(NULL); +#ifdef DEBUG + if(r.size() != mm) laerror("incompatible vector in NRMat::diagonalof(...)"); #endif -} -else //non-square -{ -for (int i=0; i< mm; i++) - { -#ifdef MATPTR - a= cblas_ddot(nn,v[0]+i,mm,v[0]+i,mm); -#else - a=cblas_ddot(nn,v+i,mm,v+i,mm); + + double a(0.0);//!< temporary variable for storing the scaling factor + + SAME_LOC(*this, r); + if(divide){ + NOT_GPU(*this); + } + + r.copyonwrite(); +#ifdef CUDALA + if(location == cpu){ #endif - if(divide) {if(a) r[i]/=a;} - else r[i] = a; - } -} - -return divide?NULL:&r[0]; -} - - -//set diagonal -template<> -void NRMat::diagonalset(const NRVec &r) -{ - if (r.size() != nn) laerror("diagonalset() incompatible vector"); - if(nn!=mm) laerror("diagonalset only for square matrix"); - -copyonwrite(); - -#ifdef MATPTR -for (int i=0; i< nn; i++) v[i][i] = r[i]; -#else -{int i,j; for (i=j=0; j< nn; ++j, i+=nn+1) v[i] = r[j];} -#endif -} - -template<> -void NRMat::orthonormalize(const bool rowcol, const NRSMat *metric) //modified Gram-Schmidt -{ -if(metric) //general metric -{ -if(rowcol) //vectors are rows - { - if((*metric).nrows() != mm) laerror("incompatible metric in orthonormalize"); - for(int j=0; j tmp = *metric * (*this).row(i); - double fact = cblas_ddot(mm,(*this)[j],1,tmp,1); - cblas_daxpy(mm,-fact,(*this)[i],1,(*this)[j],1); - } - NRVec tmp = *metric * (*this).row(j); - double norm = cblas_ddot(mm,(*this)[j],1,tmp,1); - if(norm<=0.) laerror("zero vector in orthonormalize or nonpositive metric"); - cblas_dscal(mm,1./std::sqrt(norm),(*this)[j],1); - } - } -else //vectors are columns - { - if((*metric).nrows() != nn) laerror("incompatible metric in orthonormalize"); - for(int j=0; j tmp = *metric * (*this).column(i); - double fact = cblas_ddot(nn,&(*this)[0][j],mm,tmp,1); - cblas_daxpy(nn,-fact,&(*this)[0][i],mm,&(*this)[0][j],mm); - } - NRVec tmp = *metric * (*this).column(j); - double norm = cblas_ddot(nn,&(*this)[0][j],mm,tmp,1); - if(norm<=0.) laerror("zero vector in orthonormalize or nonpositive metric"); - cblas_dscal(nn,1./std::sqrt(norm),&(*this)[0][j],mm); - } - } -} - -else //unit metric - -{ -if(rowcol) //vectors are rows - { - for(int j=0; j tmp(mm, cpu); + for(int i=0;i +void NRMat::diagonalset(const NRVec &r) { +#ifdef DEBUG + if(r.size() != nn) laerror("incompatible vectors int NRMat::diagonalset(...)"); + if(nn != mm) laerror("NRMat::diagonalset(...) can be used only for square matrices"); +#endif + + SAME_LOC(*this, r); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + +#ifdef MATPTR + for (int i=0; i +void NRMat >::diagonalset(const NRVec > &r) { +#ifdef DEBUG + if(r.size() != nn) laerror("incompatible vectors int NRMat >::diagonalset(...)"); + if(nn != mm) laerror("NRMat >::diagonalset(...) can be used only for square matrices"); +#endif + SAME_LOC(*this, r); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + #ifdef MATPTR + for (int i=0; iv), 1); + } +#endif +} + +/***************************************************************************//** + * perform straightforward orthonormalization via modified Gram-Schmidt process + * @param[in] rowcol flag regarding the interpretation of the current matrix + * \li \c true the vectors being orthonormalized are stored as rows + * \li \c false the vectors being orthonormalized are stored as columns + * @param[in] metric pointer to real symmetric matrix stored in packed form which + * is used in computing the inner products in the process, the standard inner product + * is taken into account for metric == NULL + * @return void + ******************************************************************************/ +template<> +void NRMat::orthonormalize(const bool rowcol, const NRSMat *metric) { + + SAME_LOC(*this, *metric); + if(metric){ + if(rowcol){ //vectors are stored in rows + if((*metric).nrows() != mm) laerror("incompatible metric in NRMat::orthonormalize(rowcol = true)"); + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int j=0; j tmp = *metric * (*this).row(i); + const double fact = cblas_ddot(mm,(*this)[j],1,tmp,1); + cblas_daxpy(mm,-fact,(*this)[i],1,(*this)[j],1); + } + const NRVec tmp = *metric * (*this).row(j); + const double norm = cblas_ddot(mm,(*this)[j],1,tmp,1); + if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); + cblas_dscal(mm,1./std::sqrt(norm),(*this)[j],1); + } +#ifdef CUDALA + }else{ + for(register int j=0; j tmp(mm, location); + tmp = *metric * (*this).row(i); + const double fact = cublasDdot(mm, (*this)[j], 1, tmp, 1); + cublasDaxpy(mm, -fact, (*this)[i], 1, (*this)[j], 1); + } + NRVec tmp(mm, location); + tmp = *metric * (*this).row(j); + const double norm = cublasDdot(mm, (*this)[j], 1, tmp, 1); + if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); + cublasDscal(mm, 1./std::sqrt(norm), (*this)[j], 1); + } + + } +#endif + }else{ //vectors are stored in columns +#ifdef CUDALA + if(location = cpu){ +#endif + if((*metric).nrows() != nn) laerror("incompatible metric in NRMat::orthonormalize(rowcol = false)"); + for(register int j=0; j tmp = *metric * (*this).column(i); + double fact = cblas_ddot(nn, &(*this)[0][j], mm, tmp, 1); + cblas_daxpy(nn, -fact, &(*this)[0][i], mm, &(*this)[0][j], mm); + } + NRVec tmp = *metric * (*this).column(j); + double norm = cblas_ddot(nn, &(*this)[0][j], mm, tmp, 1); + if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); + cblas_dscal(nn, 1./std::sqrt(norm), &(*this)[0][j], mm); + } +#ifdef CUDALA + }else{ + if((*metric).nrows() != nn) laerror("incompatible metric in NRMat::orthonormalize(rowcol = false)"); + for(register int j=0; j tmp(nn, location); + tmp = *metric * (*this).column(i); + double fact = cublasDdot(nn, &(*this)[0][j], mm, tmp, 1); + cublasDaxpy(nn, -fact, &(*this)[0][i], mm, &(*this)[0][j], mm); + } + NRVec tmp(nn, location); + tmp = *metric * (*this).column(j); + double norm = cublasDdot(nn, &(*this)[0][j], mm, tmp, 1); + if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); + cublasDscal(nn, 1./std::sqrt(norm), &(*this)[0][j], mm); + } + } +#endif + } + }else{ //unit metric (standard inner product) will be used + if(rowcol){ +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int j=0; j::orthonormalize(...)"); + cblas_dscal(mm,1./norm,(*this)[j],1); + } +#ifdef CUDALA + }else{ + for(register int j=0; j::orthonormalize(...)"); + cublasDscal(mm, 1./norm, (*this)[j], 1); + } + } +#endif + }else{ // vectors are stored in columns +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int j=0; j::orthonormalize(...)"); + cblas_dscal(nn, 1./norm, &(*this)[0][j], mm); + } +#ifdef CUDALA + }else{ + for(register int j=0; j::orthonormalize(...)"); + cublasDscal(nn, 1./norm, &(*this)[0][j], mm); + } + } +#endif + } + } //end of the unit-metric branch +} + +/***************************************************************************//** + * interchange the order of the rows of the current (real) matrix + * @return reference to the modified matrix + ******************************************************************************/ +template<> +NRMat& NRMat::swap_rows(){ + copyonwrite(); + const int n_pul = this->nn >> 1; + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i +NRMat >& NRMat >::swap_rows(){ + copyonwrite(); + const int n_pul = this->nn >> 1; + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i +NRMat& NRMat::swap_rows(){ + T tmp; + copyonwrite(); + const int n_pul = this->nn >> 1; + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i::swap_rows"); + for(register int i=0; i -NRMat& NRMat::SwapRows(){ - copyonwrite(); - const int n_pul = this->nn / 2; - double * const dataIn = this->v; - - for(register int i=0; i -NRMat >& NRMat >::SwapRows(){ +NRMat& NRMat::swap_cols(){ copyonwrite(); - const int n = this->nn; - const int m = this->mm; - const int n_pul = this->nn / 2; - complex * const dataIn = this->v; + const int m_pul = mm >> 1; - for(register int i=0; i +NRMat >& NRMat >::swap_cols(){ + copyonwrite(); + const int m_pul = mm >> 1; + +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i -NRMat& NRMat::SwapRows(){ +NRMat& NRMat::swap_cols(){ + T tmp; copyonwrite(); - const int n = this->nn; - const int m = this->mm; - const int n_pul = this->nn / 2; - T * const dataIn = this->v; + const int m_pul = mm >> 1; +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0; i::swap_cols"); + for(register int i=0; i +NRMat& NRMat::swap_rows_cols(){ + const int n_pul = nn >> 1; + const int m_pul = mm >> 1; + double tmp(0.0); - for(register int j=0;j -NRMat& NRMat::SwapCols(){ +NRMat >& NRMat >::swap_rows_cols(){ + const int n_pul = nn >> 1; + const int m_pul = mm >> 1; + + complex tmp(0.0, 0.0); + copyonwrite(); - const int n = this->nn; - const int m = this->mm; - const int m_pul = m / 2; - double * const dataIn = this->v; +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int i=0;i -NRMat >& NRMat >::SwapCols(){ - copyonwrite(); - const int n_pul = this->nn / 2; - const int m_pul = this->mm / 2; - complex * const dataIn = this->v; - - for(register int i=0; i -NRMat& NRMat::SwapCols(){ - copyonwrite(); - const int n_pul = nn / 2; - const int m_pul = mm / 2; - T * const dataIn = this->v; - - for(register int i=0; i)*mm); + cublasZswap(mm, (cuDoubleComplex*)(v + n_pul*mm + mm - 1), -1, (cuDoubleComplex*)gpu_ptr, 1); + cublasZcopy(mm, (cuDoubleComplex*)gpu_ptr, 1, (cuDoubleComplex*)(v + n_pul*mm), 1); + gpufree(gpu_ptr); } } +#endif return *this; } -//------------------------------------------------------------------------------ -// for a matrix A(1:nn,1:mm) performs Fortran-like -// operation A(nn:-1:1,mm:-1:1) -//------------------------------------------------------------------------------ + +/***************************************************************************//** + * interchange the order of the rows and columns of the current + * general matrix \f$A\f$ of type T, i.e. perform the operation + * \f[A_{i,j}\leftarrow A_{nn-1-i, mm-1-j}\f] + * where \f$0\leq{}i\le{}nn\f$ and \f$0\leq{}j\le{}mm\f$ + * @return reference to the modified matrix + ******************************************************************************/ template -NRMat& NRMat::SwapRowsCols(){ - this->copyonwrite(); - const int n = this->nn; - const int m = this->mm; - T * const dataIn = this->v; - T * const dataOut = this->v; - - const int Dim = n*m; - for(register int i=0;i& NRMat::swap_rows_cols(){ + const int n_pul = nn >> 1; + const int m_pul = mm >> 1; + const int dim = nn*mm; + T *data_ptr; + T tmp; + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + data_ptr = (*this)[0]; + const int dim_pul = dim >> 1; + for(register int i=0; i<=dim_pul; i++){ + tmp = data_ptr[i]; + data_ptr[i] = data_ptr[dim - i - 1]; + data_ptr[dim - i - 1] = tmp; + } +#ifdef CUDALA + }else{ + if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat::swap_rows_cols"); + for(register int i=0; i; template class NRMat >; template class NRMat; diff --git a/mat.h b/mat.h index 40b63d5..d28a03d 100644 --- a/mat.h +++ b/mat.h @@ -1,4 +1,5 @@ -/* +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +/******************************************************************************* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or complex versions written by Roman Curik @@ -16,55 +17,101 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -*/ +*******************************************************************************/ #ifndef _LA_MAT_H_ #define _LA_MAT_H_ #include "la_traits.h" namespace LA { + +/***************************************************************************//** + * \brief NRMat class template implementing the matrix interface + * @see NRVec, NRSMat + ******************************************************************************/ template -class NRMat { +class NRMat{ protected: - int nn; - int mm; + int nn;//!< number of rows + int mm;//!< number of columns #ifdef MATPTR - T **v; + T **v;//!< pointer to the array of pointers pointing to the beginings of individual rows #else - T *v; + T *v;//!< pointer to the data stored continuously in emmory #endif - int *count; + int *count;//!< reference counter #ifdef CUDALA - GPUID location; + GPUID location; #endif public: friend class NRVec; friend class NRSMat; - inline NRMat() : nn(0), mm(0), v(0), count(0) - { -#ifdef CUDALA - location = DEFAULT_LOC; -#endif - }; - inline NRMat(const int n, const int m ,const GPUID loc= undefined); - inline NRMat(const T &a, const int n, const int m); - NRMat(const T *a, const int n, const int m); - inline NRMat(const NRMat &rhs); - NRMat(const typename LA_traits_complex::NRMat_Noncomplex_type &rhs, bool imagpart=false); //construct complex from real - explicit NRMat(const NRSMat &rhs); -#ifdef MATPTR - explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset=0) :NRMat(&rhs[0][0] + offset ,n,m) {if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");}; -#else - explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset=0); -#endif + //! standard destructor ~NRMat(); + + /***************************************************************************//** + * \brief inlined constructor creating zero matrix of general type T + ******************************************************************************/ + inline NRMat() : nn(0), mm(0), v(0), count(0){ + #ifdef CUDALA + location = DEFAULT_LOC; + #endif + }; + + /***************************************************************************//** + * \brief Inlined constructor creating matrix of given size and location. + * Because of performance reasons, no incialization is done. + * @param[in] n vector size (count of elements) + * @param[in] loc location of the underlying data (CPU/GPU) + ******************************************************************************/ + inline NRMat(const int n, const int m, const GPUID loc = undefined); + + //! inlined constructor creating matrix of given size filled with prescribed value and stored at given location + inline NRMat(const T &a, const int n, const int m, const GPUID loc); + + //! inlined constructor creating matrix of given size filled with prescribed value + inline NRMat(const T &a, const int n, const int m); + + //! inlined constructor creating matrix of given size filled with data located at given memory location + NRMat(const T *a, const int n, const int m); + + //! inlined copy-constructor + inline NRMat(const NRMat &rhs); + + //! complexifying constructor + NRMat(const typename LA_traits_complex::NRMat_Noncomplex_type &rhs, bool imagpart = false); + + //! explicit constructor converting symmetric matrix stored in packed form into a NRMat object + explicit NRMat(const NRSMat &rhs); + + //! explicit constructor converting vector into a NRMat object +#ifdef MATPTR + explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){ + if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); + }; +#else + explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset = 0); +#endif + #ifdef MATPTR const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise #else const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise #endif + const bool operator==(const NRMat &rhs) const {return !(*this != rhs);}; + + //! determine the count of references to this object inline int getcount() const {return count?*count:0;} + + //! ensure that the data of this matrix are referenced exactly once + void copyonwrite(); + + /***************************************************************************//** + * routines for CUDA related stuff + * \li getlocation() gets the protected data member location + * \li moveto(const GPUID) moves underlying data between CPU/GPU memory + ******************************************************************************/ #ifdef CUDALA inline GPUID getlocation() const {return location;} void moveto(const GPUID dest); @@ -72,97 +119,240 @@ public: inline GPUID getlocation() const {return cpu;} void moveto(const GPUID dest) {}; #endif - NRMat & operator=(const NRMat &rhs); //assignment - void randomize(const typename LA_traits::normtype &x); //fill with random numbers - NRMat & operator=(const T &a); //assign a to diagonal - NRMat & operator|=(const NRMat &rhs); //assignment to a new copy - NRMat & operator+=(const T &a); //add diagonal - NRMat & operator-=(const T &a); //substract diagonal - NRMat & operator*=(const T &a); //multiply by a scalar - NRMat & operator+=(const NRMat &rhs); - NRMat & operator-=(const NRMat &rhs); - NRMat & operator^=(const NRMat &rhs); //Hadamard (element-wise) product - NRMat & operator+=(const NRSMat &rhs); - NRMat & operator-=(const NRSMat &rhs); - const NRMat operator-() const; //unary minus - inline const NRMat operator+(const T &a) const; - inline const NRMat operator-(const T &a) const; - inline const NRMat operator*(const T &a) const; - inline const NRMat operator+(const NRMat &rhs) const; - inline const NRMat operator-(const NRMat &rhs) const; - inline const NRMat operator+(const NRSMat &rhs) const; - inline const NRMat operator-(const NRSMat &rhs) const; - const T dot(const NRMat &rhs) const; // scalar product of Mat.Mat//@@@for complex do conjugate - const NRMat operator*(const NRMat &rhs) const; // Mat * Mat - const NRMat oplus(const NRMat &rhs) const; //direct sum - const NRMat otimes(const NRMat &rhs, bool reversecolumns=false) const; //direct product - void diagmultl(const NRVec &rhs); //multiply by a diagonal matrix from L - void diagmultr(const NRVec &rhs); //multiply by a diagonal matrix from R - const NRSMat transposedtimes() const; //A^T . A - const NRSMat timestransposed() const; //A . A^T - const NRMat operator*(const NRSMat &rhs) const; // Mat * Smat - const NRMat operator&(const NRMat &rhs) const; // direct sum - const NRMat operator|(const NRMat &rhs) const; // direct product - const NRVec operator*(const NRVec &rhs) const {NRVec result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec - const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec - const NRVec rsum() const; //sum of rows - const NRVec csum() const; //sum of columns - void orthonormalize(const bool rowcol, const NRSMat *metric=NULL);//orthonormalize (true - vectors are rows) - const NRVec row(const int i, int l= -1) const; //row of, efficient - const NRVec column(const int j, int l= -1) const {if(l<0) l=nn; NRVec r(l); for(int i=0; i &, const bool divide=0, bool cache=false) const; //get diagonal - void diagonalset(const NRVec &); //set diagonal elements - void gemv(const T beta, NRVec &r, const char trans, const T alpha, const NRVec &x) const {r.gemv(beta,*this,trans,alpha,x);}; - void gemv(const T beta, NRVec > &r, const char trans, const T alpha, const NRVec > &x) const {r.gemv(beta,*this,trans,alpha,x);}; - inline T* operator[](const int i); //subscripting: pointer to row i - inline const T* operator[](const int i) const; - inline T& operator()(const int i, const int j); // (i,j) subscripts - inline const T& operator()(const int i, const int j) const; - inline int nrows() const; - inline int ncols() const; - inline int size() const; - void get(int fd, bool dimensions=1, bool transposed=false); - void put(int fd, bool dimensions=1, bool transposed=false) const; - void copyonwrite(); - void clear() {if(nn&&mm) {copyonwrite(); LA_traits::clear((*this)[0],nn*mm);}}; //zero out - void resize(int n, int m); - inline operator T*(); //get a pointer to the data - inline operator const T*() const; - NRMat & transposeme(int n=0); // square matrices only - NRMat & conjugateme(); // square matrices only - const NRMat transpose(bool conj=false) const; - const NRMat conjugate() const; - const NRMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const; //there is also independent less efficient routine for generally indexed submatrix - void storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs); //overwrite a block with external matrix - void gemm(const T &beta, const NRMat &a, const char transa, const NRMat &b, - const char transb, const T &alpha);//this = alpha*op( A )*op( B ) + beta*this - void fprintf(FILE *f, const char *format, const int modulo) const; - void fscanf(FILE *f, const char *format); - const typename LA_traits::normtype norm(const T scalar=(T)0) const; - void axpy(const T alpha, const NRMat &x); // this += a*x - inline const T amax() const; - const T trace() const; - NRMat & SwapRows(); - NRMat & SwapCols(); - NRMat & SwapRowsCols(); -//members concerning sparse matrix + //! fill the matrix with pseudorandom numbers (uniform distribution) + void randomize(const typename LA_traits::normtype &x); + + //! assigment operator performing shallow copy + NRMat & operator=(const NRMat &rhs); + //! assigment operator performing deep copy + NRMat & operator|=(const NRMat &rhs); + + //! assign scalar value to the diagonal elements + NRMat & operator=(const T &a); + //! add scalar value to the diagonal elements + NRMat & operator+=(const T &a); + //! subtract scalar value to the diagonal elements + NRMat & operator-=(const T &a); + + //! multiply by a scalar value + NRMat & operator*=(const T &a); + + //! add given matrix + NRMat & operator+=(const NRMat &rhs); + //! subtract given matrix + NRMat & operator-=(const NRMat &rhs); + //! Hadamard element-wise product + NRMat & operator^=(const NRMat &rhs); + + //! add symmetric matrix stored in packed form + NRMat & operator+=(const NRSMat &rhs); + //! subtract symmetric matrix stored in packed form + NRMat & operator-=(const NRSMat &rhs); + + //! unary minus + const NRMat operator-() const; + + //! add scalar value to all matrix elements and return the result by value + inline const NRMat operator+(const T &a) const; + //! subtract scalar value from all matrix elements and return the result by value + inline const NRMat operator-(const T &a) const; + //! multiply all matrix elements by a scalar value and return the result by value + inline const NRMat operator*(const T &a) const; + + //! add given matrix and return the result by value + inline const NRMat operator+(const NRMat &rhs) const; + //! add given symmetric matrix stored in packed form and return the result by value + inline const NRMat operator+(const NRSMat &rhs) const; + + //! subtract given matrix and return the result by value + inline const NRMat operator-(const NRMat &rhs) const; + //! subtract given symmetric matrix stored in packed form and return the result by value + inline const NRMat operator-(const NRSMat &rhs) const; + + //! multiply by given matrix and return the result by value + const NRMat operator*(const NRMat &rhs) const; + //! multiply by given symmetric matrix stored in packed form and return the result by value + const NRMat operator*(const NRSMat &rhs) const; + + //! direct sum of two matrices + const NRMat operator&(const NRMat &rhs) const; + //! direct product of two matrices + const NRMat operator|(const NRMat &rhs) const; + + //! multiply by a vector + const NRVec operator*(const NRVec &rhs) const { + NRVec result(nn, rhs.getlocation()); + result.gemv((T)0, *this, 'n', (T)1, rhs); + return result; + }; + //! multiply this matrix of general type T by vector of type complex + const NRVec > operator*(const NRVec > &rhs) const { + NRVec > result(nn, rhs.getlocation()); + result.gemv((T)0, *this, 'n', (T)1, rhs); + return result; + }; + + //! inner product of two matrices (taking conjugation into account in the complex case) + const T dot(const NRMat &rhs) const; + + //! direct sum + const NRMat oplus(const NRMat &rhs) const; + //! direct product + const NRMat otimes(const NRMat &rhs, bool reversecolumns = false) const; + + //! multiply by diagonal matrix from left + void diagmultl(const NRVec &rhs); + //! multiply by diagonal matrix from right + void diagmultr(const NRVec &rhs); + + //! for this matrix \f$A\f$ compute \f$A^\mathrm{T}\cdot{}A\f$ + const NRSMat transposedtimes() const; + //! for this matrix \f$A\f$ compute \f$A\cdot{}A^\mathrm{T}\f$ + const NRSMat timestransposed() const; + + //! sum the rows + const NRVec rsum() const; + //! sum the columns + const NRVec csum() const; + + //! orthonormalize this matrix + void orthonormalize(const bool rowcol, const NRSMat *metric = NULL); + + //! get the ith row + const NRVec row(const int i, int l = -1) const; + + //! get the jth column + const NRVec column(const int j, int l = -1) const { + NOT_GPU(*this); + if(l < 0) l = nn; + NRVec r(l); + for(register int i=0; i &, const bool divide = 0, bool cache = false) const; + //! set diagonal elements + void diagonalset(const NRVec &); + + //! perform the gemv operation with vector of type T + void gemv(const T beta, NRVec &r, const char trans, const T alpha, const NRVec &x) const { r.gemv(beta, *this, trans, alpha, x); }; + //! perform the gemv operation with vector of type complex + void gemv(const T beta, NRVec > &r, const char trans, const T alpha, const NRVec > &x) const { r.gemv(beta, *this, trans, alpha, x); }; + + //! determine the pointer to the ith row + inline T* operator[](const int i); + //! determine the const pointer to the ith row + inline const T* operator[](const int i) const; + + //! get the reference to the element with indices (i,j) + inline T& operator()(const int i, const int j); + //! get the const reference to the element with indices (i,j) + inline const T& operator()(const int i, const int j) const; + //! get the copy of the element with indices (i,j) + inline const T get_ij(const int i, const int j) const; + + //! get the number of rows + inline int nrows() const; + //! get the number of columns + inline int ncols() const; + //! get the number of matrix elements + inline int size() const; + + //! unformatted input + void get(int fd, bool dimensions = 1, bool transposed = false); + //! unformatted output + void put(int fd, bool dimensions = 1, bool transposed = false) const; + //! formatted output + void fprintf(FILE *f, const char *format, const int modulo) const; + //! formatted input + void fscanf(FILE *f, const char *format); + + //! set all matrix elements equal to zero + void clear(){ + if(nn&&mm){ + copyonwrite(); + LA_traits::clear((*this)[0], nn*mm); + } + }; + + //! resize the matrix + void resize(int n, int m); + + //! get the pointer to the data + inline operator T*(); + //! get the const pointer to the data + inline operator const T*() const; + + //! in case of square matrix, transpose the leading minor of order n + NRMat& transposeme(const int n = 0); + //! conjugate a square matrix + NRMat& conjugateme(); + + //! transpose this matrix and return the result by value + const NRMat transpose(bool conj = false) const; + //! conjugate this matrix and return the result by value + const NRMat conjugate() const; + + //! extract specified submatrix + const NRMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const; + + //! store given matrix at given position into the current matrix + void storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs); + + //! perform the \b gemm operation + void gemm(const T &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T &alpha); + + //! compute the norm of this matrix + const typename LA_traits::normtype norm(const T scalar = (T)0) const; + + //! add up a scalar multiple of given matrix to the current matrix + void axpy(const T alpha, const NRMat &x); + + //! maximal element in the absolute value + inline const T amax() const; + //! minimal element in the absolute value + inline const T amin() const; + + //! determine the sum of the diagonal elements + const T trace() const; + + //! swap the order of the rows of the current matrix + NRMat & swap_rows(); + //! swap the order of the columns of the current matrix + NRMat & swap_cols(); + //! swap the order of the rows and columns of the current matrix + NRMat & swap_rows_cols(); + + //! multiply by sparse matrix SparseSMat operator*(const SparseSMat &rhs) const; + + //! explicit constructor converting sparse matrix into \c NRMat object explicit NRMat(const SparseMat &rhs); // dense from sparse - explicit NRMat(const SparseSMat &rhs); // dense from sparse + //! explicit constructor converting sparse symmetric matrix into \c NRMat object + explicit NRMat(const SparseSMat &rhs); + + //! add up given sparse matrix NRMat & operator+=(const SparseMat &rhs); + //! subtract given sparse matrix NRMat & operator-=(const SparseMat &rhs); - void gemm(const T &beta, const SparseMat &a, const char transa, const NRMat &b, const char transb, const T &alpha);//this = alpha*op( A )*op( B ) + beta*this - inline void simplify() {}; //just for compatibility with sparse ones - bool issymmetric() const {return 0;}; + + //! perform the \b gemm operation + void gemm(const T &beta, const SparseMat &a, const char transa, const NRMat &b, const char transb, const T &alpha); + + inline void simplify() {}; + bool issymmetric() const { return 0; }; + #ifndef NO_STRASSEN -//Strassen's multiplication (better than n^3, analogous syntax to gemm) - void strassen(const T beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T alpha);//this := alpha*op( A )*op( B ) + beta*this + //! Strassen's multiplication (better than \f$\mathacal{O}(n^3)\f$, analogous syntax to \see NRMat::gemm() ) + void strassen(const T beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T alpha); void s_cutoff(const int,const int,const int,const int) const; #endif }; }//namespace + //due to mutual includes this has to be after full class declaration #include "vec.h" #include "smat.h" @@ -170,107 +360,158 @@ public: #include "sparsesmat.h" namespace LA { -// ctors + +/***************************************************************************//** + * matrix constructor + * @param[in] n number of rows of the matrix being created + * @param[in] m number of cols of the matrix being created + * @param[in] loc location for storing the matrix + * @see count, v, location + ******************************************************************************/ template -NRMat::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) -{ +NRMat::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { + T* p; *count = 1; + const int nm = n*m; #ifdef CUDALA - location= (loc==undefined?DEFAULT_LOC:loc); - if(location==cpu) - { -#endif -#ifdef MATPTR - v = new T*[n]; - v[0] = new T[m*n]; - for (int i=1; i -NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) -{ -#ifdef CUDALA - location=DEFAULT_LOC; -#endif - - int i; +NRMat::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { + const int nm = n*m; T *p; *count = 1; -#ifdef CUDALA - if(location==cpu) - { -#endif -#ifdef MATPTR - v = new T*[n]; - p = v[0] = new T[m*n]; - for (int i=1; i -NRMat::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) -{ +NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) { + const int nm = n*m; + T *p; + *count = 1; + #ifdef CUDALA - location=DEFAULT_LOC; + location = DEFAULT_LOC; + if(location==cpu){ +#endif + #ifdef MATPTR + v = new T*[n]; + p = v[0] = new T[nm]; + for (register int i=1; i +NRMat::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) { + const int nm = n*m; +#ifdef CUDALA + location = DEFAULT_LOC; #endif *count = 1; #ifdef CUDALA - if(location==cpu) - { -#endif -#ifdef MATPTR - v = new T*[n]; - v[0] = new T[m*n]; - for (int i=1; i -NRMat::NRMat(const NRMat &rhs) -{ +NRMat::NRMat(const NRMat &rhs) { #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif nn = rhs.nn; mm = rhs.mm; @@ -279,19 +520,22 @@ NRMat::NRMat(const NRMat &rhs) if (count) ++(*count); } - +/***************************************************************************//** + * create matrix from a \c NRSMat object + * @param[in] rhs \c NRSMat input object to be converted + * @see count, v, vec.h, NRSMat + ******************************************************************************/ template -NRMat::NRMat(const NRSMat &rhs) -{ -NOT_GPU(rhs); +NRMat::NRMat(const NRSMat &rhs) { + NOT_GPU(rhs); #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif - int i; + int i(0), j(0), k(0); nn = mm = rhs.nrows(); - count = new int; + count = new int; *count = 1; #ifdef MATPTR v = new T*[nn]; @@ -301,16 +545,28 @@ NOT_GPU(rhs); v = new T[mm*nn]; #endif - int j, k = 0; #ifdef MATPTR - for (i=0; i NRMat::NRMat(const NRVec &rhs, const int n, const int m, const int offset) @@ -318,180 +574,356 @@ NRMat::NRMat(const NRVec &rhs, const int n, const int m, const int offset) if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); #ifdef CUDALA -location=rhs.location; + location=rhs.location; #endif nn = n; mm = m; count = rhs.count; - v = rhs.v + offset; - (*count)++; + v = rhs.v + offset;//!< make just shallow copy + (*count)++;//!< therefore increase the reference counter } #endif -// Mat + Smat +/***************************************************************************//** + * \c NRMat + \c NRSmat via operator += + * @param[in] rhs NRSMat matrix to be subtracted from current matrix + * @return result of the subtraction + * @see NRMat::operator+=(const NRSMat &) + ******************************************************************************/ template -inline const NRMat NRMat::operator+(const NRSMat &rhs) const -{ +inline const NRMat NRMat::operator+(const NRSMat &rhs) const { return NRMat(*this) += rhs; } -// Mat - Smat +/***************************************************************************//** + * \c NRMat - \c NRSmat via operator -= + * @param[in] rhs NRSMat matrix to be subtracted from current matrix + * @return result of the subtraction + * @see NRMat::operator-=(const NRSMat &) + ******************************************************************************/ template -inline const NRMat NRMat::operator-(const NRSMat &rhs) const -{ +inline const NRMat NRMat::operator-(const NRSMat &rhs) const { return NRMat(*this) -= rhs; } -// Mat[i] : pointer to the first element of i-th row +/***************************************************************************//** + * @param[in] i row number + * @return pointer to the first element in the i-th row + ******************************************************************************/ template -inline T* NRMat::operator[](const int i) -{ +inline T* NRMat::operator[](const int i) { #ifdef DEBUG - if (_LA_count_check && *count != 1) laerror("Mat lval use of [] with count > 1"); - if (i<0 || i>=nn) laerror("Mat [] out of range"); - if (!v) laerror("[] for unallocated Mat"); -#endif -#ifdef MATPTR - return v[i]; -#else - return v+i*mm; + if (_LA_count_check && *count != 1) laerror("matrix with *count>1 used as l-value"); + if (i < 0 || i >= nn) laerror("Mat [] out of range"); + if (!v) laerror("unallocated matrix"); #endif + NOT_GPU(*this); + #ifdef MATPTR + return v[i]; + #else + return v + i*mm; + #endif } + +/***************************************************************************//** + * @param[in] i row number + * @return const pointer to the first element in the i-th row + ******************************************************************************/ template -inline const T* NRMat::operator[](const int i) const -{ +inline const T* NRMat::operator[](const int i) const { #ifdef DEBUG - if (i<0 || i>=nn) laerror("Mat [] out of range"); - if (!v) laerror("[] for unallocated Mat"); + if (i < 0 || i >= nn) laerror("index out of range"); + if (!v) laerror("unallocated matrix"); #endif -#ifdef MATPTR - return v[i]; -#else - return v+i*mm; + NOT_GPU(*this); + #ifdef MATPTR + return v[i]; + #else + return v + i*mm; + #endif +} + +/***************************************************************************//** + * for a given matrix \f$A\f$, determine the element with indices (i,j) + * @param[in] i row number + * @param[in] j col number + * @return reference to \f$A_{i,j}\f$ + * @see NRMat::count + ******************************************************************************/ +template +inline T& NRMat::operator()(const int i, const int j){ +#ifdef DEBUG + if (_LA_count_check && *count != 1) laerror("NRMat::operator(,) used as l-value for a matrix with count > 1"); + if (i < 0 || i >= nn && nn > 0 || j < 0 || j >= mm && mm > 0) laerror("index out of range"); + if (!v) laerror("unallocated matrix"); +#endif + NOT_GPU(*this); + #ifdef MATPTR + return v[i][j]; + #else + return v[i*mm + j]; + #endif +} + +/***************************************************************************//** + * for a given matrix \f$A\f$, determine the element with indices (i,j) + * @param[in] i row number + * @param[in] j col number + * @return const reference to \f$A_{i,j}\f$ + ******************************************************************************/ +template +inline const T& NRMat::operator()(const int i, const int j) const{ + T ret; +#ifdef DEBUG + if (i<0 || i>=nn && nn>0 || j<0 || j>=mm && mm>0) laerror("index out of range"); + if (!v) laerror("unallocated matrix"); +#endif + NOT_GPU(*this); + #ifdef MATPTR + return v[i][j]; + #else + return v[i*mm + j]; + #endif +} + +/***************************************************************************//** + * for a given matrix \f$A\f$, determine the element with indices (i,j) + * @param[in] i row number + * @param[in] j col number + * @return const reference to \f$A_{i,j}\f$ + ******************************************************************************/ +template +inline const T NRMat::get_ij(const int i, const int j) const{ + T ret; +#ifdef DEBUG + if (i<0 || i>=nn || j<0 || j>=mm) laerror("index out of range"); + if (!v) laerror("unallocated matrix"); +#endif +#ifdef CUDALA + if(location == cpu){ +#endif + #ifdef MATPTR + return v[i][j]; + #else + return v[i*mm + j]; + #endif +#ifdef CUDALA + }else{ + const int pozice = i*mm + j; + gpuget(1, sizeof(T), v + pozice, &ret); + return ret; + } #endif } -// Mat(i,j) reference to the matrix element M_{ij} +/***************************************************************************//** + * @return number of rows + ******************************************************************************/ template -inline T & NRMat::operator()(const int i, const int j) -{ -#ifdef DEBUG - if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1"); - if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range"); - if (!v) laerror("(,) for unallocated Mat"); -NOT_GPU(*this); -#endif -#ifdef MATPTR - return v[i][j]; -#else - return v[i*mm+j]; -#endif -} - -template -inline const T & NRMat::operator()(const int i, const int j) const -{ -#ifdef DEBUG - if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range"); - if (!v) laerror("(,) for unallocated Mat"); -NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct -#endif -#ifdef MATPTR - return v[i][j]; -#else - return v[i*mm+j]; -#endif -} - -// number of rows -template -inline int NRMat::nrows() const -{ +inline int NRMat::nrows() const{ return nn; } -// number of columns +/***************************************************************************//** + * @return number of columns + ******************************************************************************/ template -inline int NRMat::ncols() const -{ +inline int NRMat::ncols() const{ return mm; } +/***************************************************************************//** + * @return number of elements + ******************************************************************************/ template -inline int NRMat::size() const -{ +inline int NRMat::size() const{ return nn*mm; } -// reference pointer to Mat +/***************************************************************************//** + * @return pointer of general type T to the underlying data structure + ******************************************************************************/ template -inline NRMat::operator T* () -{ -#ifdef DEBUG - if (!v) laerror("unallocated Mat in operator T*"); -#endif -#ifdef MATPTR - return v[0]; -#else - return v; -#endif -} -template -inline NRMat::operator const T* () const -{ -#ifdef DEBUG - if (!v) laerror("unallocated Mat in operator T*"); -#endif -#ifdef MATPTR - return v[0]; -#else - return v; -#endif +inline NRMat::operator T*(){ + #ifdef DEBUG + if (!v) laerror("unallocated matrix"); + #endif + #ifdef MATPTR + return v[0]; + #else + return v; + #endif } -// max element of Mat +/***************************************************************************//** + * @return const pointer of general type T to the underlying data + ******************************************************************************/ +template +inline NRMat::operator const T*() const{ + #ifdef DEBUG + if (!v) laerror("unallocated matrix"); + #endif + #ifdef MATPTR + return v[0]; + #else + return v; + #endif +} + +/***************************************************************************//** + * for this real matrix \f$A\f$, determine the first element + * with largest absolute value + * @return \f$A_{l,m}\f$ which maximizes \f$\left|A_{i,j}\right|\f$ + ******************************************************************************/ template<> -inline const double NRMat::amax() const -{ -#ifdef MATPTR - return v[0][cblas_idamax(nn*mm, v[0], 1)]; -#else - return v[cblas_idamax(nn*mm, v, 1)]; +inline const double NRMat::amax() const{ +#ifdef CUDALA + if(location == cpu){ +#endif + #ifdef MATPTR + return v[0][cblas_idamax(nn*mm, v[0], 1) - 1]; + #else + return v[cblas_idamax(nn*mm, v, 1) - 1]; + #endif +#ifdef CUDALA + }else{ + double ret(0.0); + const int pozice = cublasIdamax(nn*mm, v, 1) - 1; + TEST_CUBLAS("cublasIdamax"); + gpuget(1, sizeof(double), v + pozice, &ret); + return ret; + } #endif } + +/***************************************************************************//** + * for this real matrix \f$A\f$, determine the first element + * with smallest absolute value + * @return \f$A_{l,m}\f$ which minimizes \f$\left|A_{i,j}\right|\f$ + ******************************************************************************/ template<> -inline const complex NRMat< complex >::amax() const -{ -#ifdef MATPTR - return v[0][cblas_izamax(nn*mm, v[0], 1)]; -#else - return v[cblas_izamax(nn*mm, v, 1)]; +inline const double NRMat::amin() const{ + double ret(0.0); +#ifdef CUDALA + if(location == cpu){ #endif + // idamin seems not to be supported + const int nm = nn*mm; + double val(0.0); + int index(-1); + ret = std::numeric_limits::max(); + for(register int i=0; i < nm; i++){ + #ifdef MATPTR + val = std::abs(v[0][i]); + #else + val = std::abs(v[i]); + #endif + if(val < ret){ index = i; ret = val; } + } + #ifdef MATPTR + ret = v[0][index]; + #else + ret = v[index]; + #endif +#ifdef CUDALA + }else{ + const int pozice = cublasIdamin(nn*mm, v, 1) - 1; + TEST_CUBLAS("cublasIdamin"); + gpuget(1, sizeof(double), v + pozice, &ret); + } +#endif + return ret; } +/***************************************************************************//** + * for this complex matrix \f$A\f$, determine the smallest index of the maximum + * magnitude element, i.e. maximal element in the 1-norm + * @return \f$A_{l,m}\f$ which maximizes \f$\left\{\left|\Re{}A_{i,j}\right|+\left|\Im{}A_{i,j}\right|\right}\f$ + ******************************************************************************/ +template<> +inline const complex NRMat >::amax() const{ +#ifdef CUDALA + if(location == cpu){ +#endif + #ifdef MATPTR + return v[0][cblas_izamax(nn*mm, v[0], 1) - 1]; + #else + return v[cblas_izamax(nn*mm, v, 1) - 1]; + #endif +#ifdef CUDALA + }else{ + complex ret(0.0, 0.0); + const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamax"); + gpuget(1, sizeof(complex), v + pozice, &ret); + return ret; + } +#endif -//basic stuff to be available for any type ... must be in .h -// dtor +} + +/***************************************************************************//** + * for this complex matrix \f$A\f$, determine the smallest index of the minimum + * magnitude element, i.e. minimal element in the 1-norm + * @return \f$A_{l,m}\f$ which minimizes \f$\left\{\left|\Re{}A_{i,j}\right|+\left|\Im{}A_{i,j}\right|\right}\f$ + ******************************************************************************/ +template<> +inline const complex NRMat >::amin() const{ + complex ret(0.0, 0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + // idamin seems not to be supported + const int nm = nn*mm; + int index(-1); + double val(0.0), min_val(0.0); + complex z_val(0.0, 0.0); + + min_val = std::numeric_limits::max(); + for(register int i=0; i < nm; i++){ + #ifdef MATPTR + z_val = v[0][i]; + #else + z_val = v[i]; + #endif + val = std::abs(z_val.real()) + std::abs(z_val.imag()); + if(val < min_val){ index = i; min_val = val; } + } + #ifdef MATPTR + ret = v[0][index]; + #else + ret = v[index]; + #endif +#ifdef CUDALA + }else{ + const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamin"); + gpuget(1, sizeof(complex), v + pozice, &ret); + } +#endif + return ret; +} + +/***************************************************************************//** + * destructor for general type + * @see NRMat::count + ******************************************************************************/ template -NRMat::~NRMat() -{ - if (!count) return; - if (--(*count) <= 0) { - if (v) { +NRMat::~NRMat() { + if(!count) return; + if(--(*count) <= 0) { + if (v){ #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - { -#ifdef MATPTR - delete[] (v[0]); -#endif - delete[] v; - } + #ifdef MATPTR + delete[] (v[0]); + #endif + delete[] v; #ifdef CUDALA - else - { - gpufree(v); + }else{ + gpufree(v); } #endif } @@ -499,361 +931,417 @@ NRMat::~NRMat() } } -// assign NRMat = NRMat +/***************************************************************************//** + * assigment operator for general type between NRMat and NRMat + * @see count + * @return reference to the newly assigned matrix + ******************************************************************************/ template -NRMat & NRMat::operator=(const NRMat &rhs) -{ - if (this !=&rhs) - { - if (count) - if (--(*count) ==0 ) - { +NRMat & NRMat::operator=(const NRMat &rhs) { + if (this != &rhs){ + if (count){ + if (--(*count) ==0 ){ #ifdef CUDALA - if(location==cpu) - { + if(location == cpu){ #endif -#ifdef MATPTR - delete[] (v[0]); -#endif - delete[] v; + #ifdef MATPTR + delete[] (v[0]); + #endif + delete[] v; #ifdef CUDALA - } - else gpufree(v); + }else{ gpufree(v); } #endif - - delete count; + delete count; } + } v = rhs.v; #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif nn = rhs.nn; mm = rhs.mm; count = rhs.count; - if (count) (*count)++; - } + if(count) (*count)++; + } return *this; } -// Explicit deep copy of NRmat +/***************************************************************************//** + * perform an explicit deep copy of \c NRMat object + * @see count + * @return reference to the newly copied matrix + ******************************************************************************/ template -NRMat & NRMat::operator|=(const NRMat &rhs) -{ - if (this == &rhs) return *this; +NRMat & NRMat::operator|=(const NRMat &rhs) { + if(this == &rhs) return *this; // test to avoid self-assignment *this = rhs; this->copyonwrite(); return *this; } -// make detach Mat and make it's own deep copy +/***************************************************************************//** + * create own deep copy + * @see NRMat::count, NRMat::operator|=() + ******************************************************************************/ template -void NRMat::copyonwrite() -{ - if (!count) laerror("Mat::copyonwrite of undefined matrix"); - if (*count > 1) { - (*count)--; - count = new int; - *count = 1; +void NRMat::copyonwrite() { + if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0"); + if(*count > 1){ + (*count)--; + count = new int; + *count = 1; #ifdef CUDALA - if(location==cpu) //matrix is in CPU memory - { -#endif -#ifdef MATPTR - T **newv = new T*[nn]; - newv[0] = new T[mm*nn]; - memcpy(newv[0], v[0], mm*nn*sizeof(T)); - v = newv; - for (int i=1; i< nn; i++) v[i] = v[i-1] + mm; -#else - T *newv = new T[mm*nn]; - memcpy(newv, v, mm*nn*sizeof(T)); - v = newv; + if(location == cpu){ //matrix is in CPU memory #endif + #ifdef MATPTR + T **newv = new T*[nn]; + newv[0] = new T[mm*nn]; + memcpy(newv[0], v[0], mm*nn*sizeof(T)); + v = newv; + for(register int i=1; i::copyonwrite(), NRMat::operator|=() + * @return reference to the newly copied matrix + ******************************************************************************/ template -void NRMat::resize(int n, int m) -{ +void NRMat::resize(int n, int m) { #ifdef DEBUG - if (n<0 || m<0) laerror("illegal dimensions in Mat::resize()"); + if (n<0 || m<0) laerror("illegal dimensions"); #endif -//allow trivial dimensions -if(n==0) m=0; -if(m==0) n=0; + //allow trivial dimensions + if(n == 0 || m == 0) m = n =0; - if (count) - { - if(n==0 && m==0) - { - if(--(*count) <= 0) { + if(count){ + if(n==0 && m==0){ + if(--(*count) <= 0){ #ifdef CUDALA - if(location==cpu) - { + if(location==cpu){ #endif -#ifdef MATPTR - if(v) delete[] (v[0]); -#endif - if(v) delete[] v; + #ifdef MATPTR + if(v) delete[] (v[0]); + #endif + if(v) delete[] v; #ifdef CUDALA } - else gpufree(v); + else { gpufree(v); } #endif - delete count; - } - count=0; - nn=mm=0; - v=0; - return; - } - if (*count > 1) { - (*count)--; - count = 0; - v = 0; - nn = 0; - mm = 0; - } - } - if (!count) { - count = new int; - *count = 1; - nn = n; + delete count; + } + count = 0; + nn = mm = 0; + v = 0; + return; + } + /* + if we have more than one reference to this matrix, set count to NULL + in order to reach the if-branch below where new memory resources are allocated + */ + if(*count > 1){ + (*count)--; + count = 0; + nn = mm = 0; + v = 0; + } + } + + if(!count){ + count = new int; + *count = 1; + nn = n; mm = m; #ifdef CUDALA - if(location==cpu) - { -#endif -#ifdef MATPTR - v = new T*[nn]; - v[0] = new T[m*n]; - for (int i=1; i< n; i++) v[i] = v[i-1] + m; -#else - v = new T[m*n]; + if(location==cpu){ #endif + #ifdef MATPTR + v = new T*[nn]; + v[0] = new T[m*n]; + for (register int i=1; i< n; i++) v[i] = v[i-1] + m; + #else + v = new T[m*n]; + #endif #ifdef CUDALA - } - else - v = (T *) gpualloc(n*m*sizeof(T)); + }else{ + v = (T *) gpualloc(n*m*sizeof(T)); + } #endif - return; } - // At this point *count = 1, check if resize is necessary - if (n!=nn || m!=mm) { - nn = n; - mm = m; + + // at this point *count = 1, check if resize is necessary + if (n != nn || m != mm) { + nn = n; + mm = m; #ifdef CUDALA - if(location==cpu) - { -#endif -#ifdef MATPTR - delete[] (v[0]); -#endif - delete[] v; -#ifdef MATPTR - v = new T*[nn]; - v[0] = new T[m*n]; - for (int i=1; i< n; i++) v[i] = v[i-1] + m; -#else - v = new T[m*n]; + if(location==cpu){ #endif + #ifdef MATPTR + delete[] (v[0]); + #endif + delete[] v; + #ifdef MATPTR + v = new T*[nn]; + v[0] = new T[m*n]; + for (int i=1; i< n; i++) v[i] = v[i-1] + m; + #else + v = new T[m*n]; + #endif #ifdef CUDALA - } - else - { - gpufree(v); - v=(T *) gpualloc(n*m*sizeof(T)); - } + }else{ + gpufree(v); + v=(T *) gpualloc(n*m*sizeof(T)); + } #endif - } + } } - - +/***************************************************************************//** + * complexify a given matrix \f$A\f$ + * @param[in] rhs matrix \f$A\f$ intended for this operation + * @return matrix \f$B\f$ where \f$\Re B=A\f$ and \f$\Im B = 0\f$ + ******************************************************************************/ template -NRMat > complexify(const NRMat &rhs) -{ -NRMat > r(rhs.nrows(),rhs.ncols()); -for(int i=0; i > complexify(const NRMat &rhs) { + NOT_GPU(rhs); + + NRMat > r(rhs.nrows(), rhs.ncols(), rhs.getlocation()); + for(register int i=0; i -std::ostream& operator<<(std::ostream &s, const NRMat &x) -{ +std::ostream& operator<<(std::ostream &s, const NRMat &x) { #ifdef CUDALA - if(x.getlocation()==cpu) - { + if(x.getlocation() == cpu){ #endif - int i,j,n,m; - n=x.nrows(); - m=x.ncols(); - s << n << ' ' << m << '\n'; - for(i=0;i::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function - } - return s; + int i(0),j(0); + int n(x.nrows()), m(x.ncols()); + s << n << ' ' << m << '\n'; + for(i=0; i::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); + } + } + return s; #ifdef CUDALA - } - else - { - NRMat tmp=x; + }else{ + NRMat tmp = x; tmp.moveto(cpu); - return s< std::istream& operator>>(std::istream &s, NRMat &x) { #ifdef CUDALA - if(x.getlocation()==cpu) - { + if(x.getlocation() == cpu){ #endif - int i,j,n,m; - s >> n >> m; - x.resize(n,m); + int i(0), j(0), n(0), m(0); + s >> n >> m; + x.resize(n, m); typename LA_traits_io::IOtype tmp; - for(i=0;i>tmp; x[i][j]=tmp;} - return s; -#ifdef CUDALA + for(i=0;i> tmp; + x[i][j] = tmp; + } } - else - { + return s; +#ifdef CUDALA + }else{ NRMat tmp; tmp.moveto(cpu); s >> tmp; tmp.moveto(x.getlocation()); - x=tmp; + x = tmp; return s; - } + } #endif } - -//optional indexing from 1 -//all possible constructors have to be given explicitly, other stuff is inherited -//with exception of the operator() which differs +/***************************************************************************//** + * implements \c NRMat functionality with indexing from 1 + * all possible constructors have to be given explicitly, other stuff is inherited + * with exception of the operator() which differs + ******************************************************************************/ template class NRMat_from1 : public NRMat { public: - NRMat_from1(): NRMat() {}; - explicit NRMat_from1(const int n): NRMat(n) {}; - NRMat_from1(const NRMat &rhs): NRMat(rhs) {}; //be able to convert the parent class transparently to this - NRMat_from1(const int n, const int m): NRMat(n,m) {}; - NRMat_from1(const T &a, const int n, const int m): NRMat(a,n,m) {}; - NRMat_from1(const T *a, const int n, const int m): NRMat(a,n,m) {}; + NRMat_from1(): NRMat() {}; + explicit NRMat_from1(const int n): NRMat(n) {}; + NRMat_from1(const NRMat &rhs): NRMat(rhs) {};//!< be able to convert the parent class transparently to this + NRMat_from1(const int n, const int m): NRMat(n, m) {}; + NRMat_from1(const T &a, const int n, const int m): NRMat(a, n, m) {}; + NRMat_from1(const T *a, const int n, const int m): NRMat(a, n, m) {}; - inline const T& operator() (const int i, const int j) const - { -#ifdef DEBUG - if (i<1 || i>NRMat::nn || j<1 || j>NRMat::mm) laerror("Mat (,) out of range"); - if (!NRMat::v) laerror("(,) for unallocated Mat"); -#endif -#ifdef MATPTR - return NRMat::v[i-1][j-1]; -#else - return NRMat::v[(i-1)*NRMat::mm+j-1]; -#endif - } - inline T& operator() (const int i, const int j) - { -#ifdef DEBUG - if (_LA_count_check && *NRMat::count != 1) laerror("Mat lval use of (,) with count > 1"); - if (i<1 || i>NRMat::nn || j<1 || j>NRMat::mm) laerror("Mat (,) out of range"); - if (!NRMat::v) laerror("(,) for unallocated Mat"); -#endif -#ifdef MATPTR - return NRMat::v[i-1][j-1]; -#else - return NRMat::v[(i-1)*NRMat::mm+j-1]; -#endif - } + inline const T& operator() (const int i, const int j) const { + #ifdef DEBUG + if (i<1 || i>NRMat::nn || j<1 || j>NRMat::mm) laerror("index out of range"); + if (!NRMat::v) laerror("unallocated matrix"); + #endif + NOT_GPU(*this); + #ifdef MATPTR + return NRMat::v[i - 1][j - 1]; + #else + return NRMat::v[(i-1)*NRMat::mm+j-1]; + #endif + } + + inline T& operator() (const int i, const int j) { + #ifdef DEBUG + if (_LA_count_check && *NRMat::count != 1) laerror("matrix with *count > 1 used as l-value"); + if (i<1 || i>NRMat::nn || j<1 || j>NRMat::mm) laerror("index out of range"); + if (!NRMat::v) laerror("unallocated matrix"); + #endif + NOT_GPU(*this); + #ifdef MATPTR + return NRMat::v[i-1][j-1]; + #else + return NRMat::v[(i-1)*NRMat::mm+j-1]; + #endif + } + + inline const T get_ij(const int i, const int j) const { + T ret; + #ifdef DEBUG + if (i<1 || i>NRMat::nn || j<1 || j>NRMat::mm) laerror("index out of range"); + if (!NRMat::v) laerror("unallocated matrix"); + #endif + #ifdef CUDALA + if(NRMat::location == cpu){ + #endif + #ifdef MATPTR + return NRMat::v[i - 1][j - 1]; + #else + return NRMat::v[(i-1)*NRMat::mm + (j-1)]; + #endif + #ifdef CUDALA + }else{ + const int pozice = (i-1)*NRMat::mm + (j-1); + gpuget(1, sizeof(T), NRMat::v + pozice, &ret); + return ret; + } + #endif + } }; - - -//Hadamard product +/***************************************************************************//** + * compute Hadamard (component-wise) product with a given matrix \f$A\f$ + * @param[in] rhs matrix \f$A\f$ + * @see count, operator* + * @return reference to the multiplied matrix + ******************************************************************************/ template -NRMat & NRMat::operator^=(const NRMat &rhs){ +NRMat& NRMat::operator^=(const NRMat &rhs){ #ifdef DEBUG - if (nn != rhs.nn || mm!= rhs.mm) - laerror("Mat ^= Mat of incompatible matrices"); + if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif - copyonwrite(); + SAME_LOC(*this, rhs); + NOT_GPU(*this); + + copyonwrite();// ensure that *count == 1 #ifdef MATPTR - for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i]; + for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i]; #else const int Dim = nn*mm; - for(register int i=0;i -void NRMat::moveto(const GPUID dest) -{ -if(location==dest) return; -CPU_GPU(location,dest); -location=dest; +void NRMat::moveto(const GPUID dest) { + if(location == dest) return;// no operation is necessary + /* + currently, only movements between CPU and GPU are implemented + CUBLAS seems to lack support for multiple GPUs + */ + CPU_GPU(location, dest); + location = dest; -if(v && !count) laerror("internal inconsistency of reference counting 1"); -if (!count) return; + if(v && !count) laerror("internal inconsistency of reference counting 1"); + if (!count) return; -if(v && *count==0) laerror("internal inconsistency of reference counting 2"); -if(!v) return; + if(v && *count==0) laerror("internal inconsistency of reference counting 2"); + if(!v) return; -T *vold = v; + T *vold = v; -if(dest == cpu) //moving from GPU to CPU - { - v = new T[nn*mm]; - gpuget(nn*mm,sizeof(T),vold,v); - if(*count == 1) gpufree(vold); - else {--(*count); count = new int(1);} - } -else //moving from CPU to GPU - { - v=(T *) gpualloc(nn*mm*sizeof(T)); - gpuput(nn*mm,sizeof(T),vold,v); - if(*count == 1) delete[] vold; - else {--(*count); count = new int(1);} + if(dest == cpu){ //moving from GPU to CPU + v = new T[nn*mm]; + gpuget(nn*mm, sizeof(T), vold, v); + if(*count == 1){ gpufree(vold); } + else{ --(*count); count = new int(1); } + + }else{ //moving from CPU to GPU + v = (T *) gpualloc(nn*mm*sizeof(T)); + gpuput(nn*mm, sizeof(T), vold, v); + if(*count == 1) delete[] vold; + else{ --(*count); count = new int(1);} } } #endif -//end CUDALA +/***************************************************************************//** + * generate operators: Mat + a, a + Mat, Mat * a + * corresponding macro is defined in vec.h + ******************************************************************************/ +NRVECMAT_OPER(Mat, +) +NRVECMAT_OPER(Mat, -) +NRVECMAT_OPER(Mat, *) +/***************************************************************************//** + * generate Mat + Mat, Mat - Mat + * corresponding macro is defined in vec.h + ******************************************************************************/ +NRVECMAT_OPER2(Mat, +) +NRVECMAT_OPER2(Mat, -) -// generate operators: Mat + a, a + Mat, Mat * a -NRVECMAT_OPER(Mat,+) -NRVECMAT_OPER(Mat,-) -NRVECMAT_OPER(Mat,*) -// generate Mat + Mat, Mat - Mat -NRVECMAT_OPER2(Mat,+) -NRVECMAT_OPER2(Mat,-) - -}//namespace -#endif /* _LA_MAT_H_ */ +}//end of the LA-namespace +#endif/* _LA_MAT_H_ */ diff --git a/noncblas.cc b/noncblas.cc index aac7f10..7c6b162 100644 --- a/noncblas.cc +++ b/noncblas.cc @@ -1,4 +1,5 @@ -/* +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +/******************************************************************************* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or @@ -14,8 +15,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -*/ - +*******************************************************************************/ #include "noncblas.h" #include "laerror.h" @@ -27,178 +27,156 @@ //Level 1 - straightforward wrappers extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy); -double cblas_ddot(const int N, const double *X, const int incX, - const double *Y, const int incY) -{ +double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp); #else -return FORNAME(ddot)(&N,X,&incX,Y,&incY); + return FORNAME(ddot)(&N,X,&incX,Y,&incY); #endif } extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx); -void cblas_dscal(const int N, const double alpha, double *X, const int incX) -{ +void cblas_dscal(const int N, const double alpha, double *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp); #else -FORNAME(dscal) (&N,&alpha,X,&incX); + FORNAME(dscal) (&N,&alpha,X,&incX); #endif } extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy); -void cblas_dcopy(const int N, const double *X, const int incX, - double *Y, const int incY) -{ +void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp); #else -FORNAME(dcopy) (&N,X,&incX,Y,&incY); + FORNAME(dcopy) (&N,X,&incX,Y,&incY); #endif } extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy); -void cblas_daxpy(const int N, const double alpha, const double *X, - const int incX, double *Y, const int incY) -{ +void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp); #else -FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY); + FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY); #endif } extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx); -double cblas_dnrm2(const int N, const double *X, const int incX) -{ +double cblas_dnrm2(const int N, const double *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -return FORNAME(dnrm2) (&ntmp,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + return FORNAME(dnrm2) (&ntmp,X,&incxtmp); #else -return FORNAME(dnrm2) (&N,X,&incX); + return FORNAME(dnrm2) (&N,X,&incX); #endif } extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx); -double cblas_dasum(const int N, const double *X, const int incX) -{ +double cblas_dasum(const int N, const double *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -return FORNAME(dasum) (&ntmp,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + return FORNAME(dasum) (&ntmp,X,&incxtmp); #else -return FORNAME(dasum) (&N,X,&incX); + return FORNAME(dasum) (&N,X,&incX); #endif } extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy); -void cblas_zcopy(const int N, const void *X, const int incX, - void *Y, const int incY) -{ +void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp); #else -FORNAME(zcopy) (&N,X,&incX,Y,&incY); + FORNAME(zcopy) (&N,X,&incX,Y,&incY); #endif } extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy); -void cblas_zaxpy(const int N, const void *alpha, const void *X, - const int incX, void *Y, const int incY) -{ +void cblas_zaxpy(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp); #else -FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY); + FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY); #endif } extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx); -void cblas_zscal(const int N, const void *alpha, void *X, const int incX) -{ +void cblas_zscal(const int N, const void *alpha, void *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -FORNAME(zscal)(&ntmp,alpha,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + FORNAME(zscal)(&ntmp,alpha,X,&incxtmp); #else -FORNAME(zscal)(&N,alpha,X,&incX); + FORNAME(zscal)(&N,alpha,X,&incX); #endif } extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx); -void cblas_zdscal(const int N, const double alpha, void *X, const int incX) -{ +void cblas_zdscal(const int N, const double alpha, void *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp); #else -FORNAME(zdscal)(&N,&alpha,X,&incX); + FORNAME(zdscal)(&N,&alpha,X,&incX); #endif } extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx); -double cblas_dznrm2(const int N, const void *X, const int incX) -{ +double cblas_dznrm2(const int N, const void *X, const int incX){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -return FORNAME(dznrm2) (&ntmp,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + return FORNAME(dznrm2) (&ntmp,X,&incxtmp); #else -return FORNAME(dznrm2) (&N,X,&incX); + return FORNAME(dznrm2) (&N,X,&incX); #endif } //the following ones are f2c-compatible, but is it truly portable??? - extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy); - -void cblas_zdotu_sub(const int N, const void *X, const int incX, - const void *Y, const int incY, void *dotu) -{ +void cblas_zdotu_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp); #else -FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY); + FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY); #endif } extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy); -void cblas_zdotc_sub(const int N, const void *X, const int incX, - const void *Y, const int incY, void *dotc) -{ +void cblas_zdotc_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc){ #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp); #else -FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY); + FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY); #endif } @@ -211,38 +189,38 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY); extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy); void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, - const int N, const double alpha, const double *Ap, - const double *X, const int incX, - const double beta, double *Y, const int incY) + const int N, const double alpha, const double *Ap, + const double *X, const int incX, + const double beta, double *Y, const int incY) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted"); + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); #else -FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY); + FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY); #endif } extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy); void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, - const int N, const void *alpha, const void *Ap, - const void *X, const int incX, - const void *beta, void *Y, const int incY) + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted"); + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); #else -FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); + FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); #endif } @@ -251,185 +229,232 @@ FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda); void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, - const double alpha, const double *X, const int incX, - const double *Y, const int incY, double *A, const int lda) + const double alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -//swap m-n, y-x + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + //swap m-n, y-x #ifdef FORINT -const FINT mtmp=M; -const FINT ntmp=N; -const FINT incxtmp=incX; -const FINT incytmp=incY; -const FINT ldatmp=lda; -FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp); + const FINT mtmp=M; + const FINT ntmp=N; + const FINT incxtmp=incX; + const FINT incytmp=incY; + const FINT ldatmp=lda; + FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp); #else -FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda); + FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda); #endif } void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, - const void *alpha, const void *X, const int incX, - const void *Y, const int incY, void *A, const int lda) + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) { -LA::laerror("cblas_zgerc cannot be simply converted to fortran order"); + laerror("cblas_zgerc cannot be simply converted to fortran order"); } void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N, - const void *alpha, const void *X, const int incX, - const void *Y, const int incY, void *A, const int lda) + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) { -LA::laerror("cblas_zgeru cannot be simply converted to fortran order"); + laerror("cblas_zgeru cannot be simply converted to fortran order"); } extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc); void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, - const enum CBLAS_TRANSPOSE TransB, const int M, const int N, - const int K, const double alpha, const double *A, - const int lda, const double *B, const int ldb, - const double beta, double *C, const int ldc) + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const double alpha, const double *A, + const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -//swap a-b, m-n + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + //swap a-b, m-n #ifdef FORINT -const FINT mtmp=M; -const FINT ntmp=N; -const FINT ktmp=K; -const FINT ldatmp=lda; -const FINT ldbtmp=ldb; -const FINT ldctmp=ldc; -FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", - &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp); + const FINT mtmp=M; + const FINT ntmp=N; + const FINT ktmp=K; + const FINT ldatmp=lda; + const FINT ldbtmp=ldb; + const FINT ldctmp=ldc; + FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", + &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp); #else -FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", - &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); + FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", + &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); #endif } extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc); void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, - const enum CBLAS_TRANSPOSE TransB, const int M, const int N, - const int K, const void *alpha, const void *A, - const int lda, const void *B, const int ldb, - const void *beta, void *C, const int ldc) + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -//swap a-b, m-n + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + //swap a-b, m-n #ifdef FORINT -const FINT mtmp=M; -const FINT ntmp=N; -const FINT ktmp=K; -const FINT ldatmp=lda; -const FINT ldbtmp=ldb; -const FINT ldctmp=ldc; -FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp); + const FINT mtmp=M; + const FINT ntmp=N; + const FINT ktmp=K; + const FINT ldatmp=lda; + const FINT ldbtmp=ldb; + const FINT ldctmp=ldc; + FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp); #else -FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc); + FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc); #endif } extern "C" void FORNAME(dgemv) (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A, const FINT *LDA, const double *X, const FINT *INCX, const double *BETA, double *Y, const FINT *INCY); void cblas_dgemv(const enum CBLAS_ORDER Order, - const enum CBLAS_TRANSPOSE TransA, const int M, const int N, - const double alpha, const double *A, const int lda, - const double *X, const int incX, const double beta, - double *Y, const int incY) + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY) { #ifdef FORINT -const FINT mtmp=M; -const FINT ntmp=N; -const FINT ldatmp=lda; -const FINT incxtmp=incX; -const FINT incytmp=incY; -if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); -//swap n-m and toggle transposition -else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); + const FINT mtmp=M; + const FINT ntmp=N; + const FINT ldatmp=lda; + const FINT incxtmp=incX; + const FINT incytmp=incY; + if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); + //swap n-m and toggle transposition + else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); #else -if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); -//swap n-m and toggle transposition -else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); + if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); + //swap n-m and toggle transposition + else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); #endif } extern "C" void FORNAME(zgemv) (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A, const FINT *LDA, const void *X, const FINT *INCX, const void *BETA, void *Y, const FINT *INCY); void cblas_zgemv(const enum CBLAS_ORDER Order, - const enum CBLAS_TRANSPOSE TransA, const int M, const int N, - const void *alpha, const void *A, const int lda, - const void *X, const int incX, const void *beta, - void *Y, const int incY) + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) { -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted"); -//swap n-m and toggle transposition + if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); + if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted"); + //swap n-m and toggle transposition #ifdef FORINT -const FINT mtmp=M; -const FINT ntmp=N; -const FINT ldatmp=lda; -const FINT incxtmp=incX; -const FINT incytmp=incY; -FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); + const FINT mtmp=M; + const FINT ntmp=N; + const FINT ldatmp=lda; + const FINT incxtmp=incX; + const FINT incytmp=incY; + FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); #else -FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); + FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); #endif } extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX); - -CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) -{ +CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) { #ifdef FORINT -const FINT ntmp=N; -const FINT incxtmp=incX; -return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp); + const FINT ntmp=N; + const FINT incxtmp=incX; + return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp); #else -return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX); + return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX); #endif } +extern "C" FINT FORNAME(izamax) (const FINT *N, const void *DX, const FINT *INCX); +CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX) { +#ifdef FORINT + const FINT ntmp=N; + const FINT incxtmp=incX; + return (CBLAS_INDEX)FORNAME(izamax)(&ntmp, X, &incxtmp); +#else + return (CBLAS_INDEX)FORNAME(izamax)(&N, X, &incX); +#endif +} +/* +extern "C" FINT FORNAME(idamin) (const FINT *N, const double *DX, const FINT *INCX); +CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX) { +#ifdef FORINT + const FINT ntmp=N; + const FINT incxtmp=incX; + return (CBLAS_INDEX)FORNAME(idamin)(&ntmp,X,&incxtmp); +#else + return (CBLAS_INDEX)FORNAME(idamin)(&N,X,&incX); +#endif +} + +extern "C" FINT FORNAME(izamin) (const FINT *N, const void *DX, const FINT *INCX); +CBLAS_INDEX cblas_izamin(const int N, const void *X, const int incX) { +#ifdef FORINT + const FINT ntmp=N; + const FINT incxtmp=incX; + return (CBLAS_INDEX)FORNAME(izamin)(&ntmp, X, &incxtmp); +#else + return (CBLAS_INDEX)FORNAME(izamin)(&N, X, &incX); +#endif +} +*/ #endif -#ifdef NONCLAPACK + +#ifdef NONCLAPACK //clapack_dgesv //allocate auxiliary storage and transpose input and output quantities to fortran/C order extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO); int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, - double *A, const int lda, int *ipiv, - double *B, const int ldb) + double *A, const int lda, int *ipiv, + double *B, const int ldb) { -FINT INFO=0; -if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); -//B should be in the same physical order, just transpose A in place and the LU result on output -for(int i=1; i or @@ -25,219 +28,359 @@ #include #include #include + extern "C" { -extern ssize_t read(int, void *, size_t); -extern ssize_t write(int, const void *, size_t); + extern ssize_t read(int, void *, size_t); + extern ssize_t write(int, const void *, size_t); } -// TODO -// specialize unary minus namespace LA { - -/* - * * Templates first, specializations for BLAS next - * - */ - -//raw I/O +/***************************************************************************//** + * routine for raw output + * @param[in] fd file descriptor for output + * @param[in] dim number of elements intended for output + * @param[in] transp reserved + * @see NRMat::get(), NRSMat::copyonwrite() + ******************************************************************************/ template -void NRSMat::put(int fd, bool dim, bool transp) const -{ +void NRSMat::put(int fd, bool dim, bool transp) const { #ifdef CUDALA -if(location!=cpu) - { - NRSMat tmp= *this; - tmp.moveto(cpu); - tmp.put(fd,dim,transp); - return; - } + if(location != cpu){ + NRSMat tmp= *this; + tmp.moveto(cpu); + tmp.put(fd,dim,transp); + return; + } +#endif + errno = 0; + if(dim){ + if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write"); + if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write"); + } + LA_traits::multiput(NN2,fd,v,dim); +} + +/***************************************************************************//** + * routine for raw input + * @param[in] fd file descriptor for input + * @param[in] dim number of elements intended for input + * @param[in] transp reserved + * @see NRSMat::put(), NRSMat::copyonwrite() + ******************************************************************************/ +template +void NRSMat::get(int fd, bool dim, bool transp) { +#ifdef CUDALA + if(location != cpu){ + NRSMat tmp; + tmp.moveto(cpu); + tmp.get(fd,dim,transp); + tmp.moveto(location); + *this = tmp; + return; + } #endif -errno=0; -if(dim) -{ -if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write"); -if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write"); -} -LA_traits::multiput(NN2,fd,v,dim); + int nn0[2]; //align at least 8-byte + errno = 0; + if(dim){ + if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read"); + resize(nn0[0]); + }else{ + copyonwrite(); + } + LA_traits::multiget(NN2,fd,v,dim); } + +/***************************************************************************//** + * constructor symmetrizing given matrix \f$A\f$ of general type T yielding \f$(A+A^\mathrm{T})/2\f$ + * @param[in] rhs matrix \f$A\f$ + ******************************************************************************/ template -void NRSMat::get(int fd, bool dim, bool transp) -{ -#ifdef CUDALA -if(location!=cpu) - { - NRSMat tmp; - tmp.moveto(cpu); - tmp.get(fd,dim,transp); - tmp.moveto(location); - *this = tmp; - return; - } -#endif +NRSMat::NRSMat(const NRMat &rhs) { + NOT_GPU(rhs); -int nn0[2]; //align at least 8-byte -errno=0; -if(dim) -{ -if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read"); -resize(nn0[0]); -} -else -copyonwrite(); -LA_traits::multiget(NN2,fd,v,dim); -} - - -// conversion ctor, symmetrize general Mat into SMat -template -NRSMat::NRSMat(const NRMat &rhs) -{ -nn=rhs.nrows(); + nn = rhs.nrows(); #ifdef DEBUG - if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat"); + if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat to NRSMat"); +#endif +#ifdef CUDALA + location = rhs.getlocation(); #endif count = new int; *count = 1; v = new T[NN2]; - int i, j, k=0; - for (i=0; iT and then set + * the diagonal elements to prescribed value + * @param[in] a scalar value to be assigned to the diagonal + * @return reference to the modified matrix + ******************************************************************************/ template -NRSMat & NRSMat::operator=(const T &a) -{ +NRSMat & NRSMat::operator=(const T &a) { + NOT_GPU(*this); copyonwrite(); - memset(v,0,NN2*sizeof(T)); - for (int i=0; idivide == true NULL + * \li divide == false pointer to the first element of r + ******************************************************************************/ template -const T* NRSMat::diagonalof(NRVec &r, const bool divide, bool cache) const -{ +const T* NRSMat::diagonalof(NRVec &r, const bool divide, bool cache) const { #ifdef DEBUG -if(r.size()!=nn) laerror("incompatible vector in diagonalof()"); + if(r.size() != nn) laerror("incompatible vector in const T* NRSMat::diagonalof(NRVec &, const bool, bool)"); #endif + NOT_GPU(*this); + SAME_LOC(*this, r); -r.copyonwrite(); + r.copyonwrite(); -if (divide) - for (int i=0; iT + * @return modified copy of this matrix + ******************************************************************************/ template -const NRSMat NRSMat::operator-() const -{ - NRSMat result(nn); - for(int i=0; i NRSMat::operator-() const { + NOT_GPU(*this); + + NRSMat result(nn, getlocation()); + for(register int i = 0; i +const NRSMat NRSMat::operator-() const { + NRSMat result(nn, getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + memcpy(result.v, v, NN2*sizeof(double)); + cblas_dscal(NN2, -1., result.v, 1); +#ifdef CUDALA + }else{ + cublasDcopy(NN2, v, 1, result.v, 1); + TEST_CUBLAS("cublasDcopy"); + + cublasDscal(NN2, -1., result.v, 1); + TEST_CUBLAS("cublasDscal"); + } +#endif + return result; +} + +/***************************************************************************//** + * implements unary minus operator for this hermitian matrix + * @return modified copy of this matrix + ******************************************************************************/ +template <> +const NRSMat > NRSMat >::operator-() const { + NRSMat > result(nn, getlocation()); +#ifdef CUDALA + if(location == cpu) { +#endif + memcpy(result.v, v, NN2*sizeof(complex)); + cblas_zscal(NN2, &CMONE, result.v, 1); + +#ifdef CUDALA + }else{ + cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1); + TEST_CUBLAS("cublasZcopy"); + + cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1); + TEST_CUBLAS("cublasZscal"); + } +#endif + return result; +} + +/***************************************************************************//** + * @return the sum of the diagonal elements + ******************************************************************************/ template -const T NRSMat::trace() const -{ +const T NRSMat::trace() const { + NOT_GPU(*this); + T tmp = 0; - for (int i=0; i -void NRSMat::randomize(const double &x) -{ -for(int i=0; i::randomize(const double &x) { + NOT_GPU(*this); + + for(int i=0; i -void NRSMat >::randomize(const double &x) -{ -for(int i=0; i >::randomize(const double &x) { + for(register int i=0; iFILE structure representing the output file + * @param[in] format format specification in standard printf-like form + * @param[in] modulo + * @see lawritemat() + ******************************************************************************/ template -void NRSMat::fprintf(FILE *file, const char *format, const int modulo) const -{ +void NRSMat::fprintf(FILE *file, const char *format, const int modulo) const { + NOT_GPU(*this); + lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1); } -// read matrix from the file with specific format + +/***************************************************************************//** + * routine for formatted input via fscanf + * @param[in] f pointer to FILE structure representing the input file + * @param[in] format format specification in standard printf-like form + ******************************************************************************/ template -void NRSMat::fscanf(FILE *f, const char *format) -{ +void NRSMat::fscanf(FILE *f, const char *format) { int n, m; - if (::fscanf(f,"%d %d",&n,&m) != 2) - laerror("cannot read matrix dimensions in SMat::fscanf"); - if (n != m) laerror("different dimensions of SMat"); + NOT_GPU(*this); + + if (::fscanf(f,"%d %d", &n, &m) != 2) + laerror("cannot read matrix dimensions in NRSMat::fscanf(FILE *, const char *)"); + if (n != m) laerror("different dimensions in NRSMat::fscanf(FILE *, const char *)"); resize(n); for (int i=0; i::fscanf(FILE *, const char *) - unable to read matrix element"); } -/* - * BLAS specializations for double and complex - */ - - - -// SMat * Mat -//NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix +/***************************************************************************//** + * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form + * with real double-precision dense matrix \f$A\f$ + * @param[in] rhs real double-precision matrix \f$A\f$ + * @return matrix produt \f$S\times{}A\f$ + ******************************************************************************/ template<> -const NRMat NRSMat::operator*(const NRMat &rhs) const -{ +const NRMat NRSMat::operator*(const NRMat &rhs) const { #ifdef DEBUG - if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat"); + if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat NRSMat::operator*(const NRMat &)"); +#endif + SAME_LOC(*this, rhs); + NRMat result(nn, rhs.ncols(), getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int k = 0; k result(nn, rhs.ncols()); - for (int k=0; k -const NRMat< complex > -NRSMat< complex >::operator*(const NRMat< complex > &rhs) const -{ +const NRMat > +NRSMat >::operator*(const NRMat > &rhs) const { #ifdef DEBUG - if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat"); + if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat >::operator*(const NRMat > &)"); +#endif + SAME_LOC(*this, rhs); + NRMat > result(nn, rhs.ncols(), getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + for(register int k=0; k > result(nn, rhs.ncols()); - for (int k=0; k -const NRMat NRSMat::operator*(const NRSMat &rhs) const -{ +const NRMat NRSMat::operator*(const NRSMat &rhs) const { #ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat"); + if (nn != rhs.nn) laerror("incompatible dimensions in NRMat NRSMat::operator*(const NRSMat &)"); #endif NRMat result(0.0, nn, nn); double *p, *q; @@ -283,156 +426,295 @@ const NRMat NRSMat::operator*(const NRSMat &rhs) const } - +/***************************************************************************//** + * multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form + * with complex double-precision symmetric matrix \f$H\f$ + * @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric) + ******************************************************************************/ template<> -const NRMat< complex > -NRSMat< complex >::operator*(const NRSMat< complex > &rhs) const -{ +const NRMat > +NRSMat >::operator*(const NRSMat > &rhs) const { #ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat"); + if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat >::operator*(const NRSMat > &)"); #endif - NRMat< complex > result(0.0, nn, nn); - NRMat< complex > rhsmat(rhs); + SAME_LOC(*this, rhs); + NRMat > result(nn, nn, getlocation()); + NRMat > rhsmat(rhs); result = *this * rhsmat; return result; -// laerror("complex SMat*Smat not implemented"); } - - -// S dot S +/***************************************************************************//** + * compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$ + * i.e. determine the value of + * \f[\sum_{i,j}A_{i,j}B_{i,j}\f] + * @param[in] rhs matrix \f$B\f$ + * @return computed inner product + ******************************************************************************/ template<> -const double NRSMat::dot(const NRSMat &rhs) const -{ +const double NRSMat::dot(const NRSMat &rhs) const { + double ret(0.); #ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible SMat's"); + if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat::dot(const NRSMat &)"); #endif - return cblas_ddot(NN2, v, 1, rhs.v, 1); + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_ddot(NN2, v, 1, rhs.v, 1); +#ifdef CUDALA + }else{ + ret = cublasDdot(NN2, v, 1, rhs.v, 1); + } +#endif + return ret; } - +/***************************************************************************//** + * compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$ + * i.e. determine the value of + * \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f] + * @param[in] rhs matrix \f$B\f$ + * @return computed inner product + ******************************************************************************/ template<> -const complex -NRSMat< complex >::dot(const NRSMat< complex > &rhs) const -{ +const complex NRSMat >::dot(const NRSMat > &rhs) const { #ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible SMat's"); + if (nn != rhs.nn) laerror("incompatible dimensions in complex NRSMat >::dot(const NRSMat > &)"); +#endif + complex dot(0., 0.); + SAME_LOC(*this, rhs); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot); +#ifdef CUDALA + }else{ + const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); + dot = complex(cuCreal(_dot), cuCimag(_dot)); + TEST_CUBLAS("cublasZdotc"); + } #endif - complex dot; - cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot); return dot; } +/***************************************************************************//** + * compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$ + * with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$ + * @param[in] rhs real double-precision vector \f$\vec{v}\f$ + * @return computed inner product + ******************************************************************************/ template<> -const double NRSMat::dot(const NRVec &rhs) const -{ +const double NRSMat::dot(const NRVec &rhs) const { + double ret(0.0); #ifdef DEBUG - if (NN2 != rhs.nn) laerror("dot of incompatible SMat's"); + if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat::dot(const NRVec &)"); +#endif + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_ddot(NN2, v, 1, rhs.v, 1); +#ifdef CUDALA + }else{ + ret = cublasDdot(NN2, v, 1, rhs.v, 1); + TEST_CUBLAS("cublasDdot"); + } #endif - return cblas_ddot(NN2, v, 1, rhs.v, 1); } - +/***************************************************************************//** + * compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$ + * with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$ + * @param[in] rhs complex double-precision vector \f$\vec{v}\f$ + * @return computed inner product + ******************************************************************************/ template<> const complex -NRSMat< complex >::dot(const NRVec< complex > &rhs) const -{ +NRSMat >::dot(const NRVec > &rhs) const { #ifdef DEBUG - if (NN2 != rhs.nn) laerror("dot of incompatible SMat's"); + if(NN2 != rhs.nn) laerror("incompatible dimensions in complex NRSMat >::dot(const NRVec > &)"); +#endif + complex dot(0., 0.); + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot); +#ifdef CUDALA + }else{ + const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1); + TEST_CUBLAS("cublasZdotc"); + dot = complex(cuCreal(_dot), cuCimag(_dot)); + } #endif - complex dot; - cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot); return dot; } - -// norm of the matrix +/***************************************************************************//** + * compute the Frobenius norm of this real double-precision symmetric matrix + * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation + ******************************************************************************/ template<> -const double NRSMat::norm(const double scalar) const -{ - if (!scalar) return cblas_dnrm2(NN2, v, 1); - double sum = 0; - int k = 0; - for (int i=0; i::norm(const double scalar) const { + if(!scalar){ + double ret(0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_dnrm2(NN2, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDnrm2(NN2, v, 1); + TEST_CUBLAS("cublasDnrm2"); + } +#endif + return ret; + } + + NOT_GPU(*this); + + double sum(0.); + int k(0); + for(register int i=0; i -const double NRSMat< complex >::norm(const complex scalar) const -{ - if (!(scalar.real()) && !(scalar.imag())) - return cblas_dznrm2(NN2, v, 1); - double sum = 0; +const double NRSMat< complex >::norm(const complex scalar) const { + if(!(scalar.real()) && !(scalar.imag())){ + double ret(0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_dznrm2(NN2, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasDznrm2"); + } +#endif + return ret; + } + + int k(0); + double sum(0.); complex tmp; - int k = 0; - for (int i=0; i -void NRSMat::axpy(const double alpha, const NRSMat & x) -{ +void NRSMat::axpy(const double alpha, const NRSMat &x) { #ifdef DEBUG - if (nn != x.nn) laerror("axpy of incompatible SMats"); + if(nn != x.nn) laerror("incompatible dimensions in void NRSMat::axpy(const double, const NRSMat&)"); #endif + SAME_LOC(*this, x); copyonwrite(); - cblas_daxpy(NN2, alpha, x.v, 1, v, 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(NN2, alpha, x.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(NN2, alpha, x.v, 1, v, 1); + TEST_CUBLAS("cublasDaxpy"); + } +#endif } - +/***************************************************************************//** + * for this complex double-precision hermitian matrix \f$H\f$ stored in packed form, + * complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute + * \f[H \leftarrow \alpha G + H\f] + ******************************************************************************/ template<> -void NRSMat< complex >::axpy(const complex alpha, - const NRSMat< complex > & x) -{ +void NRSMat >::axpy(const complex alpha, const NRSMat > & x) { #ifdef DEBUG - if (nn != x.nn) laerror("axpy of incompatible SMats"); + if(nn != x.nn) laerror("incompatible dimensions in void NRSMat >::axpy(const complex , const NRSMat >&)"); #endif + SAME_LOC(*this, x); copyonwrite(); - cblas_zaxpy(nn, &alpha, x.v, 1, v, 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(nn, &alpha, x.v, 1, v, 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); + cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy"); + } +#endif + } -//complex from real +/***************************************************************************//** + * create hermitian matrix \f$H\f$ from given real double-precision symmetric + * matrix \f$S\f$ + * @param[in] rhs real double-precision symmetric matrix \f$S\f$ + * @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$ + ******************************************************************************/ template<> -NRSMat >::NRSMat(const NRSMat &rhs, bool imagpart) -: nn(rhs.nrows()), v(new complex[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1)) -{ -memset(v,0,nn*(nn+1)/2*sizeof(complex)); -cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2); +NRSMat >::NRSMat(const NRSMat &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) { + //inconsistent in general case? + const int nnp1 = nn*(nn + 1)/2; +#ifdef CUDALA + location = rhs.getlocation(); + if(location == cpu){ +#endif + v = new complex[nnp1]; + memset(v, 0, nnp1*sizeof(complex)); + cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2); +#ifdef CUDALA + }else{ + v = (complex*) gpualloc(nnp1*sizeof(complex)); + + complex *_val = gpuputcomplex(CZERO); + cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZcopy"); + gpufree(_val); + + cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2); + TEST_CUBLAS("cublasDcopy"); + } +#endif } - -//some template specializations leading to BLAS/CUBLAS calls - - - - -////////////////////////////////////////////////////////////////////////////// -////// forced instantization in the corresponding object file +/***************************************************************************//** + * forced instantization in the corresponding object file + ******************************************************************************/ template class NRSMat; -template class NRSMat< complex >; +template class NRSMat >; template class NRSMat; template class NRSMat; diff --git a/smat.h b/smat.h index 0714d32..5f94360 100644 --- a/smat.h +++ b/smat.h @@ -1,3 +1,6 @@ +//------------------------------------------------------------------------------ +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +//------------------------------------------------------------------------------ /* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or @@ -23,88 +26,142 @@ namespace LA { #define NN2 (nn*(nn+1)/2) + + +/***************************************************************************//** + * This class implements a general symmetric or hermitian matrix the elements + * of which are stored in packed form. Particularly the lower triangular part + * of a symmetric or hermitian matrix of order \f$N\f$ is interpreted as a + * vector of length \f$N(N+1)/2\f$ in row-major storage scheme. + ******************************************************************************/ template -class NRSMat { // symmetric or complex hermitean matrix in packed form +class NRSMat{ protected: - int nn; - T *v; - int *count; + int nn;//!< number of rows/columns of this symmetric matrix + T *v;//!< internal pointer to the underlying data structure + int *count;//!< pointer to the reference counter #ifdef CUDALA - GPUID location; + GPUID location;//!< specification of memory (GPU/CPU) location where this objects resides #endif public: friend class NRVec; friend class NRMat; - inline NRSMat() : nn(0),v(0),count(0) - { -#ifdef CUDALA - location = DEFAULT_LOC; -#endif - }; - inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array - inline NRSMat(const T &a, const int n); //Initialize to constant - inline NRSMat(const T *a, const int n); // Initialize to array - inline NRSMat(const NRSMat &rhs); // Copy constructor - NRSMat(const typename LA_traits_complex::NRSMat_Noncomplex_type &rhs, bool imagpart=false); //construct complex from real - explicit NRSMat(const NRMat &rhs); // symmetric part of general matrix - explicit NRSMat(const NRVec &rhs, const int n); //construct matrix from vector - NRSMat & operator|=(const NRSMat &rhs); //assignment to a new copy - NRSMat & operator=(const NRSMat &rhs); //assignment + ~NRSMat(); + + //! default constructor of null-matrix + inline NRSMat() : nn(0),v(0),count(0) { + #ifdef CUDALA + location = DEFAULT_LOC; + #endif + }; + + //! default constructor + inline explicit NRSMat(const int n, const GPUID loc = undefined); + + //! constructor initializing the matrix being created by given scalar value + inline NRSMat(const T &a, const int n); + + //! constructor initializing the matrix being created by data located at given memory position + inline NRSMat(const T *a, const int n); + + //! copy constructor + inline NRSMat(const NRSMat &rhs); + + //! constructor converting real matrix to its complex counterpart + NRSMat(const typename LA_traits_complex::NRSMat_Noncomplex_type &rhs, bool imagpart = false); + + //! constructor creating symmetric part of a general matrix + explicit NRSMat(const NRMat &rhs); + + //! construct symmetric matrix by filling the lower triangle with data stored in a vector + explicit NRSMat(const NRVec &rhs, const int n); + + //! assignment operator performing shallow copy + NRSMat & operator=(const NRSMat &rhs); + + //! assignment operator performing deep copy + NRSMat & operator|=(const NRSMat &rhs); + + //! fill the matrix with pseudorandom numbers (uniform distribution) void randomize(const typename LA_traits::normtype &x); - NRSMat & operator=(const T &a); //assign a to diagonal -#ifdef CUDALA - inline GPUID getlocation() const {return location;} - void moveto(const GPUID dest); -#else - inline GPUID getlocation() const {return cpu;} - void moveto(const GPUID dest) {}; -#endif - const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise - const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);}; - inline NRSMat & operator*=(const T &a); - inline NRSMat & operator+=(const T &a); - inline NRSMat & operator-=(const T &a); - inline NRSMat & operator+=(const NRSMat &rhs); - inline NRSMat & operator-=(const NRSMat &rhs); - const NRSMat operator-() const; //unary minus + + //! assign scalar value to diagonal elements + NRSMat & operator=(const T &a); + + inline int getcount() const {return count?*count:0;} + + #ifdef CUDALA + inline GPUID getlocation() const {return location;} + void moveto(const GPUID dest); + #else + inline GPUID getlocation() const {return cpu;} + void moveto(const GPUID dest) {}; + #endif + + //! relational operator for testing nonequality + const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits::gencmp(v,rhs.v,NN2);}; + //! relational operator for testing equality + const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);}; + + inline NRSMat & operator*=(const T &a); + inline NRSMat & operator+=(const T &a); + inline NRSMat & operator-=(const T &a); + inline NRSMat & operator+=(const NRSMat &rhs); + inline NRSMat & operator-=(const NRSMat &rhs); + const NRSMat operator-() const; + inline const NRSMat operator*(const T &a) const; inline const NRSMat operator+(const T &a) const; inline const NRSMat operator-(const T &a) const; - inline const NRSMat operator+(const NRSMat &rhs) const; + inline const NRSMat operator+(const NRSMat &rhs) const; inline const NRSMat operator-(const NRSMat &rhs) const; - inline const NRMat operator+(const NRMat &rhs) const; - inline const NRMat operator-(const NRMat &rhs) const; - const NRMat operator*(const NRSMat &rhs) const; // SMat*SMat - const NRMat operator*(const NRMat &rhs) const; // SMat*Mat - const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate - const T dot(const NRVec &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate - const NRVec operator*(const NRVec &rhs) const {NRVec result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec - const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec - const T* diagonalof(NRVec &, const bool divide=0, bool cache=false) const; //get diagonal + + inline const NRMat operator+(const NRMat &rhs) const; + inline const NRMat operator-(const NRMat &rhs) const; + const NRMat operator*(const NRSMat &rhs) const; + const NRMat operator*(const NRMat &rhs) const; + + const T dot(const NRSMat &rhs) const; + const T dot(const NRVec &rhs) const; + + const NRVec operator*(const NRVec &rhs) const {NRVec result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; + const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; + + const T* diagonalof(NRVec &, const bool divide = 0, bool cache = false) const; + void gemv(const T beta, NRVec &r, const char trans, const T alpha, const NRVec &x) const {r.gemv(beta,*this,trans,alpha,x);}; void gemv(const T beta, NRVec > &r, const char trans, const T alpha, const NRVec > &x) const {r.gemv(beta,*this,trans,alpha,x);}; + inline const T& operator[](const int ij) const; inline T& operator[](const int ij); + inline const T& operator()(const int i, const int j) const; inline T& operator()(const int i, const int j); + inline int nrows() const; inline int ncols() const; inline int size() const; + inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant - const typename LA_traits::normtype norm(const T scalar=(T)0) const; + const typename LA_traits::normtype norm(const T scalar = (T)0) const; void axpy(const T alpha, const NRSMat &x); // this+= a*x + inline const T amax() const; + inline const T amin() const; + const T trace() const; - void get(int fd, bool dimensions=1, bool transp=0); - void put(int fd, bool dimensions=1, bool transp=0) const; + void get(int fd, bool dimensions = 1, bool transp = 0); + void put(int fd, bool dimensions = 1, bool transp = 0) const; + void copyonwrite(); + void clear() {copyonwrite(); LA_traits::clear(v,NN2);}; //zero out void resize(const int n); - inline operator T*(); //get a pointer to the data - inline operator const T*() const; //get a pointer to the data - ~NRSMat(); + + inline operator T*(); + inline operator const T*() const; void fprintf(FILE *f, const char *format, const int modulo) const; void fscanf(FILE *f, const char *format); //members concerning sparse matrix @@ -121,81 +178,102 @@ public: namespace LA { -// ctors +/***************************************************************************//** + * constructor of a symmetric matrix stored in packed form + * @param[in] n number of rows of the matrix being created + * @param[in] loc location for storing the matrix + * @see count, v, location + ******************************************************************************/ template -inline NRSMat::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1)) -{ +inline NRSMat::NRSMat(const int n, const GPUID loc): nn(n), count(new int(1)) { #ifdef CUDALA - location= (loc==undefined?DEFAULT_LOC:loc); - if(location==cpu) + location = (loc == undefined?DEFAULT_LOC:loc); + if(location == cpu){ #endif - v=new T[NN2]; + v = new T[NN2]; #ifdef CUDALA - else v= (T*) gpualloc(NN2*sizeof(T)); -#endif -} - -template -inline NRSMat::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) -{ -#ifdef CUDALA - location=DEFAULT_LOC; - if(location==cpu) -#endif - { - v=new T[NN2]; - if(a != (T)0) for(int i=0; i::NRSMat(const int, const GPUID) + ******************************************************************************/ template -inline NRSMat::NRSMat(const T *a, const int n) : nn(n), count(new int(1)) -{ +inline NRSMat::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) { #ifdef CUDALA - location=DEFAULT_LOC; - if(location==cpu) + location = DEFAULT_LOC; + if(location == cpu){ +#endif + v = new T[NN2]; + if(a != (T)0) for(register int i = 0; i::NRSMat(const int, const GPUID), NRSMat::NRSMat(const T&, const int) + ******************************************************************************/ +template +inline NRSMat::NRSMat(const T *a, const int n) : nn(n), count(new int(1)) { +#ifdef CUDALA + location = DEFAULT_LOC; + if(location == cpu){ #endif memcpy(v, a, NN2*sizeof(T)); #ifdef CUDALA - else - { - v= (T*) gpualloc(NN2*sizeof(T)); - cublasSetVector(NN2,sizeof(T),a,1,v,1); - } + }else{ + v = (T*) gpualloc(NN2*sizeof(T)); + cublasSetVector(NN2, sizeof(T), a, 1, v, 1); + } #endif } +/***************************************************************************//** + * copy constructor implementing shallow copy + * @param[in] rhs reference matrix being copied + * @see count, v, location + ******************************************************************************/ template -inline NRSMat::NRSMat(const NRSMat &rhs) //copy constructor -{ +inline NRSMat::NRSMat(const NRSMat &rhs) { #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif v = rhs.v; nn = rhs.nn; count = rhs.count; - if (count) (*count)++; + if(count) (*count)++; } +/***************************************************************************//** + * constructor interpreting a vector of \f$n(n+1)/2\f$ elements as a symmetric + * matrix stored in packed form having \f$n\f$ rows + * @param[in] rhs reference matrix being copied + * @param[in] n count of rows of the matrix being created + ******************************************************************************/ template -NRSMat::NRSMat(const NRVec &rhs, const int n) // type conversion -{ +NRSMat::NRSMat(const NRVec &rhs, const int n) { #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif nn = n; #ifdef DEBUG - if (NN2 != rhs.size()) - laerror("matrix dimensions incompatible with vector length"); + if(NN2 != rhs.size()){ laerror("incompatible dimensions in NRSMat::NRSMat(const NRVec&, const int)"); } #endif count = rhs.count; v = rhs.v; @@ -203,195 +281,364 @@ NRSMat::NRSMat(const NRVec &rhs, const int n) // type conversion } -// S *= a +/***************************************************************************//** + * multiply this real symmetric matrix with real scalar value + * @param[in] a real multiplicative factor + * @return reference to the modified matrix + ******************************************************************************/ template<> -inline NRSMat & NRSMat::operator*=(const double & a) -{ +inline NRSMat & NRSMat::operator*=(const double &a) { copyonwrite(); - cblas_dscal(NN2, a, v, 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dscal(NN2, a, v, 1); +#ifdef CUDALA + }else{ + cublasDscal(NN2, a, v, 1); + TEST_CUBLAS("cublasDscal");//"NRSMat& NRSMat::operator*=(const double &)" + } +#endif return *this; } +/***************************************************************************//** + * multiply this complex symmetric matrix with complex scalar value + * @param[in] a complex multiplicative factor + * @return reference to the modified matrix + ******************************************************************************/ template<> -inline NRSMat< complex > & -NRSMat< complex >::operator*=(const complex & a) -{ +inline NRSMat > & +NRSMat >::operator*=(const complex &a) { copyonwrite(); - cblas_zscal(NN2, &a, v, 1); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zscal(NN2, &a, v, 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex _a = make_cuDoubleComplex(a.real(), a.imag()); + cublasZscal(NN2, _a, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZscal");//"NRSMat >& NRSMat >::operator*=(const complex &)" + } +#endif return *this; } + + +/***************************************************************************//** + * multiply this symmetric matrix of general type T stored in packed form + * with scalar value of type T + * @param[in] a multiplicative factor of type T + * @return reference to the modified matrix + ******************************************************************************/ template -inline NRSMat & NRSMat::operator*=(const T & a) -{ - copyonwrite(); - for(int i=0; i & NRSMat::operator*=(const T &a) { + NOT_GPU(*this); + + copyonwrite(); + for(register int i = 0; iT to the diagonal elements of this symmetric matrix of type T + * @param[in] a scalar value \f$\alpha\f$ + * @return reference to the modified matrix + ******************************************************************************/ template -inline NRSMat & NRSMat::operator+=(const T &a) -{ +inline NRSMat & NRSMat::operator+=(const T &a) { + NOT_GPU(*this); + copyonwrite(); - for (int i=0; iT from the + * diagonal elements of this symmetric matrix of type T + * @param[in] a scalar value \f$\alpha\f$ + * @return reference to the modified matrix + ******************************************************************************/ template -inline NRSMat & NRSMat::operator-=(const T &a) -{ +inline NRSMat & NRSMat::operator-=(const T &a) { + NOT_GPU(*this); + copyonwrite(); - for (int i=0; i -inline NRSMat & -NRSMat::operator+=(const NRSMat & rhs) -{ +inline NRSMat& NRSMat::operator+=(const NRSMat & rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator+="); + if(nn != rhs.nn) laerror("incompatible dimensions in NRSMat& NRSMat::operator+=(const NRSMat &)"); #endif + SAME_LOC(*this, rhs); copyonwrite(); - cblas_daxpy(NN2, 1.0, rhs.v, 1, v, 1); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(NN2, 1.0, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(NN2, 1.0, rhs.v, 1, v, 1); + TEST_CUBLAS("cublasDaxpy");//" NRSMat& NRSMat::operator+=(const NRSMat &)" + } +#endif return *this; } +/***************************************************************************//** + * add up this complex symmetric matrix with given symmetric matrix + * @param[in] rhs complex symmetric matrix to be added + * @return reference to the modified matrix + ******************************************************************************/ template<> -inline NRSMat< complex > & -NRSMat< complex >::operator+=(const NRSMat< complex > & rhs) -{ +inline NRSMat >& NRSMat >::operator+=(const NRSMat > & rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator+="); + if(nn != rhs.nn) laerror("incompatible dimensions in NRSMat >& NRSMat >::operator+=(const NRSMat > &)"); #endif + SAME_LOC(*this, rhs); copyonwrite(); - cblas_zaxpy(NN2, &CONE, rhs.v, 1, v, 1); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(NN2, &CONE, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasZaxpy(NN2, CUONE, (cuDoubleComplex*)(rhs.v), 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy");//"NRSMat >& NRSMat >::operator+=(const NRSMat > &)" + } +#endif return *this; } +/***************************************************************************//** + * add up this symmetric matrix of general type T with given symmetric matrix + * @param[in] rhs complex matrix of general type T to be added + * @return reference to the modified matrix + ******************************************************************************/ template -inline NRSMat & NRSMat::operator+=(const NRSMat & rhs) -{ +inline NRSMat& NRSMat::operator+=(const NRSMat& rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator+="); + if(nn != rhs.nn) laerror("incompatible NRSMat& NRSMat::operator+=(const NRSMat &)"); #endif + NOT_GPU(*this); + SAME_LOC(*this, rhs); + copyonwrite(); - for(int i=0; i +inline NRSMat& NRSMat::operator-=(const NRSMat& rhs) { +#ifdef DEBUG + if(nn != rhs.nn) laerror("incompatible dimensions in NRSMat& NRSMat::operator-=(const NRSMat &)"); +#endif + SAME_LOC(*this, rhs); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(NN2, -1.0, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(NN2, -1.0, rhs.v, 1, v, 1); + TEST_CUBLAS("cublasDaxpy");//" NRSMat& NRSMat::operator-=(const NRSMat &)" + } +#endif + return *this; +} + +/***************************************************************************//** + * subtracts given complex symmetric matrix from this complex symmetric matrix + * @param[in] rhs complex symmetric matrix to be subtracted + * @return reference to the modified matrix + ******************************************************************************/ +template<> +inline NRSMat >& NRSMat >::operator-=(const NRSMat >& rhs) { +#ifdef DEBUG + if(nn != rhs.nn) laerror("incompatible dimensions in NRSMat >& NRSMat >::operator-=(const NRSMat > &)"); +#endif + SAME_LOC(*this, rhs); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(NN2, &CMONE, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasZaxpy(NN2, CUMONE, (cuDoubleComplex*)(rhs.v), 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy");//"NRSMat >& NRSMat >::operator-=(const NRSMat > &)" + } +#endif + return *this; +} + +/***************************************************************************//** + * subtracts given symmetric matrix of general type T from this symmetric matrix of type T + * @param[in] rhs symmetric matrix of general type T to be subtracted + * @return reference to the modified matrix + ******************************************************************************/ +template +inline NRSMat& NRSMat::operator-=(const NRSMat& rhs) { +#ifdef DEBUG + if(nn != rhs.nn) laerror("incompatible NRSMat& NRSMat::operator-=(const NRSMat &)"); +#endif + NOT_GPU(*this); + copyonwrite(); + + for(register int i = 0; i -inline NRSMat & -NRSMat::operator-=(const NRSMat & rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator-="); -#endif - copyonwrite(); - cblas_daxpy(NN2, -1.0, rhs.v, 1, v, 1); - return *this; -} - -template<> -inline NRSMat< complex > & -NRSMat< complex >::operator-=(const NRSMat< complex > & rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator-="); -#endif - copyonwrite(); - cblas_zaxpy(NN2, &CMONE, rhs.v, 1, v, 1); - return *this; -} - +/***************************************************************************//** + * add up given dense matrix of general type T with this symmetric matrix of type T + * @param[in] rhs dense matrix of type T to be added + * @return reference to the modified matrix + ******************************************************************************/ template -inline NRSMat & NRSMat::operator-=(const NRSMat & rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("incompatible SMats in SMat::operator-="); -#endif - copyonwrite(); - for(int i=0; i -inline const NRMat NRSMat::operator+(const NRMat &rhs) const -{ +inline const NRMat NRSMat::operator+(const NRMat &rhs) const { return NRMat(rhs) += *this; } -// SMat - Mat +/***************************************************************************//** + * subtracts given dense matrix of general type T from this symmetric matrix of type T + * @param[in] rhs dense matrix of type T to be added + * @return reference to the modified matrix + ******************************************************************************/ template -inline const NRMat NRSMat::operator-(const NRMat &rhs) const -{ +inline const NRMat NRSMat::operator-(const NRMat &rhs) const { return NRMat(-rhs) += *this; } -// access the element, linear array case +/***************************************************************************//** + * determine matrix element of this symmetric matrix of general type T + * using cumulative index increasing in a row-major way and corresponding to the + * lower triangular part of the respective dense matrix + * @param[in] ij index of the requested element + * @return reference to the corresponding matrix element + ******************************************************************************/ template -inline T & NRSMat::operator[](const int ij) -{ +inline T& NRSMat::operator[](const int ij) { #ifdef DEBUG - if (_LA_count_check && *count != 1) laerror("lval [] with count > 1 in Smat"); - if (ij<0 || ij>=NN2) laerror("SMat [] out of range"); - if (!v) laerror("[] for unallocated Smat"); -#endif - return v[ij]; -} -template -inline const T & NRSMat::operator[](const int ij) const -{ -#ifdef DEBUG - if (ij<0 || ij>=NN2) laerror("SMat [] out of range"); - if (!v) laerror("[] for unallocated Smat"); + if(_LA_count_check && *count != 1) laerror("T& NRSMat::operator[] used for matrix with count>1"); + if(ij<0 || ij>=NN2) laerror("T& NRSMat::operator[] out of range"); + if(!v) laerror("T& NRSMat::operator[] used for unallocated NRSmat object"); #endif + NOT_GPU(*this); + return v[ij]; } -template -inline T SMat_index(T i, T j) -{ -return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i; +/***************************************************************************//** + * determine matrix element of this symmetric matrix of general type T + * using cumulative index increasing in a row-major way and corresponding to the + * lower triangular part of the respective dense matrix, i.e. \f$A_{i,j}\f$ for + * \f$N>i\geq{}j\geq0\f$ corresponds to cumulative index \f$i(i+1)/2+j\f$ + * @param[in] ij index of the requested element + * @return constant reference to the corresponding matrix element + ******************************************************************************/ +template +inline const T & NRSMat::operator[](const int ij) const { +#ifdef DEBUG + if(ij<0 || ij>=NN2) laerror("T& NRSMat::operator[] out of range"); + if(!v) laerror("T& NRSMat::operator[] used for unallocated NRSmat object"); +#endif + NOT_GPU(*this); + + return v[ij]; } +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$0\leq{}i,j -inline T SMat_index_igej(T i, T j) -{ -return i*(i+1)/2+j; +inline T SMat_index(T i, T j) { + return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i; } +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$0\leq{}i,j -inline T SMat_index_ilej(T i, T j) -{ -return j*(j+1)/2+i; +inline T SMat_index_igej(T i, T j) { + return i*(i+1)/2+j; } - +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$0\leq{}i,j -inline T SMat_index_1(T i, T j) -{ -return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1; +inline T SMat_index_ilej(T i, T j) { + return j*(j+1)/2+i; } +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$1\leq{}i,j\leq{}N\f$ + * @param[in] i row index + * @param[in] i column index + * @return cumulative index + ******************************************************************************/ template -inline T SMat_index_1igej(T i, T j) -{ -return i*(i-1)/2+j-1; +inline T SMat_index_1(T i, T j) { + return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1; } +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$1\leq{}i,j\leq{}N\f$ for special case \f$i\geq{}j\f$ + * @param[in] i row index + * @param[in] i column index + * @return cumulative index + ******************************************************************************/ template -inline T SMat_index_1ilej(T i, T j) -{ -return j*(j-1)/2+i-1; +inline T SMat_index_1igej(T i, T j) { + return i*(i-1)/2+j-1; +} + +/***************************************************************************//** + * determine the cumulative index or matrix element in row \f$i\f$ and column \f$j\f$ + * where \f$1\leq{}i,j\leq{}N\f$ for special case \f$i\leq{}j\f$ + * @param[in] i row index + * @param[in] i column index + * @return cumulative index + ******************************************************************************/ +template +inline T SMat_index_1ilej(T i, T j) { + return j*(j-1)/2+i-1; } //indexing for antisymmetric matrix (used by fourindex) @@ -399,397 +646,549 @@ return j*(j-1)/2+i-1; template inline T ASMat_index(T i, T j) { -if(i==j) return -1; +if(i == j) return -1; return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i; } template inline T ASMat_index_1(T i, T j) { -if(i==j) return -1; +if(i == j) return -1; return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1; } - - -// access the element, 2-dim array case +/***************************************************************************//** + * determine matrix element of this symmetric matrix of general type T + * @param[in] i row index running from 0 + * @param[in] j column index running from 0 + * @return reference to the corresponding matrix element + * @see count, SMat_index, NRSMat::operator[] + ******************************************************************************/ template -inline T & NRSMat::operator()(const int i, const int j) -{ +inline T & NRSMat::operator()(const int i, const int j) { #ifdef DEBUG - if (_LA_count_check && *count != 1) laerror("lval (i,j) with count > 1 in Smat"); - if (i<0 || i>=nn || j<0 || j>=nn) laerror("SMat (i,j) out of range"); - if (!v) laerror("(i,j) for unallocated Smat"); -#endif - return v[SMat_index(i,j)]; -} -template -inline const T & NRSMat::operator()(const int i, const int j) const -{ -#ifdef DEBUG - if (i<0 || i>=nn || j<0 || j>=nn) laerror("SMat (i,j) out of range"); - if (!v) laerror("(i,j) for unallocated Smat"); + if(_LA_count_check && *count != 1) laerror("T & NRSMat::operator()(const int, const int) used for matrix with count > 1"); + if(i<0 || i>=nn || j<0 || j>=nn) laerror("T & NRSMat::operator()(const int, const int) out of range"); + if(!v) laerror("T & NRSMat::operator()(const int, const int) used for unallocated NRSmat object"); #endif + NOT_GPU(*this); + return v[SMat_index(i,j)]; } -// return the number of rows and columns +/***************************************************************************//** + * determine matrix element of this symmetric matrix of general type T + * @param[in] i row index running from 0 + * @param[in] j column index running from 0 + * @return constant reference to the corresponding matrix element + * @see count, SMat_index, NRSMat::operator[] + ******************************************************************************/ template -inline int NRSMat::nrows() const -{ - return nn; +inline const T & NRSMat::operator()(const int i, const int j) const { +#ifdef DEBUG + if(i<0 || i>=nn || j<0 || j>=nn) laerror("T & NRSMat::operator()(const int, const int) out of range"); + if(!v) laerror("T & NRSMat::operator()(const int, const int) used for unallocated NRSmat object"); +#endif + NOT_GPU(*this); + + return v[SMat_index(i,j)]; } + +/***************************************************************************//** + * @return number of rows of this symmetric matrix of generalt type T + ******************************************************************************/ template -inline int NRSMat::ncols() const -{ +inline int NRSMat::nrows() const { return nn; } +/***************************************************************************//** + * @return number of columns of this symmetric matrix of generalt type T + ******************************************************************************/ template -inline int NRSMat::size() const -{ +inline int NRSMat::ncols() const { + return nn; +} + +/***************************************************************************//** + * @return number of elements of this symmetric matrix of generalt type T + ******************************************************************************/ +template +inline int NRSMat::size() const { return NN2; } -// max value +/***************************************************************************//** + * for this real symmetric matrix \f$A\f$, determine the + * first element with largest absolute value + * @return \f$A_{l,m}\f$ which maximizes \f$\left|A_{i,j}\right|\f$ + ******************************************************************************/ template<> -inline const double NRSMat::amax() const -{ - return v[cblas_idamax(NN2, v, 1)]; -} -template<> -inline const complex NRSMat< complex >::amax() const -{ - return v[cblas_izamax(NN2, v, 1)]; +inline const double NRSMat::amax() const { + double ret(0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = v[cblas_idamax(NN2, v, 1) - 1]; +#ifdef CUDALA + }else{ + const int pozice = cublasIdamax(NN2, v, 1) - 1; + TEST_CUBLAS("cublasIdamax");//"double NRSMat::amax()" + + gpuget(1, sizeof(double), v + pozice, &ret); + } +#endif + return ret; } -// reference pointer to Smat -template -inline NRSMat:: operator T*() -{ -#ifdef DEBUG - if (!v) laerror("unallocated SMat in operator T*"); +/***************************************************************************//** + * for this real symmetric matrix \f$A\f$, determine the + * first element with smallest absolute value + * @return \f$A_{l,m}\f$ which minimizes \f$\left|A_{i,j}\right|\f$ + ******************************************************************************/ +template<> +inline const double NRSMat::amin() const { + double ret(0.0); +#ifdef CUDALA + if(location == cpu){ #endif - return v; + // idamin seems not to be supported + double val(0.0); + int index(-1); + ret = std::numeric_limits::max(); + for(register int i = 0; i < NN2; i++){ + val = std::abs(v[i]); + if(val < ret){ index = i; ret = val; } + } + ret = v[index]; +#ifdef CUDALA + }else{ + const int pozice = cublasIdamin(nn, v, 1) - 1; + TEST_CUBLAS("cublasIdamin");//"double NRSMat::amin()" + gpuget(1, sizeof(double), v + pozice, &ret); + } +#endif + return ret; } + +/***************************************************************************//** + * for this complex symmetric matrix \f$A\f$, determine the + * first element with largest "absolute value" + * @return \f$A_{l,m}\f$ which maximizes \f$\left|\Re{}A_{i,j}\right| + \left|\Im{}A_{i,j}\right|\f$ + ******************************************************************************/ +template<> +inline const complex NRSMat< complex >::amax() const{ + complex ret(0., 0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = v[cblas_izamax(NN2, v, 1) - 1]; +#ifdef CUDALA + }else{ + const int pozice = cublasIzamax(NN2, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamax");//"complex NRSMat >::amax()" + gpuget(1, sizeof(complex), v + pozice, &ret); + } +#endif + return ret; +} + +/***************************************************************************//** + * for this complex symmetric matrix \f$A\f$, determine the + * first element with smallest "absolute value" + * @return \f$A_{l,m}\f$ which minimizes \f$\left|\Re{}A_{i,j}\right| + \left|\Im{}A_{i,j}\right|\f$ + ******************************************************************************/ +template<> +inline const complex NRSMat >::amin() const{ + complex ret(0., 0.); +#ifdef CUDALA + if(location == cpu){ +#endif + // izamin seems not to be supported + int index(0); + double val(0.0), min_val(0.0); + complex z_val(0.0, 0.0); + + min_val = std::numeric_limits::max(); + for(register int i = 0; i < NN2; i++){ + z_val = v[i]; + val = std::abs(z_val.real()) + std::abs(z_val.imag()); + if(val < min_val){ index = i; min_val = val; } + } + ret = v[index]; +#ifdef CUDALA + }else{ + const int pozice = cublasIzamin(nn, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamin");//"complex NRSMat >::amin()" + gpuget(1, sizeof(complex), v + pozice, &ret); + } +#endif + return ret; +} + +/***************************************************************************//** + * @return pointer of general type T to the underlying data structure + ******************************************************************************/ template -inline NRSMat:: operator const T*() const -{ +inline NRSMat::operator T*() { #ifdef DEBUG - if (!v) laerror("unallocated SMat in operator T*"); + if(!v) laerror("unallocated NRSMat object in NRSMat::operator T*()"); #endif return v; } -//basic stuff to be available for any type ... must be in .h - -// dtor +/***************************************************************************//** + * @return constant pointer of general type T to the underlying data structure + ******************************************************************************/ template -NRSMat::~NRSMat() -{ - if (!count) return; - if (--(*count) <= 0) { - if (v) - { -#ifdef CUDALA - if(location==cpu) -#endif - delete[] v; -#ifdef CUDALA - else gpufree(v); -#endif - } - delete count; - } -} - - -// assignment with a physical copy -template -NRSMat & NRSMat::operator|=(const NRSMat &rhs) -{ +inline NRSMat::operator const T*() const { #ifdef DEBUG - if (!rhs.v) laerror("unallocated rhs in NRSMat operator |="); + if(!v) laerror("unallocated NRSMat object in NRSMat::operator const T*()"); #endif - if (this == &rhs) return *this; - *this = rhs; - this->copyonwrite(); - return *this; + return v; +} + +/***************************************************************************//** + * destructor for general type T + * @see NRSMat::count, NRSMat::v + ******************************************************************************/ +template +NRSMat::~NRSMat() { + if(!count) return; + if(--(*count) <= 0) { + if(v){ +#ifdef CUDALA + if(location == cpu){ +#endif + delete[] v; +#ifdef CUDALA + }else{ gpufree(v); } +#endif + } + delete count; + } +} + +/***************************************************************************//** + * assigment operator implementing deep copy of the reference NRSMat object + * @see NRSMat::operator=, NRSMat::copyonwrite() + ******************************************************************************/ +template +NRSMat & NRSMat::operator|=(const NRSMat &rhs) { +#ifdef DEBUG + if(!rhs.v) laerror("unallocated NRSMat object in NRSMat & NRSMat::operator|=(const NRSMat &)"); +#endif + if(this == &rhs) return *this; + *this = rhs; + this->copyonwrite(); + return *this; } -// assignment +/***************************************************************************//** + * assignment operator implementing shallow copy of reference NRSMat object + * @see NRSMat::operator|=, NRSMat::copyonwrite() + ******************************************************************************/ template -NRSMat & NRSMat::operator=(const NRSMat & rhs) -{ - if (this == & rhs) return *this; - if (count) - if(--(*count) == 0) - { +NRSMat & NRSMat::operator=(const NRSMat & rhs) { + if(this == &rhs) return *this; + if(count) + if(--(*count) == 0){ #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - delete [] v; + delete [] v; #ifdef CUDALA - else - gpufree(v); + }else{ gpufree(v); } #endif - delete count; - } - v = rhs.v; - nn = rhs.nn; - count = rhs.count; + delete count; + } + v = rhs.v; + nn = rhs.nn; + count = rhs.count; #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif - if (count) (*count)++; + if(count) (*count)++; return *this; } - -// make new instation of the Smat, deep copy +/***************************************************************************//** + * detach this NRSMat object and create own physical copy of the data + * @see NRSMat::operator|=, NRSMat::copyonwrite() + ******************************************************************************/ template -void NRSMat::copyonwrite() -{ - if (!count) laerror("SMat::copyonwrite() of undefined Smat"); - if (*count > 1) { - (*count)--; - count = new int; - *count = 1; +void NRSMat::copyonwrite() { + if(!count) laerror("calling NRSMat::copyonwrite() for undefined NRSMat object"); + if(*count > 1){ + (*count)--; + count = new int; + *count = 1; T *newv; #ifdef CUDALA - if(location==cpu) - { + if(location == cpu) { #endif - newv = new T[NN2]; - memcpy(newv, v, NN2*sizeof(T)); + newv = new T[NN2]; + memcpy(newv, v, NN2*sizeof(T)); #ifdef CUDALA - } - else - { - newv = (T *) gpualloc(NN2*sizeof(T)); - if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem"); - cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1); - } + }else{ + newv = (T *) gpualloc(NN2*sizeof(T)); + if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat::copyonwrite()"); + cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); + TEST_CUBLAS("cublasScopy");//"NRSMat::copyonwrite()" + } #endif v = newv; } } - -// resize Smat +/***************************************************************************//** + * resize this symmetric matrix of general type T + * @param[in] n requested number of rows (columns) + ******************************************************************************/ template -void NRSMat::resize(const int n) -{ +void NRSMat::resize(const int n) { #ifdef DEBUG - if (n < 0) laerror("illegal matrix dimension in resize of Smat"); + if(n < 0) laerror("illegal dimension in NRSMat::resize(const int)"); #endif - if (count) - { - if(n==0) - { - if(--(*count) <= 0) { - if(v) { + if(count){ + if(n == 0){ + if(--(*count) <= 0) { + if(v) { #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - delete[] (v); + delete[] (v); #ifdef CUDALA - else - gpufree(v); + }else{ gpufree(v); } #endif + } + delete count; } - delete count; - } - count=0; - nn=0; - v=0; - return; - } - - if(*count > 1) { //detach from previous - (*count)--; - count = 0; - v = 0; - nn = 0; - } + count = 0; + nn = 0; + v = 0; + return; + } + if(*count > 1){ + (*count)--; + count = 0; + v = 0; + nn = 0; + } } - if (!count) { //new uninitialized vector or just detached - count = new int; - *count = 1; - nn = n; + if(!count){ + count = new int; + *count = 1; + nn = n; #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - v = new T[NN2]; + v = new T[NN2]; #ifdef CUDALA - else - v = (T*) gpualloc(NN2*sizeof(T)); + }else{ v = (T*) gpualloc(NN2*sizeof(T)); } #endif - return; - } - if (n != nn) { - nn = n; + return; + } + if(n != nn){ + nn = n; #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - { - delete[] v; - v = new T[NN2]; - } + delete[] v; + v = new T[NN2]; #ifdef CUDALA - else - { - gpufree(v); - v = (T*) gpualloc(NN2*sizeof(T)); - } + }else{ + + gpufree(v); + v = (T*) gpualloc(NN2*sizeof(T)); + } #endif - } + } } +/***************************************************************************//** + * perform memory transfers between CPU and GPU memory + * @param[in] dest memory destination + * @see NRSMat::location, NRSMat::getlocation() + ******************************************************************************/ #ifdef CUDALA template -void NRSMat::moveto(const GPUID dest) -{ -if(location==dest) return; -CPU_GPU(location,dest); -location=dest; +void NRSMat::moveto(const GPUID dest) { + if(location == dest) return; -if(v && !count) laerror("internal inconsistency of reference counting 1"); -if (!count) return; + CPU_GPU(location,dest); + location = dest; -if(v && *count==0) laerror("internal inconsistency of reference counting 2"); -if(!v) return; + if(v && !count) laerror("internal inconsistency of reference counting 1"); + if(!count) return; -T *vold = v; + if(v && *count == 0) laerror("internal inconsistency of reference counting 2"); + if(!v) return; -if(dest == cpu) //moving from GPU to CPU - { - v = new T[NN2]; - gpuget(NN2,sizeof(T),vold,v); - if(*count == 1) gpufree(vold); - else {--(*count); count = new int(1);} - } -else //moving from CPU to GPU - { - v=(T *) gpualloc(NN2*sizeof(T)); - gpuput(NN2,sizeof(T),vold,v); - if(*count == 1) delete[] vold; - else {--(*count); count = new int(1);} - } + T *vold = v; + + if(dest == cpu){ //moving from GPU to CPU + v = new T[NN2]; + gpuget(NN2, sizeof(T), vold, v); + if(*count == 1) gpufree(vold); + else {--(*count); count = new int(1);} + + }else{ //moving from CPU to GPU + + v = (T *) gpualloc(NN2*sizeof(T)); + gpuput(NN2, sizeof(T), vold, v); + if(*count == 1) delete[] vold; + else {--(*count); count = new int(1);} + } } #endif - - +/***************************************************************************//** + * complexify a given matrix \f$A\f$ of general type T + * @param[in] rhs matrix \f$A\f$ intended for this operation + * @return matrix \f$B\f$ where \f$\Re B=A\f$ and \f$\Im B = 0\f$ + ******************************************************************************/ template -NRSMat > complexify(const NRSMat &rhs) -{ -NRSMat > r(rhs.nrows()); -for(int i=0; i > complexify(const NRSMat &rhs) { + NOT_GPU(rhs); + + NRSMat > r(rhs.nrows()); + for(register int i = 0; i +NRSMat > complexify(const NRSMat &rhs) { + NRSMat > r(rhs.nrows(), rhs.getlocation()); +#ifdef CUDALA + if(rhs.getlocation() == cpu){ +#endif + cblas_dcopy(rhs.size(), &(rhs[0]), 1, (double*)(&(r[0])), 2); +#ifdef CUDALA + }else{ + cublasDcopy(rhs.size(), &(rhs[0]), 1, (double*)(&(r[0])), 2); + TEST_CUBLAS("cublasDcopy");//"NRSMat > complexify(const NRSMat &)" + } +#endif + return r; +} +*/ +/***************************************************************************//** + * output operator + * @param[in,out] s output stream + * @param[in] x NRSMat matrix to be printed out + * @return modified stream + ******************************************************************************/ template -std::ostream& operator<<(std::ostream &s, const NRSMat &x) - { +std::ostream& operator<<(std::ostream &s, const NRSMat &x) { #ifdef CUDALA -if(x.getlocation()==cpu) - { + if(x.getlocation() == cpu){ #endif - int i,j,n; - n=x.nrows(); - s << n << ' ' << n << '\n'; - for(i=0;i::IOtype)x(i,j) << (j==n-1 ? '\n' : ' '); - } - return s; + int i,j,n; + n = x.nrows(); + s << n << ' ' << n << '\n'; + for(i = 0;i::IOtype)x(i,j) << (j == n-1 ? '\n' : ' '); + } + return s; #ifdef CUDALA - } -else - { - NRSMat tmp=x; - tmp.moveto(cpu); - return s< tmp = x; + tmp.moveto(cpu); + return s< matrix for storing the input + * @return modified stream + ******************************************************************************/ template -std::istream& operator>>(std::istream &s, NRSMat &x) - { +std::istream& operator>>(std::istream &s, NRSMat &x) { #ifdef CUDALA -if(x.getlocation()==cpu) - { + if(x.getlocation() == cpu){ #endif - int i,j,n,m; - s >> n >> m; - if(n!=m) laerror("input symmetric matrix not square"); - x.resize(n); + int i,j,n,m; + s >> n >> m; + if(n!=m) laerror("input symmetric matrix not square"); + x.resize(n); typename LA_traits_io::IOtype tmp; - for(i=0;i>tmp; x(i,j)=tmp;} - return s; + for(i = 0;i>tmp; x(i,j)=tmp;} + return s; #ifdef CUDALA - } -else - { - NRSMat tmp; - tmp.moveto(cpu); - s >> tmp; - tmp.moveto(x.getlocation()); - x=tmp; - return s; - } + }else{ + NRSMat tmp; + tmp.moveto(cpu); + s >> tmp; + tmp.moveto(x.getlocation()); + x = tmp; + return s; + } #endif - } +} -// generate operators: SMat + a, a + SMat, SMat * a +/***************************************************************************//** + * generate operators relating NRSMat objects and scalars + * corresponding macro is defined in vec.h + ******************************************************************************/ NRVECMAT_OPER(SMat,+) NRVECMAT_OPER(SMat,-) NRVECMAT_OPER(SMat,*) -// generate SMat + SMat, SMat - SMat + +/***************************************************************************//** + * generate operators relating in general two NRSMat objects + * corresponding macro is defined in vec.h + ******************************************************************************/ NRVECMAT_OPER2(SMat,+) NRVECMAT_OPER2(SMat,-) -//optional indexing from 1 -//all possible constructors have to be given explicitly, other stuff is inherited -//with exception of the operator() which differs +/***************************************************************************//** + * class implementing NRSMat funcitonality with indices running from 1 + * allmost all function members are inherited, only constructors are given explicitly + ******************************************************************************/ template class NRSMat_from1 : public NRSMat { public: - NRSMat_from1(): NRSMat() {}; - explicit NRSMat_from1(const int n): NRSMat(n) {}; + NRSMat_from1(): NRSMat() {}; + explicit NRSMat_from1(const int n): NRSMat(n) {}; NRSMat_from1(const NRSMat &rhs): NRSMat(rhs) {}; //be able to convert the parent class transparently to this - NRSMat_from1(const T &a, const int n): NRSMat(a,n) {}; - NRSMat_from1(const T *a, const int n): NRSMat(a,n) {}; - explicit NRSMat_from1(const NRMat &rhs): NRSMat(rhs) {}; - explicit NRSMat_from1(const NRVec &rhs, const int n): NRSMat(rhs,n) {}; + NRSMat_from1(const T &a, const int n): NRSMat(a,n) {}; + NRSMat_from1(const T *a, const int n): NRSMat(a,n) {}; + explicit NRSMat_from1(const NRMat &rhs): NRSMat(rhs) {}; + explicit NRSMat_from1(const NRVec &rhs, const int n): NRSMat(rhs,n) {}; - inline const T& operator() (const int i, const int j) const - { -#ifdef DEBUG - if(i<=0||j<=0||i>NRSMat::nn||j>NRSMat::nn) laerror("index out of range in NRSMat_from1"); -#endif + inline const T& operator() (const int i, const int j) const { + #ifdef DEBUG + if(i<=0||j<=0||i>NRSMat::nn||j>NRSMat::nn) laerror("index in const T& NRSMat::operator() (const int, const int) out of range"); + #endif return NRSMat::v[SMat_index_1(i,j)]; - } - inline T& operator() (const int i, const int j) - { -#ifdef DEBUG - if(i<=0||j<=0||i>NRSMat::nn||j>NRSMat::nn) laerror("index out of range in NRSMat_from1"); -#endif + } + + inline T& operator() (const int i, const int j){ + #ifdef DEBUG + if(i<=0||j<=0||i>NRSMat::nn||j>NRSMat::nn) laerror("index in T& NRSMat::operator() (const int, const int) out of range"); + #endif return NRSMat::v[SMat_index_1(i,j)]; - } + } }; }//namespace diff --git a/vec.cc b/vec.cc index c6b9bf1..c6c95d9 100644 --- a/vec.cc +++ b/vec.cc @@ -1,4 +1,5 @@ -/* +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +/******************************************************************************* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or complex versions written by Roman Curik @@ -16,9 +17,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -*/ - - +*******************************************************************************/ #include #include #include @@ -28,22 +27,822 @@ #include #include "vec.h" #include "qsort.h" + extern "C" { -extern ssize_t read(int, void *, size_t); -extern ssize_t write(int, const void *, size_t); + extern ssize_t read(int, void *, size_t); + extern ssize_t write(int, const void *, size_t); } namespace LA { -////////////////////////////////////////////////////////////////////////////// -//// forced instantization in the corespoding object file +/***************************************************************************//** + * conversion constructor interpreting a given matrix with \f$N\f$ rows and + * \f$M\f$ columns of general type T as a vector of \f$N\times{}M\f$ + * elements + * @param[in] rhs matrix being converted + * @see NRMat::NRMat() + ******************************************************************************/ +#ifndef MATPTR +template +NRVec::NRVec(const NRMat &rhs) { +#ifdef CUDALA + location = rhs.location; +#endif + nn = rhs.nn*rhs.mm; + v = rhs.v; + count = rhs.count; + (*count)++; +} +#endif + +/***************************************************************************//** + * routine for raw output + * @param[in] fd file descriptor for output + * @param[in] dim number of elements intended for output + * @param[in] transp reserved + * @see NRMat::put() + ******************************************************************************/ +template +void NRVec::put(int fd, bool dim, bool transp) const { +#ifdef CUDALA + if(location != cpu){ + NRVec tmp = *this; + tmp.moveto(cpu); + tmp.put(fd,dim,transp); + return; + } +#endif + errno = 0; + int pad(1); //align at least 8-byte + if(dim){ + if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("write failed"); + if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("write failed"); + } + LA_traits::multiput(nn,fd,v,dim); +} + +/***************************************************************************//** + * routine for raw input + * @param[in] fd file descriptor for input + * @param[in] dim number of elements intended for input, for dim=0 perform copyonwrite + * @param[in] transp reserved + * @see NRMat::get(), copyonwrite() + ******************************************************************************/ +template +void NRVec::get(int fd, bool dim, bool transp) { +#ifdef CUDALA + if(location != cpu){ + NRVec tmp; + tmp.moveto(cpu); + tmp.get(fd,dim,transp); + tmp.moveto(location); + *this = tmp; + return; + } +#endif + int nn0[2]; //align at least 8-byte + errno = 0; + if(dim){ + if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("read failed"); + resize(nn0[0]); + }else{ + copyonwrite(); + } + LA_traits::multiget(nn,fd,v,dim); +} + +/***************************************************************************//** + * routine for formatted output via lawritemat + * @param[in] file pointer to FILE structure representing the output file + * @param[in] format format specification in standard printf-like form + * @param[in] modulo + * @see lawritemat() + ******************************************************************************/ +template +void NRVec::fprintf(FILE *file, const char *format, const int modulo) const { + NOT_GPU(*this); + + lawritemat(file, v, 1, nn, format, 1, modulo, 0); +} + +/***************************************************************************//** + * routine for formatted input via fscanf + * @param[in] f pointer to FILE structure representing the input file + * @param[in] format format specification in standard printf-like form + ******************************************************************************/ +template +void NRVec::fscanf(FILE *f, const char *format) { + int n(0); + NOT_GPU(*this); + + if(::fscanf(f, "%d", &n) != 1) laerror("can not read vector dimension"); + resize(n); + for(register int i=0; i +const NRVec NRVec::operator-() const { + NRVec result(*this); + result.copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dscal(nn, -1.0, v, 1); +#ifdef CUDALA + }else{ + cublasDscal(nn, -1.0, v, 1); + TEST_CUBLAS("cublasDscal"); + } +#endif + return result; +} + +/***************************************************************************//** + * unary minus operator in case of complex double-precision vector + * @return the modified vector by value + ******************************************************************************/ +template<> +const NRVec > NRVec >::operator-() const { + NRVec > result(*this); + result.copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zdscal(nn, -1.0, v, 1); +#ifdef CUDALA + }else{ + cublasZdscal(nn, -1.0, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZdscal"); + } +#endif + return result; +} + +/***************************************************************************//** + * unary minus operator for vector of general type + * @return the modified vector + ******************************************************************************/ +template +const NRVec NRVec::operator-() const { + NOT_GPU(*this); + NRVec result(nn, getlocation()); + for(register int i=0; i +const bool NRVec::operator>(const NRVec &rhs) const { + int n(nn); + + SAME_LOC(*this, rhs); + NOT_GPU(*this); + + if(rhs.nn < n) n = rhs.nn; + for(register int i=0; i::bigger(v[i], rhs.v[i])) return true; + if(LA_traits::smaller(v[i], rhs.v[i])) return false; + } + return nn>rhs.nn; +} + +/***************************************************************************//** + * comparison operator (lexicographical order) + * @param[in] rhs vector intended for comparison + * @return + * \li \c false current vector is bigger than vector \c rhs + * \li \c true current vector is smaller than vector \c rhs + ******************************************************************************/ +template +const bool NRVec::operator<(const NRVec &rhs) const { + int n(nn); + + SAME_LOC(*this, rhs); + NOT_GPU(*this); + + if(rhs.nn < n) n = rhs.nn; + for(register int i=0; i::smaller(v[i], rhs.v[i])) return true; + if(LA_traits::bigger(v[i], rhs.v[i])) return false; + } + return nn +void NRVec::randomize(const double &x){ + NOT_GPU(*this); + + for(register int i=0; i +void NRVec >::randomize(const double &x) { + NOT_GPU(*this); + + for(register int i=0; i(x*(2.*random()/(1. + RAND_MAX) - 1.), x*(2.*random()/(1. + RAND_MAX) - 1.)); + } +} + +/***************************************************************************//** + * constructor creating complex vector from a real one + * @param[in] rhs the real vector being converted into the complex one + * @param[in] imagpart + * \li \c true vector \c rhs is interpreted as the imaginary part of the new complex vector + * \li \c false vector \c rhs is interpreted as the real part of the new complex vector + * @return + * \li \c false current vector is bigger than vector \c rhs + * \li \c true current vector is smaller than vector \c rhs + ******************************************************************************/ +template<> +NRVec >::NRVec(const NRVec &rhs, bool imagpart): nn(rhs.size()){ + + count = new int; + *count = 1; +#ifdef CUDALA + location = rhs.getlocation(); + if(location == cpu){ +#endif + v = (complex*)new complex[nn]; + memset(v, 0, nn*sizeof(complex)); + cblas_dcopy(nn, &rhs[0], 1, ((double *)v) + (imagpart?1:0), 2); +#ifdef CUDALA + }else{ + v = (complex*) gpualloc(nn*sizeof(complex)); + + cublasZscal(nn, CUZERO, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZscal"); + + cublasDcopy(nn, &rhs[0], 1, ((double *)v) + (imagpart?1:0), 2); + TEST_CUBLAS("cublasDcopy"); + } +#endif +} + +/***************************************************************************//** + * perform the axpy operation on the current real vector \f$\vec{v}\f$, i.e. + * \f[ \vec{v} \leftarrow \vec{v} + \alpha\vec{x} \f] + * @param[in] alpha double-precision real parameter \f$\alpha\f$ + * @param[in] x double-precision real vector \f$\vec{x}\f$ + ******************************************************************************/ +template<> +void NRVec::axpy(const double alpha, const NRVec &x) { +#ifdef DEBUG + if (nn != x.nn) laerror("incompatible vectors"); +#endif + SAME_LOC(*this, x); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(nn, alpha, x.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(nn, alpha, x.v, 1, v, 1); + TEST_CUBLAS("cublasDaxpy"); + } +#endif +} + +/***************************************************************************//** + * perform the axpy operation on the current complex vector \f$\vec{v}\f$, i.e. + * \f[ \vec{v} \leftarrow \vec{v} + \alpha\vec{x} \f] + * @param[in] alpha \f$\alpha\f$ parameter + * @param[in] x complex vector \f$\vec{x}\f$ + ******************************************************************************/ +template<> +void NRVec >::axpy(const complex alpha, const NRVec > &x){ +#ifdef DEBUG + if (nn != x.nn) laerror("incompatible vectors"); +#endif + SAME_LOC(*this, x); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zaxpy(nn, &alpha, x.v, 1, v, 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); + cublasZaxpy(nn, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy"); + } +#endif + +} + +/***************************************************************************//** + * perform the axpy operation on the current real vector \f$\vec{v}\f$, i.e. + * \f[ \vec{v} \leftarrow \vec{v} + \alpha\vec{x} \f] + * @param[in] alpha \f$\alpha\f$ parameter + * @param[in] x pointer to double-precision real data + * @param[in] stride sets the stride + ******************************************************************************/ +template<> +void NRVec::axpy(const double alpha, const double *x, const int stride){ + NOT_GPU(*this); + + copyonwrite(); + cblas_daxpy(nn, alpha, x, stride, v, 1); +} + +/***************************************************************************//** + * perform the axpy operation on the current complex vector \f$\vec{v}\f$, i.e. + * \f[ \vec{v} \leftarrow \vec{v} + \alpha\vec{x} \f] + * @param[in] alpha double-precision complex parameter \f$\alpha\f$ + * @param[in] x pointer to double-precision complex data + * @param[in] stride sets the stride + ******************************************************************************/ +template<> +void NRVec >::axpy(const complex alpha, const complex *x, const int stride){ + NOT_GPU(*this); + + copyonwrite(); + cblas_zaxpy(nn, &alpha, x, stride, v, 1); +} + +/***************************************************************************//** + * assign real scalar value to every element of the current vector + * @param[in] a scalar value to be assigned + * @return reference to the modified vector + ******************************************************************************/ +template<> +NRVec& NRVec::operator=(const double &a){ +copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dcopy(nn, &a, 0, v, 1); +#ifdef CUDALA + }else{ + smart_gpu_set(nn, (double)0, v); + } +#endif + return *this; +} + +/***************************************************************************//** + * assign complex scalar value to every element of the current vector + * @param[in] a scalar value to be assigned + * @return reference to the modified vector + ******************************************************************************/ +template<> +NRVec >& NRVec >::operator=(const complex &a){ +copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zcopy(nn, &a, 0, v, 1); +#ifdef CUDALA + }else{ + smart_gpu_set(nn, (complex)0, v); + } +#endif + return *this; +} + +/***************************************************************************//** + * assign scalar value to every element of the current vector of general type T + * @param[in] a scalar value to be assigned + * @return reference to the modified vector + ******************************************************************************/ +template +NRVec& NRVec::operator=(const T &a){ + NOT_GPU(*this); + copyonwrite(); + + if(a != (T)0){ + for(register int i=0; i +NRVec& NRVec::normalize(double *norm){ + double tmp(0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + tmp = cblas_dnrm2(nn, v, 1); + if(norm) *norm = tmp; + #ifdef DEBUG + if(!tmp) laerror("attempt to normalize zero vector"); + #endif + copyonwrite(); + tmp = 1.0 / tmp; + cblas_dscal(nn, tmp, v, 1); +#ifdef CUDALA + }else{ + tmp = cublasDnrm2(nn, v, 1); + TEST_CUBLAS("cublasDnrm2"); + + if(norm) *norm = tmp; + #ifdef DEBUG + if(!tmp) laerror("attempt to normalize zero vector"); + #endif + copyonwrite(); + tmp = 1.0 / tmp; + cublasDscal(nn, tmp, v, 1); + TEST_CUBLAS("cublasDscal"); + } +#endif + return *this; +} + +/***************************************************************************//** + * normalize current complex vector (in the Euclidean norm) + * @param[in] norm if not NULL, the norm of this vector is stored into *norm + * @return reference to the modified vector + ******************************************************************************/ +template<> +NRVec > & NRVec >::normalize(double *norm){ + double tmp(0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + tmp = cblas_dznrm2(nn, v, 1); + if(norm) *norm = tmp; + #ifdef DEBUG + if(tmp == 0.0) laerror("attempt to normalize zero vector"); + #endif + copyonwrite(); + tmp = 1.0 / tmp; + cblas_zdscal(nn, tmp, v, 1); +#ifdef CUDALA + }else{ + tmp = cublasDznrm2(nn, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasDznrm2"); + + if(norm) *norm = tmp; + #ifdef DEBUG + if(tmp == 0.0) laerror("attempt to normalize zero vector"); + #endif + copyonwrite(); + + tmp = 1.0 / tmp; + cublasZdscal(nn, tmp, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZdscal"); + } +#endif + return *this; +} + +/***************************************************************************//** + * perform the \b gemv operation on this real vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta real parameter \f$\beta\f$ + * @param[in] A real matrix \f$A\f$ + * @param[in] trans if trans == 'n' use \f$A\f$ directly, otherwise \f$\operatorname{op}(A)\equiv{}A^\mathrm{T}\f$ + * @param[in] alpha real parameter \f$\alpha\f$ + * @param[in] x real vector \f$\vec{x}\f$ + * @see NRMat::gemm + ******************************************************************************/ +template<> +void NRVec::gemv(const double beta, const NRMat &A, + const char trans, const double alpha, const NRVec &x) { +#ifdef DEBUG + if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } +#endif + SAME_LOC3(*this, x, A); + copyonwrite(); + +#ifdef CUDALA + if(location==cpu){ +#endif + cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); +#ifdef CUDALA + }else{ + cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); + TEST_CUBLAS("cublasDgemv"); + } +#endif +} + +/***************************************************************************//** + * perform the \b gemv operation on this complex vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta real parameter \f$\beta\f$ + * @param[in] A real matrix \f$A\f$ + * @param[in] trans if trans == 'n' use \f$A\f$ directly, otherwise \f$\operatorname{op}(A)\equiv{}A^\mathrm{T}\f$ + * @param[in] alpha real parameter \f$\alpha\f$ + * @param[in] x real vector \f$\vec{x}\f$ + * @see gemm + ******************************************************************************/ +template<> +void NRVec >::gemv(const double beta, const NRMat &A, + const char trans, const double alpha, const NRVec > &x) { +#ifdef DEBUG + if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } +#endif + SAME_LOC3(*this, x, A); + copyonwrite(); + +#ifdef CUDALA + if(location==cpu){ +#endif + cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2); + cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); +#ifdef CUDALA + }else{ + cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2); + TEST_CUBLAS("cublasDgemv"); + + cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); + TEST_CUBLAS("cublasDgemv"); + } +#endif +} + + +/***************************************************************************//** + * perform the \b gemv operation on this complex vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta complex parameter \f$\beta\f$ + * @param[in] A complex matrix \f$A\f$ + * @param[in] trans if trans == 'n' use \f$A\f$ directly, otherwise \f$\operatorname{op}(A)\equiv{}A^\mathrm{T}\f$ + * @param[in] alpha complex parameter \f$\alpha\f$ + * @param[in] x real vector \f$\vec{x}\f$ + * @see gemm + ******************************************************************************/ +template<> +void NRVec >::gemv(const complex beta, + const NRMat > &A, const char trans, + const complex alpha, const NRVec > &x) { +#ifdef DEBUG + if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } +#endif + SAME_LOC3(*this, x, A); + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1); +#ifdef CUDALA + }else{ + + const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); + const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag()); + + cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), + _alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZgemv"); + } +#endif +} + +/***************************************************************************//** + * perform the \b gemv operation on this real vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta real parameter \f$\beta\f$ + * @param[in] A real symmetric matrix \f$A\f$ stored in packed form + * @param[in] trans if trans == 'n' use \f$A\f$ directly, otherwise \f$\operatorname{op}(A)\equiv{}A^\mathrm{T}\f$ + * @param[in] alpha real parameter \f$\alpha\f$ + * @param[in] x real vector \f$\vec{x}\f$ + * @see gemm, NRSMat + ******************************************************************************/ +template<> +void NRVec::gemv(const double beta, const NRSMat &A, + const char trans, const double alpha, const NRVec &x) { +#ifdef DEBUG + if(A.ncols() != x.size()){ laerror("incompatible dimensions"); } +#endif + SAME_LOC3(*this, A, x); + copyonwrite(); + +#ifdef CUDALA + if(location==cpu){ +#endif + cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1); +#ifdef CUDALA + }else{ + cublasDspmv('U', A.ncols(), alpha, A, x.v, 1, beta, v, 1); + TEST_CUBLAS("cublasDspmv"); + } +#endif +} + + +/***************************************************************************//** + * perform the \c gemv operation on this complex vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta real parameter \f$\beta\f$ + * @param[in] A real symmetric matrix \f$A\f$ stored in packed form + * @param[in] trans if trans == 'n' use \f$A\f$ directly, otherwise \f$\operatorname{op}(A)\equiv{}A^\mathrm{T}\f$ + * @param[in] alpha real parameter \f$\alpha\f$ + * @param[in] x complex vector \f$\vec{x}\f$ + * @see gemm, NRSMat + ******************************************************************************/ +template<> +void NRVec >::gemv(const double beta, const NRSMat &A, + const char trans, const double alpha, const NRVec > &x) { +#ifdef DEBUG + if(A.ncols() != x.size()){ laerror("incompatible dimensions"); } +#endif + SAME_LOC3(*this, A, x); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, (double *)x.v, 2, beta, (double *)v, 2); + cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, ((double *)x.v) + 1, 2, beta, ((double *)v) + 1, 2); +#ifdef CUDALA + }else{ + cublasDspmv('U', A.ncols(), alpha, A, (double*)(x.v), 2, beta, (double*)v, 2); + TEST_CUBLAS("cublasDspmv"); + + cublasDspmv('U', A.ncols(), alpha, A, ((double*)(x.v)) + 1, 2, beta, ((double*)v) + 1, 2); + TEST_CUBLAS("cublasDspmv"); + } +#endif +} + +/***************************************************************************//** + * perform the \b gemv operation on this complex vector \f$\vec{y}\f$, i.e. + * \f[\vec{y}\leftarrow \alpha\operatorname{op}(A)\cdot\vec{x}+\beta\vec{y}\f] + * @param[in] beta complex parameter \f$\beta\f$ + * @param[in] A complex Hermitian matrix \f$A\f$ stored in packed form + * @param[in] trans not used + * @param[in] alpha complex parameter \f$\alpha\f$ + * @param[in] x complex vector \f$\vec{x}\f$ + * @see gemm, NRSMat + ******************************************************************************/ +template<> +void NRVec >::gemv(const complex beta, + const NRSMat > &A, const char trans, + const complex alpha, const NRVec > &x){ +#ifdef DEBUG + if(A.ncols() != x.size()) laerror("incompatible dimensions"); +#endif + SAME_LOC3(*this, A, x); + copyonwrite(); + +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zhpmv(CblasRowMajor, CblasLower, A.ncols(), &alpha, A, x.v, 1, &beta, v, 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); + const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag()); + + cublasZhpmv('U', A.ncols(), _alpha, (cuDoubleComplex*)((const complex*)A), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)(this->v), 1); + TEST_CUBLAS("cublasZhpmv"); + } +#endif +} + +/***************************************************************************//** + * computes the outer product of this real vector \f$\vec{a}\f$ with given + * real vector \f$\vec{b}\f$ and scales the resulting matrix with factor \f$\alpha\f$, i.e. + * the matrix elements of the final matrix \f$A\f$ can be expressed as + * \f[A_{i,j} = \alpha\cdot\vec{a}_i\vec{b}_j\f] + * @param[in] b real vector \f$\vec{b}\f$ + * @param[in] conj not used + * @param[in] scale real factor \f$\alpha\f$ + ******************************************************************************/ +template<> +const NRMat NRVec::otimes(const NRVec &b,const bool conj, const double &scale) const { + + SAME_LOC(*this, b); + NRMat result(0.0, nn, b.nn, this->getlocation()); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dger(CblasRowMajor, nn, b.nn, scale, v, 1, b.v, 1, result, b.nn); +#ifdef CUDALA + }else{ + cublasDger(b.nn, nn, scale, b.v, 1, v, 1, result[0], b.nn); + TEST_CUBLAS("cublasDger"); + } +#endif + return result; +} + +/***************************************************************************//** + * computes the outer product of this complex vector \f$\vec{a}\f$ with given + * complex vector \f$\vec{b}\f$ and scales the resulting matrix with factor \f$\alpha\f$, i.e. + * the matrix elements of the final matrix \f$A\f$ can be expressed as + * \f[A_{i,j} = \alpha\cdot\vec{a}_i\vec{b}_j\f] + * in case conj = true, the result is + * \f[A_{i,j} = \alpha\cdot\vec{a}_i\vec{b}_j^{*}\f] + * @param[in] b complex vector \f$\vec{b}\f$ + * @param[in] conj determines whther the vector \f$\vec{b}\f$ is conjugated + * @param[in] scale complex scaling factor \f$\alpha\f$ + ******************************************************************************/ +template<> +const NRMat > +NRVec >::otimes(const NRVec > &b, const bool conj, const complex &scale) const { + + SAME_LOC(*this, b); + NRMat > result(0., nn, b.nn, this->getlocation()); + +#ifdef CUDALA + if(location == cpu){ +#endif + if(conj){ + cblas_zgerc(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result, b.nn); + }else{ + cblas_zgeru(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result, b.nn); + } +#ifdef CUDALA + }else{ + if(conj){ + const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), -scale.imag()); + + cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1); + TEST_CUBLAS("cublasZgerc"); + + result.conjugateme(); + }else{ + const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), +scale.imag()); + + cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1); + TEST_CUBLAS("cublasZgeru"); + } + } +#endif + return result; +} + +template +int NRVec::sort(int direction, int from, int to, int *perm) { + NOT_GPU(*this); + + copyonwrite(); + if(to == -1) to = nn - 1; + if(direction) return memqsort<1, NRVec, int, int>(*this, perm, from, to); + else return memqsort<0, NRVec, int, int>(*this, perm, from, to); +} + +template<> +NRVec > complexify(const NRVec &rhs) { + NRVec > r(rhs.size(), rhs.getlocation()); + +#ifdef CUDALA + if(rhs.getlocation() == cpu){ +#endif + cblas_dcopy(rhs.size(), &rhs[0], 1, (double *)(&r[0]), 2); +#ifdef CUDALA + }else{ + r = 0; + cublasDcopy(rhs.size(), rhs.v, 1, (double*)(r.v), 2); + TEST_CUBLAS("cublasDcopy"); + } +#endif + return r; +} + +/***************************************************************************//** + * forced instantization in the corespoding object file + ******************************************************************************/ #define INSTANTIZE(T) \ template void NRVec::put(int fd, bool dim, bool transp) const; \ template void NRVec::get(int fd, bool dim, bool transp); \ - - INSTANTIZE(double) INSTANTIZE(complex) INSTANTIZE(char) @@ -57,422 +856,6 @@ INSTANTIZE(unsigned int) INSTANTIZE(unsigned long) INSTANTIZE(unsigned long long) - - - -/* - * Templates first, specializations for BLAS next - */ - -// conversion ctor -#ifndef MATPTR -template -NRVec::NRVec(const NRMat &rhs) -{ - nn = rhs.nn*rhs.mm; - v = rhs.v; - count = rhs.count; - (*count)++; -} -#endif - - - - - -//raw I/O -template -void NRVec::put(int fd, bool dim, bool transp) const -{ -#ifdef CUDALA -if(location!=cpu) - { - NRVec tmp= *this; - tmp.moveto(cpu); - tmp.put(fd,dim,transp); - return; - } -#endif - -errno=0; -int pad=1; //align at least 8-byte -if(dim) -{ -if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write"); -if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("cannot write"); -} -LA_traits::multiput(nn,fd,v,dim); -} - - -template -void NRVec::get(int fd, bool dim, bool transp) -{ -#ifdef CUDALA -if(location!=cpu) - { - NRVec tmp; - tmp.moveto(cpu); - tmp.get(fd,dim,transp); - tmp.moveto(location); - *this = tmp; - return; - } -#endif -int nn0[2]; //align at least 8-byte -errno=0; -if(dim) -{ -if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read"); -resize(nn0[0]); -} -else -copyonwrite(); -LA_traits::multiget(nn,fd,v,dim); -} - - - - -// formatted print for NRVec -template -void NRVec::fprintf(FILE *file, const char *format, const int modulo) const -{ - lawritemat(file, v, 1, nn, format, 1, modulo, 0); -} - -// formatted scan for NRVec -template -void NRVec::fscanf(FILE *f, const char *format) -{ - int n; - - if(::fscanf(f, "%d", &n) != 1) laerror("cannot read vector dimension"); - resize(n); - for (int i=0; i -const NRVec NRVec::operator-() const -{ - NRVec result(nn); - for (int i=0; i -const bool NRVec::operator>(const NRVec &rhs) const -{ -int n=nn; if(rhs.nn::bigger(v[i],rhs.v[i])) return true; - if(LA_traits::smaller(v[i],rhs.v[i])) return false; - } -return nn>rhs.nn; -} - -template -const bool NRVec::operator<(const NRVec &rhs) const -{ -int n=nn; if(rhs.nn::smaller(v[i],rhs.v[i])) return true; - if(LA_traits::bigger(v[i],rhs.v[i])) return false; - } -return nn -void NRVec::randomize(const double &x) -{ -for(int i=0; i -void NRVec >::randomize(const double &x) -{ -for(int i=0; i (x*(2.*random()/(1.+RAND_MAX) -1.),x*(2.*random()/(1.+RAND_MAX) -1.)); -} - - - -//complex from real constructor -template<> -NRVec >::NRVec(const NRVec &rhs, bool imagpart) -: nn(rhs.size()), v(new complex[rhs.size()]), count(new int(1)) -{ -memset(v,0,nn*sizeof(complex)); -cblas_dcopy(nn,&rhs[0],1,((double *)v) + (imagpart?1:0),2); -} - - -// axpy call for T = double (not strided) -template<> -void NRVec::axpy(const double alpha, const NRVec &x) -{ -#ifdef DEBUG - if (nn != x.nn) laerror("axpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_daxpy(nn, alpha, x.v, 1, v, 1); -} - -// axpy call for T = complex (not strided) -template<> -void NRVec< complex >::axpy(const complex alpha, - const NRVec< complex > &x) -{ -#ifdef DEBUG - if (nn != x.nn) laerror("axpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_zaxpy(nn, &alpha, x.v, 1, v, 1); -} - -// axpy call for T = double (strided) -template<> -void NRVec::axpy(const double alpha, const double *x, const int stride) -{ - copyonwrite(); - cblas_daxpy(nn, alpha, x, stride, v, 1); -} - -// axpy call for T = complex (strided) -template<> -void NRVec< complex >::axpy(const complex alpha, - const complex *x, const int stride) -{ - copyonwrite(); - cblas_zaxpy(nn, &alpha, x, stride, v, 1); -} - -// unary minus -template<> -const NRVec NRVec::operator-() const -{ - NRVec result(*this); - result.copyonwrite(); - cblas_dscal(nn, -1.0, result.v, 1); - return result; -} - -template<> -const NRVec< complex > -NRVec< complex >::operator-() const -{ - NRVec< complex > result(*this); - result.copyonwrite(); - cblas_zdscal(nn, -1.0, result.v, 1); - return result; -} - -// assignment of scalar to every element -template -NRVec & NRVec::operator=(const T &a) -{ - copyonwrite(); - if(a != (T)0) - for (int i=0; i -template<> -NRVec & NRVec::normalize() -{ - double tmp; - - tmp = cblas_dnrm2(nn, v, 1); -#ifdef DEBUG - if(!tmp) laerror("normalization of zero vector"); -#endif - copyonwrite(); - tmp = 1.0/tmp; - cblas_dscal(nn, tmp, v, 1); - return *this; -} - -// Normalization of NRVec< complex > -template<> -NRVec< complex > & NRVec< complex >::normalize() -{ - complex tmp; - tmp = cblas_dznrm2(nn, v, 1); -#ifdef DEBUG - if(!(tmp.real()) && !(tmp.imag())) laerror("normalization of zero vector"); -#endif - copyonwrite(); - tmp = 1.0/tmp; - cblas_zscal(nn, &tmp, v, 1); - return *this; -} - -//stubs for linkage - -#define INSTANTIZE_DUMMY(T) \ -template<> void NRVec::gemv(const T beta, const NRMat &a, const char trans, const T alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ -template<> void NRVec::gemv(const T beta, const NRSMat &a, const char trans, const T alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ -template<> void NRVec::gemv(const T beta, const SparseMat &a, const char trans, const T alpha, const NRVec &x, bool s) { laerror("gemv on unsupported types"); } \ -template<> void NRVec::gemv(const LA_traits_complex::Component_type beta, const LA_traits_complex::NRMat_Noncomplex_type &a, const char trans, const LA_traits_complex::Component_type alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ -template<> void NRVec::gemv(const LA_traits_complex::Component_type beta, const LA_traits_complex::NRSMat_Noncomplex_type &a, const char trans, const LA_traits_complex::Component_type alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ -template<> NRVec & NRVec::normalize() {laerror("normalize() impossible for integer types"); return *this;} \ -template<> const NRMat NRVec::otimes(const NRVec &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat ();} - - - - -// gemv calls -template<> -void NRVec::gemv(const double beta, const NRMat &A, - const char trans, const double alpha, const NRVec &x) -{ -#ifdef DEBUG - if ((trans == 'n'?A.ncols():A.nrows()) != x.size()) - laerror("incompatible sizes in gemv A*x"); -#endif - SAME_LOC3(*this,x,A); - copyonwrite(); -#ifdef CUDALA - if(location==cpu) -#endif - cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); -#ifdef CUDALA - else - cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1); -#endif -} - - -template<> -void NRVec >::gemv(const double beta, const NRMat &A, - const char trans, const double alpha, const NRVec > &x) -{ -#ifdef DEBUG - if ((trans == 'n'?A.ncols():A.nrows()) != x.size()) - laerror("incompatible sizes in gemv A*x"); -#endif - copyonwrite(); - cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2); - cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); -} - - -template<> -void NRVec< complex >::gemv(const complex beta, - const NRMat< complex > &A, const char trans, - const complex alpha, const NRVec &x) -{ -#ifdef DEBUG - if ((trans == 'n'?A.ncols():A.nrows()) != x.size()) - laerror("incompatible sizes in gemv A*x"); -#endif - copyonwrite(); - cblas_zgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), - A.nrows(), A.ncols(), &alpha, A, A.ncols(), - x.v, 1, &beta, v, 1); -} - - -template<> -void NRVec::gemv(const double beta, const NRSMat &A, - const char trans, const double alpha, const NRVec &x) -{ -#ifdef DEBUG - if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x"); -#endif -SAME_LOC3(*this,A,x); - copyonwrite(); -#ifdef CUDALA -if(location==cpu) -#endif - cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1); -#ifdef CUDALA -else - cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1); -#endif -} - -template<> -void NRVec >::gemv(const double beta, const NRSMat &A, - const char trans, const double alpha, const NRVec > &x) -{ -#ifdef DEBUG - if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x"); -#endif - copyonwrite(); - cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, (double *)x.v, 2, beta, (double *)v, 2); - cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, ((double *)x.v)+1, 2, beta, ((double *)v)+1, 2); -} - - - -template<> -void NRVec< complex >::gemv(const complex beta, - const NRSMat< complex > &A, const char trans, - const complex alpha, const NRVec &x) -{ -#ifdef DEBUG - if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv"); -#endif - copyonwrite(); - cblas_zhpmv(CblasRowMajor, CblasLower, A.ncols(), &alpha, A, - x.v, 1, &beta, v, 1); -} - - - - - -// Direct product Mat = Vec | Vec -template<> -const NRMat NRVec::otimes(const NRVec &b,const bool conj, const double &scale) const -{ - NRMat result(0.,nn,b.nn); - cblas_dger(CblasRowMajor, nn, b.nn, scale, v, 1, b.v, 1, result, b.nn); - return result; -} - -template<> -const NRMat< complex > -NRVec >::otimes(const NRVec< complex > &b, const bool conj, const complex &scale) const -{ - NRMat< complex > result(0.,nn,b.nn); - if(conj) cblas_zgerc(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result, b.nn); - else cblas_zgeru(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result, b.nn); - return result; -} - - -template -int NRVec::sort(int direction, int from, int to, int *perm) -{ -NOT_GPU(*this); -copyonwrite(); -if(to == -1) to=nn-1; -if(direction) return memqsort<1,NRVec,int,int>(*this,perm,from,to); -else return memqsort<0,NRVec,int,int>(*this,perm,from,to); -} - - - - - - -////////////////////////////////////////////////////////////////////////////// -//// forced instantization in the corespoding object file - - template class NRVec; template class NRVec >; template class NRVec; @@ -486,6 +869,15 @@ template class NRVec; template class NRVec; template class NRVec; +#define INSTANTIZE_DUMMY(T) \ +template<> void NRVec::gemv(const T beta, const NRMat &a, const char trans, const T alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ +template<> void NRVec::gemv(const T beta, const NRSMat &a, const char trans, const T alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ +template<> void NRVec::gemv(const T beta, const SparseMat &a, const char trans, const T alpha, const NRVec &x, bool s) { laerror("gemv on unsupported types"); } \ +template<> void NRVec::gemv(const LA_traits_complex::Component_type beta, const LA_traits_complex::NRMat_Noncomplex_type &a, const char trans, const LA_traits_complex::Component_type alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ +template<> void NRVec::gemv(const LA_traits_complex::Component_type beta, const LA_traits_complex::NRSMat_Noncomplex_type &a, const char trans, const LA_traits_complex::Component_type alpha, const NRVec &x) { laerror("gemv on unsupported types"); } \ +template<> NRVec & NRVec::normalize(LA_traits::normtype *) {laerror("normalize() impossible for integer types"); return *this;} \ +template<> const NRMat NRVec::otimes(const NRVec &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat ();} + INSTANTIZE_DUMMY(char) INSTANTIZE_DUMMY(short) @@ -507,7 +899,6 @@ INSTANTIZE_DUMMY(complex) INSTANTIZE_DUMMY(complex) INSTANTIZE_DUMMY(complex) INSTANTIZE_DUMMY(complex) - INSTANTIZE_DUMMY(complex >) INSTANTIZE_DUMMY(complex >) diff --git a/vec.h b/vec.h index e959069..e488f64 100644 --- a/vec.h +++ b/vec.h @@ -1,4 +1,5 @@ -/* +/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ +/******************************************************************************* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or complex versions written by Roman Curik @@ -15,7 +16,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -*/ +*******************************************************************************/ #ifndef _LA_VEC_H_ #define _LA_VEC_H_ @@ -23,19 +24,24 @@ namespace LA { -////////////////////////////////////////////////////////////////////////////// -// Forward declarations -template void lawritemat(FILE *file,const T *a,int r,int c, - const char *form0,int nodim,int modulo, int issym); +/***************************************************************************//** + * forward declarations + ******************************************************************************/ +template void lawritemat(FILE *file, const T *a, int r, int c, + const char *form0, int nodim, int modulo, int issym); -// Memory allocated constants for cblas routines +/***************************************************************************//** + * static constants used in several cblas-routines + ******************************************************************************/ const static complex CONE = 1.0, CMONE = -1.0, CZERO = 0.0; #ifdef CUDALA const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.}; #endif -// Macros to construct binary operators +,-,*, from +=, -=, *= -// for 3 cases: X + a, a + X, X + Y +/***************************************************************************//** + * macros to construct binary operators +,-,*, from +=, -=, *= + * for 3 cases: X + a, a + X, X + Y + ******************************************************************************/ #define NRVECMAT_OPER(E,X) \ template \ inline const NR##E NR##E::operator X(const T &a) const \ @@ -51,125 +57,296 @@ inline const NR##E NR##E::operator X(const NR##E &a) const \ { return NR##E(*this) X##= a; } -// NRVec class +/***************************************************************************//** + * \brief NRVec class template implementing the vector interface + * @see NRMat, NRSMat + ******************************************************************************/ template class NRVec { protected: - int nn; - T *v; - int *count; + int nn;//!< size of the vector + T *v;//!< pointer to the underlying data structure + int *count;//!< pointer to the reference-counter #ifdef CUDALA - GPUID location; + GPUID location;//!< determines the memory address space of this object (CPU/GPU) #endif public: friend class NRSMat; friend class NRMat; + template friend NRVec > complexify(const NRVec&); - inline NRVec(): nn(0),v(0),count(0) - { -#ifdef CUDALA - location = DEFAULT_LOC; -#endif - }; - explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1)) - { -#ifdef CUDALA - if(loc==undefined) location = DEFAULT_LOC; else location = loc; - if(location==cpu) -#endif - v= new T[n]; -#ifdef CUDALA - else - v= (T*) gpualloc(n*sizeof(T)); -#endif - }; + typedef T ROWTYPE; + //! standard destructor + ~NRVec(); + + /***************************************************************************//** + * inlined constructor creating zero vector of general type T + ******************************************************************************/ + inline NRVec(): nn(0), v(0), count(0) { + #ifdef CUDALA + location = DEFAULT_LOC; + #endif + }; + + /***************************************************************************//** + * Explicit inlined constructor creating vector of given size and location. + * Because of performance reasons, no incialization is done. + * @param[in] n vector size (count of elements) + * @param[in] loc location of the underlying data (CPU/GPU) + ******************************************************************************/ + explicit inline NRVec(const int n, const GPUID loc = undefined): nn(n), count(new int(1)) { + #ifdef CUDALA + location = (loc == undefined)?DEFAULT_LOC:loc; + if(location == cpu){ + #endif + v = new T[n]; + #ifdef CUDALA + }else{ + v = (T*) gpualloc(n*sizeof(T)); + } + #endif + }; + + //! inlined constructor creating vector of given size filled with prescribed value inline NRVec(const T &a, const int n); + + //! inlined constructor creating vector of given size filled with data located at given memory location inline NRVec(const T *a, const int n); + + //! inlined constructor creating vector of given size filled with data located at given memory location inline NRVec(T *a, const int n, bool skeleton); + + //! inlined copy constructor inline NRVec(const NRVec &rhs); - NRVec(const typename LA_traits_complex::NRVec_Noncomplex_type &rhs, bool imagpart=false); //construct complex from real + + //! complexifying constructor + NRVec(const typename LA_traits_complex::NRVec_Noncomplex_type &rhs, bool imagpart=false);//construct complex from real + + //! explicit inlined constructor converting symmetric matrix into a vector inline explicit NRVec(const NRSMat & S); -#ifdef MATPTR - explicit NRVec(const NRMat &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {}; -#else - explicit NRVec(const NRMat &rhs); -#endif -#ifdef CUDALA - inline GPUID getlocation() const {return location;} - void moveto(const GPUID dest); -#else - inline GPUID getlocation() const {return cpu;} - void moveto(const GPUID dest) {}; -#endif - NRVec & operator=(const NRVec &rhs); - NRVec & operator=(const T &a); //assign a to every element + + /***************************************************************************//** + + ******************************************************************************/ + #ifdef MATPTR + explicit NRVec(const NRMat &rhs): NRVec(&rhs[0][0], rhs.nrows()*rhs.ncols()) {}; + #else + explicit NRVec(const NRMat &rhs); + #endif + + /***************************************************************************//** + * routines for CUDA related stuff + * \li getlocation() gets the protected data member location + * \li moveto(const GPUID) moves underlying data between CPU/GPU memory + ******************************************************************************/ + #ifdef CUDALA + inline GPUID getlocation() const { return location; } + void moveto(const GPUID dest); + #else + inline GPUID getlocation() const { return cpu; } + void moveto(const GPUID dest) {}; + #endif + + //! create separate copy of the data corresponding to this vector + void copyonwrite(); + + //! purge this vector + void clear() { copyonwrite(); LA_traits::clear(v, nn); }; + + //! assignment operator assigns given vector + NRVec& operator=(const NRVec &rhs); + + //! assigment operator assigns given scalar to each element of this vector + NRVec& operator=(const T &a); + + //! fills in this vector with pseudo-random numbers generated using uniform distribution void randomize(const typename LA_traits::normtype &x); - NRVec & operator|=(const NRVec &rhs); - const bool operator!=(const NRVec &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits::gencmp(v,rhs.v,nn);} //memcmp for scalars else elementwise + + //! perform deep-copy of given vector + NRVec& operator|=(const NRVec &rhs); + + //! relational operators + const bool operator!=(const NRVec &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits::gencmp(v,rhs.v,nn);} const bool operator==(const NRVec &rhs) const {return !(*this != rhs);}; const bool operator>(const NRVec &rhs) const; const bool operator<(const NRVec &rhs) const; const bool operator>=(const NRVec &rhs) const {return !(*this < rhs);}; const bool operator<=(const NRVec &rhs) const {return !(*this > rhs);}; + + //! unary minus const NRVec operator-() const; - inline NRVec & operator+=(const NRVec &rhs); - inline NRVec & operator-=(const NRVec &rhs); - inline NRVec & operator*=(const NRVec &rhs); //elementwise - inline NRVec & operator/=(const NRVec &rhs); //elementwise - inline NRVec & operator+=(const T &a); - inline NRVec & operator-=(const T &a); - inline NRVec & operator*=(const T &a); - inline int getcount() const {return count?*count:0;} + + //! bunch of vector-vector arithmetic operators defined element-wise + inline NRVec& operator+=(const NRVec &rhs); + inline NRVec& operator-=(const NRVec &rhs); + inline NRVec& operator*=(const NRVec &rhs); + inline NRVec& operator/=(const NRVec &rhs); + inline const NRVec operator+(const NRVec &rhs) const; inline const NRVec operator-(const NRVec &rhs) const; + + //! bunch of scalar-vector arithmetic operators defined element-wise + inline NRVec& operator+=(const T &a); + inline NRVec& operator-=(const T &a); + inline NRVec& operator*=(const T &a); + inline const NRVec operator+(const T &a) const; inline const NRVec operator-(const T &a) const; inline const NRVec operator*(const T &a) const; - inline const T operator*(const NRVec &rhs) const; //scalar product -> dot - inline const T dot(const NRVec &rhs) const {return *this * rhs;}; //@@@for complex do conjugate + + + //! determine the actual value of the reference counter + inline int getcount() const {return count?*count:0;} + + //! compute the Euclidean inner product (with conjugation in complex case) + inline const T operator*(const NRVec &rhs) const; + inline const T dot(const NRVec &rhs) const {return *this * rhs;}; + + //! compute the Euclidean inner product (with conjugation in complex case) with a stride-vector + inline const T dot(const T *a, const int stride = 1) const; + void gemv(const T beta, const NRMat &a, const char trans, const T alpha, const NRVec &x); - void gemv(const T beta, const NRSMat &a, const char trans /*just for compatibility*/, const T alpha, const NRVec &x); - void gemv(const T beta, const SparseMat &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false); - void gemv(const typename LA_traits_complex::Component_type beta, const typename LA_traits_complex::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex::Component_type alpha, const NRVec &x); - void gemv(const typename LA_traits_complex::Component_type beta, const typename LA_traits_complex::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex::Component_type alpha, const NRVec &x); - const NRVec operator*(const NRMat &mat) const {NRVec result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; - const NRVec operator*(const NRSMat &mat) const {NRVec result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; - const NRVec operator*(const SparseMat &mat) const {NRVec result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; - const NRMat otimes(const NRVec &rhs, const bool conjugate=false, const T &scale=1) const; //outer product - inline const NRMat operator|(const NRVec &rhs) const {return otimes(rhs,true);}; - inline const T sum() const {T sum=0; for(int i=0; i &a, const char trans /**< just for compatibility reasons */, const T alpha, const NRVec &x); + void gemv(const T beta, const SparseMat &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric = false); + + void gemv( const typename LA_traits_complex::Component_type beta, + const typename LA_traits_complex::NRMat_Noncomplex_type &a, + const char trans, + const typename LA_traits_complex::Component_type alpha, + const NRVec &x); + + void gemv( const typename LA_traits_complex::Component_type beta, + const typename LA_traits_complex::NRSMat_Noncomplex_type &a, + const char trans, + const typename LA_traits_complex::Component_type alpha, const NRVec &x); + + //! multiply given matrix with this vector from left + const NRVec operator*(const NRMat &mat) const { + SAME_LOC(*this, mat); + + NRVec result(mat.ncols(), mat.getlocation()); + result.gemv((T)0, mat, 't', (T)1, *this); + return result; + }; + + //! multiply given symmetric matrix in packed form with this vector from left + const NRVec operator*(const NRSMat &mat) const { + SAME_LOC(*this, mat); + + NRVec result(mat.ncols(), mat.getlocation()); + result.gemv((T)0, mat, 't', (T)1, *this); + return result; + }; + + //! multiply given sparse matrix with this vector from left + const NRVec operator*(const SparseMat &mat) const { + NOT_GPU(*this); + + NRVec result(mat.ncols()); + result.gemv((T)0, mat, 't', (T)1, *this); + return result; + }; + + //! compute the outer product of two vectors + const NRMat otimes(const NRVec &rhs, const bool conjugate = false, const T &scale = 1) const; + //! opeartor for outer product computation + inline const NRMat operator|(const NRVec &rhs) const { return otimes(rhs,true); }; + + //! compute the sum of the vector elements + inline const T sum() const { + T sum(0); + for(register int i=0; i::normtype asum() const; + + //! indexing operator - index running from zero inline T & operator[](const int i); inline const T & operator[](const int i) const; - typedef T ROWTYPE; - inline void setcoldim(int i) {}; //dummy + + //! dummy routine + inline void setcoldim(int i) {}; + + //! get the pointer to the underlying data structure + inline operator T*(); + //! get the constant pointer to the underlying data structure + inline operator const T*() const; + + //! add up a scalar multiple of a given vector + void axpy(const T alpha, const NRVec &x); + + //! add up a scalar multiple of a given vector with given stride + void axpy(const T alpha, const T *x, const int stride=1); + + //! determine the number of elements inline int size() const; - inline operator T*(); //get a pointer to the data - inline operator const T*() const; //get a pointer to the data - ~NRVec(); - void axpy(const T alpha, const NRVec &x); // this+= a*x - void axpy(const T alpha, const T *x, const int stride=1); // this+= a*x - void copyonwrite(); - void clear() {copyonwrite(); LA_traits::clear(v,nn);}; //zero out + + //! resize the current vector void resize(const int n); - void get(int fd, bool dimensions=1, bool transp=0); - void put(int fd, bool dimensions=1, bool transp=0) const; - NRVec & normalize(); + + //! determine the norm of this vector inline const typename LA_traits::normtype norm() const; - inline const T amax() const; + + //! normalize this vector and optionally save the norm + NRVec& normalize(typename LA_traits::normtype* norm = 0); + + //! get normalized copy of this vector inline const NRVec unitvector() const; + + //! determine the maximal element (in the absolute value) of this vector + inline const T amax() const; + //! determine the minimal element (in the absolute value) of this vector + inline const T amin() const; + + //! routine for formatted output void fprintf(FILE *f, const char *format, const int modulo) const; + //! routine for unformatted output + void put(int fd, bool dimensions=1, bool transp=0) const; + + //! routine for formatted input void fscanf(FILE *f, const char *format); -//sparse matrix concerning members - explicit NRVec(const SparseMat &rhs); // dense from sparse matrix with one of dimensions =1 - inline void simplify() {}; //just for compatibility with sparse ones - bool bigger(int i, int j) const {return LA_traits::bigger(v[i],v[j]);}; - bool smaller(int i, int j) const {return LA_traits::smaller(v[i],v[j]);}; - void swap(int i, int j) {T tmp; tmp=v[i]; v[i]=v[j]; v[j]=tmp;}; - int sort(int direction=0, int from=0, int to= -1, int *perm=NULL); //sort, ascending by default, returns parity of permutation - NRVec & CallOnMe(T (*_F)(const T &) ) {copyonwrite(); for(int i=0; i &rhs); + + //! routine for compatibility with sparse types + inline void simplify() {}; + + //! determine whether the ith element is bigger than the jth element + bool bigger(int i, int j) const { + NOT_GPU(*this); + return LA_traits::bigger(v[i], v[j]); + }; + + //! determine whether the ith element is bigger than the jth element + bool smaller(int i, int j) const { + NOT_GPU(*this); + return LA_traits::smaller(v[i], v[j]); + }; + + //! swap the ith and jth element + void swap(int i, int j) { + const T tmp(v[i]); + v[i] = v[j]; + v[j] = tmp; + }; + + //! sort by default in ascending order and return the parity of corresponding permutation resulting to this order + int sort(int direction = 0, int from = 0, int to = -1, int *perm = NULL); + + //! apply given function to each element + NRVec& call_on_me(T (*_F)(const T &) ){ + NOT_GPU(*this); + + copyonwrite(); + for(int i=0; i -std::ostream & operator<<(std::ostream &s, const NRVec &x) -{ +std::ostream & operator<<(std::ostream &s, const NRVec &x) { #ifdef CUDALA -if(x.getlocation()==cpu) - { + if(x.getlocation() == cpu){ #endif - int i, n; - n = x.size(); - s << n << std::endl; - for(i=0; i::IOtype)x[i] << (i == n-1 ? '\n' : ' '); - return s; -#ifdef CUDALA - } -else - { - NRVec tmp=x; - tmp.moveto(cpu); - return s< -std::istream & operator>>(std::istream &s, NRVec &x) -{ -#ifdef CUDALA -if(x.getlocation()==cpu) - { -#endif - int i,n; - s >> n; - x.resize(n); - typename LA_traits_io::IOtype tmp; - for(i=0; i> tmp; x[i]=tmp;} - return s; -#ifdef CUDALA - } -else - { - NRVec tmp; - tmp.moveto(cpu); - s >> tmp; - tmp.moveto(x.getlocation()); - x=tmp; - return s; - } -#endif -} - - -// INLINES - -// ctors -template -inline NRVec::NRVec(const T& a, const int n) : nn(n), count(new int) -{ - *count = 1; -#ifdef CUDALA - location=DEFAULT_LOC; - if(location==cpu) - { -#endif - v = new T[n]; - if(a != (T)0) - for(int i=0; i::IOtype)x[i] << (i == n-1 ? '\n' : ' '); + } + return s; #ifdef CUDALA + }else{ + NRVec tmp(x); + tmp.moveto(cpu); + return s << tmp; } - else - { - v= (T*) gpualloc(n*sizeof(T)); - cublasSetVector(n,sizeof(T),&a,0,v,1); +#endif +} + +/***************************************************************************//** + * input operator + * @param[in,out] s input stream + * @param[in] x vector of general type intended for input + * @return modified stream + ******************************************************************************/ +template +std::istream & operator>>(std::istream &s, NRVec &x) { +#ifdef CUDALA + if(x.getlocation() == cpu){ +#endif + int i,n; + s >> n; + x.resize(n); + typename LA_traits_io::IOtype tmp; + for(i=0; i> tmp; + x[i] = tmp; + } + return s; +#ifdef CUDALA + }else{ + NRVec tmp; + tmp.moveto(cpu); + s >> tmp; + tmp.moveto(x.getlocation()); + x = tmp; + return s; } #endif } +/***************************************************************************//** + * inline constructor creating vector of given size filled with prescribed value + * @param[in] a value to be assigned to all vector elements + * @param[in] n required vector size + ******************************************************************************/ template -inline NRVec::NRVec(const T *a, const int n) : nn(n), count(new int) -{ -#ifdef CUDALA -location=DEFAULT_LOC; - if(location==cpu) - { -#endif - v=new T[n]; +inline NRVec::NRVec(const T& a, const int n): nn(n), count(new int) { *count = 1; - memcpy(v, a, n*sizeof(T)); #ifdef CUDALA - } - else - { - v= (T*) gpualloc(n*sizeof(T)); - cublasSetVector(n,sizeof(T),a,1,v,1); - } + location = DEFAULT_LOC; + if(location == cpu){ +#endif + v = new T[n]; + if(a != (T)0){ + for(register int i=0; i -inline NRVec::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int) -{ - if(!skeleton) - { +inline NRVec::NRVec(const T *a, const int n): nn(n), count(new int) { #ifdef CUDALA -location=DEFAULT_LOC; - if(location==cpu) - { + location = DEFAULT_LOC; + if(location == cpu) { #endif - v=new T[n]; + v = new T[n]; *count = 1; memcpy(v, a, n*sizeof(T)); #ifdef CUDALA - } - else - { - v= (T*) gpualloc(n*sizeof(T)); - cublasSetVector(n,sizeof(T),a,1,v,1); - } + }else{ + v = (T*) gpualloc(n*sizeof(T)); + cublasSetVector(n, sizeof(T), a, 1, v, 1); + TEST_CUBLAS("cublasSetVector"); + } #endif - } - else - { -#ifdef CUDALA - if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU"); -#endif - *count = 2; - v=a; - } + } +/***************************************************************************//** + * inline constructor creating vector of given size filled with given data + * @param[in] a pointer to the data + * @param[in] n required vector size + * @param[in] skeleton if equal to true, only the internal data pointer is modified + * and reference counter is set to two, i.e. no data deallocation occurs in destructor + ******************************************************************************/ template -inline NRVec::NRVec(const NRVec &rhs) -{ +inline NRVec::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int) { + if(!skeleton){ #ifdef CUDALA - location=rhs.location; + location = DEFAULT_LOC; + if(location == cpu){ +#endif + v = new T[n]; + *count = 1; + memcpy(v, a, n*sizeof(T)); +#ifdef CUDALA + }else{ + v= (T*) gpualloc(n*sizeof(T)); + cublasSetVector(n, sizeof(T), a, 1, v, 1); + TEST_CUBLAS("cublasSetVector"); + } +#endif + }else{ +#ifdef CUDALA + if(location != cpu) laerror("NRVec() with skeleton option cannot be on GPU"); +#endif + *count = 2; + v = a; + } +} + +/***************************************************************************//** + * inline copy constructor + * @param[in] rhs reference vector being copied + ******************************************************************************/ +template +inline NRVec::NRVec(const NRVec &rhs) { +#ifdef CUDALA + location = rhs.location; #endif v = rhs.v; nn = rhs.nn; @@ -330,667 +518,1001 @@ inline NRVec::NRVec(const NRVec &rhs) if(count) (*count)++; } +/***************************************************************************//** + * inline constructor interpreting symmetric matrix of order \f$n\f$ stored in packed form + * as a linear vector consisting of \f$n(n+1)/2\f$ elements + * @param[in] rhs symmetric matrix of type NRSMat + * @see NRSMat + ******************************************************************************/ template -inline NRVec::NRVec(const NRSMat &rhs) -{ +inline NRVec::NRVec(const NRSMat &rhs) { #ifdef CUDALA - location=rhs.location; + location = rhs.location; #endif nn = rhs.nn; + //! using macro NN2 defined in smat.h nn = NN2; v = rhs.v; count = rhs.count; (*count)++; } -// x +/-= a +/***************************************************************************//** + * adds given scalar value of type T to all vector elements + * @param[in] a scalar value being added + * @return reference to the modified vector + ******************************************************************************/ template -inline NRVec & NRVec::operator+=(const T &a) -{ +inline NRVec & NRVec::operator+=(const T &a) { NOT_GPU(*this); + copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator-=(const T &a) -{ +inline NRVec& NRVec::operator-=(const T &a) { NOT_GPU(*this); - copyonwrite(); - int i; - for(i=0; iT to this vector \f$\vec{x}\f$ + * \f[\vec{x}\leftarrow\vec{x}+\vec{y}\f] + * @param[in] rhs vector \f$\vec{y}\f$ of type T + * @return reference to the modified vector + ******************************************************************************/ template -inline NRVec & NRVec::operator+=(const NRVec &rhs) -{ +inline NRVec& NRVec::operator+=(const NRVec &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -NOT_GPU(*this); -NOT_GPU(rhs); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif - copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator*=(const NRVec &rhs) -{ +inline NRVec& NRVec::operator*=(const NRVec& rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("*= of incompatible vectors"); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif - copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator/=(const NRVec &rhs) -{ +inline NRVec & NRVec::operator/=(const NRVec &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("/= of incompatible vectors"); -NOT_GPU(*this); -NOT_GPU(rhs); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif + NOT_GPU(*this); + NOT_GPU(rhs); + copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator-=(const NRVec &rhs) -{ +inline NRVec & NRVec::operator-=(const NRVec &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -NOT_GPU(*this); -NOT_GPU(rhs); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif - copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator*=(const T &a) -{ -NOT_GPU(*this); - copyonwrite(); - int i; - for(i=0; i & NRVec::operator*=(const T &a) { + NOT_GPU(*this); + copyonwrite(); + + for(register int i=0; iT + * with given vector \f$\vec{y}\f$ of type T and order \f$N\f$ + * \f[d = \sum_{i=1}^N\vec{x}_i\cdot\vec{y}_i\f] + * @param[in] rhs general vector \f$\vec{y}\f$ + * @return reference to the modified vector + ******************************************************************************/ template -inline const T NRVec::operator*(const NRVec &rhs) const -{ +inline const T NRVec::operator*(const NRVec &rhs) const { #ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible vectors"); -NOT_GPU(*this); -NOT_GPU(rhs); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif - T dot = 0; - for(int i=0; i -inline T & NRVec::operator[](const int i) -{ +inline T& NRVec::operator[](const int i) { #ifdef DEBUG - if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1"); - if(i < 0 || i >= nn) laerror("NRVec out of range"); - if(!v) laerror("[] on unallocated NRVec"); -NOT_GPU(*this); -#endif - return v[i]; -} -template -inline const T & NRVec::operator[](const int i) const -{ -#ifdef DEBUG - if(i < 0 || i >= nn) laerror("NRVec out of range"); - if(!v) laerror("[] on unallocated NRVec"); -NOT_GPU(*this); + if(_LA_count_check && *count != 1) laerror("possible use of NRVec[] with count>1 as l-value"); + if(i < 0 || i >= nn) laerror("out of range"); + if(!v) laerror("unallocated NRVec"); #endif + NOT_GPU(*this); + return v[i]; } -// length of the vector +/***************************************************************************//** + * indexing operator giving the element at given position with range checking in + * the DEBUG mode + * @param[in] i position of the required vector element (starting from 0) + * @return constant reference to the requested element + ******************************************************************************/ template -inline int NRVec::size() const -{ +inline const T& NRVec::operator[](const int i) const { +#ifdef DEBUG + if(i < 0 || i >= nn) laerror("out of range"); + if(!v) laerror("unallocated NRVec"); +#endif + NOT_GPU(*this); + + return v[i]; +} + +/***************************************************************************//** + * determine the number of elements of this vector + * @return length of this vector + ******************************************************************************/ +template +inline int NRVec::size() const { return nn; } -// reference Vec to the first element +/***************************************************************************//** + * get the pointer to the underlying data of this vector + * @return pointer to the first vector element + ******************************************************************************/ template -inline NRVec::operator T*() -{ +inline NRVec::operator T*() { #ifdef DEBUG - if(!v) laerror("unallocated NRVec in operator T*"); + if(!v) laerror("unallocated NRVec"); #endif return v; } + +/***************************************************************************//** + * get the constant pointer to the underlying data of this vector + * @return constant pointer to the first vector element + ******************************************************************************/ template -inline NRVec::operator const T*() const -{ +inline NRVec::operator const T*() const { #ifdef DEBUG - if(!v) laerror("unallocated NRVec in operator T*"); + if(!v) laerror("unallocated NRVec"); #endif return v; } - -// Make Vec unitvector +/***************************************************************************//** + * create normalized copy of this vector + * @return copy of this vector after normalization + * @see NRVec::normalize() + ******************************************************************************/ template -inline const NRVec NRVec::unitvector() const -{ +inline const NRVec NRVec::unitvector() const { return NRVec(*this).normalize(); } -// generate operators: Vec + a, a + Vec, Vec * a +/***************************************************************************//** + * generate operators involving vector and scalar + ******************************************************************************/ NRVECMAT_OPER(Vec,+) NRVECMAT_OPER(Vec,-) NRVECMAT_OPER(Vec,*) -// generate operators: Vec + Vec, Vec - Vec + +/***************************************************************************//** + * generate operators involving vector and vector + ******************************************************************************/ NRVECMAT_OPER2(Vec,+) NRVECMAT_OPER2(Vec,-) -// Few forward declarations - -//basic stuff which has to be in .h -// dtor +/***************************************************************************//** + * destructor for general vector decreases the reference count and performs + * deallocation if neccessary + ******************************************************************************/ template -NRVec::~NRVec() -{ - if(!count) return; - if(--(*count) <= 0) { - if(v) - { -#ifdef CUDALA - if(location==cpu) -#endif - delete[] (v); -#ifdef CUDALA - else gpufree(v); -#endif - } - delete count; - } -} - -// detach from a physical vector and make own copy -template -void NRVec::copyonwrite() -{ - if(!count) laerror("Vec::copyonwrite() of an undefined vector"); - if(*count > 1) - { - (*count)--; - count = new int; - *count = 1; - T *newv; -#ifdef CUDALA - if(location==cpu) - { -#endif - newv = new T[nn]; - memcpy(newv, v, nn*sizeof(T)); -#ifdef CUDALA - } - else - { - newv = (T *) gpualloc(nn*sizeof(T)); - if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem"); - cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1); - } -#endif - - - v = newv; - } -} - - -// Asignment -template -NRVec & NRVec::operator=(const NRVec &rhs) -{ - if (this != &rhs) - { - if(count) - if(--(*count) == 0) - { -#ifdef CUDALA - if(location==cpu) -#endif - delete[] v; -#ifdef CUDALA - else - gpufree(v); -#endif - delete count; - } - v = rhs.v; - nn = rhs.nn; - count = rhs.count; -#ifdef CUDALA - location=rhs.location; -#endif - if(count) (*count)++; - } - return *this; -} - - - -// Resize -template -void NRVec::resize(const int n) -{ -#ifdef DEBUG - if(n<0) laerror("illegal vector dimension"); -#endif - if(count) - { - if(n==0) - { +NRVec::~NRVec() { + if(!count) return; if(--(*count) <= 0) { - if(v) - { + if(v){ #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - delete[] (v); + delete[] v; #ifdef CUDALA - else - gpufree(v); + }else{ gpufree(v); } #endif - } + } delete count; - } - count=0; - nn=0; - v=0; - return; + } +} + +/***************************************************************************//** + * make own copy of the underlying data connected with this vector + ******************************************************************************/ +template +void NRVec::copyonwrite() { + if(!count) laerror("copyonwrite of an undefined vector"); + if(*count > 1) { + (*count)--; + count = new int; + *count = 1; + T *newv; +#ifdef CUDALA + if(location == cpu){ +#endif + newv = new T[nn]; + memcpy(newv, v, nn*sizeof(T)); +#ifdef CUDALA + }else{ + newv = (T *) gpualloc(nn*sizeof(T)); + if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec::copyonwrite()"); + cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); + TEST_CUBLAS("cublasScopy");//"NRVec::copyonwrite()" + } +#endif + v = newv; } - if(*count > 1) { - (*count)--; - count = 0; - v = 0; - nn = 0; - } - } - if(!count) { - count = new int; - *count = 1; - nn = n; -#ifdef CUDALA - if(location==cpu) -#endif - v = new T[nn]; -#ifdef CUDALA - else - v = (T*) gpualloc(nn*sizeof(T)); -#endif - return; - } - // *count = 1 in this branch - if (n != nn) { - nn = n; -#ifdef CUDALA - if(location==cpu) -#endif - { - delete[] v; - v = new T[nn]; - } -#ifdef CUDALA - else - { - gpufree(v); - v = (T*) gpualloc(nn*sizeof(T)); - } -#endif - } } -// assignment with a physical (deep) copy +/***************************************************************************//** + * assigns general vector \f$\vec{y}\f$ to this vector \f$\vec{x}\f$ + * \li checks for self-assignment + * \li decreases the reference count and performs deallocation if neccesary + * \li links the internal data structures with corresponding properties of vector \f$\vec{y}\f$ + * \li updates the reference count properly + ******************************************************************************/ template -NRVec & NRVec::operator|=(const NRVec &rhs) -{ -#ifdef DEBUG - if (!rhs.v) laerror("unallocated rhs in NRVec operator |="); +NRVec & NRVec::operator=(const NRVec &rhs) { + //check for self-assignment + if(this != &rhs){ + if(count){ + if(--(*count) == 0){ +#ifdef CUDALA + if(location == cpu){ #endif - if (this == &rhs) return *this; + delete[] v; +#ifdef CUDALA + }else{ + gpufree(v); + } +#endif + delete count; + } + } + v = rhs.v; + nn = rhs.nn; + count = rhs.count; +#ifdef CUDALA + location = rhs.location; +#endif + if(count){ (*count)++; } + } + return *this; +} + + +/***************************************************************************//** + * resizes this vector + * @param[in] n requested size + ******************************************************************************/ +template +void NRVec::resize(const int n) { +#ifdef DEBUG + if(n < 0) laerror("illegal dimension"); +#endif + if(count){ + if(n == 0){ + if(--(*count) <= 0){ + if(v){ +#ifdef CUDALA + if(location == cpu){ +#endif + delete[] (v); +#ifdef CUDALA + }else{ + gpufree(v); + } +#endif + } + delete count; + } + count = 0; + nn = 0; + v = 0; + return; + } + if(*count > 1) { + (*count)--; + count = 0; + v = 0; + nn = 0; + } + } + if(!count){ + count = new int; + *count = 1; + nn = n; +#ifdef CUDALA + if(location == cpu) +#endif + v = new T[nn]; +#ifdef CUDALA + else + v = (T*) gpualloc(nn*sizeof(T)); +#endif + return; + } + // *count = 1 in this branch + if (n != nn) { + nn = n; +#ifdef CUDALA + if(location == cpu){ +#endif + + delete[] v; + v = new T[nn]; +#ifdef CUDALA + }else{ + + gpufree(v); + v = (T*) gpualloc(nn*sizeof(T)); + } +#endif + } +} + + +/***************************************************************************//** + * perfrom deep copy + * @param[in] rhs vector being copied + * @see NRVec::copyonwrite() + ******************************************************************************/ +template +NRVec & NRVec::operator|=(const NRVec &rhs) { +#ifdef DEBUG + if(!rhs.v) laerror("unallocated vector"); +#endif + if(this == &rhs) return *this; *this = rhs; this->copyonwrite(); return *this; } - - +/***************************************************************************//** + * complexify given vector of general type T, i.e. convert its + * elements to type complex + * @param[in] rhs vector being complexified + * @see NRVec::copyonwrite() + ******************************************************************************/ template -NRVec > complexify(const NRVec &rhs) -{ -NRVec > r(rhs.size()); -for(int i=0; i > complexify(const NRVec &rhs) { + NOT_GPU(rhs); + + NRVec > r(rhs.size(), rhs.getlocation()); + for(register int i=0; i NRVec > complexify(const NRVec &rhs); - +/***************************************************************************//** + * routine for moving vector data between CPU and GPU memory + * @param[in] dest required location + * @see NRVec::location, NRVec::getlocation() + ******************************************************************************/ #ifdef CUDALA template -void NRVec::moveto(const GPUID dest) -{ -if(location==dest) return; -CPU_GPU(location,dest); -location=dest; +void NRVec::moveto(const GPUID dest) { + if(location == dest) return; -if(v && !count) laerror("internal inconsistency of reference counting 1"); -if (!count) return; + CPU_GPU(location, dest); + location = dest; -if(v && *count==0) laerror("internal inconsistency of reference counting 2"); -if(!v) return; + if(v && !count) laerror("internal"); + if (!count) return; -T *vold = v; + if(v && *count == 0) laerror("internal"); + if(!v) return; -if(dest == cpu) //moving from GPU to CPU - { - v = new T[nn]; - gpuget(nn,sizeof(T),vold,v); - if(*count == 1) gpufree(vold); - else {--(*count); count = new int(1);} - } -else //moving from CPU to GPU - { - v=(T *) gpualloc(nn*sizeof(T)); - gpuput(nn,sizeof(T),vold,v); - if(*count == 1) delete[] vold; - else {--(*count); count = new int(1);} - } + T *vold = v; + + if(dest == cpu){ // moving from GPU to CPU + v = new T[nn]; + gpuget(nn,sizeof(T),vold,v); + if(*count == 1) gpufree(vold); + else {--(*count); count = new int(1);} + + }else{ // moving from CPU to GPU + v = (T *) gpualloc(nn*sizeof(T)); + gpuput(nn,sizeof(T),vold,v); + if(*count == 1) delete[] vold; + else {--(*count); count = new int(1);} + } } #endif - -//some template specializations leading to BLAS/CUBLAS calls +/***************************************************************************//** + * adds a real scalar value \f$\alpha\f$ to all elements of this real vector \f$\vec{x}\f$ + * \f[\vec{x}_i\leftarrow\vec{x}_i+\alpha\f] + * @param[in] a real scalar value \f$\alpha\f$ being added + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec & NRVec::operator+=(const double &a) -{ +inline NRVec& NRVec::operator+=(const double &a) { copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_daxpy(nn, 1.0, &a, 0, v, 1); #ifdef CUDALA - else - { - double *d=gpuputdouble(a); + }else{ + double *d = gpuputdouble(a); cublasDaxpy(nn, 1.0, d, 0, v, 1); + TEST_CUBLAS("cublasDaxpy"); gpufree(d); - } + } #endif return *this; } +/***************************************************************************//** + * adds a complex scalar value \f$\alpha\f$ to all elements of this complex vector \f$\vec{x}\f$ + * \f[\vec{x}_i\leftarrow\vec{x}_i+\alpha\f] + * @param[in] a complex scalar value \f$\alpha\f$ being added + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec< complex > & -NRVec< complex >::operator+=(const complex &a) -{ +inline NRVec >& NRVec >::operator+=(const complex &a) { copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_zaxpy(nn, &CONE, &a, 0, v, 1); #ifdef CUDALA - else - { - complex *d=gpuputcomplex(a); + }else{ + complex *d = gpuputcomplex(a); cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1); + TEST_CUBLAS("cublasZaxpy"); gpufree(d); - } + } #endif return *this; } +/***************************************************************************//** + * subtracts a real scalar value \f$\alpha\f$ from all elements of this real vector \f$\vec{x}\f$ + * \f[\vec{x}_i\leftarrow\vec{x}_i-\alpha\f] + * @param[in] a real scalar value \f$\alpha\f$ being subtracted + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec & NRVec::operator-=(const double &a) -{ +inline NRVec& NRVec::operator-=(const double &a) { copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_daxpy(nn, -1.0, &a, 0, v, 1); #ifdef CUDALA - else - { - double *d=gpuputdouble(a); + }else{ + double *d = gpuputdouble(a); cublasDaxpy(nn, -1.0, d, 0, v, 1); + TEST_CUBLAS("cublasDaxpy"); gpufree(d); - } + } #endif return *this; } +/***************************************************************************//** + * subtracts a complex scalar value \f$\alpha\f$ from all elements of this complex vector \f$\vec{x}\f$ + * \f[\vec{x}_i\leftarrow\vec{x}_i-\alpha\f] + * @param[in] a complex scalar value \f$\alpha\f$ being subtracted + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec< complex > & -NRVec< complex >::operator-=(const complex &a) -{ +inline NRVec >& NRVec >::operator-=(const complex &a) { copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif cblas_zaxpy(nn, &CMONE, &a, 0, v, 1); #ifdef CUDALA - else - { - complex *d=gpuputcomplex(a); - cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1); - gpufree(d); - } + }else{ + complex *d = gpuputcomplex(a); + cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1); + TEST_CUBLAS("cublasZaxpy"); + gpufree(d); + } #endif return *this; } - +/***************************************************************************//** + * adds a real vector \f$\vec{y}\f$ to this real vector \f$\vec{x}\f$ + * \f[\vec{x}\leftarrow\vec{x}+\vec{y}\f] + * @param[in] rhs real vector \f$\vec{y}\f$ + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec & NRVec::operator+=(const NRVec &rhs) -{ +inline NRVec& NRVec::operator+=(const NRVec &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif + SAME_LOC(*this, rhs); copyonwrite(); - cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1); - return *this; -} -template<> -inline -NRVec< complex > & -NRVec< complex >::operator+=(const NRVec< complex > &rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasDaxpy(nn, 1.0, rhs.v, 1, v, 1); + TEST_CUBLAS("cubasDaxpy"); + } #endif - copyonwrite(); - cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1); return *this; } - +/***************************************************************************//** + * adds a complex vector \f$\vec{y}\f$ to this complex vector \f$\vec{x}\f$ + * \f[\vec{x}\leftarrow\vec{x}+\vec{y}\f] + * @param[in] rhs complex vector \f$\vec{y}\f$ + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec & NRVec::operator-=(const NRVec &rhs) -{ +inline NRVec >& NRVec >::operator+=(const NRVec > &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif -SAME_LOC(*this,rhs); + SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA - if(location==cpu) + if(location == cpu){ #endif - cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1); + cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1); #ifdef CUDALA - else + }else{ + cublasZaxpy(nn, CUONE, (cuDoubleComplex*)rhs.v, 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy"); + } +#endif + return *this; +} + +/***************************************************************************//** + * subtracts a real vector \f$\vec{y}\f$ from this real vector \f$\vec{x}\f$ + * \f[\vec{x}\leftarrow\vec{x}-\vec{y}\f] + * @param[in] rhs real vector \f$\vec{y}\f$ + * @return reference to the modified vector + ******************************************************************************/ +template<> +inline NRVec & NRVec::operator-=(const NRVec &rhs) { +#ifdef DEBUG + if (nn != rhs.nn) laerror("incompatible dimensions"); +#endif + SAME_LOC(*this,rhs); + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1); + TEST_CUBLAS("cubasDaxpy"); + } #endif return *this; } +/***************************************************************************//** + * subtracts a complex vector \f$\vec{y}\f$ from this complex vector \f$\vec{x}\f$ + * \f[\vec{x}\leftarrow\vec{x}-\vec{y}\f] + * @param[in] rhs double-precision complex vector \f$\vec{y}\f$ + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -NRVec< complex > & -NRVec< complex >::operator-=(const NRVec< complex > &rhs) -{ +inline NRVec >& NRVec >::operator-=(const NRVec > &rhs) { #ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); + if (nn != rhs.nn) laerror("incompatible dimensions"); #endif - copyonwrite(); - cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1); - return *this; -} - -template<> -inline -NRVec & NRVec::operator*=(const double &a) -{ - copyonwrite(); - cblas_dscal(nn, a, v, 1); - return *this; -} - -template<> -inline -NRVec< complex > & -NRVec< complex >::operator*=(const complex &a) -{ - copyonwrite(); - cblas_zscal(nn, &a, v, 1); - return *this; -} - - -template<> -inline -const double NRVec::operator*(const NRVec &rhs) const -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible vectors"); + SAME_LOC(*this, rhs); + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ #endif - return cblas_ddot(nn, v, 1, rhs.v, 1); + cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1); +#ifdef CUDALA + }else{ + cublasZaxpy(nn, CUMONE, (cuDoubleComplex*)rhs.v, 1, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZaxpy"); + } +#endif + return *this; } - +/***************************************************************************//** + * multiplies this real vector \f$\vec{x}\f$ by a real scalar value \f$\alpha\f$ + * \f[\vec{x}_i\leftarrow\alpha\vec{x}_i\f] + * @param[in] a real scalar value \f$\alpha\f$ + * @return reference to the modified vector + ******************************************************************************/ template<> -inline -const complex -NRVec< complex >::operator*(const NRVec< complex > &rhs) const -{ +inline NRVec& NRVec::operator*=(const double &a) { + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_dscal(nn, a, v, 1); +#ifdef CUDALA + }else{ + cublasDscal(nn, a, v, 1); + TEST_CUBLAS("cublasDscal"); + } +#endif + return *this; +} + +/***************************************************************************//** + * multiplies this complex vector \f$\vec{x}\f$ by a complex scalar value \f$\alpha\f$ + * \f[\vec{x}_i\leftarrow\alpha\vec{x}_i\f] + * @param[in] a complex scalar value \f$\alpha\f$ + * @return reference to the modified vector + ******************************************************************************/ +template<> +inline NRVec >& NRVec >::operator*=(const complex &a) { + copyonwrite(); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zscal(nn, &a, v, 1); +#ifdef CUDALA + }else{ + const cuDoubleComplex alpha = make_cuDoubleComplex(a.real(), a.imag()); + cublasZscal(nn, alpha, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasZscal"); + } +#endif + return *this; +} + +/***************************************************************************//** + * computes the inner product of this real vector \f$\vec{x}\f$ with given real vector \f$\vec{y]\f$ + * @param[in] rhs real vector \f$\vec{y}\f$ + * @return \f$\sum_{i=1}^N\vec{x}_i\cdot\vec{y}_i\f$ + ******************************************************************************/ +template<> +inline const double NRVec::operator*(const NRVec &rhs) const { + double ret(0.0); #ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible vectors"); + if(nn != rhs.nn) laerror("incompatible dimensions"); +#endif + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_ddot(nn, v, 1, rhs.v, 1); +#ifdef CUDALA + }else{ + ret = cublasDdot(nn, v, 1, rhs.v, 1); + TEST_CUBLAS("cublasDdot"); + } +#endif + return ret; +} + +/***************************************************************************//** + * computes the inner product of this complex vector \f$\vec{x}\f$ with given complex vector \f$\vec{y}\f$ + * taking conjugation of vector \f$\vec{x}\f$ into account + * @param[in] rhs complex vector \f$\vec{y}\f$ + * @return \f$\sum_{i=1}^N\overbar{\vec{x}_i}\cdot\vec{y}_i\f$ + ******************************************************************************/ +template<> +inline const complex NRVec >::operator*(const NRVec< complex > &rhs) const { +#ifdef DEBUG + if(nn != rhs.nn) laerror("incompatible dimensions"); #endif complex dot; - cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot); + SAME_LOC(*this, rhs); +#ifdef CUDALA + if(location == cpu){ +#endif + cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot); +#ifdef CUDALA + }else{ + const cuDoubleComplex val = cublasZdotc(nn, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1); + TEST_CUBLAS("cublasZdotc"); + dot = complex(cuCreal(val), cuCimag(val)); + } +#endif return dot; } -// Sum of elements +/***************************************************************************//** + * computes the inner product of this real vector \f$\vec{x}\f$ with given real data + * @param[in] y pointer to the double-precision real array (sufficient length assumed) + * @param[in] stride specifies the stride regarding the data pointe to by y + * @return \f$\sum_{i=1}^N\vec{x}_{i}\cdot y_{\mathrm{stride}\cdot(i-1) + 1}\f$ + ******************************************************************************/ template<> -inline -const double NRVec::asum() const -{ - return cblas_dasum(nn, v, 1); +inline const double NRVec::dot(const double *y, const int stride) const { + NOT_GPU(*this); + return cblas_ddot(nn, y, stride, v, 1); } - -// Dot product: x * y +/***************************************************************************//** + * computes the inner product of this complex vector \f$\vec{x}\f$ with given complex data + * @param[in] y pointer to the double-precision complex array (sufficient length assumed) + * @param[in] stride specifies the stride regarding the data pointe to by y + * @return \f$\sum_{i=1}^N\vec{x}_{i}\cdot \overbar{y_{\mathrm{stride}\cdot(i-1) + 1}}\f$ + ******************************************************************************/ template<> -inline -const double NRVec::dot(const double *y, const int stride) const -{ - return cblas_ddot(nn, y, stride, v, 1); -} - -template<> -inline -const complex -NRVec< complex >::dot(const complex *y, const int stride) const -{ +inline const complex NRVec >::dot(const complex *y, const int stride) const { complex dot; + NOT_GPU(*this); cblas_zdotc_sub(nn, y, stride, v, 1, &dot); return dot; } -// return norm of the Vec +/***************************************************************************//** + * computes the sum of the absolute values of the elements of this real vector \f$\vec{x}\f$ + * @return \f$\sum_{i=1}^N\left|\vec{x}_i\right|\f$ + ******************************************************************************/ template<> -inline -const double NRVec::norm() const -{ +inline const double NRVec::asum() const { + double ret(0.0); #ifdef CUDALA - if(location!=cpu) return cublasDnrm2(nn, v, 1); + if(location == cpu){ #endif - return cblas_dnrm2(nn, v, 1); + ret = cblas_dasum(nn, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDasum(nn, v, 1); + TEST_CUBLAS("cublasDasum"); + } +#endif + return ret; } + +/***************************************************************************//** + * for this complex vector \f$\vec{x}\f$ compute the expression + * \f[\sum_{i=1}^N\left|\Re{}\vec{x}_i\right| + \left|\Im{}\vec{x}_i\right|\f] + * @return the value of this sum + ******************************************************************************/ template<> -inline -const double NRVec< complex >::norm() const -{ - return cblas_dznrm2(nn, v, 1); +inline const double NRVec >::asum() const { + double ret(0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_dzasum(nn, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDzasum(nn, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasDzasum"); + } +#endif + return ret; } -// Max element of the array +/***************************************************************************//** + * for this real vector \f$\vec{x}\f$ (of \f$N\f$ elements) determine the Frobenius norm + * @return \f$\sum_{i=1}^N\left|\vec{x}_i\right|^2\f$ + ******************************************************************************/ template<> -inline -const double NRVec::amax() const -{ - return v[cblas_idamax(nn, v, 1)]; +inline const double NRVec::norm() const { + double ret(0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_dnrm2(nn, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDnrm2(nn, v, 1); + TEST_CUBLAS("cublasDnrm2"); + } +#endif + return ret; } -/* -cblas_izamax seems to be missing at least in some cblas versions +/***************************************************************************//** + * for this complex vector \f$\vec{x}\f$ (of \f$N\f$ elements) determine the Frobenius norm + * @return \f$\sum_{i=1}^N\left|\vec{x}_i\right|^2\f$ + ******************************************************************************/ template<> -inline -const complex NRVec< complex >::amax() const -{ - return v[cblas_izamax(nn, v, 1)]; +inline const double NRVec< complex >::norm() const { + double ret(0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = cblas_dznrm2(nn, v, 1); +#ifdef CUDALA + }else{ + ret = cublasDznrm2(nn, (cuDoubleComplex*)v, 1); + TEST_CUBLAS("cublasDzrm2"); + } +#endif + return ret; } -*/ +/***************************************************************************//** + * for this real vector \f$\vec{x}\f$ determine the element with largest absolute value + * @return \f$\vec{x}_i\f$ where \f$\left|\vec{x]_i\right|=\mathrm{max}_{j}\left|\vec{x}_{j}\right|\f$ + ******************************************************************************/ +template<> +inline const double NRVec::amax() const { + double ret(0.0); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = v[cblas_idamax(nn, v, 1) - 1]; +#ifdef CUDALA + }else{ + const int pozice = cublasIdamax(nn, v, 1) - 1; + TEST_CUBLAS("cublasIdamax"); + gpuget(1, sizeof(double), v + pozice, &ret); + } +#endif + return ret; +} +/***************************************************************************//** + * for this real vector \f$\vec{x}\f$ determine the element with smallest absolute value + * @return \f$\vec{x}_i\f$ where \f$\left|\vec{x]_i\right|=\mathrm{min}_{j}\left|\vec{x}_{j}\right|\f$ + ******************************************************************************/ +template<> +inline const double NRVec::amin() const { + double ret(std::numeric_limits::max()); +#ifdef CUDALA + if(location == cpu){ +#endif + //BLAS routine idamin seems no to be supported + double val(0.0); + int index(-1); + for(register int i = 0; i < nn; i++){ + val = std::abs(v[i]); + if(val < ret){ index = i; ret = val; } + } + ret = v[index]; +#ifdef CUDALA + }else{ + const int pozice = cublasIdamin(nn, v, 1) - 1; + TEST_CUBLAS("cublasIdamin"); + gpuget(1, sizeof(double), v + pozice, &ret); + } +#endif + return ret; +} + +/***************************************************************************//** + * for a given complex vector \f$\vec{v}\f$, determine the smallest index of the maximum + * magnitude element, i.e. maximal element in the 1-norm + * @return \f$\vec{v}_{j}\f$ which maximizes \f$\left\{\left|\Re{}\vec{v}_{i}\right|+\left|\Im{}\vec{v}_{i}\right|\right}\f$ + ******************************************************************************/ +template<> +inline const complex NRVec >::amax() const { + complex ret(0., 0.); +#ifdef CUDALA + if(location == cpu){ +#endif + ret = v[cblas_izamax(nn, v, 1) - 1]; +#ifdef CUDALA + }else{ + const int pozice = cublasIzamax(nn, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamax"); + gpuget(1, sizeof(complex), v + pozice, &ret); + } +#endif + return ret; +} + +/***************************************************************************//** + * for a given complex vector \f$\vec{v}\f$, determine the smallest index of the minimum + * magnitude element, i.e. minimal element in the 1-norm + * @return \f$\vec{v}_{j}\f$ which minimizes \f$\left\{\left|\Re{}\vec{v}_{i}\right|+\left|\Im{}\vec{v}_{i}\right|\right}\f$ + ******************************************************************************/ +template<> +inline const complex NRVec >::amin() const { + complex ret(0., 0.); +#ifdef CUDALA + if(location == cpu){ +#endif + // izamin seems not to be supported + int index(0); + double val(0.0), min_val(std::numeric_limits::max()); + complex z_val(0.0, 0.0); + + for(register int i=0; i < nn; i++){ + z_val = v[i]; + val = std::abs(z_val.real()) + std::abs(z_val.imag()); + if(val < min_val){ index = i; min_val = val; } + } + ret = v[index]; +#ifdef CUDALA + }else{ + const int pozice = cublasIzamin(nn, (cuDoubleComplex*)v, 1) - 1; + TEST_CUBLAS("cublasIzamin"); + gpuget(1, sizeof(complex), v + pozice, &ret); + } +#endif + return ret; +} }//namespace #endif /* _LA_VEC_H_ */