From 074c9438627dbdcc19c44931072c5938697e5f08 Mon Sep 17 00:00:00 2001 From: jiri Date: Fri, 25 Jun 2010 15:28:19 +0000 Subject: [PATCH] *** empty log message *** --- cuda_la.cc | 68 +++++ cuda_la.h | 56 +++++ la_traits.h | 10 + laerror.cc | 13 + mat.cc | 322 +++++++++++++++++++++++- mat.h | 276 ++++++++++++++++---- noncblas.cc | 231 ++++++++++++++--- nonclass.cc | 321 ++++++++++++++--------- smat.cc | 25 ++ smat.h | 263 +++++++++++++++---- t.cc | 57 ++++- vec.cc | 46 +++- vec.h | 714 ++++++++++++++++++++++++++++++++++++---------------- 13 files changed, 1938 insertions(+), 464 deletions(-) create mode 100644 cuda_la.h diff --git a/cuda_la.cc b/cuda_la.cc index d7ce83b..a91e01e 100644 --- a/cuda_la.cc +++ b/cuda_la.cc @@ -1,2 +1,70 @@ +#include "la_traits.h" #include "cuda_la.h" +#ifdef CUDALA + +namespace LA { + +GPUID DEFAULT_LOC = cpu; + +void set_default_loc(const GPUID loc) +{ +DEFAULT_LOC = loc; +} + +void *gpualloc(size_t size) +{ +cublasStatus status; +void *ptr=NULL; +status = cublasAlloc(size,1,&ptr); +if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); +return ptr; +} + + +void gpufree(void *ptr) +{ +cublasStatus status = cublasFree(ptr); +if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree"); +} + +void gpuget(size_t n, size_t elsize, const void *from, void *to) +{ +cublasStatus status; +status=cublasGetVector(n,elsize,from,1,to,1); +if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector"); +} + +void gpuput(size_t n, size_t elsize, const void *from, void *to) +{ +cublasStatus status; +status=cublasSetVector(n,elsize,from,1,to,1); +if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); +} + +double *gpuputdouble(const double &x) +{ +cublasStatus status; +void *ptr=NULL; +status = cublasAlloc(1,sizeof(double),&ptr); +if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); +status=cublasSetVector(1,sizeof(double),&x,1,ptr,1); +if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); +return (double *)ptr; +} + +complex *gpuputcomplex(const complex &x) +{ +cublasStatus status; +void *ptr=NULL; +status = cublasAlloc(1,sizeof(complex),&ptr); +if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc"); +status=cublasSetVector(1,sizeof(complex),&x,1,ptr,1); +if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector"); +return (complex *)ptr; +} + + + +} +#endif diff --git a/cuda_la.h b/cuda_la.h new file mode 100644 index 0000000..92fc178 --- /dev/null +++ b/cuda_la.h @@ -0,0 +1,56 @@ +#ifndef _CUDA_LA_H +#define _CUDA_LA_H + +#ifdef CUDALA +#undef MATPTR +#include "cublas.h" +#endif + +#include "la_traits.h" + +namespace LA { + +#ifdef CUDALA +#define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");} +#define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");} +#define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");} +#else +#define NOT_GPU(x) {} +#define SAME_LOC(x,y) {} +#define SAME_LOC3(x,y,z) {} +#endif + +typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID; + +#ifdef CUDALA + +//global static instantiation of this class will provide automatic init/shutdown of GPU +class GPU_START { +public: + GPU_START(void) + { + cublasStatus status = cublasInit(); + if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS"); + } + ~GPU_START(void) + { + cublasStatus status = cublasShutdown(); + if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU"); + } +}; + +extern void *gpualloc(size_t size); +extern void gpufree(void *ptr); +extern void gpuget(size_t n, size_t elsize, const void *from, void *to); +extern void gpuput(size_t n, size_t elsize, const void *from, void *to); +extern double *gpuputdouble(const double &x); +extern complex *gpuputcomplex(const complex &x); + +void set_default_loc(const GPUID loc); + +extern GPUID DEFAULT_LOC; + + +#endif +} +#endif diff --git a/la_traits.h b/la_traits.h index 60ec83a..0947cee 100644 --- a/la_traits.h +++ b/la_traits.h @@ -40,6 +40,8 @@ #include "laerror.h" +#include "cuda_la.h" + #ifdef NONCBLAS #include "noncblas.h" #else @@ -205,6 +207,8 @@ struct LA_traits_aux, scalar_true> { typedef complex elementtype; typedef complex producttype; typedef C normtype; +typedef C realtype; +typedef complex complextype; static inline C sqrabs(const complex x) { return x.real()*x.real()+x.imag()*x.imag();} static inline bool gencmp(const complex *x, const complex *y, int n) {return memcmp(x,y,n*sizeof(complex));} static bool bigger(const complex &x, const complex &y) {laerror("complex comparison undefined"); return false;} @@ -230,6 +234,8 @@ struct LA_traits_aux { typedef C elementtype; typedef C producttype; typedef C normtype; +typedef C realtype; +typedef complex complextype; static inline C sqrabs(const C x) { return x*x;} static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));} static inline bool bigger(const C &x, const C &y) {return x>y;} @@ -261,6 +267,8 @@ struct LA_traits_aux, scalar_false> { \ typedef C elementtype; \ typedef X producttype; \ typedef typename LA_traits::normtype normtype; \ +typedef X::realtype> realtype; \ +typedef X::complextype> complextype; \ static bool gencmp(const C *x, const C *y, int n) {for(int i=0; iy;} \ static inline bool smaller(const C &x, const C &y) {return x, scalar_false> { \ typedef C elementtype; \ typedef NRMat producttype; \ typedef typename LA_traits::normtype normtype; \ +typedef X::realtype> realtype; \ +typedef X::complextype> complextype; \ static bool gencmp(const C *x, const C *y, int n) {for(int i=0; iy;} \ static inline bool smaller(const C &x, const C &y) {return x #include +#include "cuda_la.h" #ifdef USE_TRACEBACK #include "traceback.h" @@ -32,6 +33,11 @@ namespace LA { +//enforce GPU initialization by a global class instantization constructor +#ifdef CUDALA +GPU_START gpu_start_instant; +#endif + bool _LA_count_check=true; extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage @@ -45,6 +51,13 @@ void laerror(const char *s1) std::cerr << s1 << "\n"; std::cout << s1 << "\n"; } +#ifdef CUDALA +{ +cublasStatus s=cublasGetError(); +std::cerr << "CUBLAS status = "< void NRMat::put(int fd, bool dim, bool transp) const { +#ifdef CUDALA +if(location!=cpu) + { + NRMat tmp= *this; + tmp.moveto(cpu); + tmp.put(fd,dim,transp); + return; + } +#endif errno=0; if(dim) { @@ -153,6 +162,17 @@ else LA_traits::multiput(nn*mm,fd, template void NRMat::get(int fd, bool dim, bool transp) { +#ifdef CUDALA +if(location!=cpu) + { + NRMat tmp; + tmp.moveto(cpu); + tmp.get(fd,dim,transp); + tmp.moveto(location); + *this = tmp; + return; + } +#endif int nn0,mm0; errno=0; if(dim) @@ -188,6 +208,43 @@ else LA_traits::multiget(nn*mm,fd, // Assign diagonal +template <> +NRMat & NRMat::operator=(const double &a) +{ + copyonwrite(); +#ifdef DEBUG + if (nn != mm) laerror("RMat.operator=scalar on non-square matrix"); +#endif +#ifdef CUDALA + if(location==cpu) + { +#endif +#ifdef MATPTR + memset(v[0],0,nn*nn*sizeof(double)); + for (int i=0; i< nn; i++) v[i][i] = a; +#else + double n=0.; + cblas_dcopy(nn*nn, &n, 0, v, 1); + cblas_dcopy(nn, &a, 0, v, nn+1); +#endif +#ifdef CUDALA + } + else + { + double *d=gpuputdouble(0.); + cublasDcopy(nn*nn, d, 0, v, 1); + gpufree(d); + d=gpuputdouble(a); + cublasDcopy(nn, d, 0, v, nn+1); + gpufree(d); + } +#endif + return *this; +} + + + + template NRMat & NRMat::operator=(const T &a) { @@ -206,6 +263,64 @@ NRMat & NRMat::operator=(const T &a) } +template <> +NRMat & NRMat::operator+=(const double&a) +{ + copyonwrite(); +#ifdef DEBUG + if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix"); +#endif +#ifdef CUDALA + if(location==cpu) + { +#endif +#ifdef MATPTR + for (int i=0; i< nn; i++) v[i][i] += a; +#else + cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1); +#endif +#ifdef CUDALA + } + else + { + double *d=gpuputdouble(a); + cublasDaxpy(nn, 1.0, d, 0, *this, nn+1); + gpufree(d); + } +#endif + return *this; +} + + +template <> +NRMat & NRMat::operator-=(const double&a) +{ + copyonwrite(); +#ifdef DEBUG + if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix"); +#endif +#ifdef CUDALA + if(location==cpu) + { +#endif +#ifdef MATPTR + for (int i=0; i< nn; i++) v[i][i] -= a; +#else + cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1); +#endif +#ifdef CUDALA + } + else + { + double *d=gpuputdouble(a); + cublasDaxpy(nn, -1.0, d, 0, *this, nn+1); + gpufree(d); + } +#endif + return *this; +} + + // M += a @@ -240,6 +355,31 @@ NRMat & NRMat::operator-=(const T &a) return *this; } +template <> +const NRMat NRMat::operator-() const +{ + NRMat result(nn, mm); +#ifdef CUDALA + if(location==cpu) + { +#endif +#ifdef MATPTR + for (int i=0; i const NRMat NRMat::operator-() const @@ -253,6 +393,7 @@ const NRMat NRMat::operator-() const return result; } + // direct sum template const NRMat NRMat::operator&(const NRMat & b) const @@ -540,7 +681,13 @@ template<> NRMat & NRMat::operator*=(const double &a) { copyonwrite(); - cblas_dscal(nn*mm, a, *this, 1); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_dscal(nn*mm, a, *this, 1); +#ifdef CUDALA + else cublasDscal(nn*mm, a, v, 1); +#endif return *this; } @@ -559,6 +706,7 @@ NRMat< complex >::operator*=(const complex &a) template NRMat & NRMat::operator*=(const T &a) { +NOT_GPU(*this); copyonwrite(); #ifdef MATPTR for (int i=0; i< nn*mm; i++) v[0][i] *= a; @@ -578,8 +726,16 @@ NRMat & NRMat::operator+=(const NRMat &rhs) if (nn != rhs.nn || mm!= rhs.mm) laerror("Mat += Mat of incompatible matrices"); #endif +SAME_LOC(*this,rhs); copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1); +#ifdef CUDALA + else + cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1); +#endif return *this; } @@ -625,8 +781,16 @@ NRMat & NRMat::operator-=(const NRMat &rhs) if (nn != rhs.nn || mm!= rhs.mm) laerror("Mat -= Mat of incompatible matrices"); #endif + SAME_LOC(*this,rhs); copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1); +#ifdef CUDALA + else + cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1); +#endif return *this; } @@ -836,9 +1000,18 @@ const NRMat NRMat::operator*(const NRMat &rhs) const if (mm != rhs.nn) laerror("product of incompatible matrices"); if (rhs.mm <=0) laerror("illegal matrix dimension in gemm"); #endif - NRMat result(nn, rhs.mm); +SAME_LOC(*this,rhs); + + NRMat result(nn, rhs.mm,rhs.getlocation()); +#ifdef CUDALA + if(location==cpu) +#endif cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0, *this, mm, rhs, rhs.mm, 0.0, result, rhs.mm); +#ifdef CUDALA + else + cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm); +#endif return result; } @@ -991,12 +1164,21 @@ void NRMat::gemm(const double &beta, const NRMat &a, if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()"); if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm"); #endif +SAME_LOC3(*this,a,b); + if (alpha==0.0 && beta==1.0) return; copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans), (transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, a.mm, b , b.mm, beta, *this , mm); +#ifdef CUDALA + else + cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm); +#endif } @@ -1028,7 +1210,20 @@ void NRMat< complex >::gemm(const complex & beta, template<> const double NRMat::norm(const double scalar) const { - if (!scalar) return cblas_dnrm2(nn*mm, (*this)[0], 1); + if (!scalar) + { +#ifdef CUDALA + if(location==cpu) +#endif + return cblas_dnrm2(nn*mm, (*this)[0], 1); +#ifdef CUDALA + else + return cublasDnrm2(nn*mm, v, 1); +#endif + } + +NOT_GPU(*this); + double sum = 0; for (int i=0; i +NRMat& NRMat::SwapRows(){ + copyonwrite(); + const int n_pul = this->nn / 2; + double * const dataIn = this->v; + + for(register int i=0; i +NRMat >& NRMat >::SwapRows(){ + copyonwrite(); + const int n = this->nn; + const int m = this->mm; + const int n_pul = this->nn / 2; + complex * const dataIn = this->v; + + for(register int i=0; i +NRMat& NRMat::SwapRows(){ + copyonwrite(); + const int n = this->nn; + const int m = this->mm; + const int n_pul = this->nn / 2; + T * const dataIn = this->v; + + for(register int i=0; i +NRMat& NRMat::SwapCols(){ + copyonwrite(); + const int n = this->nn; + const int m = this->mm; + const int m_pul = m / 2; + double * const dataIn = this->v; + + for(register int i=0; i +NRMat >& NRMat >::SwapCols(){ + copyonwrite(); + const int n_pul = this->nn / 2; + const int m_pul = this->mm / 2; + complex * const dataIn = this->v; + + for(register int i=0; i +NRMat& NRMat::SwapCols(){ + copyonwrite(); + const int n_pul = nn / 2; + const int m_pul = mm / 2; + T * const dataIn = this->v; + + for(register int i=0; i +NRMat& NRMat::SwapRowsCols(){ + this->copyonwrite(); + const int n = this->nn; + const int m = this->mm; + T * const dataIn = this->v; + T * const dataOut = this->v; + + const int Dim = n*m; + for(register int i=0;i; friend class NRSMat; - inline NRMat() : nn(0), mm(0), v(0), count(0) {}; - inline NRMat(const int n, const int m); + inline NRMat() : nn(0), mm(0), v(0), count(0) + { +#ifdef CUDALA + location = DEFAULT_LOC; +#endif + }; + inline NRMat(const int n, const int m ,const GPUID loc= undefined); inline NRMat(const T &a, const int n, const int m); NRMat(const T *a, const int n, const int m); inline NRMat(const NRMat &rhs); @@ -57,6 +65,13 @@ public: #endif const bool operator==(const NRMat &rhs) const {return !(*this != rhs);}; inline int getcount() const {return count?*count:0;} +#ifdef CUDALA + inline GPUID getlocation() const {return location;} + void moveto(const GPUID dest); +#else + inline GPUID getlocation() const {return cpu;} + void moveto(const GPUID dest) {}; +#endif NRMat & operator=(const NRMat &rhs); //assignment void randomize(const typename LA_traits::normtype &x); //fill with random numbers NRMat & operator=(const T &a); //assign a to diagonal @@ -88,7 +103,7 @@ public: const NRMat operator*(const NRSMat &rhs) const; // Mat * Smat const NRMat operator&(const NRMat &rhs) const; // direct sum const NRMat operator|(const NRMat &rhs) const; // direct product - const NRVec operator*(const NRVec &rhs) const {NRVec result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec + const NRVec operator*(const NRVec &rhs) const {NRVec result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec const NRVec rsum() const; //sum of rows const NRVec csum() const; //sum of columns @@ -157,9 +172,14 @@ public: namespace LA { // ctors template -NRMat::NRMat(const int n, const int m) : nn(n), mm(m), count(new int) +NRMat::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { *count = 1; +#ifdef CUDALA + location= (loc==undefined?DEFAULT_LOC:loc); + if(location==cpu) + { +#endif #ifdef MATPTR v = new T*[n]; v[0] = new T[m*n]; @@ -167,14 +187,29 @@ NRMat::NRMat(const int n, const int m) : nn(n), mm(m), count(new int) #else v = new T[m*n]; #endif +#ifdef CUDALA + } + else + { + v= (T*) gpualloc(n*m*sizeof(T)); + } +#endif } template NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) { +#ifdef CUDALA + location=DEFAULT_LOC; +#endif + int i; T *p; *count = 1; +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR v = new T*[n]; p = v[0] = new T[m*n]; @@ -186,12 +221,29 @@ NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new for (i=0; i< n*m; i++) *p++ = a; else memset(p, 0, n*m*sizeof(T)); +#ifdef CUDALA + } + else + { + v= (T*) gpualloc(n*m*sizeof(T)); + cublasSetVector(n*m,sizeof(T),&a,0,v,1); + } +#endif + } template NRMat::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) { +#ifdef CUDALA + location=DEFAULT_LOC; +#endif + *count = 1; +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR v = new T*[n]; v[0] = new T[m*n]; @@ -201,11 +253,25 @@ NRMat::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new v = new T[m*n]; memcpy(v, a, n*m*sizeof(T)); #endif +#ifdef CUDALA + } + else + { + v= (T*) gpualloc(n*m*sizeof(T)); + cublasSetVector(n*m,sizeof(T),a,1,v,1); + } +#endif + } + +//copy constructor template NRMat::NRMat(const NRMat &rhs) { +#ifdef CUDALA + location=rhs.location; +#endif nn = rhs.nn; mm = rhs.mm; count = rhs.count; @@ -213,9 +279,16 @@ NRMat::NRMat(const NRMat &rhs) if (count) ++(*count); } + template NRMat::NRMat(const NRSMat &rhs) { +NOT_GPU(rhs); + +#ifdef CUDALA + location=rhs.location; +#endif + int i; nn = mm = rhs.nrows(); count = new int; @@ -244,6 +317,10 @@ NRMat::NRMat(const NRVec &rhs, const int n, const int m, const int offset) { if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); +#ifdef CUDALA +location=rhs.location; +#endif + nn = n; mm = m; count = rhs.count; @@ -303,6 +380,7 @@ inline T & NRMat::operator()(const int i, const int j) if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1"); if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range"); if (!v) laerror("(,) for unallocated Mat"); +NOT_GPU(*this); #endif #ifdef MATPTR return v[i][j]; @@ -310,12 +388,14 @@ inline T & NRMat::operator()(const int i, const int j) return v[i*mm+j]; #endif } + template inline const T & NRMat::operator()(const int i, const int j) const { #ifdef DEBUG if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range"); if (!v) laerror("(,) for unallocated Mat"); +NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct #endif #ifdef MATPTR return v[i][j]; @@ -391,7 +471,7 @@ inline const complex NRMat< complex >::amax() const } -//basi stuff to be available for any type ... must be in .h +//basic stuff to be available for any type ... must be in .h // dtor template NRMat::~NRMat() @@ -399,10 +479,21 @@ NRMat::~NRMat() if (!count) return; if (--(*count) <= 0) { if (v) { +#ifdef CUDALA + if(location==cpu) +#endif + { #ifdef MATPTR delete[] (v[0]); #endif delete[] v; + } +#ifdef CUDALA + else + { + gpufree(v); + } +#endif } delete count; } @@ -415,14 +506,27 @@ NRMat & NRMat::operator=(const NRMat &rhs) if (this !=&rhs) { if (count) - if (--(*count) ==0 ) { + if (--(*count) ==0 ) + { +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR delete[] (v[0]); #endif delete[] v; +#ifdef CUDALA + } + else gpufree(v); +#endif + delete count; } v = rhs.v; +#ifdef CUDALA + location=rhs.location; +#endif nn = rhs.nn; mm = rhs.mm; count = rhs.count; @@ -437,46 +541,8 @@ template NRMat & NRMat::operator|=(const NRMat &rhs) { if (this == &rhs) return *this; -#ifdef DEBUG - if (!rhs.v) laerror("unallocated rhs in Mat operator |="); -#endif - if (count) - if (*count > 1) { - --(*count); - nn = 0; - mm = 0; - count = 0; - v = 0; - } - if (nn != rhs.nn || mm != rhs.mm) { - if (v) { -#ifdef MATPTR - delete[] (v[0]); -#endif - delete[] (v); - v = 0; - } - nn = rhs.nn; - mm = rhs.mm; - } - if (!v) { -#ifdef MATPTR - v = new T*[nn]; - v[0] = new T[mm*nn]; -#else - v = new T[mm*nn]; -#endif - } -#ifdef MATPTR - for (int i=1; i< nn; i++) v[i] = v[i-1] + mm; - memcpy(v[0], rhs.v[0], nn*mm*sizeof(T)); -#else - memcpy(v, rhs.v, nn*mm*sizeof(T)); -#endif - - if (!count) count = new int; - *count = 1; - + *this = rhs; + this->copyonwrite(); return *this; } @@ -486,9 +552,13 @@ void NRMat::copyonwrite() { if (!count) laerror("Mat::copyonwrite of undefined matrix"); if (*count > 1) { - (*count)--; - count = new int; - *count = 1; + (*count)--; + count = new int; + *count = 1; +#ifdef CUDALA + if(location==cpu) //matrix is in CPU memory + { +#endif #ifdef MATPTR T **newv = new T*[nn]; newv[0] = new T[mm*nn]; @@ -499,10 +569,21 @@ void NRMat::copyonwrite() T *newv = new T[mm*nn]; memcpy(newv, v, mm*nn*sizeof(T)); v = newv; +#endif +#ifdef CUDALA + } + else //matrix is in GPU memory + { + T *newv = (T *) gpualloc(mm*nn*sizeof(T)); + if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem"); + cublasScopy(nn*mm*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1); + v = newv; + } #endif } } + template void NRMat::resize(int n, int m) { @@ -519,10 +600,18 @@ if(m==0) n=0; if(n==0 && m==0) { if(--(*count) <= 0) { +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR if(v) delete[] (v[0]); #endif if(v) delete[] v; +#ifdef CUDALA + } + else gpufree(v); +#endif delete count; } count=0; @@ -543,6 +632,10 @@ if(m==0) n=0; *count = 1; nn = n; mm = m; +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR v = new T*[nn]; v[0] = new T[m*n]; @@ -550,12 +643,22 @@ if(m==0) n=0; #else v = new T[m*n]; #endif +#ifdef CUDALA + } + else + v = (T *) gpualloc(n*m*sizeof(T)); +#endif + return; } // At this point *count = 1, check if resize is necessary if (n!=nn || m!=mm) { nn = n; mm = m; +#ifdef CUDALA + if(location==cpu) + { +#endif #ifdef MATPTR delete[] (v[0]); #endif @@ -566,6 +669,14 @@ if(m==0) n=0; for (int i=1; i< n; i++) v[i] = v[i-1] + m; #else v = new T[m*n]; +#endif +#ifdef CUDALA + } + else + { + gpufree(v); + v=(T *) gpualloc(n*m*sizeof(T)); + } #endif } } @@ -587,7 +698,11 @@ return r; // I/O template std::ostream& operator<<(std::ostream &s, const NRMat &x) - { +{ +#ifdef CUDALA + if(x.getlocation()==cpu) + { +#endif int i,j,n,m; n=x.nrows(); m=x.ncols(); @@ -597,18 +712,43 @@ std::ostream& operator<<(std::ostream &s, const NRMat &x) for(j=0; j::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function } return s; - } +#ifdef CUDALA + } + else + { + NRMat tmp=x; + tmp.moveto(cpu); + return s< std::istream& operator>>(std::istream &s, NRMat &x) +{ +#ifdef CUDALA + if(x.getlocation()==cpu) { +#endif int i,j,n,m; s >> n >> m; x.resize(n,m); typename LA_traits_io::IOtype tmp; for(i=0;i>tmp; x[i][j]=tmp;} return s; - } +#ifdef CUDALA + } + else + { + NRMat tmp; + tmp.moveto(cpu); + s >> tmp; + tmp.moveto(x.getlocation()); + x=tmp; + return s; + } +#endif +} //optional indexing from 1 @@ -671,6 +811,38 @@ NRMat & NRMat::operator^=(const NRMat &rhs){ } +#ifdef CUDALA +template +void NRMat::moveto(const GPUID dest) +{ +if(location==dest) return; +location=dest; + +if(v && !count) laerror("internal inconsistency of reference counting 1"); +if (!count) return; + +if(v && *count==0) laerror("internal inconsistency of reference counting 2"); +if(!v) return; + +T *vold = v; + +if(dest == cpu) //moving from GPU to CPU + { + v = new T[nn*mm]; + gpuget(nn*mm,sizeof(T),vold,v); + if(*count == 1) gpufree(vold); + else {--(*count); count = new int(1);} + } +else //moving from CPU to GPU + { + v=(T *) gpualloc(nn*mm*sizeof(T)); + gpuput(nn*mm,sizeof(T),vold,v); + if(*count == 1) delete[] vold; + else {--(*count); count = new int(1);} + } +} +#endif +//end CUDALA diff --git a/noncblas.cc b/noncblas.cc index 1bef99d..aac7f10 100644 --- a/noncblas.cc +++ b/noncblas.cc @@ -20,106 +20,186 @@ #include "noncblas.h" #include "laerror.h" #include "mat.h" +#include "fortran.h" -#ifdef FORTRAN_ -#define FORNAME(x) x##_ -#else -#define FORNAME(x) x -#endif - #ifdef NONCBLAS //Level 1 - straightforward wrappers -extern "C" double FORNAME(ddot) (const int *n, const double *x, const int *incx, const double *y, const int *incy); +extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy); double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp); +#else return FORNAME(ddot)(&N,X,&incX,Y,&incY); +#endif } -extern "C" void FORNAME(dscal) (const int *n, const double *a, double *x, const int *incx); +extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx); void cblas_dscal(const int N, const double alpha, double *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp); +#else FORNAME(dscal) (&N,&alpha,X,&incX); +#endif } -extern "C" void FORNAME(dcopy) (const int *n, const double *x, const int *incx, double *y, const int *incy); +extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy); void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp); +#else FORNAME(dcopy) (&N,X,&incX,Y,&incY); +#endif } -extern "C" void FORNAME(daxpy) (const int *n, const double *a, const double *x, const int *incx, double *y, const int *incy); +extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy); void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp); +#else FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY); +#endif } -extern "C" double FORNAME(dnrm2) (const int *n, const double *x, const int *incx); +extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx); double cblas_dnrm2(const int N, const double *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +return FORNAME(dnrm2) (&ntmp,X,&incxtmp); +#else return FORNAME(dnrm2) (&N,X,&incX); +#endif } -extern "C" double FORNAME(dasum) (const int *n, const double *x, const int *incx); +extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx); double cblas_dasum(const int N, const double *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +return FORNAME(dasum) (&ntmp,X,&incxtmp); +#else return FORNAME(dasum) (&N,X,&incX); +#endif } -extern "C" void FORNAME(zcopy) (const int *n, const void *x, const int *incx, void *y, const int *incy); +extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy); void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp); +#else FORNAME(zcopy) (&N,X,&incX,Y,&incY); +#endif } -extern "C" void FORNAME(zaxpy) (const int *n, const void *a, const void *x, const int *incx, void *y, const int *incy); +extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy); void cblas_zaxpy(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp); +#else FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY); +#endif } -extern "C" void FORNAME(zscal) (const int *n, const void *a, void *x, const int *incx); +extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx); void cblas_zscal(const int N, const void *alpha, void *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +FORNAME(zscal)(&ntmp,alpha,X,&incxtmp); +#else FORNAME(zscal)(&N,alpha,X,&incX); +#endif } -extern "C" void FORNAME(zdscal) (const int *n, const double *a, void *x, const int *incx); +extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx); void cblas_zdscal(const int N, const double alpha, void *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp); +#else FORNAME(zdscal)(&N,&alpha,X,&incX); +#endif } -extern "C" double FORNAME(dznrm2) (const int *n, const void *x, const int *incx); +extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx); double cblas_dznrm2(const int N, const void *X, const int incX) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +return FORNAME(dznrm2) (&ntmp,X,&incxtmp); +#else return FORNAME(dznrm2) (&N,X,&incX); +#endif } //the following ones are f2c-compatible, but is it truly portable??? -extern "C" void FORNAME(zdotu) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy); +extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy); void cblas_zdotu_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp); +#else FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY); +#endif } -extern "C" void FORNAME(zdotc) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy); +extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy); void cblas_zdotc_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc) { +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp); +#else FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY); +#endif } @@ -129,7 +209,7 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY); //enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114}; -extern "C" void FORNAME(dspmv) (const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy); +extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy); void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *Ap, const double *X, const int incX, @@ -137,11 +217,18 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, { if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted"); +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); +#else FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY); +#endif } -extern "C" void FORNAME(zhpmv) (const char *uplo, const int *n, const void *alpha, const void *ap, const void *x, const int *incx, const void *beta, void *y, const int *incy); +extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy); void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const void *alpha, const void *Ap, const void *X, const int incX, @@ -149,20 +236,36 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, { if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted"); +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); +#else FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); +#endif } //Level 2 and Level 3 on general matrices - take into account the transposed storage of matrices in Fortran and C -extern "C" void FORNAME(dger) (const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda); +extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda); void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda) { if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); //swap m-n, y-x +#ifdef FORINT +const FINT mtmp=M; +const FINT ntmp=N; +const FINT incxtmp=incX; +const FINT incytmp=incY; +const FINT ldatmp=lda; +FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp); +#else FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda); +#endif } void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, @@ -179,7 +282,7 @@ void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N, LA::laerror("cblas_zgeru cannot be simply converted to fortran order"); } -extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc); +extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc); void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, @@ -188,11 +291,22 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA { if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); //swap a-b, m-n +#ifdef FORINT +const FINT mtmp=M; +const FINT ntmp=N; +const FINT ktmp=K; +const FINT ldatmp=lda; +const FINT ldbtmp=ldb; +const FINT ldctmp=ldc; +FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", + &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp); +#else FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); +#endif } -extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const void *alpha, const void *a, const int *lda, const void *b, const int *ldb, const void *beta, void *c, const int *ldc); +extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc); void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, @@ -201,26 +315,49 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA { if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); //swap a-b, m-n +#ifdef FORINT +const FINT mtmp=M; +const FINT ntmp=N; +const FINT ktmp=K; +const FINT ldatmp=lda; +const FINT ldbtmp=ldb; +const FINT ldctmp=ldc; +FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp); +#else FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc); +#endif } -extern "C" void FORNAME(dgemv) (const char *TRANS, const int *M, const int *N, const double *ALPHA, const double *A, const int *LDA, const double *X, const int *INCX, const double *BETA, double *Y, const int *INCY); +extern "C" void FORNAME(dgemv) (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A, const FINT *LDA, const double *X, const FINT *INCX, const double *BETA, double *Y, const FINT *INCY); void cblas_dgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { +#ifdef FORINT +const FINT mtmp=M; +const FINT ntmp=N; +const FINT ldatmp=lda; +const FINT incxtmp=incX; +const FINT incytmp=incY; +if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); +//swap n-m and toggle transposition +else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); +#else if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); //swap n-m and toggle transposition else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); +#endif } -extern "C" void FORNAME(zgemv) (const char *TRANS, const int *M, const int *N, const void *ALPHA, const void *A, const int *LDA, const void *X, const int *INCX, const void *BETA, void *Y, const int *INCY); +extern "C" void FORNAME(zgemv) (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A, const FINT *LDA, const void *X, const FINT *INCX, const void *BETA, void *Y, const FINT *INCY); void cblas_zgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, @@ -230,14 +367,29 @@ void cblas_zgemv(const enum CBLAS_ORDER Order, if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted"); //swap n-m and toggle transposition +#ifdef FORINT +const FINT mtmp=M; +const FINT ntmp=N; +const FINT ldatmp=lda; +const FINT incxtmp=incX; +const FINT incytmp=incY; +FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); +#else FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); +#endif } -extern "C" int FORNAME(idamax) (const int *N, const double *DX, const int *INCX); +extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX); CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) { -return FORNAME(idamax)(&N,X,&incX); +#ifdef FORINT +const FINT ntmp=N; +const FINT incxtmp=incX; +return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp); +#else +return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX); +#endif } @@ -246,21 +398,38 @@ return FORNAME(idamax)(&N,X,&incX); #ifdef NONCLAPACK //clapack_dgesv //allocate auxiliary storage and transpose input and output quantities to fortran/C order -extern "C" void FORNAME(dgesv) (const int *N, const int *NRHS, double *A, const int *LDA, int *IPIV, double *B, const int *LDB, int *INFO); +extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO); int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, double *A, const int lda, int *ipiv, double *B, const int ldb) { -int INFO=0; +FINT INFO=0; if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted"); //B should be in the same physical order, just transpose A in place and the LU result on output for(int i=1; i &a, double *b, const int nrhs, const int ldb, double *det, int n) { - int r, *ipiv; + FINT r, *ipiv; a.copyonwrite(); - ipiv = new int[n]; + ipiv = new FINT[n]; char U = 'U'; +#ifdef FORINT + const FINT ntmp=n; + const FINT nrhstmp=nrhs; + const FINT ldbtmp=ldb; + FORNAME(dspsv)(&U, &ntmp, &nrhstmp, a, ipiv, b, &ldbtmp,&r); +#else FORNAME(dspsv)(&U, &n, &nrhs, a, ipiv, b, &ldb,&r); +#endif if (r < 0) { delete[] ipiv; laerror("illegal argument in spsv() call of linear_solve()"); @@ -239,8 +243,8 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n); //other version of linear solver based on gesvx //------------------------------------------------------------------------------ -extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, complex *A, const int *lda, complex *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, complex *B, const int *ldb, complex *X, const int *ldx, double *rcond, double *ferr, double *berr, complex *work, double *rwork, int *info); -extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, double *A, const int *lda, double *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, double *B, const int *ldb, double *X, const int *ldx, double *rcond, double *ferr, double *berr, double *work, int *iwork, int *info); +extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, complex *A, const FINT *lda, complex *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, complex *B, const FINT *ldb, complex *X, const FINT *ldx, double *rcond, double *ferr, double *berr, complex *work, double *rwork, FINT *info); +extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, double *A, const FINT *lda, double *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, double *B, const FINT *ldb, double *X, const FINT *ldx, double *rcond, double *ferr, double *berr, double *work, FINT *iwork, FINT *info); //------------------------------------------------------------------------------ // solves set of linear equations using dgesvx // input: @@ -270,18 +274,18 @@ int linear_solve_x(NRMat &_A, double *_B, const int _rhsCount, const int double *A; double * const _A_data = (double*)_A; - int info; - const int nrhs = _rhsCount; - const int n = _eqCount; - int lda = A_cols; - const int ldaf = lda; + FINT info; + const FINT nrhs = _rhsCount; + const FINT n = _eqCount; + FINT lda = A_cols; + const FINT ldaf = lda; double rcond; double ferr[nrhs], berr[nrhs], work[4*n]; double R[n], C[n]; - int *const iwork = new int[n]; - int *const ipiv = new int[n]; + FINT *const iwork = new FINT[n]; + FINT *const ipiv = new FINT[n]; double *X = new double[n*nrhs]; double *AF = new double[ldaf*n]; @@ -303,7 +307,7 @@ int linear_solve_x(NRMat &_A, double *_B, const int _rhsCount, const int } FORNAME(dgesvx)(&fact, &trans, &n, &nrhs, A, &lda, AF, &ldaf, &ipiv[0], &equed, &R[0], &C[0], _B, &n, X, &n, &rcond, ferr, berr, work, iwork, &info); - + if(_rcond)*_rcond = rcond; cblas_dcopy(n*nrhs, X, 1, _B, 1);//store the solution @@ -312,7 +316,7 @@ int linear_solve_x(NRMat &_A, double *_B, const int _rhsCount, const int if(_saveA){ delete[] A; } - return info; + return (int)info; } //------------------------------------------------------------------------------ // solves set of linear equations using zgesvx @@ -343,18 +347,18 @@ int linear_solve_x(NRMat > &_A, complex *_B, const int _ complex *A; complex * const _A_data = (complex*)_A; - int info; - const int nrhs = _rhsCount; - const int n = _eqCount; - int lda = A_cols; - const int ldaf = lda; + FINT info; + const FINT nrhs = _rhsCount; + const FINT n = _eqCount; + FINT lda = A_cols; + const FINT ldaf = lda; double rcond; double ferr[nrhs], berr[nrhs]; double R[n], C[n], rwork[2*n]; complex work[2*n]; - int *const ipiv = new int[n]; + FINT *const ipiv = new FINT[n]; complex *X = new complex[n*nrhs]; complex *AF = new complex[ldaf*n]; @@ -386,7 +390,7 @@ int linear_solve_x(NRMat > &_A, complex *_B, const int _ if(_saveA){ delete[] A; } - return info; + return (int)info; } //------------------------------------------------------------------------------ // for given square matrices A, B computes X = AB^{-1} as follows @@ -402,8 +406,8 @@ int linear_solve_x(NRMat > &_A, complex *_B, const int _ //------------------------------------------------------------------------------ int multiply_by_inverse(NRMat &_A, NRMat &_B, bool _useEq, double *_rcond){ - const int n = _A.nrows(); - const int m = _A.ncols(); + const FINT n = _A.nrows(); + const FINT m = _A.ncols(); if(n != m || n != _B.nrows() || n != _B.ncols()){ laerror("multiply_by_inverse: incompatible matrices"); } @@ -417,13 +421,13 @@ int multiply_by_inverse(NRMat &_A, NRMat &_B, bool _useEq, doubl double * const B = (double*)_B; _B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B - int info; + FINT info; double rcond; double ferr[n], berr[n], work[4*n]; double R[n], C[n]; - int *const iwork = new int[n]; - int *const ipiv = new int[n]; + FINT *const iwork = new FINT[n]; + FINT *const ipiv = new FINT[n]; double *X = new double[n2]; double *AF = new double[n2]; @@ -437,7 +441,7 @@ int multiply_by_inverse(NRMat &_A, NRMat &_B, bool _useEq, doubl delete[] iwork;delete[] ipiv; delete[] AF;delete[] X; - return info; + return (int)info; } //------------------------------------------------------------------------------ // for given square matrices A, B computes X = AB^{-1} as follows @@ -453,8 +457,8 @@ int multiply_by_inverse(NRMat &_A, NRMat &_B, bool _useEq, doubl //------------------------------------------------------------------------------ int multiply_by_inverse(NRMat > &_A, NRMat > &_B, bool _useEq, double *_rcond){ - const int n = _A.nrows(); - const int m = _A.ncols(); + const FINT n = _A.nrows(); + const FINT m = _A.ncols(); if(n != m || n != _B.nrows() || n != _B.ncols()){ laerror("multiply_by_inverse: incompatible matrices"); } @@ -468,13 +472,13 @@ int multiply_by_inverse(NRMat > &_A, NRMat > &_B complex * const B = (complex*)_B; _B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B - int info; + FINT info; double rcond; double ferr[n], berr[n]; double R[n], C[n], rwork[2*n]; complex work[2*n]; - int *const ipiv = new int[n]; + FINT *const ipiv = new FINT[n]; complex *X = new complex[n2]; complex *AF = new complex[n2]; @@ -488,24 +492,24 @@ int multiply_by_inverse(NRMat > &_A, NRMat > &_B delete[] ipiv; delete[] AF;delete[] X; - return info; + return (int)info; } //------------------------------------------------------------------------------ -extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const int *N, - double *A, const int *LDA, double *W, double *WORK, const int *LWORK, int *INFO); +extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const FINT *N, + double *A, const FINT *LDA, double *W, double *WORK, const FINT *LWORK, FINT *INFO); -extern "C" void FORNAME(dsygv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N, - double *A, const int *LDA, double *B, const int *LDB, double *W, double *WORK, const int *LWORK, int *INFO); +extern "C" void FORNAME(dsygv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N, + double *A, const FINT *LDA, double *B, const FINT *LDB, double *W, double *WORK, const FINT *LWORK, FINT *INFO); // a will contain eigenvectors (columns if corder==1), w eigenvalues void diagonalize(NRMat &a, NRVec &w, const bool eivec, const bool corder, int n, NRMat *b, const int itype) { - int m = a.nrows(); + FINT m = a.nrows(); if (m != a.ncols()) laerror("diagonalize() call with non-square matrix"); if (a.nrows() != w.size()) laerror("inconsistent dimension of eigenvalue vector in diagonalize()"); @@ -517,22 +521,38 @@ void diagonalize(NRMat &a, NRVec &w, const bool eivec, w.copyonwrite(); if(b) b->copyonwrite(); - int r = 0; + FINT r = 0; char U ='U'; char vectors = 'V'; if (!eivec) vectors = 'N'; - int LWORK = -1; + FINT LWORK = -1; double WORKX; - int ldb=0; if(b) ldb=b->ncols(); + FINT ldb=0; if(b) ldb=b->ncols(); +#ifdef FORINT + const FINT itypetmp = itype; + FINT ntmp = n; + // First call is to determine size of workspace + if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r ); + else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r ); +#else // First call is to determine size of workspace if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r ); else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, &r ); - LWORK = (int)WORKX; +#endif + + LWORK = (FINT)WORKX; double *WORK = new double[LWORK]; - if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r ); + +#ifdef FORINT + if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r ); + else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r ); +#else + if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r ); else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, &r ); - delete[] WORK; +#endif + + delete[] WORK; if (vectors == 'V' && corder) a.transposeme(n); if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()"); @@ -541,18 +561,18 @@ void diagonalize(NRMat &a, NRVec &w, const bool eivec, -extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const int *N, - complex *A, const int *LDA, double *W, complex *WORK, const int *LWORK, double *RWORK, int *INFO); +extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const FINT *N, + complex *A, const FINT *LDA, double *W, complex *WORK, const FINT *LWORK, double *RWORK, FINT *INFO); -extern "C" void FORNAME(zhegv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N, - complex *A, const int *LDA, complex *B, const int *LDB, double *W, complex *WORK, const int *LWORK, double *RWORK, int *INFO); +extern "C" void FORNAME(zhegv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N, + complex *A, const FINT *LDA, complex *B, const FINT *LDB, double *W, complex *WORK, const FINT *LWORK, double *RWORK, FINT *INFO); // a will contain eigenvectors (columns if corder==1), w eigenvalues void diagonalize(NRMat > &a, NRVec &w, const bool eivec, const bool corder, int n, NRMat > *b, const int itype) { - int m = a.nrows(); + FINT m = a.nrows(); if (m != a.ncols()) laerror("diagonalize() call with non-square matrix"); if (a.nrows() != w.size()) laerror("inconsistent dimension of eigenvalue vector in diagonalize()"); @@ -564,23 +584,38 @@ void diagonalize(NRMat > &a, NRVec &w, const bool eivec, w.copyonwrite(); if(b) b->copyonwrite(); - int r = 0; + FINT r = 0; char U ='U'; char vectors = 'V'; if (!eivec) vectors = 'N'; - int LWORK = -1; + FINT LWORK = -1; complex WORKX; - int ldb=0; if(b) ldb=b->ncols(); + FINT ldb=0; if(b) ldb=b->ncols(); // First call is to determine size of workspace double *RWORK = new double[3*n+2]; - if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r ); +#ifdef FORINT + const FINT itypetmp = itype; + FINT ntmp = n; + if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r ); + else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r ); +#else + if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r ); else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, RWORK, &r ); - LWORK = (int)WORKX.real(); +#endif + + LWORK = (FINT)WORKX.real(); complex *WORK = new complex[LWORK]; - if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r ); + +#ifdef FORINT + if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r ); + else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r ); +#else + if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r ); else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, RWORK, &r ); - delete[] WORK; +#endif + + delete[] WORK; delete[] RWORK; if (vectors == 'V' && corder) a.transposeme(n); @@ -590,11 +625,11 @@ void diagonalize(NRMat > &a, NRVec &w, const bool eivec, -extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const int *N, - double *AP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO); +extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const FINT *N, + double *AP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO); -extern "C" void FORNAME(dspgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N, - double *AP, double *BP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO); +extern "C" void FORNAME(dspgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N, + double *AP, double *BP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO); // v will contain eigenvectors, w eigenvalues @@ -613,14 +648,21 @@ void diagonalize(NRSMat &a, NRVec &w, NRMat *v, if(v) v->copyonwrite(); if(b) b->copyonwrite(); - int r = 0; + FINT r = 0; char U = 'U'; char job = v ? 'v' : 'n'; double *WORK = new double[3*n]; - int ldv=v?v->ncols():n; - if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r ); + FINT ldv=v?v->ncols():n; +#ifdef FORINT + const FINT itypetmp = itype; + FINT ntmp = n; + if(b) FORNAME(dspgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r ); + else FORNAME(dspev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r ); +#else + if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r ); else FORNAME(dspev)(&job, &U, &n, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r ); +#endif delete[] WORK; if (v && corder) v->transposeme(n); @@ -629,11 +671,11 @@ void diagonalize(NRSMat &a, NRVec &w, NRMat *v, } -extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const int *N, - complex *AP, double *W, complex *Z, const int *LDZ, complex *WORK, double *RWORK, int *INFO); +extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const FINT *N, + complex *AP, double *W, complex *Z, const FINT *LDZ, complex *WORK, double *RWORK, FINT *INFO); -extern "C" void FORNAME(zhpgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N, - complex *AP, complex *BP, double *W, complex *Z, const int *LDZ, complex *WORK, double *RWORK, int *INFO); +extern "C" void FORNAME(zhpgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N, + complex *AP, complex *BP, double *W, complex *Z, const FINT *LDZ, complex *WORK, double *RWORK, FINT *INFO); // v will contain eigenvectors, w eigenvalues @@ -652,15 +694,22 @@ void diagonalize(NRSMat > &a, NRVec &w, NRMatcopyonwrite(); if(b) b->copyonwrite(); - int r = 0; + FINT r = 0; char U = 'U'; char job = v ? 'v' : 'n'; complex *WORK = new complex[2*n]; double *RWORK = new double[3*n]; - int ldv=v?v->ncols():n; + FINT ldv=v?v->ncols():n; +#ifdef FORINT + const FINT itypetmp = itype; + FINT ntmp = n; + if(b) FORNAME(zhpgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(complex *)0, &ldv, WORK, RWORK, &r ); + else FORNAME(zhpev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(complex *)0, &ldv, WORK, RWORK, &r ); +#else if(b) FORNAME(zhpgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(complex *)0, &ldv, WORK, RWORK, &r ); else FORNAME(zhpev)(&job, &U, &n, a, w, v?(*v)[0]:(complex *)0, &ldv, WORK, RWORK, &r ); +#endif delete[] WORK; delete[] RWORK; if (v && corder) v->transposeme(n); @@ -671,17 +720,17 @@ void diagonalize(NRSMat > &a, NRVec &w, NRMat &a, NRMat *u, NRVec &s, NRMat *v, const bool corder, int m, int n) { - int m0 = a.nrows(); - int n0 = a.ncols(); - if(m<=0) m=m0; - if(n<=0) n=n0; + FINT m0 = a.nrows(); + FINT n0 = a.ncols(); + if(m<=0) m=(int)m0; + if(n<=0) n=(int)n0; if(n>n0 || m>m0) laerror("bad dimension in singular_decomposition"); if (u) if (m > u->nrows() || m> u->ncols()) laerror("inconsistent dimension of U Mat in singular_decomposition()"); @@ -700,14 +749,30 @@ void singular_decomposition(NRMat &a, NRMat *u, NRVec &s char jobu = u ? 'A' : 'N'; char jobv = v ? 'A' : 'N'; double work0; - int lwork = -1; - int r; + FINT lwork = -1; + FINT r; + +#ifdef FORINT + FINT ntmp = n; + FINT mtmp = m; + FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0, + u?(*u)[0]:0, &m0, &work0, &lwork, &r); +#else FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0, u?(*u)[0]:0, &m0, &work0, &lwork, &r); - lwork = (int) work0; +#endif + + lwork = (FINT) work0; double *work = new double[lwork]; + +#ifdef FORINT + FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0, + u?(*u)[0]:0, &m0, work, &lwork, &r); +#else FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0, u?(*u)[0]:0, &m0, work, &lwork, &r); +#endif + delete[] work; if (v && corder) v->transposeme(n); @@ -719,14 +784,14 @@ void singular_decomposition(NRMat &a, NRMat *u, NRVec &s //extern "C" void FORNAME(dgeqrf)(const int *M, const int *N, double *A, const int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO); -extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const int *N, - double *A, const int *LDA, double *WR, double *WI, double *VL, const int *LDVL, - double *VR, const int *LDVR, double *WORK, const int *LWORK, int *INFO ); +extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const FINT *N, + double *A, const FINT *LDA, double *WR, double *WI, double *VL, const FINT *LDVL, + double *VR, const FINT *LDVR, double *WORK, const FINT *LWORK, FINT *INFO ); -extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const int *N, - double *A, const int *LDA, double *B, const int *LDB, double *WR, double *WI, double *WBETA, - double *VL, const int *LDVL, double *VR, const int *LDVR, - double *WORK, const int *LWORK, int *INFO ); +extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT *N, + double *A, const FINT *LDA, double *B, const FINT *LDB, double *WR, double *WI, double *WBETA, + double *VL, const FINT *LDVL, double *VR, const FINT *LDVR, + double *WORK, const FINT *LWORK, FINT *INFO ); //statics for sorting @@ -802,23 +867,41 @@ void gdiagonalize(NRMat &a, NRVec &wr, NRVec &wi, char jobvl = vl ? 'V' : 'N'; char jobvr = vr ? 'V' : 'N'; double work0; - int lwork = -1; - int r; - int lda=a.ncols(); - int ldb=0; + FINT lwork = -1; + FINT r; + FINT lda=a.ncols(); + FINT ldb=0; if(b) ldb=b->ncols(); - int ldvl= vl?vl->ncols():lda; - int ldvr= vr?vr->ncols():lda; - if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0, + FINT ldvl= vl?vl->ncols():lda; + FINT ldvr= vr?vr->ncols():lda; + +#ifdef FORINT + FINT ntmp = n; + if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0, + &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r); + else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0, + &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r); +#else + if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0, &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r); else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0, &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r); - lwork = (int) work0; +#endif + + lwork = (FINT) work0; double *work = new double[lwork]; + +#ifdef FORINT + if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0, + &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r); + else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0, + &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r); +#else if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0, &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r); else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0, &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r); +#endif delete[] work; //std::cout <<"TEST dgeev\n"< *A, const int *LDA, int *INFO); +extern "C" void FORNAME(dpotrf)(const char *UPLO, const FINT *N, double *A, const FINT *LDA, FINT *INFO); +extern "C" void FORNAME(zpotrf)(const char *UPLO, const FINT *N, complex *A, const FINT *LDA, FINT *INFO); void cholesky(NRMat &a, bool upper) { if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); -int lda=a.ncols(); -int n=a.nrows(); +FINT lda=a.ncols(); +FINT n=a.nrows(); char uplo=upper?'U':'L'; -int info; +FINT info; a.copyonwrite(); FORNAME(dpotrf)(&uplo, &n, a, &lda, &info); if(info) {std::cerr << "Lapack error "< > &a, bool upper) { if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); -int lda=a.ncols(); -int n=a.nrows(); +FINT lda=a.ncols(); +FINT n=a.nrows(); char uplo=upper?'U':'L'; -int info; +FINT info; a.copyonwrite(); a.transposeme();//switch to Fortran order FORNAME(zpotrf)(&uplo, &n, a, &lda, &info); @@ -1250,14 +1333,14 @@ else //various norms -extern "C" double FORNAME(zlange)( const char *NORM, const int *M, const int *N, complex *A, const int *LDA, double *WORK); -extern "C" double FORNAME(dlange)( const char *NORM, const int *M, const int *N, double *A, const int *LDA, double *WORK); +extern "C" double FORNAME(zlange)( const char *NORM, const FINT *M, const FINT *N, complex *A, const FINT *LDA, double *WORK); +extern "C" double FORNAME(dlange)( const char *NORM, const FINT *M, const FINT *N, double *A, const FINT *LDA, double *WORK); double MatrixNorm(NRMat > &A, const char norm) { const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order - const int M = A.nrows(); - const int N = A.ncols(); + const FINT M = A.nrows(); + const FINT N = A.ncols(); double work[M]; const double ret = FORNAME(zlange)(&TypNorm, &M, &N, A[0], &M, &work[0]); return ret; @@ -1266,8 +1349,8 @@ double MatrixNorm(NRMat > &A, const char norm) double MatrixNorm(NRMat &A, const char norm) { const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order - const int M = A.nrows(); - const int N = A.ncols(); + const FINT M = A.nrows(); + const FINT N = A.ncols(); double work[M]; const double ret = FORNAME(dlange)(&TypNorm, &M, &N, A[0], &M, &work[0]); return ret; @@ -1276,15 +1359,15 @@ double MatrixNorm(NRMat &A, const char norm) //condition number -extern "C" void FORNAME(zgecon)( const char *norm, const int *n, complex *A, const int *LDA, const double *anorm, double *rcond, complex *work, double *rwork, int *info); -extern "C" void FORNAME(dgecon)( const char *norm, const int *n, double *A, const int *LDA, const double *anorm, double *rcond, double *work, double *rwork, int *info); +extern "C" void FORNAME(zgecon)( const char *norm, const FINT *n, complex *A, const FINT *LDA, const double *anorm, double *rcond, complex *work, double *rwork, FINT *info); +extern "C" void FORNAME(dgecon)( const char *norm, const FINT *n, double *A, const FINT *LDA, const double *anorm, double *rcond, double *work, double *rwork, FINT *info); double CondNumber(NRMat > &A, const char norm) { const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order - const int N = A.nrows(); + const FINT N = A.nrows(); double Norma(0.0), ret(0.0); - int info; + FINT info; complex *work; double *rwork; @@ -1305,9 +1388,9 @@ double CondNumber(NRMat > &A, const char norm) double CondNumber(NRMat &A, const char norm) { const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order - const int N = A.nrows(); + const FINT N = A.nrows(); double Norma(0.0), ret(0.0); - int info; + FINT info; double *work; double *rwork; diff --git a/smat.cc b/smat.cc index c2f5ea5..50fb7b5 100644 --- a/smat.cc +++ b/smat.cc @@ -44,6 +44,16 @@ namespace LA { template void NRSMat::put(int fd, bool dim, bool transp) const { +#ifdef CUDALA +if(location!=cpu) + { + NRSMat tmp= *this; + tmp.moveto(cpu); + tmp.put(fd,dim,transp); + return; + } +#endif + errno=0; if(dim) { @@ -56,6 +66,18 @@ LA_traits::multiput(NN2,fd,v,dim); template void NRSMat::get(int fd, bool dim, bool transp) { +#ifdef CUDALA +if(location!=cpu) + { + NRSMat tmp; + tmp.moveto(cpu); + tmp.get(fd,dim,transp); + tmp.moveto(location); + *this = tmp; + return; + } +#endif + int nn0[2]; //align at least 8-byte errno=0; if(dim) @@ -402,6 +424,9 @@ cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2); } +//some template specializations leading to BLAS/CUBLAS calls + + ////////////////////////////////////////////////////////////////////////////// diff --git a/smat.h b/smat.h index 1b9b93e..686213f 100644 --- a/smat.h +++ b/smat.h @@ -29,12 +29,20 @@ protected: int nn; T *v; int *count; +#ifdef CUDALA + GPUID location; +#endif public: friend class NRVec; friend class NRMat; - inline NRSMat() : nn(0),v(0),count(0) {}; - inline explicit NRSMat(const int n); // Zero-based array + inline NRSMat() : nn(0),v(0),count(0) + { +#ifdef CUDALA + location = DEFAULT_LOC; +#endif + }; + inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array inline NRSMat(const T &a, const int n); //Initialize to constant inline NRSMat(const T *a, const int n); // Initialize to array inline NRSMat(const NRSMat &rhs); // Copy constructor @@ -45,6 +53,13 @@ public: NRSMat & operator=(const NRSMat &rhs); //assignment void randomize(const typename LA_traits::normtype &x); NRSMat & operator=(const T &a); //assign a to diagonal +#ifdef CUDALA + inline GPUID getlocation() const {return location;} + void moveto(const GPUID dest); +#else + inline GPUID getlocation() const {return cpu;} + void moveto(const GPUID dest) {}; +#endif const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);}; inline NRSMat & operator*=(const T &a); @@ -65,8 +80,8 @@ public: const NRMat operator*(const NRMat &rhs) const; // SMat*Mat const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate const T dot(const NRVec &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate - const NRVec operator*(const NRVec &rhs) const {NRVec result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec - const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec + const NRVec operator*(const NRVec &rhs) const {NRVec result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec + const NRVec > operator*(const NRVec > &rhs) const {NRVec > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec const T* diagonalof(NRVec &, const bool divide=0, bool cache=false) const; //get diagonal void gemv(const T beta, NRVec &r, const char trans, const T alpha, const NRVec &x) const {r.gemv(beta,*this,trans,alpha,x);}; void gemv(const T beta, NRVec > &r, const char trans, const T alpha, const NRVec > &x) const {r.gemv(beta,*this,trans,alpha,x);}; @@ -108,29 +123,63 @@ namespace LA { // ctors template -inline NRSMat::NRSMat(const int n) : nn(n), v(new T[NN2]), - count(new int) {*count = 1;} - -template -inline NRSMat::NRSMat(const T& a, const int n) : nn(n), - v(new T[NN2]), count(new int) +inline NRSMat::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1)) { - *count =1; - if(a != (T)0) for(int i=0; i -inline NRSMat::NRSMat(const T *a, const int n) : nn(n), - v(new T[NN2]), count(new int) +inline NRSMat::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) { - *count = 1; - memcpy(v, a, NN2*sizeof(T)); +#ifdef CUDALA + location=DEFAULT_LOC; + if(location==cpu) +#endif + { + v=new T[NN2]; + if(a != (T)0) for(int i=0; i +inline NRSMat::NRSMat(const T *a, const int n) : nn(n), count(new int(1)) +{ +#ifdef CUDALA + location=DEFAULT_LOC; + if(location==cpu) +#endif + memcpy(v, a, NN2*sizeof(T)); +#ifdef CUDALA + else + { + v= (T*) gpualloc(NN2*sizeof(T)); + cublasSetVector(NN2,sizeof(T),a,1,v,1); + } +#endif + } template inline NRSMat::NRSMat(const NRSMat &rhs) //copy constructor { +#ifdef CUDALA + location=rhs.location; +#endif v = rhs.v; nn = rhs.nn; count = rhs.count; @@ -140,6 +189,9 @@ inline NRSMat::NRSMat(const NRSMat &rhs) //copy constructor template NRSMat::NRSMat(const NRVec &rhs, const int n) // type conversion { +#ifdef CUDALA + location=rhs.location; +#endif nn = n; #ifdef DEBUG if (NN2 != rhs.size()) @@ -150,6 +202,7 @@ NRSMat::NRSMat(const NRVec &rhs, const int n) // type conversion (*count)++; } + // S *= a template<> inline NRSMat & NRSMat::operator*=(const double & a) @@ -437,33 +490,31 @@ NRSMat::~NRSMat() { if (!count) return; if (--(*count) <= 0) { - if (v) delete[] (v); + if (v) + { +#ifdef CUDALA + if(location==cpu) +#endif + delete[] v; +#ifdef CUDALA + else gpufree(v); +#endif + } delete count; } } + // assignment with a physical copy template NRSMat & NRSMat::operator|=(const NRSMat &rhs) { - if (this != &rhs) { - if(!rhs.v) laerror("unallocated rhs in NRSMat operator |="); - if(count) - if(*count > 1) { // detach from the other - --(*count); - nn = 0; - count = 0; - v = 0; - } - if (nn != rhs.nn) { - if(v) delete [] (v); - nn = rhs.nn; - } - if (!v) v = new T[NN2]; - if (!count) count = new int; - *count = 1; - memcpy(v, rhs.v, NN2*sizeof(T)); - } +#ifdef DEBUG + if (!rhs.v) laerror("unallocated rhs in NRSMat operator |="); +#endif + if (this == &rhs) return *this; + *this = rhs; + this->copyonwrite(); return *this; } @@ -474,13 +525,24 @@ NRSMat & NRSMat::operator=(const NRSMat & rhs) { if (this == & rhs) return *this; if (count) - if(--(*count) == 0) { - delete [] v; + if(--(*count) == 0) + { +#ifdef CUDALA + if(location==cpu) +#endif + delete [] v; +#ifdef CUDALA + else + gpufree(v); +#endif delete count; - } + } v = rhs.v; nn = rhs.nn; count = rhs.count; +#ifdef CUDALA + location=rhs.location; +#endif if (count) (*count)++; return *this; } @@ -495,9 +557,24 @@ void NRSMat::copyonwrite() (*count)--; count = new int; *count = 1; - T *newv = new T[NN2]; - memcpy(newv, v, NN2*sizeof(T)); - v = newv; + T *newv; +#ifdef CUDALA + if(location==cpu) + { +#endif + newv = new T[NN2]; + memcpy(newv, v, NN2*sizeof(T)); +#ifdef CUDALA + } + else + { + newv = (T *) gpualloc(NN2*sizeof(T)); + if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem"); + cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1); + } +#endif + + v = newv; } } @@ -514,7 +591,16 @@ void NRSMat::resize(const int n) if(n==0) { if(--(*count) <= 0) { - if(v) delete[] (v); + if(v) { +#ifdef CUDALA + if(location==cpu) +#endif + delete[] (v); +#ifdef CUDALA + else + gpufree(v); +#endif + } delete count; } count=0; @@ -534,16 +620,71 @@ void NRSMat::resize(const int n) count = new int; *count = 1; nn = n; +#ifdef CUDALA + if(location==cpu) +#endif v = new T[NN2]; +#ifdef CUDALA + else + v = (T*) gpualloc(NN2*sizeof(T)); +#endif + return; } if (n != nn) { - nn = n; - delete[] v; - v = new T[NN2]; + nn = n; +#ifdef CUDALA + if(location==cpu) +#endif + { + delete[] v; + v = new T[NN2]; + } +#ifdef CUDALA + else + { + gpufree(v); + v = (T*) gpualloc(NN2*sizeof(T)); + } +#endif + } } +#ifdef CUDALA +template +void NRSMat::moveto(const GPUID dest) +{ +if(location==dest) return; +location=dest; + +if(v && !count) laerror("internal inconsistency of reference counting 1"); +if (!count) return; + +if(v && *count==0) laerror("internal inconsistency of reference counting 2"); +if(!v) return; + +T *vold = v; + +if(dest == cpu) //moving from GPU to CPU + { + v = new T[NN2]; + gpuget(NN2,sizeof(T),vold,v); + if(*count == 1) gpufree(vold); + else {--(*count); count = new int(1);} + } +else //moving from CPU to GPU + { + v=(T *) gpualloc(NN2*sizeof(T)); + gpuput(NN2,sizeof(T),vold,v); + if(*count == 1) delete[] vold; + else {--(*count); count = new int(1);} + } +} +#endif + + + template NRSMat > complexify(const NRSMat &rhs) @@ -554,10 +695,15 @@ for(int i=0; i std::ostream& operator<<(std::ostream &s, const NRSMat &x) { +#ifdef CUDALA +if(x.getlocation()==cpu) + { +#endif int i,j,n; n=x.nrows(); s << n << ' ' << n << '\n'; @@ -566,12 +712,25 @@ std::ostream& operator<<(std::ostream &s, const NRSMat &x) for(j=0; j::IOtype)x(i,j) << (j==n-1 ? '\n' : ' '); } return s; +#ifdef CUDALA + } +else + { + NRSMat tmp=x; + tmp.moveto(cpu); + return s< std::istream& operator>>(std::istream &s, NRSMat &x) { +#ifdef CUDALA +if(x.getlocation()==cpu) + { +#endif int i,j,n,m; s >> n >> m; if(n!=m) laerror("input symmetric matrix not square"); @@ -579,6 +738,18 @@ std::istream& operator>>(std::istream &s, NRSMat &x) typename LA_traits_io::IOtype tmp; for(i=0;i>tmp; x(i,j)=tmp;} return s; +#ifdef CUDALA + } +else + { + NRSMat tmp; + tmp.moveto(cpu); + s >> tmp; + tmp.moveto(x.getlocation()); + x=tmp; + return s; + } +#endif } diff --git a/t.cc b/t.cc index f683a95..c808fb5 100644 --- a/t.cc +++ b/t.cc @@ -24,6 +24,7 @@ using namespace std; using namespace LA; + extern void test(const NRVec &x); @@ -1551,7 +1552,7 @@ cout <<"Error = "<<(cx.transpose(true)*cx -bb).norm()<>n; @@ -1574,5 +1575,59 @@ cout <<"Cholesky factor filling = "<>n; +NRMat a(n,n),b(n,n); +a.randomize(1.); +b.randomize(1.); +NRMatc; +double t=clock()/((double) (CLOCKS_PER_SEC)); +int rep=1+10000000000LL/n/n/n; +cout <<"Repetitions " <cgpu; +a.moveto(gpu1); +b.moveto(gpu1); + t=clock()/((double) (CLOCKS_PER_SEC)); +for(int i=0; i::multiput(nn,fd,v,dim); } + template void NRVec::get(int fd, bool dim, bool transp) { +#ifdef CUDALA +if(location!=cpu) + { + NRVec tmp; + tmp.moveto(cpu); + tmp.get(fd,dim,transp); + tmp.moveto(location); + *this = tmp; + return; + } +#endif int nn0[2]; //align at least 8-byte errno=0; if(dim) @@ -319,9 +341,16 @@ void NRVec::gemv(const double beta, const NRMat &A, if ((trans == 'n'?A.ncols():A.nrows()) != x.size()) laerror("incompatible sizes in gemv A*x"); #endif + SAME_LOC3(*this,x,A); copyonwrite(); - cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), - A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); +#ifdef CUDALA + else + cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1); +#endif } @@ -362,8 +391,16 @@ void NRVec::gemv(const double beta, const NRSMat &A, #ifdef DEBUG if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x"); #endif +SAME_LOC3(*this,A,x); copyonwrite(); +#ifdef CUDALA +if(location==cpu) +#endif cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1); +#ifdef CUDALA +else + cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1); +#endif } template<> @@ -420,6 +457,7 @@ NRVec >::otimes(const NRVec< complex > &b, const bool co template int NRVec::sort(int direction, int from, int to, int *perm) { +NOT_GPU(*this); copyonwrite(); if(to == -1) to=nn-1; if(direction) return memqsort<1,NRVec,int,int>(*this,perm,from,to); @@ -427,6 +465,10 @@ else return memqsort<0,NRVec,int,int>(*this,perm,from,to); } + + + + ////////////////////////////////////////////////////////////////////////////// //// forced instantization in the corespoding object file diff --git a/vec.h b/vec.h index 58e5cd0..759e153 100644 --- a/vec.h +++ b/vec.h @@ -30,6 +30,9 @@ template void lawritemat(FILE *file,const T *a,int r,int c, // Memory allocated constants for cblas routines const static complex CONE = 1.0, CMONE = -1.0, CZERO = 0.0; +#ifdef CUDALA +const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.}; +#endif // Macros to construct binary operators +,-,*, from +=, -=, *= // for 3 cases: X + a, a + X, X + Y @@ -44,7 +47,7 @@ template \ #define NRVECMAT_OPER2(E,X) \ template \ - inline const NR##E NR##E::operator X(const NR##E &a) const \ +inline const NR##E NR##E::operator X(const NR##E &a) const \ { return NR##E(*this) X##= a; } @@ -55,12 +58,32 @@ protected: int nn; T *v; int *count; +#ifdef CUDALA + GPUID location; +#endif public: friend class NRSMat; friend class NRMat; - inline NRVec(): nn(0),v(0),count(0){}; - explicit inline NRVec(const int n) : nn(n), v(new T[n]), count(new int(1)) {}; + inline NRVec(): nn(0),v(0),count(0) + { +#ifdef CUDALA + location = DEFAULT_LOC; +#endif + }; + explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1)) + { +#ifdef CUDALA + if(loc==undefined) location = DEFAULT_LOC; else location = loc; + if(location==cpu) +#endif + v= new T[n]; +#ifdef CUDALA + else + v= (T*) gpualloc(n*sizeof(T)); +#endif + }; + inline NRVec(const T &a, const int n); inline NRVec(const T *a, const int n); inline NRVec(T *a, const int n, bool skeleton); @@ -71,6 +94,13 @@ public: explicit NRVec(const NRMat &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {}; #else explicit NRVec(const NRMat &rhs); +#endif +#ifdef CUDALA + inline GPUID getlocation() const {return location;} + void moveto(const GPUID dest); +#else + inline GPUID getlocation() const {return cpu;} + void moveto(const GPUID dest) {}; #endif NRVec & operator=(const NRVec &rhs); NRVec & operator=(const T &a); //assign a to every element @@ -103,8 +133,8 @@ public: void gemv(const T beta, const SparseMat &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false); void gemv(const typename LA_traits_complex::Component_type beta, const typename LA_traits_complex::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex::Component_type alpha, const NRVec &x); void gemv(const typename LA_traits_complex::Component_type beta, const typename LA_traits_complex::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex::Component_type alpha, const NRVec &x); - const NRVec operator*(const NRMat &mat) const {NRVec result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; - const NRVec operator*(const NRSMat &mat) const {NRVec result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; + const NRVec operator*(const NRMat &mat) const {NRVec result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; + const NRVec operator*(const NRSMat &mat) const {NRVec result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; const NRVec operator*(const SparseMat &mat) const {NRVec result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;}; const NRMat otimes(const NRVec &rhs, const bool conjugate=false, const T &scale=1) const; //outer product inline const NRMat operator|(const NRVec &rhs) const {return otimes(rhs,true);}; @@ -150,29 +180,58 @@ public: #include "sparsemat.h" #include "sparsesmat.h" + + namespace LA { // formatted I/O template std::ostream & operator<<(std::ostream &s, const NRVec &x) { +#ifdef CUDALA +if(x.getlocation()==cpu) + { +#endif int i, n; - n = x.size(); s << n << std::endl; for(i=0; i::IOtype)x[i] << (i == n-1 ? '\n' : ' '); return s; +#ifdef CUDALA + } +else + { + NRVec tmp=x; + tmp.moveto(cpu); + return s< std::istream & operator>>(std::istream &s, NRVec &x) { +#ifdef CUDALA +if(x.getlocation()==cpu) + { +#endif int i,n; - s >> n; x.resize(n); typename LA_traits_io::IOtype tmp; for(i=0; i> tmp; x[i]=tmp;} return s; +#ifdef CUDALA + } +else + { + NRVec tmp; + tmp.moveto(cpu); + s >> tmp; + tmp.moveto(x.getlocation()); + x=tmp; + return s; + } +#endif } @@ -180,22 +239,51 @@ std::istream & operator>>(std::istream &s, NRVec &x) // ctors template -inline NRVec::NRVec(const T& a, const int n) : nn(n), v(new T[n]), count(new int) +inline NRVec::NRVec(const T& a, const int n) : nn(n), count(new int) { *count = 1; +#ifdef CUDALA + location=DEFAULT_LOC; + if(location==cpu) + { +#endif + v = new T[n]; if(a != (T)0) for(int i=0; i inline NRVec::NRVec(const T *a, const int n) : nn(n), count(new int) { - v=new T[n]; - *count = 1; - memcpy(v, a, n*sizeof(T)); +#ifdef CUDALA +location=DEFAULT_LOC; + if(location==cpu) + { +#endif + v=new T[n]; + *count = 1; + memcpy(v, a, n*sizeof(T)); +#ifdef CUDALA + } + else + { + v= (T*) gpualloc(n*sizeof(T)); + cublasSetVector(n,sizeof(T),a,1,v,1); + } +#endif + } template @@ -203,12 +291,28 @@ inline NRVec::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int) { if(!skeleton) { +#ifdef CUDALA +location=DEFAULT_LOC; + if(location==cpu) + { +#endif v=new T[n]; *count = 1; memcpy(v, a, n*sizeof(T)); +#ifdef CUDALA + } + else + { + v= (T*) gpualloc(n*sizeof(T)); + cublasSetVector(n,sizeof(T),a,1,v,1); + } +#endif } else { +#ifdef CUDALA + if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU"); +#endif *count = 2; v=a; } @@ -217,6 +321,9 @@ inline NRVec::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int) template inline NRVec::NRVec(const NRVec &rhs) { +#ifdef CUDALA + location=rhs.location; +#endif v = rhs.v; nn = rhs.nn; count = rhs.count; @@ -226,6 +333,9 @@ inline NRVec::NRVec(const NRVec &rhs) template inline NRVec::NRVec(const NRSMat &rhs) { +#ifdef CUDALA + location=rhs.location; +#endif nn = rhs.nn; nn = NN2; v = rhs.v; @@ -233,28 +343,11 @@ inline NRVec::NRVec(const NRSMat &rhs) (*count)++; } -// x += a -template<> -inline NRVec & NRVec::operator+=(const double &a) -{ - copyonwrite(); - cblas_daxpy(nn, 1.0, &a, 0, v, 1); - return *this; -} - -template<> -inline NRVec< complex > & -NRVec< complex >::operator+=(const complex &a) -{ - copyonwrite(); - cblas_zaxpy(nn, &CONE, &a, 0, v, 1); - return *this; -} - -//and for general type +// x +/-= a template inline NRVec & NRVec::operator+=(const T &a) { + NOT_GPU(*this); copyonwrite(); int i; for(i=0; i & NRVec::operator+=(const T &a) } -// x -= a -template<> -inline NRVec & NRVec::operator-=(const double &a) -{ - copyonwrite(); - cblas_daxpy(nn, -1.0, &a, 0, v, 1); - return *this; -} - -template<> -inline NRVec< complex > & -NRVec< complex >::operator-=(const complex &a) -{ - copyonwrite(); - cblas_zaxpy(nn, &CMONE, &a, 0, v, 1); - return *this; -} - -//and for general type template inline NRVec & NRVec::operator-=(const T &a) { + NOT_GPU(*this); copyonwrite(); - int i; - for(i=0; i -inline NRVec & NRVec::operator+=(const NRVec &rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1); - return *this; -} - -template<> -inline NRVec< complex > & -NRVec< complex >::operator+=(const NRVec< complex > &rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1); - return *this; -} - -//and for general type template inline NRVec & NRVec::operator+=(const NRVec &rhs) { #ifdef DEBUG if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +NOT_GPU(*this); +NOT_GPU(rhs); #endif copyonwrite(); int i; @@ -346,6 +400,8 @@ inline NRVec & NRVec::operator/=(const NRVec &rhs) { #ifdef DEBUG if (nn != rhs.nn) laerror("/= of incompatible vectors"); +NOT_GPU(*this); +NOT_GPU(rhs); #endif copyonwrite(); int i; @@ -356,35 +412,13 @@ inline NRVec & NRVec::operator/=(const NRVec &rhs) // x -= x -template<> -inline NRVec & NRVec::operator-=(const NRVec &rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1); - return *this; -} - -template<> -inline NRVec< complex > & -NRVec< complex >::operator-=(const NRVec< complex > &rhs) -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); -#endif - copyonwrite(); - cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1); - return *this; -} - -//and for general type template inline NRVec & NRVec::operator-=(const NRVec &rhs) { #ifdef DEBUG if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +NOT_GPU(*this); +NOT_GPU(rhs); #endif copyonwrite(); int i; @@ -394,27 +428,10 @@ inline NRVec & NRVec::operator-=(const NRVec &rhs) // x *= a -template<> -inline NRVec & NRVec::operator*=(const double &a) -{ - copyonwrite(); - cblas_dscal(nn, a, v, 1); - return *this; -} - -template<> -inline NRVec< complex > & -NRVec< complex >::operator*=(const complex &a) -{ - copyonwrite(); - cblas_zscal(nn, &a, v, 1); - return *this; -} - -//and for general type template inline NRVec & NRVec::operator*=(const T &a) { +NOT_GPU(*this); copyonwrite(); int i; for(i=0; i & NRVec::operator*=(const T &a) // scalar product x.y -template<> -inline const double NRVec::operator*(const NRVec &rhs) const -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible vectors"); -#endif - return cblas_ddot(nn, v, 1, rhs.v, 1); -} - - -template<> -inline const complex -NRVec< complex >::operator*(const NRVec< complex > &rhs) const -{ -#ifdef DEBUG - if (nn != rhs.nn) laerror("dot of incompatible vectors"); -#endif - complex dot; - cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot); - return dot; -} - template inline const T NRVec::operator*(const NRVec &rhs) const { #ifdef DEBUG if (nn != rhs.nn) laerror("dot of incompatible vectors"); +NOT_GPU(*this); +NOT_GPU(rhs); #endif T dot = 0; for(int i=0; i::operator*(const NRVec &rhs) const -// Sum of elements -template<> -inline const double NRVec::asum() const -{ - return cblas_dasum(nn, v, 1); -} - - -// Dot product: x * y -template<> -inline const double NRVec::dot(const double *y, const int stride) const -{ - return cblas_ddot(nn, y, stride, v, 1); -} -template<> -inline const complex -NRVec< complex >::dot(const complex *y, const int stride) const -{ - complex dot; - cblas_zdotc_sub(nn, y, stride, v, 1, &dot); - return dot; -} // x[i] returns i-th element template @@ -489,6 +464,7 @@ inline T & NRVec::operator[](const int i) if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1"); if(i < 0 || i >= nn) laerror("NRVec out of range"); if(!v) laerror("[] on unallocated NRVec"); +NOT_GPU(*this); #endif return v[i]; } @@ -498,6 +474,7 @@ inline const T & NRVec::operator[](const int i) const #ifdef DEBUG if(i < 0 || i >= nn) laerror("NRVec out of range"); if(!v) laerror("[] on unallocated NRVec"); +NOT_GPU(*this); #endif return v[i]; } @@ -527,29 +504,6 @@ inline NRVec::operator const T*() const return v; } -// return norm of the Vec -template<> -inline const double NRVec::norm() const -{ - return cblas_dnrm2(nn, v, 1); -} -template<> -inline const double NRVec< complex >::norm() const -{ - return cblas_dznrm2(nn, v, 1); -} - -// Max element of the array -template<> -inline const double NRVec::amax() const -{ - return v[cblas_idamax(nn, v, 1)]; -} -template<> -inline const complex NRVec< complex >::amax() const -{ - return v[cblas_izamax(nn, v, 1)]; -} // Make Vec unitvector @@ -576,7 +530,16 @@ NRVec::~NRVec() { if(!count) return; if(--(*count) <= 0) { - if(v) delete[] (v); + if(v) + { +#ifdef CUDALA + if(location==cpu) +#endif + delete[] (v); +#ifdef CUDALA + else gpufree(v); +#endif + } delete count; } } @@ -591,12 +554,29 @@ void NRVec::copyonwrite() (*count)--; count = new int; *count = 1; - T *newv = new T[nn]; - memcpy(newv, v, nn*sizeof(T)); + T *newv; +#ifdef CUDALA + if(location==cpu) + { +#endif + newv = new T[nn]; + memcpy(newv, v, nn*sizeof(T)); +#ifdef CUDALA + } + else + { + newv = (T *) gpualloc(nn*sizeof(T)); + if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem"); + cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1); + } +#endif + + v = newv; } } + // Asignment template NRVec & NRVec::operator=(const NRVec &rhs) @@ -606,17 +586,29 @@ NRVec & NRVec::operator=(const NRVec &rhs) if(count) if(--(*count) == 0) { - delete[] v; +#ifdef CUDALA + if(location==cpu) +#endif + delete[] v; +#ifdef CUDALA + else + gpufree(v); +#endif delete count; } v = rhs.v; nn = rhs.nn; count = rhs.count; +#ifdef CUDALA + location=rhs.location; +#endif if(count) (*count)++; } return *this; } + + // Resize template void NRVec::resize(const int n) @@ -629,7 +621,17 @@ void NRVec::resize(const int n) if(n==0) { if(--(*count) <= 0) { - if(v) delete[] (v); + if(v) + { +#ifdef CUDALA + if(location==cpu) +#endif + delete[] (v); +#ifdef CUDALA + else + gpufree(v); +#endif + } delete count; } count=0; @@ -648,14 +650,33 @@ void NRVec::resize(const int n) count = new int; *count = 1; nn = n; - v = new T[nn]; +#ifdef CUDALA + if(location==cpu) +#endif + v = new T[nn]; +#ifdef CUDALA + else + v = (T*) gpualloc(nn*sizeof(T)); +#endif return; } // *count = 1 in this branch if (n != nn) { nn = n; - delete[] v; - v = new T[nn]; +#ifdef CUDALA + if(location==cpu) +#endif + { + delete[] v; + v = new T[nn]; + } +#ifdef CUDALA + else + { + gpufree(v); + v = (T*) gpualloc(nn*sizeof(T)); + } +#endif } } @@ -664,30 +685,18 @@ void NRVec::resize(const int n) template NRVec & NRVec::operator|=(const NRVec &rhs) { - if (this != &rhs) { #ifdef DEBUG if (!rhs.v) laerror("unallocated rhs in NRVec operator |="); #endif - if (count) - if (*count > 1) { - --(*count); - nn = 0; - count = 0; - v = 0; - } - if (nn != rhs.nn) { - if (v) delete[] (v); - nn = rhs.nn; - } - if(!v) v = new T[nn]; - if(!count) count = new int; - *count = 1; - memcpy(v, rhs.v, nn*sizeof(T)); - } - return *this; + if (this == &rhs) return *this; + *this = rhs; + this->copyonwrite(); + return *this; } + + template NRVec > complexify(const NRVec &rhs) { @@ -696,6 +705,291 @@ for(int i=0; i +void NRVec::moveto(const GPUID dest) +{ +if(location==dest) return; +location=dest; + +if(v && !count) laerror("internal inconsistency of reference counting 1"); +if (!count) return; + +if(v && *count==0) laerror("internal inconsistency of reference counting 2"); +if(!v) return; + +T *vold = v; + +if(dest == cpu) //moving from GPU to CPU + { + v = new T[nn]; + gpuget(nn,sizeof(T),vold,v); + if(*count == 1) gpufree(vold); + else {--(*count); count = new int(1);} + } +else //moving from CPU to GPU + { + v=(T *) gpualloc(nn*sizeof(T)); + gpuput(nn,sizeof(T),vold,v); + if(*count == 1) delete[] vold; + else {--(*count); count = new int(1);} + } +} +#endif + + +//some template specializations leading to BLAS/CUBLAS calls +template<> +inline +NRVec & NRVec::operator+=(const double &a) +{ + copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_daxpy(nn, 1.0, &a, 0, v, 1); +#ifdef CUDALA + else + { + double *d=gpuputdouble(a); + cublasDaxpy(nn, 1.0, d, 0, v, 1); + gpufree(d); + } +#endif + return *this; +} + +template<> +inline +NRVec< complex > & +NRVec< complex >::operator+=(const complex &a) +{ + copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_zaxpy(nn, &CONE, &a, 0, v, 1); +#ifdef CUDALA + else + { + complex *d=gpuputcomplex(a); + cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1); + gpufree(d); + } +#endif + return *this; +} + +template<> +inline +NRVec & NRVec::operator-=(const double &a) +{ + copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_daxpy(nn, -1.0, &a, 0, v, 1); +#ifdef CUDALA + else + { + double *d=gpuputdouble(a); + cublasDaxpy(nn, -1.0, d, 0, v, 1); + gpufree(d); + } +#endif + return *this; +} + +template<> +inline +NRVec< complex > & +NRVec< complex >::operator-=(const complex &a) +{ + copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_zaxpy(nn, &CMONE, &a, 0, v, 1); +#ifdef CUDALA + else + { + complex *d=gpuputcomplex(a); + cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1); + gpufree(d); + } +#endif + return *this; +} + + +template<> +inline +NRVec & NRVec::operator+=(const NRVec &rhs) +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +#endif + copyonwrite(); + cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1); + return *this; +} +template<> +inline +NRVec< complex > & +NRVec< complex >::operator+=(const NRVec< complex > &rhs) +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +#endif + copyonwrite(); + cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1); + return *this; +} + + +template<> +inline +NRVec & NRVec::operator-=(const NRVec &rhs) +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +#endif +SAME_LOC(*this,rhs); + copyonwrite(); +#ifdef CUDALA + if(location==cpu) +#endif + cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1); +#ifdef CUDALA + else + cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1); +#endif + return *this; +} + +template<> +inline +NRVec< complex > & +NRVec< complex >::operator-=(const NRVec< complex > &rhs) +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("daxpy of incompatible vectors"); +#endif + copyonwrite(); + cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1); + return *this; +} + +template<> +inline +NRVec & NRVec::operator*=(const double &a) +{ + copyonwrite(); + cblas_dscal(nn, a, v, 1); + return *this; +} + +template<> +inline +NRVec< complex > & +NRVec< complex >::operator*=(const complex &a) +{ + copyonwrite(); + cblas_zscal(nn, &a, v, 1); + return *this; +} + + +template<> +inline +const double NRVec::operator*(const NRVec &rhs) const +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("dot of incompatible vectors"); +#endif + return cblas_ddot(nn, v, 1, rhs.v, 1); +} + + +template<> +inline +const complex +NRVec< complex >::operator*(const NRVec< complex > &rhs) const +{ +#ifdef DEBUG + if (nn != rhs.nn) laerror("dot of incompatible vectors"); +#endif + complex dot; + cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot); + return dot; +} + +// Sum of elements +template<> +inline +const double NRVec::asum() const +{ + return cblas_dasum(nn, v, 1); +} + + +// Dot product: x * y +template<> +inline +const double NRVec::dot(const double *y, const int stride) const +{ + return cblas_ddot(nn, y, stride, v, 1); +} + +template<> +inline +const complex +NRVec< complex >::dot(const complex *y, const int stride) const +{ + complex dot; + cblas_zdotc_sub(nn, y, stride, v, 1, &dot); + return dot; +} + +// return norm of the Vec +template<> +inline +const double NRVec::norm() const +{ +#ifdef CUDALA + if(location!=cpu) return cublasDnrm2(nn, v, 1); +#endif + return cblas_dnrm2(nn, v, 1); +} + +template<> +inline +const double NRVec< complex >::norm() const +{ + return cblas_dznrm2(nn, v, 1); +} + +// Max element of the array +template<> +inline +const double NRVec::amax() const +{ + return v[cblas_idamax(nn, v, 1)]; +} + +/* +cblas_izamax seems to be missing at least in some cblas versions +template<> +inline +const complex NRVec< complex >::amax() const +{ + return v[cblas_izamax(nn, v, 1)]; +} +*/ + + + }//namespace #endif /* _LA_VEC_H_ */