diff --git a/ChangeLog b/ChangeLog index f3ef84f..538c841 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,33 @@ +04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and +gdiagonalize() to get correct eigenvector convention for C-storage +30.10.2013 macros for consistent lowercase and uppercase of character +parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated) +29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas +14.10.2013 added operator% to bivector +01.06.2012 more efficient clear() by detachonly parameter to copyonwrite() +14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat +13.03.2012 symmetry of integrals with different spins added to fourindex.h +23.02.2012 included unistd.h +23.02.2012 fixed max read/write size in multiget and multiput to 1GB +24.01.2012 Improved DIIS (L.V.) +19.01.2012 Fixed location for gpu in nonclass.cc +02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.) +01.02.2011 Added trace2 for complex matrices by L. Veis +18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik +28.12.2010 Generalized diagonalization and functions of general complex matrices +21.12.2010 Changed to size_t in matrix put,get to prevent overflow +08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp +24.11.2010 Added checking for integer overflow in exptimes +23.10.2010 Fixed dependency on atlas for static libraries in configure.ac +27.09.2010 Seed of CSRMat class added. +22.09.2010 Allowed formal operator[] on gpu matrices +22.09.2010 Added submatrix() to SparseSMat +22.09.2010 Zgerc and zgeru implemented in non-cblas version +21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems 08.09.2010 RELEASE 0.6 -08.09.2010 Doxygen documentation by M. Sulc +08.09.2010 Doxygen documentation for NRVec, NRMat, NRSMat classes contributed by M. Sulc 08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc -08.09.2010 Minor bugfixes contributed by M. Sulc +08.09.2010 Minor bugfixes and improvements contributed by M. Sulc 25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat 24.06.2010 Fixed a memory leak existing when MATPTR was defined 18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS @@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested) 11.03.2008 added cblas_idamax replacement for non-cblas 05.03.2008 fixed transposed bug in inverse() with non-cblas 26.02.2008 INITIAL RELEASE 0.1 + diff --git a/bitvector.cc b/bitvector.cc index d61c0b9..0a2c779 100644 --- a/bitvector.cc +++ b/bitvector.cc @@ -130,7 +130,7 @@ return *this; /*number of ones in a binary number, from "Hacker's delight" book*/ #ifdef LONG_IS_32 -static unsigned long word_popul(unsigned long x) +static unsigned int word_popul(unsigned long x) { x -= ((x>>1)&0x55555555); x = (x&0x33333333) + ((x>>2)&0x33333333); @@ -140,9 +140,10 @@ x+= (x>>16); return x&0x3f; } #else -static unsigned long word_popul(unsigned long x) +//@@@@ use an efficient trick +static unsigned int word_popul(unsigned long x) { -unsigned long s=0; +unsigned int s=0; for(int i=0; i<64; ++i) { if(x&1) ++s; @@ -156,7 +157,6 @@ return s; unsigned int bitvector::population(const unsigned int before) const { -//@@@before int i; unsigned int s=0; for(i=0; i>(i%blockbits))&1;}; const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;}; - void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));}; + void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));}; void fill() {memset(v,0xff,nn*sizeof(bitvector_block));}; bool operator!=(const bitvector &rhs) const; bool operator==(const bitvector &rhs) const {return !(*this != rhs);}; @@ -70,13 +70,13 @@ public: bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;}; bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;}; bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;}; - unsigned int population(const unsigned int before=0) const; //@@@number of 1's + unsigned int operator%(const bitvector &y) const; //number of differing bits + unsigned int population(const unsigned int before=0) const; //number of 1's //extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits //logical shifts <<= >>= << >> not implemented yet //logical rotations not implemented yet }; - extern std::ostream & operator<<(std::ostream &s, const bitvector &x); extern std::istream & operator>>(std::istream &s, bitvector &x); diff --git a/csrmat.cc b/csrmat.cc index 4682ebd..4f319ce 100644 --- a/csrmat.cc +++ b/csrmat.cc @@ -52,8 +52,9 @@ INSTANTIZE(complex) */ //// forced instantization of functions in the header in the corresponding object file -template class CSRMat; -template class CSRMat >; +//@@@@@template class CSRMat; +//@@@@template class CSRMat >; +//@@@@ unfinished class commented out diff --git a/fortran.h b/fortran.h index b91e11d..e2a3dc6 100644 --- a/fortran.h +++ b/fortran.h @@ -11,3 +11,6 @@ #undef FORINT #define FINT int #endif + +#define BLAS_FORTRANCASE(x) toupper(x) +#define LAPACK_FORTRANCASE(x) toupper(x) diff --git a/fourindex.h b/fourindex.h index 3b490e4..78ab995 100644 --- a/fourindex.h +++ b/fourindex.h @@ -104,10 +104,10 @@ __attribute__((packed)) //later add symmetry of complex integrals -typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored +typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored // these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2 -static const int fourindex_n_symmetrytypes=6; -static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8}; +static const int fourindex_n_symmetrytypes=7; +static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4}; static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]= { {{0,1,2,3,1}}, @@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]= {{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}}, {{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}}, {{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges + {{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}}, }; @@ -669,6 +670,108 @@ typedef typename LA_traits::normtype normtype; //make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations template +class fourindex_dense : public NRMat { +public: + fourindex_dense(): NRMat() {}; + explicit fourindex_dense(const int n): NRMat(n*(n+1)/2,n*(n+1)/2) {}; + fourindex_dense(const NRMat &rhs): NRMat(rhs) {}; //be able to convert the parent class transparently to this + fourindex_dense(const T &a, const int n): NRMat(a,n*(n+1)/2,n*(n+1)/2) {}; + fourindex_dense(const T *a, const int n): NRMat(a,n*(n+1)/2,n*(n+1)/2) {}; + //and also construct it from sparse and externally stored fourindex classes + //it seems not possible to nest template just for the two constructors + fourindex_dense(const fourindex &rhs); + fourindex_dense(const fourindex_ext &rhs); + + T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l); + const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const; + void resize(const int n) {(*this).NRMat::resize(n*(n+1)/2,n*(n+1)/2);}; + void putext(int f, T thr=1e-15); + int nbas() const {return (int)std::sqrt(2*(*this).nrows());}; + +}; + +template +void fourindex_dense::putext(int f, T thr) +{ +T y; +for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j) + for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l) + if((y=abs((*this)(i,j,k,l))) > thr) + { + matel4stored x; + x.elem= y; + x.index.indiv.i=i; + x.index.indiv.j=j; + x.index.indiv.k=k; + x.index.indiv.l=l; + if(sizeof(matel4stored) != write(f,&x,sizeof(matel4stored)) ) + laerror("write error in putext"); + } +} + + +template +fourindex_dense::fourindex_dense(const fourindex &rhs) : NRMat((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2) +{ +if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch"); +typename fourindex::iterator p; +#ifdef DEBUG +unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j); +unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l); +if (IJ<0 || IJ>=(unsigned int)NRMat::nn || KL<0 || KL>=(unsigned int)NRMat::mm) laerror("fourindex_dense index out of range in constructor"); +#endif +for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem; +} + +template +fourindex_dense::fourindex_dense(const fourindex_ext &rhs) : NRMat((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2) +{ +if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch"); +typename fourindex_ext::iterator p; +for(p=rhs.begin(); p!= rhs.end(); ++p) + { +#ifdef DEBUG +unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j); +unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l); +if (IJ<0 || IJ>=(unsigned int)NRMat::nn || KL<0 || KL>=(unsigned int)NRMat::mm) laerror("fourindex_dense index out of range in constructor"); +#endif + (*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem; + } +} + + + +template +T& fourindex_dense::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) +{ +int I = SMat_index_1(i,j); +int J = SMat_index_1(k,l); +//I,J act as indices of a NRmat +#ifdef DEBUG + if (*NRMat::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense"); + if (I<0 || I>=NRMat::nn || J<0 || J>=NRMat::mm) laerror("fourindex_dense index out of range"); + if (!NRMat::v) laerror("access to unallocated fourindex_dense"); +#endif +return NRMat::operator()(I,J); +} + + +template +const T& fourindex_dense::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const +{ +int I = SMat_index_1(i,j); +int J = SMat_index_1(k,l); +//I,J act as indices of a NRSmat +#ifdef DEBUG + if (I<0 || I>=NRMat::nn || J<0 || J>=NRMat::mm) laerror("fourindex_dense index out of range"); + if (!NRMat::v) laerror("access to unallocated fourindex_dense"); +#endif +return NRMat::operator()(I,J); +} + + +//////////////////// +template class fourindex_dense : public NRSMat { public: fourindex_dense(): NRSMat() {}; diff --git a/la_traits.h b/la_traits.h index 12dcaf5..23ceab9 100644 --- a/la_traits.h +++ b/la_traits.h @@ -35,6 +35,7 @@ #include #include #include +#include //using namespace std; @@ -56,7 +57,7 @@ extern "C" { #include "noncblas.h" #else extern "C" { -#include "clapack.h" +#include "atlas/clapack.h" } #endif @@ -213,7 +214,7 @@ typedef C normtype; typedef C realtype; typedef complex complextype; static inline C sqrabs(const complex x) { return x.real()*x.real()+x.imag()*x.imag();} -static inline bool gencmp(const complex *x, const complex *y, int n) {return memcmp(x,y,n*sizeof(complex));} +static inline bool gencmp(const complex *x, const complex *y, size_t n) {return memcmp(x,y,n*sizeof(complex));} static bool bigger(const complex &x, const complex &y) {laerror("complex comparison undefined"); return false;} static bool smaller(const complex &x, const complex &y) {laerror("complex comparison undefined"); return false;} static inline normtype norm (const complex &x) {return std::abs(x);} @@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex *x, bool dimensions=0) size_t total=0; size_t system_limit = (1L<<30)/sizeof(complex); //do not expect too much from the system and read at most 1GB at once ssize_t r; + size_t nn; do{ - r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex)); - if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "< system_limit ? system_limit : n-total)*sizeof(complex)); + if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<); if(r%sizeof(complex)) laerror("read error 2"); } @@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex *x, bool dimensions=0) size_t total=0; size_t system_limit = (1L<<30)/sizeof(complex); //do not expect too much from the system and write at most 1GB at once ssize_t r; + size_t nn; do{ - r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex)); - if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "< system_limit ? system_limit : n-total)*sizeof(complex)); + if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<); if(r%sizeof(complex)) laerror("write error 2"); } while(total < n); } -static void copy(complex *dest, complex *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex));} -static void clear(complex *dest, unsigned int n) {memset(dest,0,n*sizeof(complex));} +static void copy(complex *dest, complex *src, size_t n) {memcpy(dest,src,n*sizeof(complex));} +static void clear(complex *dest, size_t n) {memset(dest,0,n*sizeof(complex));} static void copyonwrite(complex &x) {}; static void clearme(complex &x) {x=0;}; static void deallocate(complex &x) {}; @@ -266,7 +269,7 @@ typedef C normtype; typedef C realtype; typedef complex complextype; static inline C sqrabs(const C x) { return x*x;} -static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));} +static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));} static inline bool bigger(const C &x, const C &y) {return x>y;} static inline bool smaller(const C &x, const C &y) {return x system_limit ? system_limit : n-total)*sizeof(C)); - if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "< system_limit ? system_limit : n-total)*sizeof(C)); + if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "< system_limit ? system_limit : n-total)*sizeof(C)); - if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "< system_limit ? system_limit : n-total)*sizeof(C)); + if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "< producttype; \ typedef typename LA_traits::normtype normtype; \ typedef X::realtype> realtype; \ typedef X::complextype> complextype; \ -static bool gencmp(const C *x, const C *y, int n) {for(int i=0; iy;} \ static inline bool smaller(const C &x, const C &y) {return x &x) {return x.norm();} \ @@ -332,8 +337,8 @@ static void put(int fd, const X &x, bool dimensions=1, bool transp=0) {x.put( static void get(int fd, X &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \ static void multiput(size_t n,int fd, const X *x, bool dimensions=1) {for(size_t i=0; i *x, bool dimensions=1) {for(size_t i=0; i &x) {x.copyonwrite();}\ static void clearme(X &x) {x.clear();}\ static void deallocate(X &x) {x.dealloc();}\ @@ -359,7 +364,7 @@ typedef NRMat producttype; \ typedef typename LA_traits::normtype normtype; \ typedef X::realtype> realtype; \ typedef X::complextype> complextype; \ -static bool gencmp(const C *x, const C *y, int n) {for(int i=0; iy;} \ static inline bool smaller(const C &x, const C &y) {return x &x) {return x.norm();} \ @@ -368,8 +373,8 @@ static void put(int fd, const X &x, bool dimensions=1, bool transp=0) {x.put( static void get(int fd, X &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} \ static void multiput(size_t n,int fd, const X *x, bool dimensions=1) {for(size_t i=0; i *x, bool dimensions=1) {for(size_t i=0; i &x) {x.copyonwrite();} \ static void clearme(X &x) {x.clear();} \ static void deallocate(X &x) {x.dealloc();} \ diff --git a/laerror.cc b/laerror.cc index 99be356..5dd8816 100644 --- a/laerror.cc +++ b/laerror.cc @@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){ laerror(msg0); } -int cblas_errprn(int ierr, int info, char *form, ...) { +#ifndef NONCBLAS +#include "cblas.h" +#include +extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...) +{ + va_list argptr; + + va_start(argptr, form); + if (p) + { + fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout); + fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout); + } + vfprintf(stdout, form, argptr); + vfprintf(stderr, form, argptr); + va_end(argptr); + laerror("terminating in cblas_xerbla"); +} + +extern "C" int cblas_errprn(int ierr, int info, char *form, ...) { char msg0[1024], *msg; va_list argptr; va_start(argptr, form); @@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) { laerror(msg0); return 0; } +#endif }//namespace diff --git a/mat.cc b/mat.cc index 210d77d..62a9f5e 100644 --- a/mat.cc +++ b/mat.cc @@ -26,11 +26,8 @@ #include #include #include +#include -extern "C" { - extern ssize_t read(int, void *, size_t); - extern ssize_t write(int, const void *, size_t); -} namespace LA { @@ -77,14 +74,14 @@ const NRMat NRMat::otimes(const NRMat &rhs, bool reversecolumns) const { T c = (*this)(i,j); for(k=0;k NRMat::row(const int i, int l) const { #ifdef MATPTR v[i] #else - v + i*l + v + i*(size_t)l #endif , l); return r; @@ -144,7 +141,7 @@ void NRMat::put(int fd, bool dim, bool transp) const { #ifdef MATPTR v[i][j] #else - v[i*mm+j] + v[i*(size_t)mm+j] #endif ,dim ,transp); } @@ -196,7 +193,7 @@ void NRMat::get(int fd, bool dim, bool transp){ #ifdef MATPTR v[i][j] #else - v[i*mm+j] + v[i*(size_t)mm+j] #endif ,dim,transp); } @@ -476,13 +473,13 @@ NRMat & NRMat::operator-=(const T &a) { ******************************************************************************/ template <> const NRMat NRMat::operator-() const { - const int nm = nn*mm; + const size_t nm = (size_t)nn*mm; NRMat result(nn, mm, getlocation()); #ifdef CUDALA if(location == cpu) { #endif #ifdef MATPTR - for(register int i=0; i NRMat::operator-() const { ******************************************************************************/ template <> const NRMat > NRMat >::operator-() const { - const int nm = nn*mm; + const size_t nm = (size_t)nn*mm; NRMat > result(nn, mm, getlocation()); #ifdef CUDALA if(location == cpu) { #endif #ifdef MATPTR - for(register int i=0; i)); cblas_zscal(nm, &CMONE, result.v, 1); @@ -539,9 +536,9 @@ const NRMat NRMat::operator-() const { NRMat result(nn, mm, getlocation()); #ifdef MATPTR - for(register int i=0; i NRMat::operator&(const NRMat &b) const { if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem"); for(register int i=0; i NRMat::operator|(const NRMat &b) const { for (int j=0; j NRMat::rsum() const { #ifdef CUDALA }else{ for(register int i=0;i > NRMat >::rsum() const { #ifdef CUDALA }else{ for(register int i=0;i NRMat::submatrix(const int fromrow, const int torow, const int #ifdef MATPTR memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T)); #else - memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T)); + memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T)); #endif } #ifdef CUDALA }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); for(register int i=fromrow; i<=torow; ++i){ - cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1); + cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1); TEST_CUBLAS("cublasScopy"); } } @@ -786,13 +783,13 @@ void NRMat::storesubmatrix(const int fromrow, const int fromcol, const NRMat #ifdef MATPTR memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T)); #else - memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T)); + memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T)); #endif #ifdef CUDALA }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); - cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1); + cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1); } #endif } @@ -821,8 +818,8 @@ NRMat& NRMat::transposeme(const int _n) { v[j][i] = tmp; #else register int a, b; - a = i*mm + j; - b = j*mm + i; + a = i*(size_t)mm + j; + b = j*(size_t)mm + i; T tmp = v[a]; v[a] = v[b]; v[b] = tmp; @@ -847,7 +844,7 @@ NRMat& NRMat::transposeme(const int _n) { ******************************************************************************/ template<> NRMat >::NRMat(const NRMat &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { - const int nn_mm = nn*mm; + const size_t nn_mm = (size_t)nn*mm; #ifdef CUDALA if(location == cpu){ #endif @@ -888,7 +885,7 @@ NRMat >::NRMat(const NRMat &rhs, bool imagpart): nn(rhs. ******************************************************************************/ template<> NRMat::NRMat(const NRMat > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { - const int nn_mm = nn*mm; + const size_t nn_mm = (size_t) nn*mm; #ifdef CUDALA if(location == cpu){ #endif @@ -1079,7 +1076,7 @@ const NRSMat NRMat::timestransposed() const { #ifdef MATPTR r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1); #else - r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1); + r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1); #endif } } @@ -1087,7 +1084,7 @@ const NRSMat NRMat::timestransposed() const { }else{ for(i=0; i > NRMat >::timestransposed() const #ifdef MATPTR cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j)); #else - cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j)); + cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j)); #endif } } @@ -1121,7 +1118,7 @@ const NRSMat > NRMat >::timestransposed() const }else{ for(i=0; i*> (&val)); } @@ -1172,7 +1169,7 @@ void NRMat::randomize(const double &x) { }else{ NRMat tmp(nn, mm, cpu); double *tmp_data = tmp; - for(register int i=0; ilocation); @@ -1203,7 +1200,7 @@ void NRMat >::randomize(const double &x) { }else{ NRMat > tmp(nn, mm, cpu); complex *tmp_data = tmp; - for(register int i=0; i(re, im); @@ -1226,10 +1223,10 @@ NRMat& NRMat::operator*=(const double &a) { #ifdef CUDALA if(location == cpu){ #endif - cblas_dscal(nn*mm, a, *this, 1); + cblas_dscal((size_t)nn*mm, a, *this, 1); #ifdef CUDALA }else{ - cublasDscal(nn*mm, a, v, 1); + cublasDscal((size_t)nn*mm, a, v, 1); TEST_CUBLAS("cublasDscal"); } #endif @@ -1249,11 +1246,11 @@ NRMat >::operator*=(const complex &a) { #ifdef CUDALA if(location == cpu){ #endif - cblas_zscal(nn*mm, &a, (*this)[0], 1); + cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1); #ifdef CUDALA }else{ const cuDoubleComplex fac = *(reinterpret_cast (&a)); - cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1); + cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1); TEST_CUBLAS("cublasZscal"); } #endif @@ -1271,9 +1268,9 @@ NRMat & NRMat::operator*=(const T &a) { NOT_GPU(*this); copyonwrite(); #ifdef MATPTR - for(register int i=0; i< nn*mm; i++) v[0][i] *= a; + for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a; #else - for(register int i=0; i< nn*mm; i++) v[i] *= a; + for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a; #endif return *this; } @@ -1294,10 +1291,10 @@ NRMat & NRMat::operator+=(const NRMat &rhs) { #ifdef CUDALA if(location == cpu){ #endif - cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1); + cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1); #ifdef CUDALA }else{ - cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1); + cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1); TEST_CUBLAS("cublasDaxpy"); } #endif @@ -1320,10 +1317,10 @@ NRMat >::operator+=(const NRMat< complex > &rhs) { #ifdef CUDALA if(location == cpu){ #endif - cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); + cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); #ifdef CUDALA }else{ - cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); + cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); } #endif return *this; @@ -1345,9 +1342,9 @@ NRMat & NRMat::operator+=(const NRMat &rhs) { copyonwrite(); #ifdef MATPTR - for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; + for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i]; #else - for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; + for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i]; #endif return *this; } @@ -1368,10 +1365,10 @@ NRMat & NRMat::operator-=(const NRMat &rhs) { #ifdef CUDALA if(location == cpu){ #endif - cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1); + cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1); #ifdef CUDALA }else{ - cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1); + cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1); } #endif return *this; @@ -1395,10 +1392,10 @@ NRMat< complex >::operator-=(const NRMat< complex > &rhs) { #ifdef CUDALA if(location == cpu){ #endif - cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); + cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); #ifdef CUDALA }else{ - cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); + cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); } #endif return *this; @@ -1421,9 +1418,9 @@ NRMat & NRMat::operator-=(const NRMat &rhs) { copyonwrite(); #ifdef MATPTR - for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; + for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i]; #else - for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; + for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i]; #endif return *this; } @@ -1693,10 +1690,10 @@ const double NRMat::dot(const NRMat &rhs) const { #ifdef CUDALA if(location == cpu){ #endif - ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1); + ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1); #ifdef CUDALA }else{ - ret = cublasDdot(nn*mm, v, 1, rhs.v, 1); + ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1); } #endif return ret; @@ -1721,10 +1718,10 @@ NRMat >::dot(const NRMat > &rhs) const { #ifdef CUDALA if(location == cpu){ #endif - cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret); + cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret); #ifdef CUDALA }else{ - cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); + cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); ret = *(reinterpret_cast*> (&val)); } #endif @@ -1804,7 +1801,7 @@ void NRMat::diagmultl(const NRVec &rhs) { for(register int i=0; i >::diagmultl(const NRVec< complex > &rhs) { }else{ for(register int i=0; i::operator*(const NRSMat &rhs) const { #ifdef CUDALA }else{ for(register int i=0; i >::operator*(const NRSMat< complex > &rhs) const #ifdef CUDALA }else{ for(register int i=0; i >& NRMat >::conjugateme() { #ifdef CUDALA if(location == cpu){ #endif - cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2); + cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2); #ifdef CUDALA }else{ - cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2); + cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2); } #endif return *this; @@ -2048,12 +2045,12 @@ void NRMat::gemm(const double &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const double &alpha) { - int k(transa=='n'?a.mm:a.nn); + int k(tolower(transa)=='n'?a.mm:a.nn); #ifdef DEBUG - int l(transa=='n'?a.nn:a.mm); - int kk(transb=='n'?b.nn:b.mm); - int ll(transb=='n'?b.mm:b.nn); + int l(tolower(transa)=='n'?a.nn:a.mm); + int kk(tolower(transb)=='n'?b.nn:b.mm); + int ll(tolower(transb)=='n'?b.mm:b.nn); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat::gemm(...)"); if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm"); #endif @@ -2066,8 +2063,8 @@ void NRMat::gemm(const double &beta, const NRMat &a, #ifdef CUDALA if(location == cpu){ #endif - cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans), - (transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, + cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans), + (tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, a.mm, b , b.mm, beta, *this , mm); #ifdef CUDALA }else{ @@ -2083,20 +2080,20 @@ void NRMat >::gemm(const complex & beta, const NRMat > & b, const char transb, const complex & alpha) { - int k(transa=='n'?a.mm:a.nn); + int k(tolower(transa)=='n'?a.mm:a.nn); #ifdef DEBUG - int l(transa=='n'?a.nn:a.mm); - int kk(transb=='n'?b.nn:b.mm); - int ll(transb=='n'?b.mm:b.nn); + int l(tolower(transa)=='n'?a.nn:a.mm); + int kk(tolower(transb)=='n'?b.nn:b.mm); + int ll(tolower(transb)=='n'?b.mm:b.nn); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat >::gemm(...)"); #endif if (alpha==CZERO && beta==CONE) return; copyonwrite(); cblas_zgemm(CblasRowMajor, - (transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), - (transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), + (tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)), + (tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)), nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm); } @@ -2113,10 +2110,10 @@ const double NRMat::norm(const double scalar) const { #ifdef CUDALA if(location == cpu){ #endif - return cblas_dnrm2(nn*mm, (*this)[0], 1); + return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1); #ifdef CUDALA }else{ - return cublasDnrm2(nn*mm, v, 1); + return cublasDnrm2((size_t)nn*mm, v, 1); } #endif } @@ -2130,7 +2127,7 @@ const double NRMat::norm(const double scalar) const { #ifdef MATPTR tmp = v[i][j]; #else - tmp = v[i*mm+j]; + tmp = v[i*(size_t)mm+j]; #endif if(i == j) tmp -= scalar; sum += tmp*tmp; @@ -2152,10 +2149,10 @@ const double NRMat >::norm(const complex scalar) const { #ifdef CUDALA if(location == cpu){ #endif - return cblas_dznrm2(nn*mm, (*this)[0], 1); + return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1); #ifdef CUDALA }else{ - return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1); + return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1); } #endif } @@ -2168,7 +2165,7 @@ const double NRMat >::norm(const complex scalar) const { #ifdef MATPTR tmp = v[i][j]; #else - tmp = v[i*mm+j]; + tmp = v[i*(size_t)mm+j]; #endif if(i == j) tmp -= scalar; const double re = tmp.real(); @@ -2195,10 +2192,10 @@ void NRMat::axpy(const double alpha, const NRMat &mat) { #ifdef CUDALA if(location == cpu){ #endif - cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1); + cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1); #ifdef CUDALA }else{ - cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1); + cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1); } #endif } @@ -2221,7 +2218,7 @@ void NRMat >::axpy(const complex alpha, #ifdef CUDALA if(location == cpu){ #endif - cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1); + cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1); #ifdef CUDALA }else{ const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); @@ -2245,7 +2242,7 @@ const T NRMat::trace() const { #ifdef MATPTR for(register int i=0; i& NRMat::swap_rows(){ #ifdef CUDALA }else{ for(register int i=0; i >& NRMat >::swap_rows(){ #ifdef CUDALA }else{ for(register int i=0; i& NRMat::swap_rows(){ }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat::swap_rows"); for(register int i=0; i& NRMat::swap_rows_cols(){ #ifdef CUDALA }else{ for(register int i=0; i >& NRMat >::swap_rows_cols(){ #ifdef CUDALA }else{ for(register int i=0;i NRMat& NRMat::swap_rows_cols(){ const int n_pul = nn >> 1; const int m_pul = mm >> 1; - const int dim = nn*mm; + const size_t dim = (size_t)nn*mm; T *data_ptr; T tmp; @@ -2837,7 +2834,7 @@ NRMat& NRMat::swap_rows_cols(){ }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat::swap_rows_cols"); for(register int i=0; i; friend class NRSMat; @@ -89,16 +89,16 @@ public: //! explicit constructor converting vector into a NRMat object #ifdef MATPTR explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){ - if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); + if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); }; #else explicit NRMat(const NRVec &rhs, const int n, const int m, const int offset = 0); #endif #ifdef MATPTR - const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise + const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise #else - const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise + const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise #endif const bool operator==(const NRMat &rhs) const {return !(*this != rhs);}; @@ -107,7 +107,7 @@ public: inline int getcount() const {return count?*count:0;} //! ensure that the data of this matrix are referenced exactly once - void copyonwrite(); + void copyonwrite(bool detachonly=false); /***************************************************************************//** * routines for CUDA related stuff @@ -260,7 +260,7 @@ public: //! get the number of columns inline int ncols() const; //! get the number of matrix elements - inline int size() const; + inline size_t size() const; //! unformatted input void get(int fd, bool dimensions = 1, bool transposed = false); @@ -274,8 +274,8 @@ public: //! set all matrix elements equal to zero void clear(){ if(nn&&mm){ - copyonwrite(); - LA_traits::clear((*this)[0], nn*mm); + copyonwrite(true); + LA_traits::clear((*this)[0], (size_t)nn*mm); } }; @@ -379,7 +379,7 @@ template NRMat::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { T* p; *count = 1; - const int nm = n*m; + const size_t nm = (size_t)n*m; #ifdef CUDALA location = (loc==undefined?DEFAULT_LOC:loc); if(location == cpu) { @@ -408,7 +408,7 @@ NRMat::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count ******************************************************************************/ template NRMat::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { - const int nm = n*m; + const size_t nm = (size_t)n*m; T *p; *count = 1; @@ -447,7 +447,7 @@ NRMat::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), ******************************************************************************/ template NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) { - const int nm = n*m; + const size_t nm = (size_t)n*m; T *p; *count = 1; @@ -460,7 +460,7 @@ NRMat::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new p = v[0] = new T[nm]; for (register int i=1; i::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new ******************************************************************************/ template NRMat::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) { - const int nm = n*m; + const size_t nm = (size_t)n*m; #ifdef CUDALA location = DEFAULT_LOC; #endif @@ -546,10 +546,10 @@ NRMat::NRMat(const NRSMat &rhs) { *count = 1; #ifdef MATPTR v = new T*[nn]; - v[0] = new T[mm*nn]; + v[0] = new T[(size_t)mm*nn]; for (int i=1; i::NRMat(const NRSMat &rhs) { #else for (i=0; i::NRMat(const NRSMat &rhs) { template NRMat::NRMat(const NRVec &rhs, const int n, const int m, const int offset) { - if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); + if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); #ifdef CUDALA location=rhs.location; @@ -628,7 +628,7 @@ inline T* NRMat::operator[](const int i) { #ifdef MATPTR return v[i]; #else - return v + i*mm; + return v + i*(size_t)mm; #endif } @@ -646,7 +646,7 @@ inline const T* NRMat::operator[](const int i) const { #ifdef MATPTR return v[i]; #else - return v + i*mm; + return v + i*(size_t)mm; #endif } @@ -668,7 +668,7 @@ inline T& NRMat::operator()(const int i, const int j){ #ifdef MATPTR return v[i][j]; #else - return v[i*mm + j]; + return v[i*(size_t)mm + j]; #endif } @@ -689,7 +689,7 @@ inline const T& NRMat::operator()(const int i, const int j) const{ #ifdef MATPTR return v[i][j]; #else - return v[i*mm + j]; + return v[i*(size_t)mm + j]; #endif } @@ -712,11 +712,11 @@ inline const T NRMat::get_ij(const int i, const int j) const{ #ifdef MATPTR return v[i][j]; #else - return v[i*mm + j]; + return v[i*(size_t)mm + j]; #endif #ifdef CUDALA }else{ - const int pozice = i*mm + j; + const size_t pozice = i*(size_t)mm + j; gpuget(1, sizeof(T), v + pozice, &ret); return ret; } @@ -743,8 +743,8 @@ inline int NRMat::ncols() const{ * @return number of elements ******************************************************************************/ template -inline int NRMat::size() const{ - return nn*mm; +inline size_t NRMat::size() const{ + return (size_t)nn*mm; } /***************************************************************************//** @@ -795,7 +795,7 @@ inline const double NRMat::amax() const{ #ifdef CUDALA }else{ double ret(0.0); - const int pozice = cublasIdamax(nn*mm, v, 1) - 1; + const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1; TEST_CUBLAS("cublasIdamax"); gpuget(1, sizeof(double), v + pozice, &ret); return ret; @@ -815,7 +815,7 @@ inline const double NRMat::amin() const{ if(location == cpu){ #endif // idamin seems not to be supported - const int nm = nn*mm; + const size_t nm = (size_t)nn*mm; double val(0.0); int index(-1); ret = std::numeric_limits::max(); @@ -834,7 +834,7 @@ inline const double NRMat::amin() const{ #endif #ifdef CUDALA }else{ - const int pozice = cublasIdamin(nn*mm, v, 1) - 1; + const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1; TEST_CUBLAS("cublasIdamin"); gpuget(1, sizeof(double), v + pozice, &ret); } @@ -860,7 +860,7 @@ inline const complex NRMat >::amax() const{ #ifdef CUDALA }else{ complex ret(0.0, 0.0); - const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1; + const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1; TEST_CUBLAS("cublasIzamax"); gpuget(1, sizeof(complex), v + pozice, &ret); return ret; @@ -881,7 +881,7 @@ inline const complex NRMat >::amin() const{ if(location == cpu){ #endif // idamin seems not to be supported - const int nm = nn*mm; + const size_t nm = (size_t)nn*mm; int index(-1); double val(0.0), min_val(0.0); complex z_val(0.0, 0.0); @@ -903,7 +903,7 @@ inline const complex NRMat >::amin() const{ #endif #ifdef CUDALA }else{ - const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1; + const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1; TEST_CUBLAS("cublasIzamin"); gpuget(1, sizeof(complex), v + pozice, &ret); } @@ -991,7 +991,7 @@ NRMat & NRMat::operator|=(const NRMat &rhs) { * @see NRMat::count, NRMat::operator|=() ******************************************************************************/ template -void NRMat::copyonwrite() { +void NRMat::copyonwrite(bool detachonly) { if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0"); if(*count > 1){ (*count)--; @@ -1002,20 +1002,20 @@ void NRMat::copyonwrite() { #endif #ifdef MATPTR T **newv = new T*[nn]; - newv[0] = new T[mm*nn]; - memcpy(newv[0], v[0], mm*nn*sizeof(T)); + newv[0] = new T[(size_t)mm*nn]; + if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T)); v = newv; for(register int i=1; i::resize(int n, int m) { #endif #ifdef MATPTR v = new T*[nn]; - v[0] = new T[m*n]; + v[0] = new T[(size_t)m*n]; for (register int i=1; i< n; i++) v[i] = v[i-1] + m; #else - v = new T[m*n]; + v = new T[(size_t)m*n]; #endif #ifdef CUDALA }else{ - v = (T *) gpualloc(n*m*sizeof(T)); + v = (T *) gpualloc((size_t)n*m*sizeof(T)); } #endif return; @@ -1108,15 +1108,15 @@ void NRMat::resize(int n, int m) { delete[] v; #ifdef MATPTR v = new T*[nn]; - v[0] = new T[m*n]; + v[0] = new T[(size_t)m*n]; for (int i=1; i< n; i++) v[i] = v[i-1] + m; #else - v = new T[m*n]; + v = new T[(size_t)m*n]; #endif #ifdef CUDALA }else{ gpufree(v); - v=(T *) gpualloc(n*m*sizeof(T)); + v=(T *) gpualloc((size_t)n*m*sizeof(T)); } #endif } @@ -1228,7 +1228,7 @@ public: #ifdef MATPTR return NRMat::v[i - 1][j - 1]; #else - return NRMat::v[(i-1)*NRMat::mm+j-1]; + return NRMat::v[(i-1)*(size_t)NRMat::mm+j-1]; #endif } @@ -1258,11 +1258,11 @@ public: #ifdef MATPTR return NRMat::v[i - 1][j - 1]; #else - return NRMat::v[(i-1)*NRMat::mm + (j-1)]; + return NRMat::v[(size_t)(i-1)*NRMat::mm + (j-1)]; #endif #ifdef CUDALA }else{ - const int pozice = (i-1)*NRMat::mm + (j-1); + const size_t pozice = (size_t)(i-1)*NRMat::mm + (j-1); gpuget(1, sizeof(T), NRMat::v + pozice, &ret); return ret; } @@ -1286,10 +1286,10 @@ NRMat& NRMat::operator^=(const NRMat &rhs){ copyonwrite();// ensure that *count == 1 #ifdef MATPTR - for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i]; + for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i]; #else - const int Dim = nn*mm; - for(register int i=0; i::moveto(const GPUID dest) { T *vold = v; if(dest == cpu){ //moving from GPU to CPU - v = new T[nn*mm]; - gpuget(nn*mm, sizeof(T), vold, v); + v = new T[(size_t)nn*mm]; + gpuget((size_t)nn*mm, sizeof(T), vold, v); if(*count == 1){ gpufree(vold); } else{ --(*count); count = new int(1); } }else{ //moving from CPU to GPU - v = (T *) gpualloc(nn*mm*sizeof(T)); - gpuput(nn*mm, sizeof(T), vold, v); + v = (T *) gpualloc((size_t)nn*mm*sizeof(T)); + gpuput((size_t)nn*mm, sizeof(T), vold, v); if(*count == 1) delete[] vold; else{ --(*count); count = new int(1);} } @@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -) }//end of the LA-namespace #endif/* _LA_MAT_H_ */ + diff --git a/noncblas.cc b/noncblas.cc index 6cb958d..923be5b 100644 --- a/noncblas.cc +++ b/noncblas.cc @@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, { if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); + char U = BLAS_FORTRANCASE('u'); #ifdef FORINT const FINT ntmp=N; const FINT incxtmp=incX; const FINT incytmp=incY; - FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); + FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); #else - FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY); + FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY); #endif } @@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, { if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); + char U = BLAS_FORTRANCASE('u'); #ifdef FORINT const FINT ntmp=N; const FINT incxtmp=incX; const FINT incytmp=incY; - FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); + FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); #else - FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); + FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY); #endif } @@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA { if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); //swap a-b, m-n + char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T'); + char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T'); #ifdef FORINT const FINT mtmp=M; const FINT ntmp=N; @@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA const FINT ldatmp=lda; const FINT ldbtmp=ldb; const FINT ldctmp=ldc; - FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", + FORNAME(dgemm) (&transb,&transa, &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp); #else - FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", + FORNAME(dgemm) (&transb,&transa, &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); #endif } @@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA { if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); //swap a-b, m-n + char transb = BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T')); + char transa = BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T')); #ifdef FORINT const FINT mtmp=M; const FINT ntmp=N; @@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA const FINT ldatmp=lda; const FINT ldbtmp=ldb; const FINT ldctmp=ldc; - FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + FORNAME(zgemm) (&transb,&transa, &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp); #else - FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), - TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), + FORNAME(zgemm) ( &transb,&transa, &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc); #endif } @@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order, const double *X, const int incX, const double beta, double *Y, const int incY) { + char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T'); + char transax = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N'); #ifdef FORINT const FINT mtmp=M; const FINT ntmp=N; const FINT ldatmp=lda; const FINT incxtmp=incX; const FINT incytmp=incY; - if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); + if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); //swap n-m and toggle transposition - else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); + else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); #else - if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); + if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); //swap n-m and toggle transposition - else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); + else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); #endif } @@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order, if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted"); //swap n-m and toggle transposition + char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N'); #ifdef FORINT const FINT mtmp=M; const FINT ntmp=N; const FINT ldatmp=lda; const FINT incxtmp=incX; const FINT incytmp=incY; - FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); + FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); #else - FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); + FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); #endif } diff --git a/nonclass.cc b/nonclass.cc index 1804cec..f22e7ed 100644 --- a/nonclass.cc +++ b/nonclass.cc @@ -17,6 +17,7 @@ */ //this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway +#include "la_traits.h" #include "noncblas.h" #include "vec.h" #include "smat.h" @@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat &a, double *b, const int nrhs, const FINT r, *ipiv; a.copyonwrite(); ipiv = new FINT[n]; - char U = 'U'; + char U = LAPACK_FORTRANCASE('u'); #ifdef FORINT const FINT ntmp=n; const FINT nrhstmp=nrhs; @@ -298,9 +299,9 @@ int linear_solve_x(NRMat &_A, double *_B, const int _rhsCount, const int const int A_rows = _A.nrows(); const int A_cols = _A.ncols(); - const char fact = _eq?'E':'N'; - const char trans = 'T';//because of c-order - char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const + const char fact = LAPACK_FORTRANCASE(_eq?'E':'N'); + const char trans = LAPACK_FORTRANCASE('T');//because of c-order + char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){ laerror("linear_solve_x: invalid input matrices"); @@ -371,9 +372,9 @@ int linear_solve_x(NRMat > &_A, complex *_B, const int _ const int A_rows = _A.nrows(); const int A_cols = _A.ncols(); - const char fact = _eq?'E':'N'; - const char trans = 'T';//because of c-order - char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const + const char fact = LAPACK_FORTRANCASE(_eq?'E':'N'); + const char trans = LAPACK_FORTRANCASE('T');//because of c-order + char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){ laerror("linear_solve_x: invalid input matrices"); @@ -557,9 +558,9 @@ void diagonalize(NRMat &a, NRVec &w, const bool eivec, if(b) b->copyonwrite(); FINT r = 0; - char U ='U'; - char vectors = 'V'; - if (!eivec) vectors = 'N'; + char U =LAPACK_FORTRANCASE('u'); + char vectors = LAPACK_FORTRANCASE('v'); + if (!eivec) vectors = LAPACK_FORTRANCASE('n'); FINT LWORK = -1; double WORKX; FINT ldb=0; if(b) ldb=b->ncols(); @@ -588,7 +589,7 @@ void diagonalize(NRMat &a, NRVec &w, const bool eivec, #endif delete[] WORK; - if (vectors == 'V' && corder) a.transposeme(n); + if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n); if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()"); if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()"); @@ -620,12 +621,13 @@ void diagonalize(NRMat > &a, NRVec &w, const bool eivec, if(b) b->copyonwrite(); FINT r = 0; - char U ='U'; - char vectors = 'V'; - if (!eivec) vectors = 'N'; + char U =LAPACK_FORTRANCASE('U'); + char vectors = LAPACK_FORTRANCASE('V'); + if (!eivec) vectors = LAPACK_FORTRANCASE('n'); FINT LWORK = -1; complex WORKX; FINT ldb=0; if(b) ldb=b->ncols(); +std::cout << "test vectors "< > &a, NRVec &w, const bool eivec, delete[] WORK; delete[] RWORK; - if (vectors == 'V' && corder) a.transposeme(n); + if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();} if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()"); if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()"); @@ -684,8 +686,8 @@ void diagonalize(NRSMat &a, NRVec &w, NRMat *v, if(b) b->copyonwrite(); FINT r = 0; - char U = 'U'; - char job = v ? 'v' : 'n'; + char U = LAPACK_FORTRANCASE('u'); + char job = LAPACK_FORTRANCASE(v ? 'v' : 'n'); double *WORK = new double[3*n]; FINT ldv=v?v->ncols():n; @@ -730,8 +732,8 @@ void diagonalize(NRSMat > &a, NRVec &w, NRMatcopyonwrite(); FINT r = 0; - char U = 'U'; - char job = v ? 'v' : 'n'; + char U = LAPACK_FORTRANCASE('u'); + char job = LAPACK_FORTRANCASE(v ? 'v' : 'n'); complex *WORK = new complex[2*n]; double *RWORK = new double[3*n]; @@ -890,8 +892,8 @@ void gdiagonalize(NRMat &a, NRVec &wr, NRVec &wi, const int sorttype, const int biorthonormalize, NRMat *b, NRVec *beta) { - if(n<=0) n = a.nrows(); - if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix"); + if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");} + if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix"); if (n > wr.size()) laerror("inconsistent dimension of eigen vector in gdiagonalize()"); if (vl) if (n > vl->nrows() || n > vl->ncols()) @@ -911,8 +913,8 @@ void gdiagonalize(NRMat &a, NRVec &wr, NRVec &wi, if (beta) beta->copyonwrite(); if (b) b->copyonwrite(); - char jobvl = vl ? 'V' : 'N'; - char jobvr = vr ? 'V' : 'N'; + char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n'); + char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n'); double work0; FINT lwork = -1; FINT r; @@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat > &a, NRVec< complex > &w, NRMat > *b, NRVec > *beta) { - if(n<=0) n = a.nrows(); - if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix"); + if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");} + if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix"); if (n > w.size()) laerror("inconsistent dimension of eigen vector in gdiagonalize()"); if (vl) if (n > vl->nrows() || n > vl->ncols()) @@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat > &a, NRVec< complex > &w, if (beta) beta->copyonwrite(); if (b) b->copyonwrite(); - char jobvl = vl ? 'V' : 'N'; - char jobvr = vr ? 'V' : 'N'; + char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n'); + char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n'); complex work0; FINT lwork = -1; FINT r; @@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat > &a, NRVec< complex > &w, if (corder) { - if (vl) vl->transposeme(n); - if (vr) vr->transposeme(n); + if (vl) {vl->transposeme(n); vl->conjugateme();} + if (vr) {vr->transposeme(n); vr->conjugateme();} } } @@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat &a, NRVec< complex > &w, const bool corder, int n, const int sorttype, const int biorthonormalize, NRMat *b, NRVec *beta) { - if(n<=0) n = a.nrows(); - if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix"); + if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");} + if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix"); NRVec wr(n), wi(n); NRMat *rvl = 0; @@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat &a, NRVec< complex > &w, template<> const NRMat realpart > >(const NRMat< complex > &a) { + NRMat result(a.nrows(), a.ncols()); + #ifdef CUDALA - if(location == cpu){ + if(a.location == cpu){ #endif - NRMat result(a.nrows(), a.ncols()); +// NRMat result(a.nrows(), a.ncols()); cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1); #ifdef CUDALA }else{ @@ -1242,11 +1246,13 @@ const NRMat realpart > >(const NRMat< complex const NRMat imagpart > >(const NRMat< complex > &a) { + NRMat result(a.nrows(), a.ncols()); + #ifdef CUDALA - if(location == cpu){ + if(a.location == cpu){ #endif - NRMat result(a.nrows(), a.ncols()); +// NRMat result(a.nrows(), a.ncols()); cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1); #ifdef CUDALA }else{ @@ -1259,12 +1265,15 @@ const NRMat imagpart > >(const NRMat< complex const NRMat< complex > realmatrix > (const NRMat &a) { + + NRMat > result(a.nrows(), a.ncols()); + #ifdef CUDALA - if(location == cpu){ + if(a.location == cpu){ #endif - NRMat > result(a.nrows(), a.ncols()); +// NRMat > result(a.nrows(), a.ncols()); cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2); #ifdef CUDALA }else{ @@ -1278,11 +1287,13 @@ const NRMat< complex > realmatrix > (const NRMat & template<> const NRMat< complex > imagmatrix > (const NRMat &a) { + NRMat< complex > result(a.nrows(), a.ncols()); + #ifdef CUDALA - if(location == cpu){ + if(a.location == cpu){ #endif - NRMat< complex > result(a.nrows(), a.ncols()); +// NRMat< complex > result(a.nrows(), a.ncols()); cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2); #ifdef CUDALA }else{ @@ -1577,7 +1588,7 @@ void cholesky(NRMat &a, bool upper) if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); FINT lda=a.ncols(); FINT n=a.nrows(); -char uplo=upper?'U':'L'; +char uplo= LAPACK_FORTRANCASE(upper?'u':'l'); FINT info; a.copyonwrite(); FORNAME(dpotrf)(&uplo, &n, a, &lda, &info); @@ -1596,7 +1607,7 @@ void cholesky(NRMat > &a, bool upper) if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); FINT lda=a.ncols(); FINT n=a.nrows(); -char uplo=upper?'U':'L'; +char uplo= LAPACK_FORTRANCASE(upper?'u':'l'); FINT info; a.copyonwrite(); a.transposeme();//switch to Fortran order @@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1 } }//namespace + diff --git a/smat.cc b/smat.cc index e8b62f4..2b57aec 100644 --- a/smat.cc +++ b/smat.cc @@ -28,11 +28,8 @@ #include #include #include +#include -extern "C" { - extern ssize_t read(int, void *, size_t); - extern ssize_t write(int, const void *, size_t); -} namespace LA { @@ -130,7 +127,7 @@ NRSMat & NRSMat::operator=(const T &a) { NOT_GPU(*this); copyonwrite(); memset(v, 0, NN2*sizeof(T)); - for(register int i=0; i::diagonalof(NRVec &r, const bool divide, bool cache) const if(divide){ for(register int i=0; i NRSMat::operator-() const { NOT_GPU(*this); NRSMat result(nn, getlocation()); - for(register int i = 0; i::trace() const { NOT_GPU(*this); T tmp = 0; - for(register int i=0; i void NRSMat::randomize(const double &x) { NOT_GPU(*this); - for(int i=0; i::randomize(const double &x) { ******************************************************************************/ template<> void NRSMat >::randomize(const double &x) { - for(register int i=0; i &r, const char trans, const T alpha, const NRVec &x) const {r.gemv(beta,*this,trans,alpha,x);}; void gemv(const T beta, NRVec > &r, const char trans, const T alpha, const NRVec > &x) const {r.gemv(beta,*this,trans,alpha,x);}; - inline const T& operator[](const int ij) const; - inline T& operator[](const int ij); + inline const T& operator[](const size_t ij) const; + inline T& operator[](const size_t ij); inline const T& operator()(const int i, const int j) const; inline T& operator()(const int i, const int j); inline int nrows() const; inline int ncols() const; - inline int size() const; + inline size_t size() const; inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant const typename LA_traits::normtype norm(const T scalar = (T)0) const; @@ -155,9 +155,9 @@ public: void get(int fd, bool dimensions = 1, bool transp = 0); void put(int fd, bool dimensions = 1, bool transp = 0) const; - void copyonwrite(); + void copyonwrite(bool detachonly=false); - void clear() {copyonwrite(); LA_traits::clear(v,NN2);}; //zero out + void clear() {copyonwrite(true); LA_traits::clear(v,NN2);}; //zero out void resize(const int n); void dealloc(void) {resize(0);} @@ -212,7 +212,7 @@ inline NRSMat::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) { if(location == cpu){ #endif v = new T[NN2]; - if(a != (T)0) for(register int i = 0; i & NRSMat::operator*=(const T &a) { NOT_GPU(*this); copyonwrite(); - for(register int i = 0; i & NRSMat::operator+=(const T &a) { NOT_GPU(*this); copyonwrite(); - for(register int i = 0; i & NRSMat::operator-=(const T &a) { NOT_GPU(*this); copyonwrite(); - for(register int i = 0; i& NRSMat::operator+=(const NRSMat& rhs) { SAME_LOC(*this, rhs); copyonwrite(); - for(register int i = 0; i& NRSMat::operator-=(const NRSMat& rhs) { NOT_GPU(*this); copyonwrite(); - for(register int i = 0; i NRSMat::operator-(const NRMat &rhs) const { * @return reference to the corresponding matrix element ******************************************************************************/ template -inline T& NRSMat::operator[](const int ij) { +inline T& NRSMat::operator[](const size_t ij) { #ifdef DEBUG if(_LA_count_check && *count != 1) laerror("T& NRSMat::operator[] used for matrix with count>1"); if(ij<0 || ij>=NN2) laerror("T& NRSMat::operator[] out of range"); @@ -560,7 +560,7 @@ inline T& NRSMat::operator[](const int ij) { * @return constant reference to the corresponding matrix element ******************************************************************************/ template -inline const T & NRSMat::operator[](const int ij) const { +inline const T & NRSMat::operator[](const size_t ij) const { #ifdef DEBUG if(ij<0 || ij>=NN2) laerror("T& NRSMat::operator[] out of range"); if(!v) laerror("T& NRSMat::operator[] used for unallocated NRSmat object"); @@ -578,8 +578,8 @@ inline const T & NRSMat::operator[](const int ij) const { * @return cumulative index ******************************************************************************/ template -inline T SMat_index(T i, T j) { - return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i; +inline size_t SMat_index(T i, T j) { + return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i; } /***************************************************************************//** @@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) { * @return cumulative index ******************************************************************************/ template -inline T SMat_index_igej(T i, T j) { - return i*(i+1)/2+j; +inline size_t SMat_index_igej(T i, T j) { + return i*(size_t)(i+1)/2+j; } /***************************************************************************//** @@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) { * @return cumulative index ******************************************************************************/ template -inline T SMat_index_ilej(T i, T j) { - return j*(j+1)/2+i; +inline size_t SMat_index_ilej(T i, T j) { + return j*(size_t)(j+1)/2+i; } /***************************************************************************//** @@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) { * @return cumulative index ******************************************************************************/ template -inline T SMat_index_1(T i, T j) { - return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1; +inline size_t SMat_index_1(T i, T j) { + return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1; } /***************************************************************************//** @@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) { * @return cumulative index ******************************************************************************/ template -inline T SMat_index_1igej(T i, T j) { - return i*(i-1)/2+j-1; +inline size_t SMat_index_1igej(T i, T j) { + return i*(size_t)(i-1)/2+j-1; } /***************************************************************************//** @@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) { * @return cumulative index ******************************************************************************/ template -inline T SMat_index_1ilej(T i, T j) { - return j*(j-1)/2+i-1; +inline size_t SMat_index_1ilej(T i, T j) { + return j*(size_t)(j-1)/2+i-1; } //indexing for antisymmetric matrix (used by fourindex) template -inline T ASMat_index(T i, T j) +inline size_t ASMat_index(T i, T j) { if(i == j) return -1; -return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i; +return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i; } template -inline T ASMat_index_1(T i, T j) +inline size_t ASMat_index_1(T i, T j) { if(i == j) return -1; return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1; @@ -715,7 +715,7 @@ inline int NRSMat::ncols() const { * @return number of elements of this symmetric matrix of generalt type T ******************************************************************************/ template -inline int NRSMat::size() const { +inline size_t NRSMat::size() const { return NN2; } @@ -758,7 +758,7 @@ inline const double NRSMat::amin() const { double val(0.0); int index(-1); ret = std::numeric_limits::max(); - for(register int i = 0; i < NN2; i++){ + for(register size_t i = 0; i < NN2; i++){ val = std::abs(v[i]); if(val < ret){ index = i; ret = val; } } @@ -812,7 +812,7 @@ inline const complex NRSMat >::amin() const{ complex z_val(0.0, 0.0); min_val = std::numeric_limits::max(); - for(register int i = 0; i < NN2; i++){ + for(register size_t i = 0; i < NN2; i++){ z_val = v[i]; val = std::abs(z_val.real()) + std::abs(z_val.imag()); if(val < min_val){ index = i; min_val = val; } @@ -920,7 +920,7 @@ NRSMat & NRSMat::operator=(const NRSMat & rhs) { * @see NRSMat::operator|=, NRSMat::copyonwrite() ******************************************************************************/ template -void NRSMat::copyonwrite() { +void NRSMat::copyonwrite(bool detachonly) { if(!count) laerror("calling NRSMat::copyonwrite() for undefined NRSMat object"); if(*count > 1){ (*count)--; @@ -931,12 +931,12 @@ void NRSMat::copyonwrite() { if(location == cpu) { #endif newv = new T[NN2]; - memcpy(newv, v, NN2*sizeof(T)); + if(!detachonly) memcpy(newv, v, NN2*sizeof(T)); #ifdef CUDALA }else{ newv = (T *) gpualloc(NN2*sizeof(T)); if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat::copyonwrite()"); - cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); + if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); TEST_CUBLAS("cublasScopy");//"NRSMat::copyonwrite()" } #endif diff --git a/sparsemat.h b/sparsemat.h index b0b7a84..b71f5cf 100644 --- a/sparsemat.h +++ b/sparsemat.h @@ -132,8 +132,8 @@ public: void setunsymmetric();//unwind the matrix assuming it was indeed symmetric inline bool issymmetric() const {return symmetric;} unsigned int length() const; - void copyonwrite(); - void clear() {copyonwrite();unsort();deletelist();} + void copyonwrite(bool detachonly=false); + void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}} void simplify(); const T trace() const; const typename LA_traits::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first diff --git a/t.cc b/t.cc index d47aef9..33ca48a 100644 --- a/t.cc +++ b/t.cc @@ -1799,6 +1799,11 @@ for(int i=0; i -extern "C" { - extern ssize_t read(int, void *, size_t); - extern ssize_t write(int, const void *, size_t); -} namespace LA { @@ -541,7 +538,7 @@ template<> void NRVec::gemv(const double beta, const NRMat &A, const char trans, const double alpha, const NRVec &x) { #ifdef DEBUG - if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } + if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } #endif SAME_LOC3(*this, x, A); copyonwrite(); @@ -549,10 +546,10 @@ void NRVec::gemv(const double beta, const NRMat &A, #ifdef CUDALA if(location==cpu){ #endif - cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); + cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); #ifdef CUDALA }else{ - cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); + cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); TEST_CUBLAS("cublasDgemv"); } #endif @@ -572,7 +569,7 @@ template<> void NRVec >::gemv(const double beta, const NRMat &A, const char trans, const double alpha, const NRVec > &x) { #ifdef DEBUG - if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } + if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } #endif SAME_LOC3(*this, x, A); copyonwrite(); @@ -580,16 +577,16 @@ void NRVec >::gemv(const double beta, const NRMat &A, #ifdef CUDALA if(location==cpu){ #endif - cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2); - cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); #ifdef CUDALA }else{ - cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2); + cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2); TEST_CUBLAS("cublasDgemv"); - cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); + cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); TEST_CUBLAS("cublasDgemv"); } #endif @@ -611,14 +608,14 @@ void NRVec >::gemv(const complex beta, const NRMat > &A, const char trans, const complex alpha, const NRVec > &x) { #ifdef DEBUG - if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } + if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } #endif SAME_LOC3(*this, x, A); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif - cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), + cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1); #ifdef CUDALA }else{ @@ -626,7 +623,7 @@ void NRVec >::gemv(const complex beta, const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag()); - cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), + cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), _alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1); TEST_CUBLAS("cublasZgemv"); } diff --git a/vec.h b/vec.h index b40ae79..6aa5546 100644 --- a/vec.h +++ b/vec.h @@ -149,10 +149,10 @@ public: #endif //! create separate copy of the data corresponding to this vector - void copyonwrite(); + void copyonwrite(bool detachonly=false); //! purge this vector - void clear() { copyonwrite(); LA_traits::clear(v, nn); }; + void clear() { copyonwrite(true); LA_traits::clear(v, nn); }; //! assignment operator assigns given vector NRVec& operator=(const NRVec &rhs); @@ -806,7 +806,7 @@ NRVec::~NRVec() { * make own copy of the underlying data connected with this vector ******************************************************************************/ template -void NRVec::copyonwrite() { +void NRVec::copyonwrite(bool detachonly) { if(!count) laerror("copyonwrite of an undefined vector"); if(*count > 1) { (*count)--; @@ -817,12 +817,12 @@ void NRVec::copyonwrite() { if(location == cpu){ #endif newv = new T[nn]; - memcpy(newv, v, nn*sizeof(T)); + if(!detachonly) memcpy(newv, v, nn*sizeof(T)); #ifdef CUDALA }else{ newv = (T *) gpualloc(nn*sizeof(T)); if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec::copyonwrite()"); - cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); + if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); TEST_CUBLAS("cublasScopy");//"NRVec::copyonwrite()" } #endif