*** empty log message ***

This commit is contained in:
jiri 2013-11-04 14:56:39 +00:00
parent a9e30620f0
commit 80fe44fab2
18 changed files with 505 additions and 308 deletions

View File

@ -1,7 +1,33 @@
04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and
gdiagonalize() to get correct eigenvector convention for C-storage
30.10.2013 macros for consistent lowercase and uppercase of character
parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated)
29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas
14.10.2013 added operator% to bivector
01.06.2012 more efficient clear() by detachonly parameter to copyonwrite()
14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat
13.03.2012 symmetry of integrals with different spins added to fourindex.h
23.02.2012 included unistd.h
23.02.2012 fixed max read/write size in multiget and multiput to 1GB
24.01.2012 Improved DIIS (L.V.)
19.01.2012 Fixed location for gpu in nonclass.cc
02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.)
01.02.2011 Added trace2 for complex matrices by L. Veis
18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik
28.12.2010 Generalized diagonalization and functions of general complex matrices
21.12.2010 Changed to size_t in matrix put,get to prevent overflow
08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp
24.11.2010 Added checking for integer overflow in exptimes
23.10.2010 Fixed dependency on atlas for static libraries in configure.ac
27.09.2010 Seed of CSRMat class added.
22.09.2010 Allowed formal operator[] on gpu matrices
22.09.2010 Added submatrix() to SparseSMat
22.09.2010 Zgerc and zgeru implemented in non-cblas version
21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems
08.09.2010 RELEASE 0.6 08.09.2010 RELEASE 0.6
08.09.2010 Doxygen documentation by M. Sulc 08.09.2010 Doxygen documentation for NRVec, NRMat, NRSMat classes contributed by M. Sulc
08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc 08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc
08.09.2010 Minor bugfixes contributed by M. Sulc 08.09.2010 Minor bugfixes and improvements contributed by M. Sulc
25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat 25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat
24.06.2010 Fixed a memory leak existing when MATPTR was defined 24.06.2010 Fixed a memory leak existing when MATPTR was defined
18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS 18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested)
11.03.2008 added cblas_idamax replacement for non-cblas 11.03.2008 added cblas_idamax replacement for non-cblas
05.03.2008 fixed transposed bug in inverse() with non-cblas 05.03.2008 fixed transposed bug in inverse() with non-cblas
26.02.2008 INITIAL RELEASE 0.1 26.02.2008 INITIAL RELEASE 0.1

View File

@ -130,7 +130,7 @@ return *this;
/*number of ones in a binary number, from "Hacker's delight" book*/ /*number of ones in a binary number, from "Hacker's delight" book*/
#ifdef LONG_IS_32 #ifdef LONG_IS_32
static unsigned long word_popul(unsigned long x) static unsigned int word_popul(unsigned long x)
{ {
x -= ((x>>1)&0x55555555); x -= ((x>>1)&0x55555555);
x = (x&0x33333333) + ((x>>2)&0x33333333); x = (x&0x33333333) + ((x>>2)&0x33333333);
@ -140,9 +140,10 @@ x+= (x>>16);
return x&0x3f; return x&0x3f;
} }
#else #else
static unsigned long word_popul(unsigned long x) //@@@@ use an efficient trick
static unsigned int word_popul(unsigned long x)
{ {
unsigned long s=0; unsigned int s=0;
for(int i=0; i<64; ++i) for(int i=0; i<64; ++i)
{ {
if(x&1) ++s; if(x&1) ++s;
@ -156,7 +157,6 @@ return s;
unsigned int bitvector::population(const unsigned int before) const unsigned int bitvector::population(const unsigned int before) const
{ {
//@@@before
int i; int i;
unsigned int s=0; unsigned int s=0;
for(i=0; i<nn-1; ++i) s+=word_popul(v[i]); for(i=0; i<nn-1; ++i) s+=word_popul(v[i]);
@ -170,4 +170,21 @@ if(modulo)
return s+word_popul(a); return s+word_popul(a);
} }
unsigned int bitvector::operator%(const bitvector &y) const
{
if(nn!=y.nn) laerror("incompatible size in bitdifference");
unsigned int s=0;
for(int i=0; i<nn-1; ++i) s+=word_popul(v[i]^y.v[i]);
bitvector_block a=v[nn-1]^y.v[nn-1];
if(modulo)
{
bitvector_block mask= ~((bitvector_block)0);
mask <<=modulo;
a &= ~mask;
}
return s+word_popul(a);
}
}//namespace }//namespace

View File

@ -55,7 +55,7 @@ public:
void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));}; void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));};
const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;}; const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;};
const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;}; const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;};
void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));}; void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));};
void fill() {memset(v,0xff,nn*sizeof(bitvector_block));}; void fill() {memset(v,0xff,nn*sizeof(bitvector_block));};
bool operator!=(const bitvector &rhs) const; bool operator!=(const bitvector &rhs) const;
bool operator==(const bitvector &rhs) const {return !(*this != rhs);}; bool operator==(const bitvector &rhs) const {return !(*this != rhs);};
@ -70,13 +70,13 @@ public:
bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;}; bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;};
bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;}; bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;};
bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;}; bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;};
unsigned int population(const unsigned int before=0) const; //@@@number of 1's unsigned int operator%(const bitvector &y) const; //number of differing bits
unsigned int population(const unsigned int before=0) const; //number of 1's
//extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits //extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits
//logical shifts <<= >>= << >> not implemented yet //logical shifts <<= >>= << >> not implemented yet
//logical rotations not implemented yet //logical rotations not implemented yet
}; };
extern std::ostream & operator<<(std::ostream &s, const bitvector &x); extern std::ostream & operator<<(std::ostream &s, const bitvector &x);
extern std::istream & operator>>(std::istream &s, bitvector &x); extern std::istream & operator>>(std::istream &s, bitvector &x);

View File

@ -52,8 +52,9 @@ INSTANTIZE(complex<double>)
*/ */
//// forced instantization of functions in the header in the corresponding object file //// forced instantization of functions in the header in the corresponding object file
template class CSRMat<double>; //@@@@@template class CSRMat<double>;
template class CSRMat<complex<double> >; //@@@@template class CSRMat<complex<double> >;
//@@@@ unfinished class commented out

View File

@ -11,3 +11,6 @@
#undef FORINT #undef FORINT
#define FINT int #define FINT int
#endif #endif
#define BLAS_FORTRANCASE(x) toupper(x)
#define LAPACK_FORTRANCASE(x) toupper(x)

View File

@ -104,10 +104,10 @@ __attribute__((packed))
//later add symmetry of complex integrals //later add symmetry of complex integrals
typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored
// these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2 // these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2
static const int fourindex_n_symmetrytypes=6; static const int fourindex_n_symmetrytypes=7;
static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8}; static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4};
static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]= static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
{ {
{{0,1,2,3,1}}, {{0,1,2,3,1}},
@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}}, {{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}},
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}}, {{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}},
{{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges {{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges
{{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}},
}; };
@ -669,6 +670,108 @@ typedef typename LA_traits<T>::normtype normtype;
//make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations //make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations
template<class T, class I> template<class T, class I>
class fourindex_dense<twoelectronrealmullikanAB,T,I> : public NRMat<T> {
public:
fourindex_dense(): NRMat<T>() {};
explicit fourindex_dense(const int n): NRMat<T>(n*(n+1)/2,n*(n+1)/2) {};
fourindex_dense(const NRMat<T> &rhs): NRMat<T>(rhs) {}; //be able to convert the parent class transparently to this
fourindex_dense(const T &a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
fourindex_dense(const T *a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
//and also construct it from sparse and externally stored fourindex classes
//it seems not possible to nest template<class I> just for the two constructors
fourindex_dense(const fourindex<I,T> &rhs);
fourindex_dense(const fourindex_ext<I,T> &rhs);
T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l);
const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const;
void resize(const int n) {(*this).NRMat<T>::resize(n*(n+1)/2,n*(n+1)/2);};
void putext(int f, T thr=1e-15);
int nbas() const {return (int)std::sqrt(2*(*this).nrows());};
};
template<class T, class I>
void fourindex_dense<twoelectronrealmullikanAB,T,I>::putext(int f, T thr)
{
T y;
for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j)
for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l)
if((y=abs((*this)(i,j,k,l))) > thr)
{
matel4stored<I,T> x;
x.elem= y;
x.index.indiv.i=i;
x.index.indiv.j=j;
x.index.indiv.k=k;
x.index.indiv.l=l;
if(sizeof(matel4stored<I,T>) != write(f,&x,sizeof(matel4stored<I,T>)) )
laerror("write error in putext");
}
}
template<class T, class I>
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
{
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
typename fourindex<I,T>::iterator p;
#ifdef DEBUG
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
#endif
for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem;
}
template<class T, class I>
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex_ext<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
{
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
typename fourindex_ext<I,T>::iterator p;
for(p=rhs.begin(); p!= rhs.end(); ++p)
{
#ifdef DEBUG
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
#endif
(*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem;
}
}
template<class T, class DUMMY>
T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l)
{
int I = SMat_index_1(i,j);
int J = SMat_index_1(k,l);
//I,J act as indices of a NRmat
#ifdef DEBUG
if (*NRMat<T>::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense");
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
#endif
return NRMat<T>::operator()(I,J);
}
template<class T, class DUMMY>
const T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const
{
int I = SMat_index_1(i,j);
int J = SMat_index_1(k,l);
//I,J act as indices of a NRSmat
#ifdef DEBUG
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
#endif
return NRMat<T>::operator()(I,J);
}
////////////////////
template<class T, class I>
class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> { class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> {
public: public:
fourindex_dense(): NRSMat<T>() {}; fourindex_dense(): NRSMat<T>() {};

View File

@ -35,6 +35,7 @@
#include <fstream> #include <fstream>
#include <limits> #include <limits>
#include <complex> #include <complex>
#include <unistd.h>
//using namespace std; //using namespace std;
@ -56,7 +57,7 @@ extern "C" {
#include "noncblas.h" #include "noncblas.h"
#else #else
extern "C" { extern "C" {
#include "clapack.h" #include "atlas/clapack.h"
} }
#endif #endif
@ -213,7 +214,7 @@ typedef C normtype;
typedef C realtype; typedef C realtype;
typedef complex<C> complextype; typedef complex<C> complextype;
static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();} static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));} static inline bool gencmp(const complex<C> *x, const complex<C> *y, size_t n) {return memcmp(x,y,n*sizeof(complex<C>));}
static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;} static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
static bool smaller(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;} static bool smaller(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
static inline normtype norm (const complex<C> &x) {return std::abs(x);} static inline normtype norm (const complex<C> &x) {return std::abs(x);}
@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
size_t total=0; size_t total=0;
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once
ssize_t r; ssize_t r;
size_t nn;
do{ do{
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");} if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
else total += r/sizeof(complex<C>); else total += r/sizeof(complex<C>);
if(r%sizeof(complex<C>)) laerror("read error 2"); if(r%sizeof(complex<C>)) laerror("read error 2");
} }
@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0)
size_t total=0; size_t total=0;
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once
ssize_t r; ssize_t r;
size_t nn;
do{ do{
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");} if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
else total += r/sizeof(complex<C>); else total += r/sizeof(complex<C>);
if(r%sizeof(complex<C>)) laerror("write error 2"); if(r%sizeof(complex<C>)) laerror("write error 2");
} }
while(total < n); while(total < n);
} }
static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));} static void copy(complex<C> *dest, complex<C> *src, size_t n) {memcpy(dest,src,n*sizeof(complex<C>));}
static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));} static void clear(complex<C> *dest, size_t n) {memset(dest,0,n*sizeof(complex<C>));}
static void copyonwrite(complex<C> &x) {}; static void copyonwrite(complex<C> &x) {};
static void clearme(complex<C> &x) {x=0;}; static void clearme(complex<C> &x) {x=0;};
static void deallocate(complex<C> &x) {}; static void deallocate(complex<C> &x) {};
@ -266,7 +269,7 @@ typedef C normtype;
typedef C realtype; typedef C realtype;
typedef complex<C> complextype; typedef complex<C> complextype;
static inline C sqrabs(const C x) { return x*x;} static inline C sqrabs(const C x) { return x*x;}
static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));} static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));}
static inline bool bigger(const C &x, const C &y) {return x>y;} static inline bool bigger(const C &x, const C &y) {return x>y;}
static inline bool smaller(const C &x, const C &y) {return x<y;} static inline bool smaller(const C &x, const C &y) {return x<y;}
static inline normtype norm (const C &x) {return std::abs(x);} static inline normtype norm (const C &x) {return std::abs(x);}
@ -278,9 +281,10 @@ static void multiget(size_t n,int fd, C *x, bool dimensions=0)
size_t total=0; size_t total=0;
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once
ssize_t r; ssize_t r;
size_t nn;
do{ do{
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");} if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
else total += r/sizeof(C); else total += r/sizeof(C);
if(r%sizeof(C)) laerror("read error 2"); if(r%sizeof(C)) laerror("read error 2");
} }
@ -291,16 +295,17 @@ static void multiput(size_t n, int fd, const C *x, bool dimensions=0)
size_t total=0; size_t total=0;
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once
ssize_t r; ssize_t r;
size_t nn;
do{ do{
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");} if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
else total += r/sizeof(C); else total += r/sizeof(C);
if(r%sizeof(C)) laerror("write error 2"); if(r%sizeof(C)) laerror("write error 2");
} }
while(total < n); while(total < n);
} }
static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));} static void copy(C *dest, C *src, size_t n) {memcpy(dest,src,n*sizeof(C));}
static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));} static void clear(C *dest, size_t n) {memset(dest,0,n*sizeof(C));}
static void copyonwrite(C &x) {}; static void copyonwrite(C &x) {};
static void clearme(C &x) {x=0;}; static void clearme(C &x) {x=0;};
static void deallocate(C &x) {}; static void deallocate(C &x) {};
@ -323,7 +328,7 @@ typedef X<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \ typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \ typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \ typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \ static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \ static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \ static inline bool smaller(const C &x, const C &y) {return x<y;} \
static inline normtype norm (const X<C> &x) {return x.norm();} \ static inline normtype norm (const X<C> &x) {return x.norm();} \
@ -332,8 +337,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \ static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \ static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \ static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \ static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();}\ static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();}\
static void copyonwrite(X<C> &x) {x.copyonwrite();}\ static void copyonwrite(X<C> &x) {x.copyonwrite();}\
static void clearme(X<C> &x) {x.clear();}\ static void clearme(X<C> &x) {x.clear();}\
static void deallocate(X<C> &x) {x.dealloc();}\ static void deallocate(X<C> &x) {x.dealloc();}\
@ -359,7 +364,7 @@ typedef NRMat<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \ typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \ typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \ typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \ static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \ static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \ static inline bool smaller(const C &x, const C &y) {return x<y;} \
static inline normtype norm (const X<C> &x) {return x.norm();} \ static inline normtype norm (const X<C> &x) {return x.norm();} \
@ -368,8 +373,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} \ static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} \
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \ static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \ static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \ static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();} \ static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();} \
static void copyonwrite(X<C> &x) {x.copyonwrite();} \ static void copyonwrite(X<C> &x) {x.copyonwrite();} \
static void clearme(X<C> &x) {x.clear();} \ static void clearme(X<C> &x) {x.clear();} \
static void deallocate(X<C> &x) {x.dealloc();} \ static void deallocate(X<C> &x) {x.dealloc();} \

View File

@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
laerror(msg0); laerror(msg0);
} }
int cblas_errprn(int ierr, int info, char *form, ...) { #ifndef NONCBLAS
#include "cblas.h"
#include <stdarg.h>
extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...)
{
va_list argptr;
va_start(argptr, form);
if (p)
{
fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout);
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
}
vfprintf(stdout, form, argptr);
vfprintf(stderr, form, argptr);
va_end(argptr);
laerror("terminating in cblas_xerbla");
}
extern "C" int cblas_errprn(int ierr, int info, char *form, ...) {
char msg0[1024], *msg; char msg0[1024], *msg;
va_list argptr; va_list argptr;
va_start(argptr, form); va_start(argptr, form);
@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) {
laerror(msg0); laerror(msg0);
return 0; return 0;
} }
#endif
}//namespace }//namespace

179
mat.cc
View File

@ -26,11 +26,8 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#include <errno.h> #include <errno.h>
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA { namespace LA {
@ -77,14 +74,14 @@ const NRMat<T> NRMat<T>::otimes(const NRMat<T> &rhs, bool reversecolumns) const
{ {
T c = (*this)(i,j); T c = (*this)(i,j);
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++) for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
r( i*rhs.nn + k, l*mm + j ) = c*rhs(k,l); r( i*(size_t)rhs.nn + k, l*mm + j ) = c*rhs(k,l);
} }
}else{ }else{
for(i=0;i<nn;i++) for(j=0;j<mm;j++) for(i=0;i<nn;i++) for(j=0;j<mm;j++)
{ {
T c=(*this)(i,j); T c=(*this)(i,j);
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++) for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
r( i*rhs.nn+k, j*rhs.mm+l ) = c *rhs(k,l); r( i*(size_t)rhs.nn+k, j*(size_t)rhs.mm+l ) = c *rhs(k,l);
} }
} }
@ -108,7 +105,7 @@ const NRVec<T> NRMat<T>::row(const int i, int l) const {
#ifdef MATPTR #ifdef MATPTR
v[i] v[i]
#else #else
v + i*l v + i*(size_t)l
#endif #endif
, l); , l);
return r; return r;
@ -144,7 +141,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
#ifdef MATPTR #ifdef MATPTR
v[i][j] v[i][j]
#else #else
v[i*mm+j] v[i*(size_t)mm+j]
#endif #endif
,dim ,transp); ,dim ,transp);
} }
@ -196,7 +193,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
#ifdef MATPTR #ifdef MATPTR
v[i][j] v[i][j]
#else #else
v[i*mm+j] v[i*(size_t)mm+j]
#endif #endif
,dim,transp); ,dim,transp);
} }
@ -476,13 +473,13 @@ NRMat<T> & NRMat<T>::operator-=(const T &a) {
******************************************************************************/ ******************************************************************************/
template <> template <>
const NRMat<double> NRMat<double>::operator-() const { const NRMat<double> NRMat<double>::operator-() const {
const int nm = nn*mm; const size_t nm = (size_t)nn*mm;
NRMat<double> result(nn, mm, getlocation()); NRMat<double> result(nn, mm, getlocation());
#ifdef CUDALA #ifdef CUDALA
if(location == cpu) { if(location == cpu) {
#endif #endif
#ifdef MATPTR #ifdef MATPTR
for(register int i=0; i<nm; i++) result.v[0][i] = -v[0][i]; for(register size_t i=0; i<nm; i++) result.v[0][i] = -v[0][i];
#else #else
memcpy(result.v, v, nm*sizeof(double)); memcpy(result.v, v, nm*sizeof(double));
cblas_dscal(nm, -1., result.v, 1); cblas_dscal(nm, -1., result.v, 1);
@ -506,13 +503,13 @@ const NRMat<double> NRMat<double>::operator-() const {
******************************************************************************/ ******************************************************************************/
template <> template <>
const NRMat<complex<double> > NRMat<complex<double> >::operator-() const { const NRMat<complex<double> > NRMat<complex<double> >::operator-() const {
const int nm = nn*mm; const size_t nm = (size_t)nn*mm;
NRMat<complex<double> > result(nn, mm, getlocation()); NRMat<complex<double> > result(nn, mm, getlocation());
#ifdef CUDALA #ifdef CUDALA
if(location == cpu) { if(location == cpu) {
#endif #endif
#ifdef MATPTR #ifdef MATPTR
for(register int i=0; i<nm; i++) result.v[0][i]= -v[0][i]; for(register size_t i=0; i<nm; i++) result.v[0][i]= -v[0][i];
#else #else
memcpy(result.v, v, nm*sizeof(complex<double>)); memcpy(result.v, v, nm*sizeof(complex<double>));
cblas_zscal(nm, &CMONE, result.v, 1); cblas_zscal(nm, &CMONE, result.v, 1);
@ -539,9 +536,9 @@ const NRMat<T> NRMat<T>::operator-() const {
NRMat<T> result(nn, mm, getlocation()); NRMat<T> result(nn, mm, getlocation());
#ifdef MATPTR #ifdef MATPTR
for(register int i=0; i<nn*mm; i++) result.v[0][i] = -v[0][i]; for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i];
#else #else
for(register int i=0; i<nn*mm; i++) result.v[i] = -v[i]; for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i];
#endif #endif
return result; return result;
} }
@ -562,11 +559,11 @@ const NRMat<T> NRMat<T>::operator&(const NRMat<T> &b) const {
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem"); if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem");
for(register int i=0; i<nn; i++){ for(register int i=0; i<nn; i++){
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*mm), 1, (float*)(result.v + i*(mm + b.mm)), 1); cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*(size_t)mm), 1, (float*)(result.v + i*(size_t)(mm + b.mm)), 1);
TEST_CUBLAS("cublasScopy"); TEST_CUBLAS("cublasScopy");
} }
for(register int i=0; i<b.nn; i++){ for(register int i=0; i<b.nn; i++){
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1); cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*(size_t)b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
TEST_CUBLAS("cublasScopy"); TEST_CUBLAS("cublasScopy");
} }
} }
@ -582,7 +579,7 @@ const NRMat<T> NRMat<T>::operator|(const NRMat<T> &b) const {
for (int j=0; j<mm; j++) for (int j=0; j<mm; j++)
for (int k=0; k<b.nn; k++) for (int k=0; k<b.nn; k++)
for (int l=0; l<b.mm; l++) for (int l=0; l<b.mm; l++)
result[i*b.nn+k][j*b.mm+l] = (*this)[i][j]*b[k][l]; result[i*(size_t)b.nn+k][j*(size_t)b.mm+l] = (*this)[i][j]*b[k][l];
return result; return result;
} }
@ -689,7 +686,7 @@ const NRVec<double> NRMat<double>::rsum() const {
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0;i<nn;i++){ for(register int i=0;i<nn;i++){
cublasDaxpy(mm, 1.0, v + i*mm, 1, result.v, 1); cublasDaxpy(mm, 1.0, v + i*(size_t)mm, 1, result.v, 1);
TEST_CUBLAS("cublasDaxpy"); TEST_CUBLAS("cublasDaxpy");
} }
} }
@ -714,7 +711,7 @@ const NRVec<complex<double> > NRMat<complex<double> >::rsum() const {
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0;i<nn;i++){ for(register int i=0;i<nn;i++){
cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(result.v), 1); cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(result.v), 1);
TEST_CUBLAS("cublasZaxpy"); TEST_CUBLAS("cublasZaxpy");
} }
} }
@ -748,14 +745,14 @@ const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int
#ifdef MATPTR #ifdef MATPTR
memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T)); memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T));
#else #else
memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T)); memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T));
#endif #endif
} }
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
for(register int i=fromrow; i<=torow; ++i){ for(register int i=fromrow; i<=torow; ++i){
cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1); cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
TEST_CUBLAS("cublasScopy"); TEST_CUBLAS("cublasScopy");
} }
} }
@ -786,13 +783,13 @@ void NRMat<T>::storesubmatrix(const int fromrow, const int fromcol, const NRMat
#ifdef MATPTR #ifdef MATPTR
memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T)); memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T));
#else #else
memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T)); memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1); cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1);
} }
#endif #endif
} }
@ -821,8 +818,8 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
v[j][i] = tmp; v[j][i] = tmp;
#else #else
register int a, b; register int a, b;
a = i*mm + j; a = i*(size_t)mm + j;
b = j*mm + i; b = j*(size_t)mm + i;
T tmp = v[a]; T tmp = v[a];
v[a] = v[b]; v[a] = v[b];
v[b] = tmp; v[b] = tmp;
@ -847,7 +844,7 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
******************************************************************************/ ******************************************************************************/
template<> template<>
NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
const int nn_mm = nn*mm; const size_t nn_mm = (size_t)nn*mm;
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
@ -888,7 +885,7 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
******************************************************************************/ ******************************************************************************/
template<> template<>
NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
const int nn_mm = nn*mm; const size_t nn_mm = (size_t) nn*mm;
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
@ -1079,7 +1076,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
#ifdef MATPTR #ifdef MATPTR
r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1); r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1);
#else #else
r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1); r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
#endif #endif
} }
} }
@ -1087,7 +1084,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
}else{ }else{
for(i=0; i<nn; ++i){ for(i=0; i<nn; ++i){
for(j=0; j<=i; ++j){ for(j=0; j<=i; ++j){
r(i, j) = cublasDdot(nn, v + i*mm, 1, v + j*mm, 1); r(i, j) = cublasDdot(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
TEST_CUBLAS("cublasDdot"); TEST_CUBLAS("cublasDdot");
} }
} }
@ -1113,7 +1110,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
#ifdef MATPTR #ifdef MATPTR
cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j)); cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j));
#else #else
cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j)); cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j));
#endif #endif
} }
} }
@ -1121,7 +1118,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
}else{ }else{
for(i=0; i<mm; ++i){ for(i=0; i<mm; ++i){
for(j=0; j<=i; ++j){ for(j=0; j<=i; ++j){
cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*mm), 1, (const cuDoubleComplex *)(v + j*mm), 1); cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*(size_t)mm), 1, (const cuDoubleComplex *)(v + j*(size_t)mm), 1);
TEST_CUBLAS("cublasZdotc"); TEST_CUBLAS("cublasZdotc");
r(i, j) = *(reinterpret_cast<complex<double>*> (&val)); r(i, j) = *(reinterpret_cast<complex<double>*> (&val));
} }
@ -1172,7 +1169,7 @@ void NRMat<double>::randomize(const double &x) {
}else{ }else{
NRMat<double> tmp(nn, mm, cpu); NRMat<double> tmp(nn, mm, cpu);
double *tmp_data = tmp; double *tmp_data = tmp;
for(register int i=0; i<nn*mm; ++i){ for(register size_t i=0; i<(size_t)nn*mm; ++i){
tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.); tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.);
} }
tmp.moveto(this->location); tmp.moveto(this->location);
@ -1203,7 +1200,7 @@ void NRMat<complex<double> >::randomize(const double &x) {
}else{ }else{
NRMat<complex<double> > tmp(nn, mm, cpu); NRMat<complex<double> > tmp(nn, mm, cpu);
complex<double> *tmp_data = tmp; complex<double> *tmp_data = tmp;
for(register int i=0; i<nn*mm; ++i){ for(register size_t i=0; i<(size_t)nn*mm; ++i){
const double re = x*(2.*random()/(1. + RAND_MAX) - 1.); const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
const double im = x*(2.*random()/(1. + RAND_MAX) - 1.); const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
tmp_data[i] = complex<double>(re, im); tmp_data[i] = complex<double>(re, im);
@ -1226,10 +1223,10 @@ NRMat<double>& NRMat<double>::operator*=(const double &a) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_dscal(nn*mm, a, *this, 1); cblas_dscal((size_t)nn*mm, a, *this, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDscal(nn*mm, a, v, 1); cublasDscal((size_t)nn*mm, a, v, 1);
TEST_CUBLAS("cublasDscal"); TEST_CUBLAS("cublasDscal");
} }
#endif #endif
@ -1249,11 +1246,11 @@ NRMat<complex<double> >::operator*=(const complex<double> &a) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zscal(nn*mm, &a, (*this)[0], 1); cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a)); const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a));
cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1); cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1);
TEST_CUBLAS("cublasZscal"); TEST_CUBLAS("cublasZscal");
} }
#endif #endif
@ -1271,9 +1268,9 @@ NRMat<T> & NRMat<T>::operator*=(const T &a) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
#ifdef MATPTR #ifdef MATPTR
for(register int i=0; i< nn*mm; i++) v[0][i] *= a; for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a;
#else #else
for(register int i=0; i< nn*mm; i++) v[i] *= a; for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a;
#endif #endif
return *this; return *this;
} }
@ -1294,10 +1291,10 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double> &rhs) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1); cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1); cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1);
TEST_CUBLAS("cublasDaxpy"); TEST_CUBLAS("cublasDaxpy");
} }
#endif #endif
@ -1320,10 +1317,10 @@ NRMat<complex<double> >::operator+=(const NRMat< complex<double> > &rhs) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
} }
#endif #endif
return *this; return *this;
@ -1345,9 +1342,9 @@ NRMat<T> & NRMat<T>::operator+=(const NRMat<T> &rhs) {
copyonwrite(); copyonwrite();
#ifdef MATPTR #ifdef MATPTR
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
#else #else
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i];
#endif #endif
return *this; return *this;
} }
@ -1368,10 +1365,10 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double> &rhs) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1); cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1); cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1);
} }
#endif #endif
return *this; return *this;
@ -1395,10 +1392,10 @@ NRMat< complex<double> >::operator-=(const NRMat< complex<double> > &rhs) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
} }
#endif #endif
return *this; return *this;
@ -1421,9 +1418,9 @@ NRMat<T> & NRMat<T>::operator-=(const NRMat<T> &rhs) {
copyonwrite(); copyonwrite();
#ifdef MATPTR #ifdef MATPTR
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i]; for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
#else #else
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i]; for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i];
#endif #endif
return *this; return *this;
} }
@ -1693,10 +1690,10 @@ const double NRMat<double>::dot(const NRMat<double> &rhs) const {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1); ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
ret = cublasDdot(nn*mm, v, 1, rhs.v, 1); ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1);
} }
#endif #endif
return ret; return ret;
@ -1721,10 +1718,10 @@ NRMat<complex<double> >::dot(const NRMat<complex<double> > &rhs) const {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret); cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
ret = *(reinterpret_cast<complex<double>*> (&val)); ret = *(reinterpret_cast<complex<double>*> (&val));
} }
#endif #endif
@ -1804,7 +1801,7 @@ void NRMat<double>::diagmultl(const NRVec<double> &rhs) {
for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); } for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); }
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*mm, 1); } for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*(size_t)mm, 1); }
} }
#endif #endif
} }
@ -1830,7 +1827,7 @@ void NRMat< complex<double> >::diagmultl(const NRVec< complex<double> > &rhs) {
}else{ }else{
for(register int i=0; i<nn; i++){ for(register int i=0; i<nn; i++){
const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag()); const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag());
cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*mm), 1); cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*(size_t)mm), 1);
} }
} }
#endif #endif
@ -1913,7 +1910,7 @@ NRMat<double>::operator*(const NRSMat<double> &rhs) const {
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<nn; i++){ for(register int i=0; i<nn; i++){
cublasDspmv('U', mm, 1.0, rhs.v, v + i*mm, 1, 0.0, result.v + i*rhs_ncols, 1); cublasDspmv('U', mm, 1.0, rhs.v, v + i*(size_t)mm, 1, 0.0, result.v + i*(size_t)rhs_ncols, 1);
} }
} }
#endif #endif
@ -1947,7 +1944,7 @@ NRMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<nn; i++){ for(register int i=0; i<nn; i++){
cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*rhs_ncols), 1); cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*(size_t)mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*(size_t)rhs_ncols), 1);
} }
} }
#endif #endif
@ -1974,10 +1971,10 @@ NRMat<complex<double> >& NRMat<complex<double> >::conjugateme() {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2); cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2); cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2);
} }
#endif #endif
return *this; return *this;
@ -2048,12 +2045,12 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
const char transa, const NRMat<double> &b, const char transb, const char transa, const NRMat<double> &b, const char transb,
const double &alpha) { const double &alpha) {
int k(transa=='n'?a.mm:a.nn); int k(tolower(transa)=='n'?a.mm:a.nn);
#ifdef DEBUG #ifdef DEBUG
int l(transa=='n'?a.nn:a.mm); int l(tolower(transa)=='n'?a.nn:a.mm);
int kk(transb=='n'?b.nn:b.mm); int kk(tolower(transb)=='n'?b.nn:b.mm);
int ll(transb=='n'?b.mm:b.nn); int ll(tolower(transb)=='n'?b.mm:b.nn);
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)"); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)");
if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm"); if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
#endif #endif
@ -2066,8 +2063,8 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans), cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans),
(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, (tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
a.mm, b , b.mm, beta, *this , mm); a.mm, b , b.mm, beta, *this , mm);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -2083,20 +2080,20 @@ void NRMat<complex<double> >::gemm(const complex<double> & beta,
const NRMat<complex<double> > & b, const char transb, const NRMat<complex<double> > & b, const char transb,
const complex<double> & alpha) const complex<double> & alpha)
{ {
int k(transa=='n'?a.mm:a.nn); int k(tolower(transa)=='n'?a.mm:a.nn);
#ifdef DEBUG #ifdef DEBUG
int l(transa=='n'?a.nn:a.mm); int l(tolower(transa)=='n'?a.nn:a.mm);
int kk(transb=='n'?b.nn:b.mm); int kk(tolower(transb)=='n'?b.nn:b.mm);
int ll(transb=='n'?b.mm:b.nn); int ll(tolower(transb)=='n'?b.mm:b.nn);
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)"); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)");
#endif #endif
if (alpha==CZERO && beta==CONE) return; if (alpha==CZERO && beta==CONE) return;
copyonwrite(); copyonwrite();
cblas_zgemm(CblasRowMajor, cblas_zgemm(CblasRowMajor,
(transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), (tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)),
(transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), (tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)),
nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm); nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm);
} }
@ -2113,10 +2110,10 @@ const double NRMat<double>::norm(const double scalar) const {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
return cblas_dnrm2(nn*mm, (*this)[0], 1); return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
return cublasDnrm2(nn*mm, v, 1); return cublasDnrm2((size_t)nn*mm, v, 1);
} }
#endif #endif
} }
@ -2130,7 +2127,7 @@ const double NRMat<double>::norm(const double scalar) const {
#ifdef MATPTR #ifdef MATPTR
tmp = v[i][j]; tmp = v[i][j];
#else #else
tmp = v[i*mm+j]; tmp = v[i*(size_t)mm+j];
#endif #endif
if(i == j) tmp -= scalar; if(i == j) tmp -= scalar;
sum += tmp*tmp; sum += tmp*tmp;
@ -2152,10 +2149,10 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
return cblas_dznrm2(nn*mm, (*this)[0], 1); return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1); return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1);
} }
#endif #endif
} }
@ -2168,7 +2165,7 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
#ifdef MATPTR #ifdef MATPTR
tmp = v[i][j]; tmp = v[i][j];
#else #else
tmp = v[i*mm+j]; tmp = v[i*(size_t)mm+j];
#endif #endif
if(i == j) tmp -= scalar; if(i == j) tmp -= scalar;
const double re = tmp.real(); const double re = tmp.real();
@ -2195,10 +2192,10 @@ void NRMat<double>::axpy(const double alpha, const NRMat<double> &mat) {
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1); cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1); cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
} }
#endif #endif
} }
@ -2221,7 +2218,7 @@ void NRMat<complex<double> >::axpy(const complex<double> alpha,
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1); cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
@ -2245,7 +2242,7 @@ const T NRMat<T>::trace() const {
#ifdef MATPTR #ifdef MATPTR
for(register int i=0; i<nn; ++i) sum += v[i][i]; for(register int i=0; i<nn; ++i) sum += v[i][i];
#else #else
for(register int i=0; i<nn*nn; i += (nn+1)) sum += v[i]; for(register size_t i=0; i<(size_t)nn*nn; i += (nn+1)) sum += v[i];
#endif #endif
return sum; return sum;
} }
@ -2554,7 +2551,7 @@ NRMat<double>& NRMat<double>::swap_rows(){
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<n_pul; i++){ for(register int i=0; i<n_pul; i++){
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm, 1); cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm, 1);
TEST_CUBLAS("cublasDswap"); TEST_CUBLAS("cublasDswap");
} }
} }
@ -2580,7 +2577,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows(){
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<n_pul; i++){ for(register int i=0; i<n_pul; i++){
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1); cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
TEST_CUBLAS("cublasZswap"); TEST_CUBLAS("cublasZswap");
} }
} }
@ -2613,7 +2610,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
}else{ }else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows"); if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows");
for(register int i=0; i<n_pul; i++){ for(register int i=0; i<n_pul; i++){
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm), 1); cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
TEST_CUBLAS("cublasSswap"); TEST_CUBLAS("cublasSswap");
} }
} }
@ -2745,7 +2742,7 @@ NRMat<double>& NRMat<double>::swap_rows_cols(){
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0; i<n_pul; i++){ for(register int i=0; i<n_pul; i++){
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm + mm - 1, -1); cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
TEST_CUBLAS("cublasDswap"); TEST_CUBLAS("cublasDswap");
} }
@ -2792,7 +2789,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows_cols(){
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
for(register int i=0;i<n_pul;i++){ for(register int i=0;i<n_pul;i++){
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1); cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
TEST_CUBLAS("cublasZswap"); TEST_CUBLAS("cublasZswap");
} }
if(nn & 1){ if(nn & 1){
@ -2817,7 +2814,7 @@ template<typename T>
NRMat<T>& NRMat<T>::swap_rows_cols(){ NRMat<T>& NRMat<T>::swap_rows_cols(){
const int n_pul = nn >> 1; const int n_pul = nn >> 1;
const int m_pul = mm >> 1; const int m_pul = mm >> 1;
const int dim = nn*mm; const size_t dim = (size_t)nn*mm;
T *data_ptr; T *data_ptr;
T tmp; T tmp;
@ -2837,7 +2834,7 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
}else{ }else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols"); if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols");
for(register int i=0; i<n_pul; i++){ for(register int i=0; i<n_pul; i++){
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1); cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
TEST_CUBLAS("cublasSswap"); TEST_CUBLAS("cublasSswap");
} }

109
mat.h
View File

@ -39,10 +39,10 @@ protected:
T *v;//!< pointer to the data stored continuously in emmory T *v;//!< pointer to the data stored continuously in emmory
#endif #endif
int *count;//!< reference counter int *count;//!< reference counter
public:
#ifdef CUDALA #ifdef CUDALA
GPUID location; GPUID location;
#endif #endif
public:
friend class NRVec<T>; friend class NRVec<T>;
friend class NRSMat<T>; friend class NRSMat<T>;
@ -89,16 +89,16 @@ public:
//! explicit constructor converting vector into a <code>NRMat<T></code> object //! explicit constructor converting vector into a <code>NRMat<T></code> object
#ifdef MATPTR #ifdef MATPTR
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){ explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
}; };
#else #else
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0); explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
#endif #endif
#ifdef MATPTR #ifdef MATPTR
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise
#else #else
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise
#endif #endif
const bool operator==(const NRMat &rhs) const {return !(*this != rhs);}; const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
@ -107,7 +107,7 @@ public:
inline int getcount() const {return count?*count:0;} inline int getcount() const {return count?*count:0;}
//! ensure that the data of this matrix are referenced exactly once //! ensure that the data of this matrix are referenced exactly once
void copyonwrite(); void copyonwrite(bool detachonly=false);
/***************************************************************************//** /***************************************************************************//**
* routines for CUDA related stuff * routines for CUDA related stuff
@ -260,7 +260,7 @@ public:
//! get the number of columns //! get the number of columns
inline int ncols() const; inline int ncols() const;
//! get the number of matrix elements //! get the number of matrix elements
inline int size() const; inline size_t size() const;
//! unformatted input //! unformatted input
void get(int fd, bool dimensions = 1, bool transposed = false); void get(int fd, bool dimensions = 1, bool transposed = false);
@ -274,8 +274,8 @@ public:
//! set all matrix elements equal to zero //! set all matrix elements equal to zero
void clear(){ void clear(){
if(nn&&mm){ if(nn&&mm){
copyonwrite(); copyonwrite(true);
LA_traits<T>::clear((*this)[0], nn*mm); LA_traits<T>::clear((*this)[0], (size_t)nn*mm);
} }
}; };
@ -379,7 +379,7 @@ template <typename T>
NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
T* p; T* p;
*count = 1; *count = 1;
const int nm = n*m; const size_t nm = (size_t)n*m;
#ifdef CUDALA #ifdef CUDALA
location = (loc==undefined?DEFAULT_LOC:loc); location = (loc==undefined?DEFAULT_LOC:loc);
if(location == cpu) { if(location == cpu) {
@ -408,7 +408,7 @@ NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) { NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
const int nm = n*m; const size_t nm = (size_t)n*m;
T *p; T *p;
*count = 1; *count = 1;
@ -447,7 +447,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n),
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) { NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
const int nm = n*m; const size_t nm = (size_t)n*m;
T *p; T *p;
*count = 1; *count = 1;
@ -460,7 +460,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
p = v[0] = new T[nm]; p = v[0] = new T[nm];
for (register int i=1; i<n; i++) v[i] = v[i-1] + m; for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
#else #else
p = v = new T[m*n]; p = v = new T[nm];
#endif #endif
if (a != (T)0) if (a != (T)0)
for (register int i=0; i<nm; i++) *p++ = a; for (register int i=0; i<nm; i++) *p++ = a;
@ -483,7 +483,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) { NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
const int nm = n*m; const size_t nm = (size_t)n*m;
#ifdef CUDALA #ifdef CUDALA
location = DEFAULT_LOC; location = DEFAULT_LOC;
#endif #endif
@ -546,10 +546,10 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
*count = 1; *count = 1;
#ifdef MATPTR #ifdef MATPTR
v = new T*[nn]; v = new T*[nn];
v[0] = new T[mm*nn]; v[0] = new T[(size_t)mm*nn];
for (int i=1; i<nn; i++) v[i] = v[i-1] + mm; for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
#else #else
v = new T[mm*nn]; v = new T[(size_t)mm*nn];
#endif #endif
#ifdef MATPTR #ifdef MATPTR
@ -561,7 +561,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
#else #else
for (i=0; i<nn; i++){ for (i=0; i<nn; i++){
for (j=0; j<=i; j++){ for (j=0; j<=i; j++){
v[i*nn + j] = v[j*nn + i] = rhs[k++]; v[i*(size_t)nn + j] = v[j*(size_t)nn + i] = rhs[k++];
} }
} }
#endif #endif
@ -578,7 +578,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
template <typename T> template <typename T>
NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset) NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
{ {
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length"); if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
#ifdef CUDALA #ifdef CUDALA
location=rhs.location; location=rhs.location;
@ -628,7 +628,7 @@ inline T* NRMat<T>::operator[](const int i) {
#ifdef MATPTR #ifdef MATPTR
return v[i]; return v[i];
#else #else
return v + i*mm; return v + i*(size_t)mm;
#endif #endif
} }
@ -646,7 +646,7 @@ inline const T* NRMat<T>::operator[](const int i) const {
#ifdef MATPTR #ifdef MATPTR
return v[i]; return v[i];
#else #else
return v + i*mm; return v + i*(size_t)mm;
#endif #endif
} }
@ -668,7 +668,7 @@ inline T& NRMat<T>::operator()(const int i, const int j){
#ifdef MATPTR #ifdef MATPTR
return v[i][j]; return v[i][j];
#else #else
return v[i*mm + j]; return v[i*(size_t)mm + j];
#endif #endif
} }
@ -689,7 +689,7 @@ inline const T& NRMat<T>::operator()(const int i, const int j) const{
#ifdef MATPTR #ifdef MATPTR
return v[i][j]; return v[i][j];
#else #else
return v[i*mm + j]; return v[i*(size_t)mm + j];
#endif #endif
} }
@ -712,11 +712,11 @@ inline const T NRMat<T>::get_ij(const int i, const int j) const{
#ifdef MATPTR #ifdef MATPTR
return v[i][j]; return v[i][j];
#else #else
return v[i*mm + j]; return v[i*(size_t)mm + j];
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const int pozice = i*mm + j; const size_t pozice = i*(size_t)mm + j;
gpuget(1, sizeof(T), v + pozice, &ret); gpuget(1, sizeof(T), v + pozice, &ret);
return ret; return ret;
} }
@ -743,8 +743,8 @@ inline int NRMat<T>::ncols() const{
* @return number of elements * @return number of elements
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
inline int NRMat<T>::size() const{ inline size_t NRMat<T>::size() const{
return nn*mm; return (size_t)nn*mm;
} }
/***************************************************************************//** /***************************************************************************//**
@ -795,7 +795,7 @@ inline const double NRMat<double>::amax() const{
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
double ret(0.0); double ret(0.0);
const int pozice = cublasIdamax(nn*mm, v, 1) - 1; const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1;
TEST_CUBLAS("cublasIdamax"); TEST_CUBLAS("cublasIdamax");
gpuget(1, sizeof(double), v + pozice, &ret); gpuget(1, sizeof(double), v + pozice, &ret);
return ret; return ret;
@ -815,7 +815,7 @@ inline const double NRMat<double>::amin() const{
if(location == cpu){ if(location == cpu){
#endif #endif
// idamin seems not to be supported // idamin seems not to be supported
const int nm = nn*mm; const size_t nm = (size_t)nn*mm;
double val(0.0); double val(0.0);
int index(-1); int index(-1);
ret = std::numeric_limits<double>::max(); ret = std::numeric_limits<double>::max();
@ -834,7 +834,7 @@ inline const double NRMat<double>::amin() const{
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const int pozice = cublasIdamin(nn*mm, v, 1) - 1; const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1;
TEST_CUBLAS("cublasIdamin"); TEST_CUBLAS("cublasIdamin");
gpuget(1, sizeof(double), v + pozice, &ret); gpuget(1, sizeof(double), v + pozice, &ret);
} }
@ -860,7 +860,7 @@ inline const complex<double> NRMat<complex<double> >::amax() const{
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
complex<double> ret(0.0, 0.0); complex<double> ret(0.0, 0.0);
const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1; const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
TEST_CUBLAS("cublasIzamax"); TEST_CUBLAS("cublasIzamax");
gpuget(1, sizeof(complex<double>), v + pozice, &ret); gpuget(1, sizeof(complex<double>), v + pozice, &ret);
return ret; return ret;
@ -881,7 +881,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
if(location == cpu){ if(location == cpu){
#endif #endif
// idamin seems not to be supported // idamin seems not to be supported
const int nm = nn*mm; const size_t nm = (size_t)nn*mm;
int index(-1); int index(-1);
double val(0.0), min_val(0.0); double val(0.0), min_val(0.0);
complex<double> z_val(0.0, 0.0); complex<double> z_val(0.0, 0.0);
@ -903,7 +903,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1; const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
TEST_CUBLAS("cublasIzamin"); TEST_CUBLAS("cublasIzamin");
gpuget(1, sizeof(complex<double>), v + pozice, &ret); gpuget(1, sizeof(complex<double>), v + pozice, &ret);
} }
@ -991,7 +991,7 @@ NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
* @see NRMat<T>::count, NRMat<T>::operator|=() * @see NRMat<T>::count, NRMat<T>::operator|=()
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
void NRMat<T>::copyonwrite() { void NRMat<T>::copyonwrite(bool detachonly) {
if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0"); if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
if(*count > 1){ if(*count > 1){
(*count)--; (*count)--;
@ -1002,20 +1002,20 @@ void NRMat<T>::copyonwrite() {
#endif #endif
#ifdef MATPTR #ifdef MATPTR
T **newv = new T*[nn]; T **newv = new T*[nn];
newv[0] = new T[mm*nn]; newv[0] = new T[(size_t)mm*nn];
memcpy(newv[0], v[0], mm*nn*sizeof(T)); if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T));
v = newv; v = newv;
for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm; for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
#else #else
T *newv = new T[mm*nn]; T *newv = new T[(size_t)mm*nn];
memcpy(newv, v, mm*nn*sizeof(T)); if(!detachonly) memcpy(newv, v, (size_t)mm*nn*sizeof(T));
v = newv; v = newv;
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ //matrix is in GPU memory }else{ //matrix is in GPU memory
T *newv = (T *) gpualloc(mm*nn*sizeof(T)); T *newv = (T *) gpualloc((size_t)mm*nn*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); if(!detachonly) cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy"); TEST_CUBLAS("cublasScopy");
v = newv; v = newv;
} }
@ -1082,14 +1082,14 @@ void NRMat<T>::resize(int n, int m) {
#endif #endif
#ifdef MATPTR #ifdef MATPTR
v = new T*[nn]; v = new T*[nn];
v[0] = new T[m*n]; v[0] = new T[(size_t)m*n];
for (register int i=1; i< n; i++) v[i] = v[i-1] + m; for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
#else #else
v = new T[m*n]; v = new T[(size_t)m*n];
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
v = (T *) gpualloc(n*m*sizeof(T)); v = (T *) gpualloc((size_t)n*m*sizeof(T));
} }
#endif #endif
return; return;
@ -1108,15 +1108,15 @@ void NRMat<T>::resize(int n, int m) {
delete[] v; delete[] v;
#ifdef MATPTR #ifdef MATPTR
v = new T*[nn]; v = new T*[nn];
v[0] = new T[m*n]; v[0] = new T[(size_t)m*n];
for (int i=1; i< n; i++) v[i] = v[i-1] + m; for (int i=1; i< n; i++) v[i] = v[i-1] + m;
#else #else
v = new T[m*n]; v = new T[(size_t)m*n];
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
gpufree(v); gpufree(v);
v=(T *) gpualloc(n*m*sizeof(T)); v=(T *) gpualloc((size_t)n*m*sizeof(T));
} }
#endif #endif
} }
@ -1228,7 +1228,7 @@ public:
#ifdef MATPTR #ifdef MATPTR
return NRMat<T>::v[i - 1][j - 1]; return NRMat<T>::v[i - 1][j - 1];
#else #else
return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1]; return NRMat<T>::v[(i-1)*(size_t)NRMat<T>::mm+j-1];
#endif #endif
} }
@ -1258,11 +1258,11 @@ public:
#ifdef MATPTR #ifdef MATPTR
return NRMat<T>::v[i - 1][j - 1]; return NRMat<T>::v[i - 1][j - 1];
#else #else
return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)]; return NRMat<T>::v[(size_t)(i-1)*NRMat<T>::mm + (j-1)];
#endif #endif
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
const int pozice = (i-1)*NRMat<T>::mm + (j-1); const size_t pozice = (size_t)(i-1)*NRMat<T>::mm + (j-1);
gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret); gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
return ret; return ret;
} }
@ -1286,10 +1286,10 @@ NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
copyonwrite();// ensure that *count == 1 copyonwrite();// ensure that *count == 1
#ifdef MATPTR #ifdef MATPTR
for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i]; for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i];
#else #else
const int Dim = nn*mm; const size_t Dim = (size_t)nn*mm;
for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i]; for(register size_t i=0; i<Dim; i++) v[i] *= rhs.v[i];
#endif #endif
return *this; return *this;
} }
@ -1320,14 +1320,14 @@ void NRMat<T>::moveto(const GPUID dest) {
T *vold = v; T *vold = v;
if(dest == cpu){ //moving from GPU to CPU if(dest == cpu){ //moving from GPU to CPU
v = new T[nn*mm]; v = new T[(size_t)nn*mm];
gpuget(nn*mm, sizeof(T), vold, v); gpuget((size_t)nn*mm, sizeof(T), vold, v);
if(*count == 1){ gpufree(vold); } if(*count == 1){ gpufree(vold); }
else{ --(*count); count = new int(1); } else{ --(*count); count = new int(1); }
}else{ //moving from CPU to GPU }else{ //moving from CPU to GPU
v = (T *) gpualloc(nn*mm*sizeof(T)); v = (T *) gpualloc((size_t)nn*mm*sizeof(T));
gpuput(nn*mm, sizeof(T), vold, v); gpuput((size_t)nn*mm, sizeof(T), vold, v);
if(*count == 1) delete[] vold; if(*count == 1) delete[] vold;
else{ --(*count); count = new int(1);} else{ --(*count); count = new int(1);}
} }
@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -)
}//end of the LA-namespace }//end of the LA-namespace
#endif/* _LA_MAT_H_ */ #endif/* _LA_MAT_H_ */

View File

@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{ {
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
char U = BLAS_FORTRANCASE('u');
#ifdef FORINT #ifdef FORINT
const FINT ntmp=N; const FINT ntmp=N;
const FINT incxtmp=incX; const FINT incxtmp=incX;
const FINT incytmp=incY; const FINT incytmp=incY;
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp); FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
#else #else
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY); FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
#endif #endif
} }
@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{ {
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted"); if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
char U = BLAS_FORTRANCASE('u');
#ifdef FORINT #ifdef FORINT
const FINT ntmp=N; const FINT ntmp=N;
const FINT incxtmp=incX; const FINT incxtmp=incX;
const FINT incytmp=incY; const FINT incytmp=incY;
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp); FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
#else #else
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY); FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY);
#endif #endif
} }
@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{ {
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n //swap a-b, m-n
char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T');
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
#ifdef FORINT #ifdef FORINT
const FINT mtmp=M; const FINT mtmp=M;
const FINT ntmp=N; const FINT ntmp=N;
@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const FINT ldatmp=lda; const FINT ldatmp=lda;
const FINT ldbtmp=ldb; const FINT ldbtmp=ldb;
const FINT ldctmp=ldc; const FINT ldctmp=ldc;
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", FORNAME(dgemm) (&transb,&transa,
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp); &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
#else #else
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T", FORNAME(dgemm) (&transb,&transa,
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
#endif #endif
} }
@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{ {
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n //swap a-b, m-n
char transb = BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T'));
char transa = BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T'));
#ifdef FORINT #ifdef FORINT
const FINT mtmp=M; const FINT mtmp=M;
const FINT ntmp=N; const FINT ntmp=N;
@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const FINT ldatmp=lda; const FINT ldatmp=lda;
const FINT ldbtmp=ldb; const FINT ldbtmp=ldb;
const FINT ldctmp=ldc; const FINT ldctmp=ldc;
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), FORNAME(zgemm) (&transb,&transa,
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp); &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
#else #else
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"), FORNAME(zgemm) ( &transb,&transa,
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc); &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
#endif #endif
} }
@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
const double *X, const int incX, const double beta, const double *X, const int incX, const double beta,
double *Y, const int incY) double *Y, const int incY)
{ {
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
char transax = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
#ifdef FORINT #ifdef FORINT
const FINT mtmp=M; const FINT mtmp=M;
const FINT ntmp=N; const FINT ntmp=N;
const FINT ldatmp=lda; const FINT ldatmp=lda;
const FINT incxtmp=incX; const FINT incxtmp=incX;
const FINT incytmp=incY; const FINT incytmp=incY;
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
//swap n-m and toggle transposition //swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp ); else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
#else #else
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
//swap n-m and toggle transposition //swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY ); else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
#endif #endif
} }
@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted"); if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted"); if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
//swap n-m and toggle transposition //swap n-m and toggle transposition
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
#ifdef FORINT #ifdef FORINT
const FINT mtmp=M; const FINT mtmp=M;
const FINT ntmp=N; const FINT ntmp=N;
const FINT ldatmp=lda; const FINT ldatmp=lda;
const FINT incxtmp=incX; const FINT incxtmp=incX;
const FINT incytmp=incY; const FINT incytmp=incY;
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp ); FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
#else #else
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY ); FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
#endif #endif
} }

View File

@ -17,6 +17,7 @@
*/ */
//this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway //this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway
#include "la_traits.h"
#include "noncblas.h" #include "noncblas.h"
#include "vec.h" #include "vec.h"
#include "smat.h" #include "smat.h"
@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const
FINT r, *ipiv; FINT r, *ipiv;
a.copyonwrite(); a.copyonwrite();
ipiv = new FINT[n]; ipiv = new FINT[n];
char U = 'U'; char U = LAPACK_FORTRANCASE('u');
#ifdef FORINT #ifdef FORINT
const FINT ntmp=n; const FINT ntmp=n;
const FINT nrhstmp=nrhs; const FINT nrhstmp=nrhs;
@ -298,9 +299,9 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
const int A_rows = _A.nrows(); const int A_rows = _A.nrows();
const int A_cols = _A.ncols(); const int A_cols = _A.ncols();
const char fact = _eq?'E':'N'; const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
const char trans = 'T';//because of c-order const char trans = LAPACK_FORTRANCASE('T');//because of c-order
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){ if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
laerror("linear_solve_x: invalid input matrices"); laerror("linear_solve_x: invalid input matrices");
@ -371,9 +372,9 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
const int A_rows = _A.nrows(); const int A_rows = _A.nrows();
const int A_cols = _A.ncols(); const int A_cols = _A.ncols();
const char fact = _eq?'E':'N'; const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
const char trans = 'T';//because of c-order const char trans = LAPACK_FORTRANCASE('T');//because of c-order
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){ if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
laerror("linear_solve_x: invalid input matrices"); laerror("linear_solve_x: invalid input matrices");
@ -557,9 +558,9 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
if(b) b->copyonwrite(); if(b) b->copyonwrite();
FINT r = 0; FINT r = 0;
char U ='U'; char U =LAPACK_FORTRANCASE('u');
char vectors = 'V'; char vectors = LAPACK_FORTRANCASE('v');
if (!eivec) vectors = 'N'; if (!eivec) vectors = LAPACK_FORTRANCASE('n');
FINT LWORK = -1; FINT LWORK = -1;
double WORKX; double WORKX;
FINT ldb=0; if(b) ldb=b->ncols(); FINT ldb=0; if(b) ldb=b->ncols();
@ -588,7 +589,7 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
#endif #endif
delete[] WORK; delete[] WORK;
if (vectors == 'V' && corder) a.transposeme(n); if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n);
if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()"); if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()"); if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()");
@ -620,12 +621,13 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
if(b) b->copyonwrite(); if(b) b->copyonwrite();
FINT r = 0; FINT r = 0;
char U ='U'; char U =LAPACK_FORTRANCASE('U');
char vectors = 'V'; char vectors = LAPACK_FORTRANCASE('V');
if (!eivec) vectors = 'N'; if (!eivec) vectors = LAPACK_FORTRANCASE('n');
FINT LWORK = -1; FINT LWORK = -1;
complex<double> WORKX; complex<double> WORKX;
FINT ldb=0; if(b) ldb=b->ncols(); FINT ldb=0; if(b) ldb=b->ncols();
std::cout << "test vectors "<<vectors<<std::endl;
// First call is to determine size of workspace // First call is to determine size of workspace
double *RWORK = new double[3*n+2]; double *RWORK = new double[3*n+2];
@ -652,7 +654,7 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
delete[] WORK; delete[] WORK;
delete[] RWORK; delete[] RWORK;
if (vectors == 'V' && corder) a.transposeme(n); if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();}
if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()"); if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()");
if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()"); if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()");
@ -684,8 +686,8 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
if(b) b->copyonwrite(); if(b) b->copyonwrite();
FINT r = 0; FINT r = 0;
char U = 'U'; char U = LAPACK_FORTRANCASE('u');
char job = v ? 'v' : 'n'; char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
double *WORK = new double[3*n]; double *WORK = new double[3*n];
FINT ldv=v?v->ncols():n; FINT ldv=v?v->ncols():n;
@ -730,8 +732,8 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
if(b) b->copyonwrite(); if(b) b->copyonwrite();
FINT r = 0; FINT r = 0;
char U = 'U'; char U = LAPACK_FORTRANCASE('u');
char job = v ? 'v' : 'n'; char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
complex<double> *WORK = new complex<double>[2*n]; complex<double> *WORK = new complex<double>[2*n];
double *RWORK = new double[3*n]; double *RWORK = new double[3*n];
@ -890,8 +892,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
const int sorttype, const int biorthonormalize, const int sorttype, const int biorthonormalize,
NRMat<double> *b, NRVec<double> *beta) NRMat<double> *b, NRVec<double> *beta)
{ {
if(n<=0) n = a.nrows(); if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix"); if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
if (n > wr.size()) if (n > wr.size())
laerror("inconsistent dimension of eigen vector in gdiagonalize()"); laerror("inconsistent dimension of eigen vector in gdiagonalize()");
if (vl) if (n > vl->nrows() || n > vl->ncols()) if (vl) if (n > vl->nrows() || n > vl->ncols())
@ -911,8 +913,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
if (beta) beta->copyonwrite(); if (beta) beta->copyonwrite();
if (b) b->copyonwrite(); if (b) b->copyonwrite();
char jobvl = vl ? 'V' : 'N'; char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
char jobvr = vr ? 'V' : 'N'; char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
double work0; double work0;
FINT lwork = -1; FINT lwork = -1;
FINT r; FINT r;
@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
NRMat<complex<double> > *b, NRVec<complex<double> > *beta) NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
{ {
if(n<=0) n = a.nrows(); if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix"); if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
if (n > w.size()) if (n > w.size())
laerror("inconsistent dimension of eigen vector in gdiagonalize()"); laerror("inconsistent dimension of eigen vector in gdiagonalize()");
if (vl) if (n > vl->nrows() || n > vl->ncols()) if (vl) if (n > vl->nrows() || n > vl->ncols())
@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
if (beta) beta->copyonwrite(); if (beta) beta->copyonwrite();
if (b) b->copyonwrite(); if (b) b->copyonwrite();
char jobvl = vl ? 'V' : 'N'; char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
char jobvr = vr ? 'V' : 'N'; char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
complex<double> work0; complex<double> work0;
FINT lwork = -1; FINT lwork = -1;
FINT r; FINT r;
@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
if (corder) { if (corder) {
if (vl) vl->transposeme(n); if (vl) {vl->transposeme(n); vl->conjugateme();}
if (vr) vr->transposeme(n); if (vr) {vr->transposeme(n); vr->conjugateme();}
} }
} }
@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
const bool corder, int n, const int sorttype, const int biorthonormalize, const bool corder, int n, const int sorttype, const int biorthonormalize,
NRMat<double> *b, NRVec<double> *beta) NRMat<double> *b, NRVec<double> *beta)
{ {
if(n<=0) n = a.nrows(); if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix"); if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix");
NRVec<double> wr(n), wi(n); NRVec<double> wr(n), wi(n);
NRMat<double> *rvl = 0; NRMat<double> *rvl = 0;
@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
template<> template<>
const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a) const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
{ {
#ifdef CUDALA
if(location == cpu){
#endif
NRMat<double> result(a.nrows(), a.ncols()); NRMat<double> result(a.nrows(), a.ncols());
#ifdef CUDALA
if(a.location == cpu){
#endif
// NRMat<double> result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1); cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -1242,11 +1246,13 @@ const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<dou
template<> template<>
const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a) const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
{ {
NRMat<double> result(a.nrows(), a.ncols());
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(a.location == cpu){
#endif #endif
NRMat<double> result(a.nrows(), a.ncols()); // NRMat<double> result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1); cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -1259,12 +1265,15 @@ const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<dou
template<> template<>
const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a) const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
{ {
NRMat <complex<double> > result(a.nrows(), a.ncols());
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(a.location == cpu){
#endif #endif
NRMat <complex<double> > result(a.nrows(), a.ncols()); // NRMat <complex<double> > result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2); cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -1278,11 +1287,13 @@ const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &
template<> template<>
const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a) const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
{ {
NRMat< complex<double> > result(a.nrows(), a.ncols());
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(a.location == cpu){
#endif #endif
NRMat< complex<double> > result(a.nrows(), a.ncols()); // NRMat< complex<double> > result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2); cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -1577,7 +1588,7 @@ void cholesky(NRMat<double> &a, bool upper)
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
FINT lda=a.ncols(); FINT lda=a.ncols();
FINT n=a.nrows(); FINT n=a.nrows();
char uplo=upper?'U':'L'; char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
FINT info; FINT info;
a.copyonwrite(); a.copyonwrite();
FORNAME(dpotrf)(&uplo, &n, a, &lda, &info); FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
@ -1596,7 +1607,7 @@ void cholesky(NRMat<complex<double> > &a, bool upper)
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky"); if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
FINT lda=a.ncols(); FINT lda=a.ncols();
FINT n=a.nrows(); FINT n=a.nrows();
char uplo=upper?'U':'L'; char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
FINT info; FINT info;
a.copyonwrite(); a.copyonwrite();
a.transposeme();//switch to Fortran order a.transposeme();//switch to Fortran order
@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1
} }
}//namespace }//namespace

23
smat.cc
View File

@ -28,11 +28,8 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#include <errno.h> #include <errno.h>
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA { namespace LA {
@ -130,7 +127,7 @@ NRSMat<T> & NRSMat<T>::operator=(const T &a) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
memset(v, 0, NN2*sizeof(T)); memset(v, 0, NN2*sizeof(T));
for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a; for(register int i=0; i<nn; i++) v[(size_t)i*(i+1)/2 + i] = a;
return *this; return *this;
} }
@ -157,11 +154,11 @@ const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
if(divide){ if(divide){
for(register int i=0; i<nn; i++){ for(register int i=0; i<nn; i++){
const T a = v[i*(i+1)/2+i]; const T a = v[(size_t)i*(i+1)/2+i];
if(a != 0.) r[i] /= a; if(a != 0.) r[i] /= a;
} }
}else{ }else{
for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i]; for(register int i=0; i<nn; i++) r[i] = v[(size_t)i*(i+1)/2+i];
} }
return divide?NULL:&r[0]; return divide?NULL:&r[0];
@ -178,7 +175,7 @@ const NRSMat<T> NRSMat<T>::operator-() const {
NOT_GPU(*this); NOT_GPU(*this);
NRSMat<T> result(nn, getlocation()); NRSMat<T> result(nn, getlocation());
for(register int i = 0; i<NN2; i++) result.v[i]= -v[i]; for(register size_t i = 0; i<NN2; i++) result.v[i]= -v[i];
return result; return result;
} }
@ -239,7 +236,7 @@ const T NRSMat<T>::trace() const {
NOT_GPU(*this); NOT_GPU(*this);
T tmp = 0; T tmp = 0;
for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i]; for(register int i=0; i<nn; i++) tmp += v[(size_t)i*(i+1)/2+i];
return tmp; return tmp;
} }
@ -251,7 +248,7 @@ template<>
void NRSMat<double>::randomize(const double &x) { void NRSMat<double>::randomize(const double &x) {
NOT_GPU(*this); NOT_GPU(*this);
for(int i=0; i<NN2; ++i){ for(size_t i=0; i<NN2; ++i){
v[i] = x*(2.*random()/(1.+RAND_MAX) -1.); v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
} }
} }
@ -262,11 +259,11 @@ void NRSMat<double>::randomize(const double &x) {
******************************************************************************/ ******************************************************************************/
template<> template<>
void NRSMat<complex<double> >::randomize(const double &x) { void NRSMat<complex<double> >::randomize(const double &x) {
for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.); for(register size_t i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.); for(register size_t i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<nn; ++i){ for(register int i=0; i<nn; ++i){
for(register int j=0; j<=i; ++j){ for(register int j=0; j<=i; ++j){
if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean if(i == j) v[i*(size_t)(i+1)/2+j].imag() = 0; //hermitean
} }
} }
} }

70
smat.h
View File

@ -25,7 +25,7 @@
#include "la_traits.h" #include "la_traits.h"
namespace LA { namespace LA {
#define NN2 (nn*(nn+1)/2) #define NN2 ((size_t)nn*(nn+1)/2)
/***************************************************************************//** /***************************************************************************//**
@ -134,15 +134,15 @@ public:
void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);}; void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);}; void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
inline const T& operator[](const int ij) const; inline const T& operator[](const size_t ij) const;
inline T& operator[](const int ij); inline T& operator[](const size_t ij);
inline const T& operator()(const int i, const int j) const; inline const T& operator()(const int i, const int j) const;
inline T& operator()(const int i, const int j); inline T& operator()(const int i, const int j);
inline int nrows() const; inline int nrows() const;
inline int ncols() const; inline int ncols() const;
inline int size() const; inline size_t size() const;
inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant
const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const; const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
@ -155,9 +155,9 @@ public:
void get(int fd, bool dimensions = 1, bool transp = 0); void get(int fd, bool dimensions = 1, bool transp = 0);
void put(int fd, bool dimensions = 1, bool transp = 0) const; void put(int fd, bool dimensions = 1, bool transp = 0) const;
void copyonwrite(); void copyonwrite(bool detachonly=false);
void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out void clear() {copyonwrite(true); LA_traits<T>::clear(v,NN2);}; //zero out
void resize(const int n); void resize(const int n);
void dealloc(void) {resize(0);} void dealloc(void) {resize(0);}
@ -212,7 +212,7 @@ inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) {
if(location == cpu){ if(location == cpu){
#endif #endif
v = new T[NN2]; v = new T[NN2];
if(a != (T)0) for(register int i = 0; i<NN2; i++) v[i] = a; if(a != (T)0) for(register size_t i = 0; i<NN2; i++) v[i] = a;
else memset(v, 0, NN2*sizeof(T)); else memset(v, 0, NN2*sizeof(T));
#ifdef CUDALA #ifdef CUDALA
@ -338,7 +338,7 @@ inline NRSMat<T> & NRSMat<T>::operator*=(const T &a) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] *= a; for(register size_t i = 0; i<NN2; ++i) v[i] *= a;
return *this; return *this;
} }
@ -353,7 +353,7 @@ inline NRSMat<T> & NRSMat<T>::operator+=(const T &a) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
for(register int i = 0; i<nn; i++) v[i*(i+1)/2 + i] += a; for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2 + i] += a;
return *this; return *this;
} }
@ -368,7 +368,7 @@ inline NRSMat<T> & NRSMat<T>::operator-=(const T &a) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
for(register int i = 0; i<nn; i++) v[i*(i+1)/2+i] -= a; for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2+i] -= a;
return *this; return *this;
} }
@ -438,7 +438,7 @@ inline NRSMat<T>& NRSMat<T>::operator+=(const NRSMat<T>& rhs) {
SAME_LOC(*this, rhs); SAME_LOC(*this, rhs);
copyonwrite(); copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] += rhs.v[i]; for(register size_t i = 0; i<NN2; ++i) v[i] += rhs.v[i];
return *this; return *this;
} }
@ -507,7 +507,7 @@ inline NRSMat<T>& NRSMat<T>::operator-=(const NRSMat<T>& rhs) {
NOT_GPU(*this); NOT_GPU(*this);
copyonwrite(); copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] -= rhs.v[i]; for(register size_t i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
return *this; return *this;
} }
@ -540,7 +540,7 @@ inline const NRMat<T> NRSMat<T>::operator-(const NRMat<T> &rhs) const {
* @return reference to the corresponding matrix element * @return reference to the corresponding matrix element
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
inline T& NRSMat<T>::operator[](const int ij) { inline T& NRSMat<T>::operator[](const size_t ij) {
#ifdef DEBUG #ifdef DEBUG
if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1"); if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1");
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range"); if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
@ -560,7 +560,7 @@ inline T& NRSMat<T>::operator[](const int ij) {
* @return constant reference to the corresponding matrix element * @return constant reference to the corresponding matrix element
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
inline const T & NRSMat<T>::operator[](const int ij) const { inline const T & NRSMat<T>::operator[](const size_t ij) const {
#ifdef DEBUG #ifdef DEBUG
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range"); if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object"); if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object");
@ -578,8 +578,8 @@ inline const T & NRSMat<T>::operator[](const int ij) const {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index(T i, T j) { inline size_t SMat_index(T i, T j) {
return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i; return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i;
} }
/***************************************************************************//** /***************************************************************************//**
@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index_igej(T i, T j) { inline size_t SMat_index_igej(T i, T j) {
return i*(i+1)/2+j; return i*(size_t)(i+1)/2+j;
} }
/***************************************************************************//** /***************************************************************************//**
@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index_ilej(T i, T j) { inline size_t SMat_index_ilej(T i, T j) {
return j*(j+1)/2+i; return j*(size_t)(j+1)/2+i;
} }
/***************************************************************************//** /***************************************************************************//**
@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index_1(T i, T j) { inline size_t SMat_index_1(T i, T j) {
return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1; return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1;
} }
/***************************************************************************//** /***************************************************************************//**
@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index_1igej(T i, T j) { inline size_t SMat_index_1igej(T i, T j) {
return i*(i-1)/2+j-1; return i*(size_t)(i-1)/2+j-1;
} }
/***************************************************************************//** /***************************************************************************//**
@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) {
* @return cumulative index * @return cumulative index
******************************************************************************/ ******************************************************************************/
template<typename T> template<typename T>
inline T SMat_index_1ilej(T i, T j) { inline size_t SMat_index_1ilej(T i, T j) {
return j*(j-1)/2+i-1; return j*(size_t)(j-1)/2+i-1;
} }
//indexing for antisymmetric matrix (used by fourindex) //indexing for antisymmetric matrix (used by fourindex)
template<typename T> template<typename T>
inline T ASMat_index(T i, T j) inline size_t ASMat_index(T i, T j)
{ {
if(i == j) return -1; if(i == j) return -1;
return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i; return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i;
} }
template<typename T> template<typename T>
inline T ASMat_index_1(T i, T j) inline size_t ASMat_index_1(T i, T j)
{ {
if(i == j) return -1; if(i == j) return -1;
return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1; return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1;
@ -715,7 +715,7 @@ inline int NRSMat<T>::ncols() const {
* @return number of elements of this symmetric matrix of generalt type <code>T</code> * @return number of elements of this symmetric matrix of generalt type <code>T</code>
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
inline int NRSMat<T>::size() const { inline size_t NRSMat<T>::size() const {
return NN2; return NN2;
} }
@ -758,7 +758,7 @@ inline const double NRSMat<double>::amin() const {
double val(0.0); double val(0.0);
int index(-1); int index(-1);
ret = std::numeric_limits<double>::max(); ret = std::numeric_limits<double>::max();
for(register int i = 0; i < NN2; i++){ for(register size_t i = 0; i < NN2; i++){
val = std::abs(v[i]); val = std::abs(v[i]);
if(val < ret){ index = i; ret = val; } if(val < ret){ index = i; ret = val; }
} }
@ -812,7 +812,7 @@ inline const complex<double> NRSMat<complex<double> >::amin() const{
complex<double> z_val(0.0, 0.0); complex<double> z_val(0.0, 0.0);
min_val = std::numeric_limits<double>::max(); min_val = std::numeric_limits<double>::max();
for(register int i = 0; i < NN2; i++){ for(register size_t i = 0; i < NN2; i++){
z_val = v[i]; z_val = v[i];
val = std::abs(z_val.real()) + std::abs(z_val.imag()); val = std::abs(z_val.real()) + std::abs(z_val.imag());
if(val < min_val){ index = i; min_val = val; } if(val < min_val){ index = i; min_val = val; }
@ -920,7 +920,7 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs) {
* @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite() * @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite()
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
void NRSMat<T>::copyonwrite() { void NRSMat<T>::copyonwrite(bool detachonly) {
if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object"); if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object");
if(*count > 1){ if(*count > 1){
(*count)--; (*count)--;
@ -931,12 +931,12 @@ void NRSMat<T>::copyonwrite() {
if(location == cpu) { if(location == cpu) {
#endif #endif
newv = new T[NN2]; newv = new T[NN2];
memcpy(newv, v, NN2*sizeof(T)); if(!detachonly) memcpy(newv, v, NN2*sizeof(T));
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
newv = (T *) gpualloc(NN2*sizeof(T)); newv = (T *) gpualloc(NN2*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()"); if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()");
cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()" TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()"
} }
#endif #endif

View File

@ -132,8 +132,8 @@ public:
void setunsymmetric();//unwind the matrix assuming it was indeed symmetric void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
inline bool issymmetric() const {return symmetric;} inline bool issymmetric() const {return symmetric;}
unsigned int length() const; unsigned int length() const;
void copyonwrite(); void copyonwrite(bool detachonly=false);
void clear() {copyonwrite();unsort();deletelist();} void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}}
void simplify(); void simplify();
const T trace() const; const T trace() const;
const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first

10
t.cc
View File

@ -1799,6 +1799,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
if(i!=j) err += abs(m2(i,j)); if(i!=j) err += abs(m2(i,j));
cout <<"offdiagonality error = "<<err<<endl; cout <<"offdiagonality error = "<<err<<endl;
err=0;
for(int i=0; i<m.nrows(); ++i) err += abs(m2(i,i) - eivals[i]);
cout <<"eigenvalue error = "<<err<<endl;
//test as general matrix //test as general matrix
NRVec<complex<double> > ww(m.nrows()); NRVec<complex<double> > ww(m.nrows());
NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows()); NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows());
@ -1812,6 +1817,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
if(i!=j) err += abs(m3(i,j)); if(i!=j) err += abs(m3(i,j));
cout <<"offdiagonality error 2 = "<<err<<endl; cout <<"offdiagonality error 2 = "<<err<<endl;
err=0;
for(int i=0; i<m.nrows(); ++i) err += abs(m3(i,i) - ww[i]);
cout <<"eigenvalue error = "<<err<<endl;
} }

27
vec.cc
View File

@ -27,11 +27,8 @@
#include <errno.h> #include <errno.h>
#include "vec.h" #include "vec.h"
#include "qsort.h" #include "qsort.h"
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA { namespace LA {
@ -541,7 +538,7 @@ template<>
void NRVec<double>::gemv(const double beta, const NRMat<double> &A, void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
const char trans, const double alpha, const NRVec &x) { const char trans, const double alpha, const NRVec &x) {
#ifdef DEBUG #ifdef DEBUG
if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif #endif
SAME_LOC3(*this, x, A); SAME_LOC3(*this, x, A);
copyonwrite(); copyonwrite();
@ -549,10 +546,10 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
#ifdef CUDALA #ifdef CUDALA
if(location==cpu){ if(location==cpu){
#endif #endif
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1); cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
TEST_CUBLAS("cublasDgemv"); TEST_CUBLAS("cublasDgemv");
} }
#endif #endif
@ -572,7 +569,7 @@ template<>
void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A, void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
const char trans, const double alpha, const NRVec<complex<double> > &x) { const char trans, const double alpha, const NRVec<complex<double> > &x) {
#ifdef DEBUG #ifdef DEBUG
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif #endif
SAME_LOC3(*this, x, A); SAME_LOC3(*this, x, A);
copyonwrite(); copyonwrite();
@ -580,16 +577,16 @@ void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
#ifdef CUDALA #ifdef CUDALA
if(location==cpu){ if(location==cpu){
#endif #endif
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2); A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2);
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2); cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
TEST_CUBLAS("cublasDgemv"); TEST_CUBLAS("cublasDgemv");
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2); cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
TEST_CUBLAS("cublasDgemv"); TEST_CUBLAS("cublasDgemv");
} }
#endif #endif
@ -611,14 +608,14 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
const NRMat<complex<double> > &A, const char trans, const NRMat<complex<double> > &A, const char trans,
const complex<double> alpha, const NRVec<complex<double> > &x) { const complex<double> alpha, const NRVec<complex<double> > &x) {
#ifdef DEBUG #ifdef DEBUG
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); } if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif #endif
SAME_LOC3(*this, x, A); SAME_LOC3(*this, x, A);
copyonwrite(); copyonwrite();
#ifdef CUDALA #ifdef CUDALA
if(location == cpu){ if(location == cpu){
#endif #endif
cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1); A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1);
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
@ -626,7 +623,7 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag()); const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag());
cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(),
_alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1); _alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1);
TEST_CUBLAS("cublasZgemv"); TEST_CUBLAS("cublasZgemv");
} }

10
vec.h
View File

@ -149,10 +149,10 @@ public:
#endif #endif
//! create separate copy of the data corresponding to this vector //! create separate copy of the data corresponding to this vector
void copyonwrite(); void copyonwrite(bool detachonly=false);
//! purge this vector //! purge this vector
void clear() { copyonwrite(); LA_traits<T>::clear(v, nn); }; void clear() { copyonwrite(true); LA_traits<T>::clear(v, nn); };
//! assignment operator assigns given vector //! assignment operator assigns given vector
NRVec& operator=(const NRVec &rhs); NRVec& operator=(const NRVec &rhs);
@ -806,7 +806,7 @@ NRVec<T>::~NRVec() {
* make own copy of the underlying data connected with this vector * make own copy of the underlying data connected with this vector
******************************************************************************/ ******************************************************************************/
template <typename T> template <typename T>
void NRVec<T>::copyonwrite() { void NRVec<T>::copyonwrite(bool detachonly) {
if(!count) laerror("copyonwrite of an undefined vector"); if(!count) laerror("copyonwrite of an undefined vector");
if(*count > 1) { if(*count > 1) {
(*count)--; (*count)--;
@ -817,12 +817,12 @@ void NRVec<T>::copyonwrite() {
if(location == cpu){ if(location == cpu){
#endif #endif
newv = new T[nn]; newv = new T[nn];
memcpy(newv, v, nn*sizeof(T)); if(!detachonly) memcpy(newv, v, nn*sizeof(T));
#ifdef CUDALA #ifdef CUDALA
}else{ }else{
newv = (T *) gpualloc(nn*sizeof(T)); newv = (T *) gpualloc(nn*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()"); if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()");
cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1); if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()" TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()"
} }
#endif #endif