*** empty log message ***

This commit is contained in:
jiri 2013-11-04 14:56:39 +00:00
parent a9e30620f0
commit 80fe44fab2
18 changed files with 505 additions and 308 deletions

View File

@ -1,7 +1,33 @@
04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and
gdiagonalize() to get correct eigenvector convention for C-storage
30.10.2013 macros for consistent lowercase and uppercase of character
parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated)
29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas
14.10.2013 added operator% to bivector
01.06.2012 more efficient clear() by detachonly parameter to copyonwrite()
14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat
13.03.2012 symmetry of integrals with different spins added to fourindex.h
23.02.2012 included unistd.h
23.02.2012 fixed max read/write size in multiget and multiput to 1GB
24.01.2012 Improved DIIS (L.V.)
19.01.2012 Fixed location for gpu in nonclass.cc
02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.)
01.02.2011 Added trace2 for complex matrices by L. Veis
18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik
28.12.2010 Generalized diagonalization and functions of general complex matrices
21.12.2010 Changed to size_t in matrix put,get to prevent overflow
08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp
24.11.2010 Added checking for integer overflow in exptimes
23.10.2010 Fixed dependency on atlas for static libraries in configure.ac
27.09.2010 Seed of CSRMat class added.
22.09.2010 Allowed formal operator[] on gpu matrices
22.09.2010 Added submatrix() to SparseSMat
22.09.2010 Zgerc and zgeru implemented in non-cblas version
21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems
08.09.2010 RELEASE 0.6
08.09.2010 Doxygen documentation by M. Sulc
08.09.2010 Doxygen documentation for NRVec, NRMat, NRSMat classes contributed by M. Sulc
08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc
08.09.2010 Minor bugfixes contributed by M. Sulc
08.09.2010 Minor bugfixes and improvements contributed by M. Sulc
25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat
24.06.2010 Fixed a memory leak existing when MATPTR was defined
18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested)
11.03.2008 added cblas_idamax replacement for non-cblas
05.03.2008 fixed transposed bug in inverse() with non-cblas
26.02.2008 INITIAL RELEASE 0.1

View File

@ -130,7 +130,7 @@ return *this;
/*number of ones in a binary number, from "Hacker's delight" book*/
#ifdef LONG_IS_32
static unsigned long word_popul(unsigned long x)
static unsigned int word_popul(unsigned long x)
{
x -= ((x>>1)&0x55555555);
x = (x&0x33333333) + ((x>>2)&0x33333333);
@ -140,9 +140,10 @@ x+= (x>>16);
return x&0x3f;
}
#else
static unsigned long word_popul(unsigned long x)
//@@@@ use an efficient trick
static unsigned int word_popul(unsigned long x)
{
unsigned long s=0;
unsigned int s=0;
for(int i=0; i<64; ++i)
{
if(x&1) ++s;
@ -156,7 +157,6 @@ return s;
unsigned int bitvector::population(const unsigned int before) const
{
//@@@before
int i;
unsigned int s=0;
for(i=0; i<nn-1; ++i) s+=word_popul(v[i]);
@ -170,4 +170,21 @@ if(modulo)
return s+word_popul(a);
}
unsigned int bitvector::operator%(const bitvector &y) const
{
if(nn!=y.nn) laerror("incompatible size in bitdifference");
unsigned int s=0;
for(int i=0; i<nn-1; ++i) s+=word_popul(v[i]^y.v[i]);
bitvector_block a=v[nn-1]^y.v[nn-1];
if(modulo)
{
bitvector_block mask= ~((bitvector_block)0);
mask <<=modulo;
a &= ~mask;
}
return s+word_popul(a);
}
}//namespace

View File

@ -55,7 +55,7 @@ public:
void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));};
const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;};
const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;};
void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));};
void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));};
void fill() {memset(v,0xff,nn*sizeof(bitvector_block));};
bool operator!=(const bitvector &rhs) const;
bool operator==(const bitvector &rhs) const {return !(*this != rhs);};
@ -70,13 +70,13 @@ public:
bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;};
bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;};
bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;};
unsigned int population(const unsigned int before=0) const; //@@@number of 1's
unsigned int operator%(const bitvector &y) const; //number of differing bits
unsigned int population(const unsigned int before=0) const; //number of 1's
//extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits
//logical shifts <<= >>= << >> not implemented yet
//logical rotations not implemented yet
};
extern std::ostream & operator<<(std::ostream &s, const bitvector &x);
extern std::istream & operator>>(std::istream &s, bitvector &x);

View File

@ -52,8 +52,9 @@ INSTANTIZE(complex<double>)
*/
//// forced instantization of functions in the header in the corresponding object file
template class CSRMat<double>;
template class CSRMat<complex<double> >;
//@@@@@template class CSRMat<double>;
//@@@@template class CSRMat<complex<double> >;
//@@@@ unfinished class commented out

View File

@ -11,3 +11,6 @@
#undef FORINT
#define FINT int
#endif
#define BLAS_FORTRANCASE(x) toupper(x)
#define LAPACK_FORTRANCASE(x) toupper(x)

View File

@ -104,10 +104,10 @@ __attribute__((packed))
//later add symmetry of complex integrals
typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored
typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored
// these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2
static const int fourindex_n_symmetrytypes=6;
static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8};
static const int fourindex_n_symmetrytypes=7;
static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4};
static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
{
{{0,1,2,3,1}},
@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}},
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}},
{{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges
{{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}},
};
@ -669,6 +670,108 @@ typedef typename LA_traits<T>::normtype normtype;
//make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations
template<class T, class I>
class fourindex_dense<twoelectronrealmullikanAB,T,I> : public NRMat<T> {
public:
fourindex_dense(): NRMat<T>() {};
explicit fourindex_dense(const int n): NRMat<T>(n*(n+1)/2,n*(n+1)/2) {};
fourindex_dense(const NRMat<T> &rhs): NRMat<T>(rhs) {}; //be able to convert the parent class transparently to this
fourindex_dense(const T &a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
fourindex_dense(const T *a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
//and also construct it from sparse and externally stored fourindex classes
//it seems not possible to nest template<class I> just for the two constructors
fourindex_dense(const fourindex<I,T> &rhs);
fourindex_dense(const fourindex_ext<I,T> &rhs);
T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l);
const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const;
void resize(const int n) {(*this).NRMat<T>::resize(n*(n+1)/2,n*(n+1)/2);};
void putext(int f, T thr=1e-15);
int nbas() const {return (int)std::sqrt(2*(*this).nrows());};
};
template<class T, class I>
void fourindex_dense<twoelectronrealmullikanAB,T,I>::putext(int f, T thr)
{
T y;
for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j)
for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l)
if((y=abs((*this)(i,j,k,l))) > thr)
{
matel4stored<I,T> x;
x.elem= y;
x.index.indiv.i=i;
x.index.indiv.j=j;
x.index.indiv.k=k;
x.index.indiv.l=l;
if(sizeof(matel4stored<I,T>) != write(f,&x,sizeof(matel4stored<I,T>)) )
laerror("write error in putext");
}
}
template<class T, class I>
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
{
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
typename fourindex<I,T>::iterator p;
#ifdef DEBUG
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
#endif
for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem;
}
template<class T, class I>
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex_ext<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
{
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
typename fourindex_ext<I,T>::iterator p;
for(p=rhs.begin(); p!= rhs.end(); ++p)
{
#ifdef DEBUG
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
#endif
(*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem;
}
}
template<class T, class DUMMY>
T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l)
{
int I = SMat_index_1(i,j);
int J = SMat_index_1(k,l);
//I,J act as indices of a NRmat
#ifdef DEBUG
if (*NRMat<T>::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense");
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
#endif
return NRMat<T>::operator()(I,J);
}
template<class T, class DUMMY>
const T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const
{
int I = SMat_index_1(i,j);
int J = SMat_index_1(k,l);
//I,J act as indices of a NRSmat
#ifdef DEBUG
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
#endif
return NRMat<T>::operator()(I,J);
}
////////////////////
template<class T, class I>
class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> {
public:
fourindex_dense(): NRSMat<T>() {};

View File

@ -35,6 +35,7 @@
#include <fstream>
#include <limits>
#include <complex>
#include <unistd.h>
//using namespace std;
@ -56,7 +57,7 @@ extern "C" {
#include "noncblas.h"
#else
extern "C" {
#include "clapack.h"
#include "atlas/clapack.h"
}
#endif
@ -213,7 +214,7 @@ typedef C normtype;
typedef C realtype;
typedef complex<C> complextype;
static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
static inline bool gencmp(const complex<C> *x, const complex<C> *y, size_t n) {return memcmp(x,y,n*sizeof(complex<C>));}
static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
static bool smaller(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
static inline normtype norm (const complex<C> &x) {return std::abs(x);}
@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
size_t total=0;
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once
ssize_t r;
size_t nn;
do{
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
else total += r/sizeof(complex<C>);
if(r%sizeof(complex<C>)) laerror("read error 2");
}
@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0)
size_t total=0;
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once
ssize_t r;
size_t nn;
do{
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
else total += r/sizeof(complex<C>);
if(r%sizeof(complex<C>)) laerror("write error 2");
}
while(total < n);
}
static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
static void copy(complex<C> *dest, complex<C> *src, size_t n) {memcpy(dest,src,n*sizeof(complex<C>));}
static void clear(complex<C> *dest, size_t n) {memset(dest,0,n*sizeof(complex<C>));}
static void copyonwrite(complex<C> &x) {};
static void clearme(complex<C> &x) {x=0;};
static void deallocate(complex<C> &x) {};
@ -266,7 +269,7 @@ typedef C normtype;
typedef C realtype;
typedef complex<C> complextype;
static inline C sqrabs(const C x) { return x*x;}
static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));}
static inline bool bigger(const C &x, const C &y) {return x>y;}
static inline bool smaller(const C &x, const C &y) {return x<y;}
static inline normtype norm (const C &x) {return std::abs(x);}
@ -278,9 +281,10 @@ static void multiget(size_t n,int fd, C *x, bool dimensions=0)
size_t total=0;
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once
ssize_t r;
size_t nn;
do{
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
else total += r/sizeof(C);
if(r%sizeof(C)) laerror("read error 2");
}
@ -291,16 +295,17 @@ static void multiput(size_t n, int fd, const C *x, bool dimensions=0)
size_t total=0;
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once
ssize_t r;
size_t nn;
do{
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
else total += r/sizeof(C);
if(r%sizeof(C)) laerror("write error 2");
}
while(total < n);
}
static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
static void copy(C *dest, C *src, size_t n) {memcpy(dest,src,n*sizeof(C));}
static void clear(C *dest, size_t n) {memset(dest,0,n*sizeof(C));}
static void copyonwrite(C &x) {};
static void clearme(C &x) {x=0;};
static void deallocate(C &x) {};
@ -323,7 +328,7 @@ typedef X<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \
static inline normtype norm (const X<C> &x) {return x.norm();} \
@ -332,8 +337,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();}\
static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();}\
static void copyonwrite(X<C> &x) {x.copyonwrite();}\
static void clearme(X<C> &x) {x.clear();}\
static void deallocate(X<C> &x) {x.dealloc();}\
@ -359,7 +364,7 @@ typedef NRMat<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \
static inline normtype norm (const X<C> &x) {return x.norm();} \
@ -368,8 +373,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} \
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();} \
static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();} \
static void copyonwrite(X<C> &x) {x.copyonwrite();} \
static void clearme(X<C> &x) {x.clear();} \
static void deallocate(X<C> &x) {x.dealloc();} \

View File

@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
laerror(msg0);
}
int cblas_errprn(int ierr, int info, char *form, ...) {
#ifndef NONCBLAS
#include "cblas.h"
#include <stdarg.h>
extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...)
{
va_list argptr;
va_start(argptr, form);
if (p)
{
fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout);
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
}
vfprintf(stdout, form, argptr);
vfprintf(stderr, form, argptr);
va_end(argptr);
laerror("terminating in cblas_xerbla");
}
extern "C" int cblas_errprn(int ierr, int info, char *form, ...) {
char msg0[1024], *msg;
va_list argptr;
va_start(argptr, form);
@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) {
laerror(msg0);
return 0;
}
#endif
}//namespace

179
mat.cc
View File

@ -26,11 +26,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA {
@ -77,14 +74,14 @@ const NRMat<T> NRMat<T>::otimes(const NRMat<T> &rhs, bool reversecolumns) const
{
T c = (*this)(i,j);
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
r( i*rhs.nn + k, l*mm + j ) = c*rhs(k,l);
r( i*(size_t)rhs.nn + k, l*mm + j ) = c*rhs(k,l);
}
}else{
for(i=0;i<nn;i++) for(j=0;j<mm;j++)
{
T c=(*this)(i,j);
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
r( i*rhs.nn+k, j*rhs.mm+l ) = c *rhs(k,l);
r( i*(size_t)rhs.nn+k, j*(size_t)rhs.mm+l ) = c *rhs(k,l);
}
}
@ -108,7 +105,7 @@ const NRVec<T> NRMat<T>::row(const int i, int l) const {
#ifdef MATPTR
v[i]
#else
v + i*l
v + i*(size_t)l
#endif
, l);
return r;
@ -144,7 +141,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
#ifdef MATPTR
v[i][j]
#else
v[i*mm+j]
v[i*(size_t)mm+j]
#endif
,dim ,transp);
}
@ -196,7 +193,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
#ifdef MATPTR
v[i][j]
#else
v[i*mm+j]
v[i*(size_t)mm+j]
#endif
,dim,transp);
}
@ -476,13 +473,13 @@ NRMat<T> & NRMat<T>::operator-=(const T &a) {
******************************************************************************/
template <>
const NRMat<double> NRMat<double>::operator-() const {
const int nm = nn*mm;
const size_t nm = (size_t)nn*mm;
NRMat<double> result(nn, mm, getlocation());
#ifdef CUDALA
if(location == cpu) {
#endif
#ifdef MATPTR
for(register int i=0; i<nm; i++) result.v[0][i] = -v[0][i];
for(register size_t i=0; i<nm; i++) result.v[0][i] = -v[0][i];
#else
memcpy(result.v, v, nm*sizeof(double));
cblas_dscal(nm, -1., result.v, 1);
@ -506,13 +503,13 @@ const NRMat<double> NRMat<double>::operator-() const {
******************************************************************************/
template <>
const NRMat<complex<double> > NRMat<complex<double> >::operator-() const {
const int nm = nn*mm;
const size_t nm = (size_t)nn*mm;
NRMat<complex<double> > result(nn, mm, getlocation());
#ifdef CUDALA
if(location == cpu) {
#endif
#ifdef MATPTR
for(register int i=0; i<nm; i++) result.v[0][i]= -v[0][i];
for(register size_t i=0; i<nm; i++) result.v[0][i]= -v[0][i];
#else
memcpy(result.v, v, nm*sizeof(complex<double>));
cblas_zscal(nm, &CMONE, result.v, 1);
@ -539,9 +536,9 @@ const NRMat<T> NRMat<T>::operator-() const {
NRMat<T> result(nn, mm, getlocation());
#ifdef MATPTR
for(register int i=0; i<nn*mm; i++) result.v[0][i] = -v[0][i];
for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i];
#else
for(register int i=0; i<nn*mm; i++) result.v[i] = -v[i];
for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i];
#endif
return result;
}
@ -562,11 +559,11 @@ const NRMat<T> NRMat<T>::operator&(const NRMat<T> &b) const {
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem");
for(register int i=0; i<nn; i++){
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*mm), 1, (float*)(result.v + i*(mm + b.mm)), 1);
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*(size_t)mm), 1, (float*)(result.v + i*(size_t)(mm + b.mm)), 1);
TEST_CUBLAS("cublasScopy");
}
for(register int i=0; i<b.nn; i++){
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*(size_t)b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
TEST_CUBLAS("cublasScopy");
}
}
@ -582,7 +579,7 @@ const NRMat<T> NRMat<T>::operator|(const NRMat<T> &b) const {
for (int j=0; j<mm; j++)
for (int k=0; k<b.nn; k++)
for (int l=0; l<b.mm; l++)
result[i*b.nn+k][j*b.mm+l] = (*this)[i][j]*b[k][l];
result[i*(size_t)b.nn+k][j*(size_t)b.mm+l] = (*this)[i][j]*b[k][l];
return result;
}
@ -689,7 +686,7 @@ const NRVec<double> NRMat<double>::rsum() const {
#ifdef CUDALA
}else{
for(register int i=0;i<nn;i++){
cublasDaxpy(mm, 1.0, v + i*mm, 1, result.v, 1);
cublasDaxpy(mm, 1.0, v + i*(size_t)mm, 1, result.v, 1);
TEST_CUBLAS("cublasDaxpy");
}
}
@ -714,7 +711,7 @@ const NRVec<complex<double> > NRMat<complex<double> >::rsum() const {
#ifdef CUDALA
}else{
for(register int i=0;i<nn;i++){
cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(result.v), 1);
cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(result.v), 1);
TEST_CUBLAS("cublasZaxpy");
}
}
@ -748,14 +745,14 @@ const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int
#ifdef MATPTR
memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T));
#else
memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T));
memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T));
#endif
}
#ifdef CUDALA
}else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
for(register int i=fromrow; i<=torow; ++i){
cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
TEST_CUBLAS("cublasScopy");
}
}
@ -786,13 +783,13 @@ void NRMat<T>::storesubmatrix(const int fromrow, const int fromcol, const NRMat
#ifdef MATPTR
memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T));
#else
memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
#endif
#ifdef CUDALA
}else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1);
cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1);
}
#endif
}
@ -821,8 +818,8 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
v[j][i] = tmp;
#else
register int a, b;
a = i*mm + j;
b = j*mm + i;
a = i*(size_t)mm + j;
b = j*(size_t)mm + i;
T tmp = v[a];
v[a] = v[b];
v[b] = tmp;
@ -847,7 +844,7 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
******************************************************************************/
template<>
NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
const int nn_mm = nn*mm;
const size_t nn_mm = (size_t)nn*mm;
#ifdef CUDALA
if(location == cpu){
#endif
@ -888,7 +885,7 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
******************************************************************************/
template<>
NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
const int nn_mm = nn*mm;
const size_t nn_mm = (size_t) nn*mm;
#ifdef CUDALA
if(location == cpu){
#endif
@ -1079,7 +1076,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
#ifdef MATPTR
r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1);
#else
r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1);
r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
#endif
}
}
@ -1087,7 +1084,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
}else{
for(i=0; i<nn; ++i){
for(j=0; j<=i; ++j){
r(i, j) = cublasDdot(nn, v + i*mm, 1, v + j*mm, 1);
r(i, j) = cublasDdot(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
TEST_CUBLAS("cublasDdot");
}
}
@ -1113,7 +1110,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
#ifdef MATPTR
cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j));
#else
cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j));
cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j));
#endif
}
}
@ -1121,7 +1118,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
}else{
for(i=0; i<mm; ++i){
for(j=0; j<=i; ++j){
cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*mm), 1, (const cuDoubleComplex *)(v + j*mm), 1);
cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*(size_t)mm), 1, (const cuDoubleComplex *)(v + j*(size_t)mm), 1);
TEST_CUBLAS("cublasZdotc");
r(i, j) = *(reinterpret_cast<complex<double>*> (&val));
}
@ -1172,7 +1169,7 @@ void NRMat<double>::randomize(const double &x) {
}else{
NRMat<double> tmp(nn, mm, cpu);
double *tmp_data = tmp;
for(register int i=0; i<nn*mm; ++i){
for(register size_t i=0; i<(size_t)nn*mm; ++i){
tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.);
}
tmp.moveto(this->location);
@ -1203,7 +1200,7 @@ void NRMat<complex<double> >::randomize(const double &x) {
}else{
NRMat<complex<double> > tmp(nn, mm, cpu);
complex<double> *tmp_data = tmp;
for(register int i=0; i<nn*mm; ++i){
for(register size_t i=0; i<(size_t)nn*mm; ++i){
const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
tmp_data[i] = complex<double>(re, im);
@ -1226,10 +1223,10 @@ NRMat<double>& NRMat<double>::operator*=(const double &a) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_dscal(nn*mm, a, *this, 1);
cblas_dscal((size_t)nn*mm, a, *this, 1);
#ifdef CUDALA
}else{
cublasDscal(nn*mm, a, v, 1);
cublasDscal((size_t)nn*mm, a, v, 1);
TEST_CUBLAS("cublasDscal");
}
#endif
@ -1249,11 +1246,11 @@ NRMat<complex<double> >::operator*=(const complex<double> &a) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zscal(nn*mm, &a, (*this)[0], 1);
cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1);
#ifdef CUDALA
}else{
const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a));
cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1);
cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1);
TEST_CUBLAS("cublasZscal");
}
#endif
@ -1271,9 +1268,9 @@ NRMat<T> & NRMat<T>::operator*=(const T &a) {
NOT_GPU(*this);
copyonwrite();
#ifdef MATPTR
for(register int i=0; i< nn*mm; i++) v[0][i] *= a;
for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a;
#else
for(register int i=0; i< nn*mm; i++) v[i] *= a;
for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a;
#endif
return *this;
}
@ -1294,10 +1291,10 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double> &rhs) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1);
#ifdef CUDALA
}else{
cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1);
TEST_CUBLAS("cublasDaxpy");
}
#endif
@ -1320,10 +1317,10 @@ NRMat<complex<double> >::operator+=(const NRMat< complex<double> > &rhs) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
#ifdef CUDALA
}else{
cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
}
#endif
return *this;
@ -1345,9 +1342,9 @@ NRMat<T> & NRMat<T>::operator+=(const NRMat<T> &rhs) {
copyonwrite();
#ifdef MATPTR
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
#else
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i];
#endif
return *this;
}
@ -1368,10 +1365,10 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double> &rhs) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1);
#ifdef CUDALA
}else{
cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1);
}
#endif
return *this;
@ -1395,10 +1392,10 @@ NRMat< complex<double> >::operator-=(const NRMat< complex<double> > &rhs) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
#ifdef CUDALA
}else{
cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
}
#endif
return *this;
@ -1421,9 +1418,9 @@ NRMat<T> & NRMat<T>::operator-=(const NRMat<T> &rhs) {
copyonwrite();
#ifdef MATPTR
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
#else
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i];
#endif
return *this;
}
@ -1693,10 +1690,10 @@ const double NRMat<double>::dot(const NRMat<double> &rhs) const {
#ifdef CUDALA
if(location == cpu){
#endif
ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1);
ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1);
#ifdef CUDALA
}else{
ret = cublasDdot(nn*mm, v, 1, rhs.v, 1);
ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1);
}
#endif
return ret;
@ -1721,10 +1718,10 @@ NRMat<complex<double> >::dot(const NRMat<complex<double> > &rhs) const {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
#ifdef CUDALA
}else{
cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
ret = *(reinterpret_cast<complex<double>*> (&val));
}
#endif
@ -1804,7 +1801,7 @@ void NRMat<double>::diagmultl(const NRVec<double> &rhs) {
for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); }
#ifdef CUDALA
}else{
for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*mm, 1); }
for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*(size_t)mm, 1); }
}
#endif
}
@ -1830,7 +1827,7 @@ void NRMat< complex<double> >::diagmultl(const NRVec< complex<double> > &rhs) {
}else{
for(register int i=0; i<nn; i++){
const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag());
cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*mm), 1);
cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*(size_t)mm), 1);
}
}
#endif
@ -1913,7 +1910,7 @@ NRMat<double>::operator*(const NRSMat<double> &rhs) const {
#ifdef CUDALA
}else{
for(register int i=0; i<nn; i++){
cublasDspmv('U', mm, 1.0, rhs.v, v + i*mm, 1, 0.0, result.v + i*rhs_ncols, 1);
cublasDspmv('U', mm, 1.0, rhs.v, v + i*(size_t)mm, 1, 0.0, result.v + i*(size_t)rhs_ncols, 1);
}
}
#endif
@ -1947,7 +1944,7 @@ NRMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
#ifdef CUDALA
}else{
for(register int i=0; i<nn; i++){
cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*rhs_ncols), 1);
cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*(size_t)mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*(size_t)rhs_ncols), 1);
}
}
#endif
@ -1974,10 +1971,10 @@ NRMat<complex<double> >& NRMat<complex<double> >::conjugateme() {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
#ifdef CUDALA
}else{
cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2);
cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2);
}
#endif
return *this;
@ -2048,12 +2045,12 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
const char transa, const NRMat<double> &b, const char transb,
const double &alpha) {
int k(transa=='n'?a.mm:a.nn);
int k(tolower(transa)=='n'?a.mm:a.nn);
#ifdef DEBUG
int l(transa=='n'?a.nn:a.mm);
int kk(transb=='n'?b.nn:b.mm);
int ll(transb=='n'?b.mm:b.nn);
int l(tolower(transa)=='n'?a.nn:a.mm);
int kk(tolower(transb)=='n'?b.nn:b.mm);
int ll(tolower(transb)=='n'?b.mm:b.nn);
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)");
if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
#endif
@ -2066,8 +2063,8 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
#ifdef CUDALA
if(location == cpu){
#endif
cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans),
(tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
a.mm, b , b.mm, beta, *this , mm);
#ifdef CUDALA
}else{
@ -2083,20 +2080,20 @@ void NRMat<complex<double> >::gemm(const complex<double> & beta,
const NRMat<complex<double> > & b, const char transb,
const complex<double> & alpha)
{
int k(transa=='n'?a.mm:a.nn);
int k(tolower(transa)=='n'?a.mm:a.nn);
#ifdef DEBUG
int l(transa=='n'?a.nn:a.mm);
int kk(transb=='n'?b.nn:b.mm);
int ll(transb=='n'?b.mm:b.nn);
int l(tolower(transa)=='n'?a.nn:a.mm);
int kk(tolower(transb)=='n'?b.nn:b.mm);
int ll(tolower(transb)=='n'?b.mm:b.nn);
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)");
#endif
if (alpha==CZERO && beta==CONE) return;
copyonwrite();
cblas_zgemm(CblasRowMajor,
(transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
(transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
(tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)),
(tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)),
nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm);
}
@ -2113,10 +2110,10 @@ const double NRMat<double>::norm(const double scalar) const {
#ifdef CUDALA
if(location == cpu){
#endif
return cblas_dnrm2(nn*mm, (*this)[0], 1);
return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1);
#ifdef CUDALA
}else{
return cublasDnrm2(nn*mm, v, 1);
return cublasDnrm2((size_t)nn*mm, v, 1);
}
#endif
}
@ -2130,7 +2127,7 @@ const double NRMat<double>::norm(const double scalar) const {
#ifdef MATPTR
tmp = v[i][j];
#else
tmp = v[i*mm+j];
tmp = v[i*(size_t)mm+j];
#endif
if(i == j) tmp -= scalar;
sum += tmp*tmp;
@ -2152,10 +2149,10 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
#ifdef CUDALA
if(location == cpu){
#endif
return cblas_dznrm2(nn*mm, (*this)[0], 1);
return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1);
#ifdef CUDALA
}else{
return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1);
return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1);
}
#endif
}
@ -2168,7 +2165,7 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
#ifdef MATPTR
tmp = v[i][j];
#else
tmp = v[i*mm+j];
tmp = v[i*(size_t)mm+j];
#endif
if(i == j) tmp -= scalar;
const double re = tmp.real();
@ -2195,10 +2192,10 @@ void NRMat<double>::axpy(const double alpha, const NRMat<double> &mat) {
#ifdef CUDALA
if(location == cpu){
#endif
cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1);
cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
#ifdef CUDALA
}else{
cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1);
cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
}
#endif
}
@ -2221,7 +2218,7 @@ void NRMat<complex<double> >::axpy(const complex<double> alpha,
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1);
cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1);
#ifdef CUDALA
}else{
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
@ -2245,7 +2242,7 @@ const T NRMat<T>::trace() const {
#ifdef MATPTR
for(register int i=0; i<nn; ++i) sum += v[i][i];
#else
for(register int i=0; i<nn*nn; i += (nn+1)) sum += v[i];
for(register size_t i=0; i<(size_t)nn*nn; i += (nn+1)) sum += v[i];
#endif
return sum;
}
@ -2554,7 +2551,7 @@ NRMat<double>& NRMat<double>::swap_rows(){
#ifdef CUDALA
}else{
for(register int i=0; i<n_pul; i++){
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm, 1);
cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm, 1);
TEST_CUBLAS("cublasDswap");
}
}
@ -2580,7 +2577,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows(){
#ifdef CUDALA
}else{
for(register int i=0; i<n_pul; i++){
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
TEST_CUBLAS("cublasZswap");
}
}
@ -2613,7 +2610,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
}else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows");
for(register int i=0; i<n_pul; i++){
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
TEST_CUBLAS("cublasSswap");
}
}
@ -2745,7 +2742,7 @@ NRMat<double>& NRMat<double>::swap_rows_cols(){
#ifdef CUDALA
}else{
for(register int i=0; i<n_pul; i++){
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
TEST_CUBLAS("cublasDswap");
}
@ -2792,7 +2789,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows_cols(){
#ifdef CUDALA
}else{
for(register int i=0;i<n_pul;i++){
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
TEST_CUBLAS("cublasZswap");
}
if(nn & 1){
@ -2817,7 +2814,7 @@ template<typename T>
NRMat<T>& NRMat<T>::swap_rows_cols(){
const int n_pul = nn >> 1;
const int m_pul = mm >> 1;
const int dim = nn*mm;
const size_t dim = (size_t)nn*mm;
T *data_ptr;
T tmp;
@ -2837,7 +2834,7 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
}else{
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols");
for(register int i=0; i<n_pul; i++){
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
TEST_CUBLAS("cublasSswap");
}

109
mat.h
View File

@ -39,10 +39,10 @@ protected:
T *v;//!< pointer to the data stored continuously in emmory
#endif
int *count;//!< reference counter
public:
#ifdef CUDALA
GPUID location;
#endif
public:
friend class NRVec<T>;
friend class NRSMat<T>;
@ -89,16 +89,16 @@ public:
//! explicit constructor converting vector into a <code>NRMat<T></code> object
#ifdef MATPTR
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
};
#else
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
#endif
#ifdef MATPTR
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise
#else
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise
#endif
const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
@ -107,7 +107,7 @@ public:
inline int getcount() const {return count?*count:0;}
//! ensure that the data of this matrix are referenced exactly once
void copyonwrite();
void copyonwrite(bool detachonly=false);
/***************************************************************************//**
* routines for CUDA related stuff
@ -260,7 +260,7 @@ public:
//! get the number of columns
inline int ncols() const;
//! get the number of matrix elements
inline int size() const;
inline size_t size() const;
//! unformatted input
void get(int fd, bool dimensions = 1, bool transposed = false);
@ -274,8 +274,8 @@ public:
//! set all matrix elements equal to zero
void clear(){
if(nn&&mm){
copyonwrite();
LA_traits<T>::clear((*this)[0], nn*mm);
copyonwrite(true);
LA_traits<T>::clear((*this)[0], (size_t)nn*mm);
}
};
@ -379,7 +379,7 @@ template <typename T>
NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
T* p;
*count = 1;
const int nm = n*m;
const size_t nm = (size_t)n*m;
#ifdef CUDALA
location = (loc==undefined?DEFAULT_LOC:loc);
if(location == cpu) {
@ -408,7 +408,7 @@ NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count
******************************************************************************/
template <typename T>
NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
const int nm = n*m;
const size_t nm = (size_t)n*m;
T *p;
*count = 1;
@ -447,7 +447,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n),
******************************************************************************/
template <typename T>
NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
const int nm = n*m;
const size_t nm = (size_t)n*m;
T *p;
*count = 1;
@ -460,7 +460,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
p = v[0] = new T[nm];
for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
#else
p = v = new T[m*n];
p = v = new T[nm];
#endif
if (a != (T)0)
for (register int i=0; i<nm; i++) *p++ = a;
@ -483,7 +483,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
******************************************************************************/
template <typename T>
NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
const int nm = n*m;
const size_t nm = (size_t)n*m;
#ifdef CUDALA
location = DEFAULT_LOC;
#endif
@ -546,10 +546,10 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
*count = 1;
#ifdef MATPTR
v = new T*[nn];
v[0] = new T[mm*nn];
v[0] = new T[(size_t)mm*nn];
for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
#else
v = new T[mm*nn];
v = new T[(size_t)mm*nn];
#endif
#ifdef MATPTR
@ -561,7 +561,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
#else
for (i=0; i<nn; i++){
for (j=0; j<=i; j++){
v[i*nn + j] = v[j*nn + i] = rhs[k++];
v[i*(size_t)nn + j] = v[j*(size_t)nn + i] = rhs[k++];
}
}
#endif
@ -578,7 +578,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
template <typename T>
NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
{
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
#ifdef CUDALA
location=rhs.location;
@ -628,7 +628,7 @@ inline T* NRMat<T>::operator[](const int i) {
#ifdef MATPTR
return v[i];
#else
return v + i*mm;
return v + i*(size_t)mm;
#endif
}
@ -646,7 +646,7 @@ inline const T* NRMat<T>::operator[](const int i) const {
#ifdef MATPTR
return v[i];
#else
return v + i*mm;
return v + i*(size_t)mm;
#endif
}
@ -668,7 +668,7 @@ inline T& NRMat<T>::operator()(const int i, const int j){
#ifdef MATPTR
return v[i][j];
#else
return v[i*mm + j];
return v[i*(size_t)mm + j];
#endif
}
@ -689,7 +689,7 @@ inline const T& NRMat<T>::operator()(const int i, const int j) const{
#ifdef MATPTR
return v[i][j];
#else
return v[i*mm + j];
return v[i*(size_t)mm + j];
#endif
}
@ -712,11 +712,11 @@ inline const T NRMat<T>::get_ij(const int i, const int j) const{
#ifdef MATPTR
return v[i][j];
#else
return v[i*mm + j];
return v[i*(size_t)mm + j];
#endif
#ifdef CUDALA
}else{
const int pozice = i*mm + j;
const size_t pozice = i*(size_t)mm + j;
gpuget(1, sizeof(T), v + pozice, &ret);
return ret;
}
@ -743,8 +743,8 @@ inline int NRMat<T>::ncols() const{
* @return number of elements
******************************************************************************/
template <typename T>
inline int NRMat<T>::size() const{
return nn*mm;
inline size_t NRMat<T>::size() const{
return (size_t)nn*mm;
}
/***************************************************************************//**
@ -795,7 +795,7 @@ inline const double NRMat<double>::amax() const{
#ifdef CUDALA
}else{
double ret(0.0);
const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1;
TEST_CUBLAS("cublasIdamax");
gpuget(1, sizeof(double), v + pozice, &ret);
return ret;
@ -815,7 +815,7 @@ inline const double NRMat<double>::amin() const{
if(location == cpu){
#endif
// idamin seems not to be supported
const int nm = nn*mm;
const size_t nm = (size_t)nn*mm;
double val(0.0);
int index(-1);
ret = std::numeric_limits<double>::max();
@ -834,7 +834,7 @@ inline const double NRMat<double>::amin() const{
#endif
#ifdef CUDALA
}else{
const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1;
TEST_CUBLAS("cublasIdamin");
gpuget(1, sizeof(double), v + pozice, &ret);
}
@ -860,7 +860,7 @@ inline const complex<double> NRMat<complex<double> >::amax() const{
#ifdef CUDALA
}else{
complex<double> ret(0.0, 0.0);
const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
TEST_CUBLAS("cublasIzamax");
gpuget(1, sizeof(complex<double>), v + pozice, &ret);
return ret;
@ -881,7 +881,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
if(location == cpu){
#endif
// idamin seems not to be supported
const int nm = nn*mm;
const size_t nm = (size_t)nn*mm;
int index(-1);
double val(0.0), min_val(0.0);
complex<double> z_val(0.0, 0.0);
@ -903,7 +903,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
#endif
#ifdef CUDALA
}else{
const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
TEST_CUBLAS("cublasIzamin");
gpuget(1, sizeof(complex<double>), v + pozice, &ret);
}
@ -991,7 +991,7 @@ NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
* @see NRMat<T>::count, NRMat<T>::operator|=()
******************************************************************************/
template <typename T>
void NRMat<T>::copyonwrite() {
void NRMat<T>::copyonwrite(bool detachonly) {
if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
if(*count > 1){
(*count)--;
@ -1002,20 +1002,20 @@ void NRMat<T>::copyonwrite() {
#endif
#ifdef MATPTR
T **newv = new T*[nn];
newv[0] = new T[mm*nn];
memcpy(newv[0], v[0], mm*nn*sizeof(T));
newv[0] = new T[(size_t)mm*nn];
if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T));
v = newv;
for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
#else
T *newv = new T[mm*nn];
memcpy(newv, v, mm*nn*sizeof(T));
T *newv = new T[(size_t)mm*nn];
if(!detachonly) memcpy(newv, v, (size_t)mm*nn*sizeof(T));
v = newv;
#endif
#ifdef CUDALA
}else{ //matrix is in GPU memory
T *newv = (T *) gpualloc(mm*nn*sizeof(T));
T *newv = (T *) gpualloc((size_t)mm*nn*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
if(!detachonly) cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy");
v = newv;
}
@ -1082,14 +1082,14 @@ void NRMat<T>::resize(int n, int m) {
#endif
#ifdef MATPTR
v = new T*[nn];
v[0] = new T[m*n];
v[0] = new T[(size_t)m*n];
for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
#else
v = new T[m*n];
v = new T[(size_t)m*n];
#endif
#ifdef CUDALA
}else{
v = (T *) gpualloc(n*m*sizeof(T));
v = (T *) gpualloc((size_t)n*m*sizeof(T));
}
#endif
return;
@ -1108,15 +1108,15 @@ void NRMat<T>::resize(int n, int m) {
delete[] v;
#ifdef MATPTR
v = new T*[nn];
v[0] = new T[m*n];
v[0] = new T[(size_t)m*n];
for (int i=1; i< n; i++) v[i] = v[i-1] + m;
#else
v = new T[m*n];
v = new T[(size_t)m*n];
#endif
#ifdef CUDALA
}else{
gpufree(v);
v=(T *) gpualloc(n*m*sizeof(T));
v=(T *) gpualloc((size_t)n*m*sizeof(T));
}
#endif
}
@ -1228,7 +1228,7 @@ public:
#ifdef MATPTR
return NRMat<T>::v[i - 1][j - 1];
#else
return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
return NRMat<T>::v[(i-1)*(size_t)NRMat<T>::mm+j-1];
#endif
}
@ -1258,11 +1258,11 @@ public:
#ifdef MATPTR
return NRMat<T>::v[i - 1][j - 1];
#else
return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
return NRMat<T>::v[(size_t)(i-1)*NRMat<T>::mm + (j-1)];
#endif
#ifdef CUDALA
}else{
const int pozice = (i-1)*NRMat<T>::mm + (j-1);
const size_t pozice = (size_t)(i-1)*NRMat<T>::mm + (j-1);
gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
return ret;
}
@ -1286,10 +1286,10 @@ NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
copyonwrite();// ensure that *count == 1
#ifdef MATPTR
for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i];
#else
const int Dim = nn*mm;
for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
const size_t Dim = (size_t)nn*mm;
for(register size_t i=0; i<Dim; i++) v[i] *= rhs.v[i];
#endif
return *this;
}
@ -1320,14 +1320,14 @@ void NRMat<T>::moveto(const GPUID dest) {
T *vold = v;
if(dest == cpu){ //moving from GPU to CPU
v = new T[nn*mm];
gpuget(nn*mm, sizeof(T), vold, v);
v = new T[(size_t)nn*mm];
gpuget((size_t)nn*mm, sizeof(T), vold, v);
if(*count == 1){ gpufree(vold); }
else{ --(*count); count = new int(1); }
}else{ //moving from CPU to GPU
v = (T *) gpualloc(nn*mm*sizeof(T));
gpuput(nn*mm, sizeof(T), vold, v);
v = (T *) gpualloc((size_t)nn*mm*sizeof(T));
gpuput((size_t)nn*mm, sizeof(T), vold, v);
if(*count == 1) delete[] vold;
else{ --(*count); count = new int(1);}
}
@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -)
}//end of the LA-namespace
#endif/* _LA_MAT_H_ */

View File

@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
char U = BLAS_FORTRANCASE('u');
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
#else
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
#endif
}
@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
char U = BLAS_FORTRANCASE('u');
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
#else
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY);
#endif
}
@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n
char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T');
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
FORNAME(dgemm) (&transb,&transa,
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
#else
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
FORNAME(dgemm) (&transb,&transa,
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
#endif
}
@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n
char transb = BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T'));
char transa = BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T'));
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
FORNAME(zgemm) (&transb,&transa,
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
#else
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
FORNAME(zgemm) ( &transb,&transa,
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
#endif
}
@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
const double *X, const int incX, const double beta,
double *Y, const int incY)
{
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
char transax = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
#else
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
#endif
}
@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
//swap n-m and toggle transposition
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
#else
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
#endif
}

View File

@ -17,6 +17,7 @@
*/
//this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway
#include "la_traits.h"
#include "noncblas.h"
#include "vec.h"
#include "smat.h"
@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const
FINT r, *ipiv;
a.copyonwrite();
ipiv = new FINT[n];
char U = 'U';
char U = LAPACK_FORTRANCASE('u');
#ifdef FORINT
const FINT ntmp=n;
const FINT nrhstmp=nrhs;
@ -298,9 +299,9 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
const int A_rows = _A.nrows();
const int A_cols = _A.ncols();
const char fact = _eq?'E':'N';
const char trans = 'T';//because of c-order
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
const char trans = LAPACK_FORTRANCASE('T');//because of c-order
char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
laerror("linear_solve_x: invalid input matrices");
@ -371,9 +372,9 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
const int A_rows = _A.nrows();
const int A_cols = _A.ncols();
const char fact = _eq?'E':'N';
const char trans = 'T';//because of c-order
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
const char trans = LAPACK_FORTRANCASE('T');//because of c-order
char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
laerror("linear_solve_x: invalid input matrices");
@ -557,9 +558,9 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
if(b) b->copyonwrite();
FINT r = 0;
char U ='U';
char vectors = 'V';
if (!eivec) vectors = 'N';
char U =LAPACK_FORTRANCASE('u');
char vectors = LAPACK_FORTRANCASE('v');
if (!eivec) vectors = LAPACK_FORTRANCASE('n');
FINT LWORK = -1;
double WORKX;
FINT ldb=0; if(b) ldb=b->ncols();
@ -588,7 +589,7 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
#endif
delete[] WORK;
if (vectors == 'V' && corder) a.transposeme(n);
if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n);
if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()");
@ -620,12 +621,13 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
if(b) b->copyonwrite();
FINT r = 0;
char U ='U';
char vectors = 'V';
if (!eivec) vectors = 'N';
char U =LAPACK_FORTRANCASE('U');
char vectors = LAPACK_FORTRANCASE('V');
if (!eivec) vectors = LAPACK_FORTRANCASE('n');
FINT LWORK = -1;
complex<double> WORKX;
FINT ldb=0; if(b) ldb=b->ncols();
std::cout << "test vectors "<<vectors<<std::endl;
// First call is to determine size of workspace
double *RWORK = new double[3*n+2];
@ -652,7 +654,7 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
delete[] WORK;
delete[] RWORK;
if (vectors == 'V' && corder) a.transposeme(n);
if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();}
if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()");
if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()");
@ -684,8 +686,8 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
if(b) b->copyonwrite();
FINT r = 0;
char U = 'U';
char job = v ? 'v' : 'n';
char U = LAPACK_FORTRANCASE('u');
char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
double *WORK = new double[3*n];
FINT ldv=v?v->ncols():n;
@ -730,8 +732,8 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
if(b) b->copyonwrite();
FINT r = 0;
char U = 'U';
char job = v ? 'v' : 'n';
char U = LAPACK_FORTRANCASE('u');
char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
complex<double> *WORK = new complex<double>[2*n];
double *RWORK = new double[3*n];
@ -890,8 +892,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
const int sorttype, const int biorthonormalize,
NRMat<double> *b, NRVec<double> *beta)
{
if(n<=0) n = a.nrows();
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
if (n > wr.size())
laerror("inconsistent dimension of eigen vector in gdiagonalize()");
if (vl) if (n > vl->nrows() || n > vl->ncols())
@ -911,8 +913,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
if (beta) beta->copyonwrite();
if (b) b->copyonwrite();
char jobvl = vl ? 'V' : 'N';
char jobvr = vr ? 'V' : 'N';
char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
double work0;
FINT lwork = -1;
FINT r;
@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
{
if(n<=0) n = a.nrows();
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
if (n > w.size())
laerror("inconsistent dimension of eigen vector in gdiagonalize()");
if (vl) if (n > vl->nrows() || n > vl->ncols())
@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
if (beta) beta->copyonwrite();
if (b) b->copyonwrite();
char jobvl = vl ? 'V' : 'N';
char jobvr = vr ? 'V' : 'N';
char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
complex<double> work0;
FINT lwork = -1;
FINT r;
@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
if (corder) {
if (vl) vl->transposeme(n);
if (vr) vr->transposeme(n);
if (vl) {vl->transposeme(n); vl->conjugateme();}
if (vr) {vr->transposeme(n); vr->conjugateme();}
}
}
@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
const bool corder, int n, const int sorttype, const int biorthonormalize,
NRMat<double> *b, NRVec<double> *beta)
{
if(n<=0) n = a.nrows();
if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix");
NRVec<double> wr(n), wi(n);
NRMat<double> *rvl = 0;
@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
template<>
const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
{
NRMat<double> result(a.nrows(), a.ncols());
#ifdef CUDALA
if(location == cpu){
if(a.location == cpu){
#endif
NRMat<double> result(a.nrows(), a.ncols());
// NRMat<double> result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
#ifdef CUDALA
}else{
@ -1242,11 +1246,13 @@ const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<dou
template<>
const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
{
NRMat<double> result(a.nrows(), a.ncols());
#ifdef CUDALA
if(location == cpu){
if(a.location == cpu){
#endif
NRMat<double> result(a.nrows(), a.ncols());
// NRMat<double> result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
#ifdef CUDALA
}else{
@ -1259,12 +1265,15 @@ const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<dou
template<>
const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
{
NRMat <complex<double> > result(a.nrows(), a.ncols());
#ifdef CUDALA
if(location == cpu){
if(a.location == cpu){
#endif
NRMat <complex<double> > result(a.nrows(), a.ncols());
// NRMat <complex<double> > result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
#ifdef CUDALA
}else{
@ -1278,11 +1287,13 @@ const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &
template<>
const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
{
NRMat< complex<double> > result(a.nrows(), a.ncols());
#ifdef CUDALA
if(location == cpu){
if(a.location == cpu){
#endif
NRMat< complex<double> > result(a.nrows(), a.ncols());
// NRMat< complex<double> > result(a.nrows(), a.ncols());
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
#ifdef CUDALA
}else{
@ -1577,7 +1588,7 @@ void cholesky(NRMat<double> &a, bool upper)
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
FINT lda=a.ncols();
FINT n=a.nrows();
char uplo=upper?'U':'L';
char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
FINT info;
a.copyonwrite();
FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
@ -1596,7 +1607,7 @@ void cholesky(NRMat<complex<double> > &a, bool upper)
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
FINT lda=a.ncols();
FINT n=a.nrows();
char uplo=upper?'U':'L';
char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
FINT info;
a.copyonwrite();
a.transposeme();//switch to Fortran order
@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1
}
}//namespace

23
smat.cc
View File

@ -28,11 +28,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA {
@ -130,7 +127,7 @@ NRSMat<T> & NRSMat<T>::operator=(const T &a) {
NOT_GPU(*this);
copyonwrite();
memset(v, 0, NN2*sizeof(T));
for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
for(register int i=0; i<nn; i++) v[(size_t)i*(i+1)/2 + i] = a;
return *this;
}
@ -157,11 +154,11 @@ const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
if(divide){
for(register int i=0; i<nn; i++){
const T a = v[i*(i+1)/2+i];
const T a = v[(size_t)i*(i+1)/2+i];
if(a != 0.) r[i] /= a;
}
}else{
for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
for(register int i=0; i<nn; i++) r[i] = v[(size_t)i*(i+1)/2+i];
}
return divide?NULL:&r[0];
@ -178,7 +175,7 @@ const NRSMat<T> NRSMat<T>::operator-() const {
NOT_GPU(*this);
NRSMat<T> result(nn, getlocation());
for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
for(register size_t i = 0; i<NN2; i++) result.v[i]= -v[i];
return result;
}
@ -239,7 +236,7 @@ const T NRSMat<T>::trace() const {
NOT_GPU(*this);
T tmp = 0;
for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
for(register int i=0; i<nn; i++) tmp += v[(size_t)i*(i+1)/2+i];
return tmp;
}
@ -251,7 +248,7 @@ template<>
void NRSMat<double>::randomize(const double &x) {
NOT_GPU(*this);
for(int i=0; i<NN2; ++i){
for(size_t i=0; i<NN2; ++i){
v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
}
}
@ -262,11 +259,11 @@ void NRSMat<double>::randomize(const double &x) {
******************************************************************************/
template<>
void NRSMat<complex<double> >::randomize(const double &x) {
for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register size_t i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register size_t i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<nn; ++i){
for(register int j=0; j<=i; ++j){
if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
if(i == j) v[i*(size_t)(i+1)/2+j].imag() = 0; //hermitean
}
}
}

70
smat.h
View File

@ -25,7 +25,7 @@
#include "la_traits.h"
namespace LA {
#define NN2 (nn*(nn+1)/2)
#define NN2 ((size_t)nn*(nn+1)/2)
/***************************************************************************//**
@ -134,15 +134,15 @@ public:
void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
inline const T& operator[](const int ij) const;
inline T& operator[](const int ij);
inline const T& operator[](const size_t ij) const;
inline T& operator[](const size_t ij);
inline const T& operator()(const int i, const int j) const;
inline T& operator()(const int i, const int j);
inline int nrows() const;
inline int ncols() const;
inline int size() const;
inline size_t size() const;
inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant
const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
@ -155,9 +155,9 @@ public:
void get(int fd, bool dimensions = 1, bool transp = 0);
void put(int fd, bool dimensions = 1, bool transp = 0) const;
void copyonwrite();
void copyonwrite(bool detachonly=false);
void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
void clear() {copyonwrite(true); LA_traits<T>::clear(v,NN2);}; //zero out
void resize(const int n);
void dealloc(void) {resize(0);}
@ -212,7 +212,7 @@ inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) {
if(location == cpu){
#endif
v = new T[NN2];
if(a != (T)0) for(register int i = 0; i<NN2; i++) v[i] = a;
if(a != (T)0) for(register size_t i = 0; i<NN2; i++) v[i] = a;
else memset(v, 0, NN2*sizeof(T));
#ifdef CUDALA
@ -338,7 +338,7 @@ inline NRSMat<T> & NRSMat<T>::operator*=(const T &a) {
NOT_GPU(*this);
copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] *= a;
for(register size_t i = 0; i<NN2; ++i) v[i] *= a;
return *this;
}
@ -353,7 +353,7 @@ inline NRSMat<T> & NRSMat<T>::operator+=(const T &a) {
NOT_GPU(*this);
copyonwrite();
for(register int i = 0; i<nn; i++) v[i*(i+1)/2 + i] += a;
for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2 + i] += a;
return *this;
}
@ -368,7 +368,7 @@ inline NRSMat<T> & NRSMat<T>::operator-=(const T &a) {
NOT_GPU(*this);
copyonwrite();
for(register int i = 0; i<nn; i++) v[i*(i+1)/2+i] -= a;
for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2+i] -= a;
return *this;
}
@ -438,7 +438,7 @@ inline NRSMat<T>& NRSMat<T>::operator+=(const NRSMat<T>& rhs) {
SAME_LOC(*this, rhs);
copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] += rhs.v[i];
for(register size_t i = 0; i<NN2; ++i) v[i] += rhs.v[i];
return *this;
}
@ -507,7 +507,7 @@ inline NRSMat<T>& NRSMat<T>::operator-=(const NRSMat<T>& rhs) {
NOT_GPU(*this);
copyonwrite();
for(register int i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
for(register size_t i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
return *this;
}
@ -540,7 +540,7 @@ inline const NRMat<T> NRSMat<T>::operator-(const NRMat<T> &rhs) const {
* @return reference to the corresponding matrix element
******************************************************************************/
template <typename T>
inline T& NRSMat<T>::operator[](const int ij) {
inline T& NRSMat<T>::operator[](const size_t ij) {
#ifdef DEBUG
if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1");
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
@ -560,7 +560,7 @@ inline T& NRSMat<T>::operator[](const int ij) {
* @return constant reference to the corresponding matrix element
******************************************************************************/
template <typename T>
inline const T & NRSMat<T>::operator[](const int ij) const {
inline const T & NRSMat<T>::operator[](const size_t ij) const {
#ifdef DEBUG
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object");
@ -578,8 +578,8 @@ inline const T & NRSMat<T>::operator[](const int ij) const {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index(T i, T j) {
return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i;
inline size_t SMat_index(T i, T j) {
return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i;
}
/***************************************************************************//**
@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index_igej(T i, T j) {
return i*(i+1)/2+j;
inline size_t SMat_index_igej(T i, T j) {
return i*(size_t)(i+1)/2+j;
}
/***************************************************************************//**
@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index_ilej(T i, T j) {
return j*(j+1)/2+i;
inline size_t SMat_index_ilej(T i, T j) {
return j*(size_t)(j+1)/2+i;
}
/***************************************************************************//**
@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index_1(T i, T j) {
return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1;
inline size_t SMat_index_1(T i, T j) {
return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1;
}
/***************************************************************************//**
@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index_1igej(T i, T j) {
return i*(i-1)/2+j-1;
inline size_t SMat_index_1igej(T i, T j) {
return i*(size_t)(i-1)/2+j-1;
}
/***************************************************************************//**
@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) {
* @return cumulative index
******************************************************************************/
template<typename T>
inline T SMat_index_1ilej(T i, T j) {
return j*(j-1)/2+i-1;
inline size_t SMat_index_1ilej(T i, T j) {
return j*(size_t)(j-1)/2+i-1;
}
//indexing for antisymmetric matrix (used by fourindex)
template<typename T>
inline T ASMat_index(T i, T j)
inline size_t ASMat_index(T i, T j)
{
if(i == j) return -1;
return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i;
return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i;
}
template<typename T>
inline T ASMat_index_1(T i, T j)
inline size_t ASMat_index_1(T i, T j)
{
if(i == j) return -1;
return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1;
@ -715,7 +715,7 @@ inline int NRSMat<T>::ncols() const {
* @return number of elements of this symmetric matrix of generalt type <code>T</code>
******************************************************************************/
template <typename T>
inline int NRSMat<T>::size() const {
inline size_t NRSMat<T>::size() const {
return NN2;
}
@ -758,7 +758,7 @@ inline const double NRSMat<double>::amin() const {
double val(0.0);
int index(-1);
ret = std::numeric_limits<double>::max();
for(register int i = 0; i < NN2; i++){
for(register size_t i = 0; i < NN2; i++){
val = std::abs(v[i]);
if(val < ret){ index = i; ret = val; }
}
@ -812,7 +812,7 @@ inline const complex<double> NRSMat<complex<double> >::amin() const{
complex<double> z_val(0.0, 0.0);
min_val = std::numeric_limits<double>::max();
for(register int i = 0; i < NN2; i++){
for(register size_t i = 0; i < NN2; i++){
z_val = v[i];
val = std::abs(z_val.real()) + std::abs(z_val.imag());
if(val < min_val){ index = i; min_val = val; }
@ -920,7 +920,7 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs) {
* @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite()
******************************************************************************/
template <typename T>
void NRSMat<T>::copyonwrite() {
void NRSMat<T>::copyonwrite(bool detachonly) {
if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object");
if(*count > 1){
(*count)--;
@ -931,12 +931,12 @@ void NRSMat<T>::copyonwrite() {
if(location == cpu) {
#endif
newv = new T[NN2];
memcpy(newv, v, NN2*sizeof(T));
if(!detachonly) memcpy(newv, v, NN2*sizeof(T));
#ifdef CUDALA
}else{
newv = (T *) gpualloc(NN2*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()");
cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()"
}
#endif

View File

@ -132,8 +132,8 @@ public:
void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
inline bool issymmetric() const {return symmetric;}
unsigned int length() const;
void copyonwrite();
void clear() {copyonwrite();unsort();deletelist();}
void copyonwrite(bool detachonly=false);
void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}}
void simplify();
const T trace() const;
const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first

10
t.cc
View File

@ -1799,6 +1799,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
if(i!=j) err += abs(m2(i,j));
cout <<"offdiagonality error = "<<err<<endl;
err=0;
for(int i=0; i<m.nrows(); ++i) err += abs(m2(i,i) - eivals[i]);
cout <<"eigenvalue error = "<<err<<endl;
//test as general matrix
NRVec<complex<double> > ww(m.nrows());
NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows());
@ -1812,6 +1817,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
if(i!=j) err += abs(m3(i,j));
cout <<"offdiagonality error 2 = "<<err<<endl;
err=0;
for(int i=0; i<m.nrows(); ++i) err += abs(m3(i,i) - ww[i]);
cout <<"eigenvalue error = "<<err<<endl;
}

27
vec.cc
View File

@ -27,11 +27,8 @@
#include <errno.h>
#include "vec.h"
#include "qsort.h"
#include <unistd.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
namespace LA {
@ -541,7 +538,7 @@ template<>
void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
const char trans, const double alpha, const NRVec &x) {
#ifdef DEBUG
if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif
SAME_LOC3(*this, x, A);
copyonwrite();
@ -549,10 +546,10 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
#ifdef CUDALA
if(location==cpu){
#endif
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
#ifdef CUDALA
}else{
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
TEST_CUBLAS("cublasDgemv");
}
#endif
@ -572,7 +569,7 @@ template<>
void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
const char trans, const double alpha, const NRVec<complex<double> > &x) {
#ifdef DEBUG
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif
SAME_LOC3(*this, x, A);
copyonwrite();
@ -580,16 +577,16 @@ void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
#ifdef CUDALA
if(location==cpu){
#endif
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2);
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
#ifdef CUDALA
}else{
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
TEST_CUBLAS("cublasDgemv");
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
TEST_CUBLAS("cublasDgemv");
}
#endif
@ -611,14 +608,14 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
const NRMat<complex<double> > &A, const char trans,
const complex<double> alpha, const NRVec<complex<double> > &x) {
#ifdef DEBUG
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
#endif
SAME_LOC3(*this, x, A);
copyonwrite();
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1);
#ifdef CUDALA
}else{
@ -626,7 +623,7 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag());
cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(),
cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(),
_alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1);
TEST_CUBLAS("cublasZgemv");
}

10
vec.h
View File

@ -149,10 +149,10 @@ public:
#endif
//! create separate copy of the data corresponding to this vector
void copyonwrite();
void copyonwrite(bool detachonly=false);
//! purge this vector
void clear() { copyonwrite(); LA_traits<T>::clear(v, nn); };
void clear() { copyonwrite(true); LA_traits<T>::clear(v, nn); };
//! assignment operator assigns given vector
NRVec& operator=(const NRVec &rhs);
@ -806,7 +806,7 @@ NRVec<T>::~NRVec() {
* make own copy of the underlying data connected with this vector
******************************************************************************/
template <typename T>
void NRVec<T>::copyonwrite() {
void NRVec<T>::copyonwrite(bool detachonly) {
if(!count) laerror("copyonwrite of an undefined vector");
if(*count > 1) {
(*count)--;
@ -817,12 +817,12 @@ void NRVec<T>::copyonwrite() {
if(location == cpu){
#endif
newv = new T[nn];
memcpy(newv, v, nn*sizeof(T));
if(!detachonly) memcpy(newv, v, nn*sizeof(T));
#ifdef CUDALA
}else{
newv = (T *) gpualloc(nn*sizeof(T));
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()");
cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()"
}
#endif