*** empty log message ***
This commit is contained in:
parent
a9e30620f0
commit
80fe44fab2
31
ChangeLog
31
ChangeLog
@ -1,7 +1,33 @@
|
||||
04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and
|
||||
gdiagonalize() to get correct eigenvector convention for C-storage
|
||||
30.10.2013 macros for consistent lowercase and uppercase of character
|
||||
parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated)
|
||||
29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas
|
||||
14.10.2013 added operator% to bivector
|
||||
01.06.2012 more efficient clear() by detachonly parameter to copyonwrite()
|
||||
14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat
|
||||
13.03.2012 symmetry of integrals with different spins added to fourindex.h
|
||||
23.02.2012 included unistd.h
|
||||
23.02.2012 fixed max read/write size in multiget and multiput to 1GB
|
||||
24.01.2012 Improved DIIS (L.V.)
|
||||
19.01.2012 Fixed location for gpu in nonclass.cc
|
||||
02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.)
|
||||
01.02.2011 Added trace2 for complex matrices by L. Veis
|
||||
18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik
|
||||
28.12.2010 Generalized diagonalization and functions of general complex matrices
|
||||
21.12.2010 Changed to size_t in matrix put,get to prevent overflow
|
||||
08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp
|
||||
24.11.2010 Added checking for integer overflow in exptimes
|
||||
23.10.2010 Fixed dependency on atlas for static libraries in configure.ac
|
||||
27.09.2010 Seed of CSRMat class added.
|
||||
22.09.2010 Allowed formal operator[] on gpu matrices
|
||||
22.09.2010 Added submatrix() to SparseSMat
|
||||
22.09.2010 Zgerc and zgeru implemented in non-cblas version
|
||||
21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems
|
||||
08.09.2010 RELEASE 0.6
|
||||
08.09.2010 Doxygen documentation by M. Sulc
|
||||
08.09.2010 Doxygen documentation for NRVec, NRMat, NRSMat classes contributed by M. Sulc
|
||||
08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc
|
||||
08.09.2010 Minor bugfixes contributed by M. Sulc
|
||||
08.09.2010 Minor bugfixes and improvements contributed by M. Sulc
|
||||
25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat
|
||||
24.06.2010 Fixed a memory leak existing when MATPTR was defined
|
||||
18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
|
||||
@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested)
|
||||
11.03.2008 added cblas_idamax replacement for non-cblas
|
||||
05.03.2008 fixed transposed bug in inverse() with non-cblas
|
||||
26.02.2008 INITIAL RELEASE 0.1
|
||||
|
||||
|
25
bitvector.cc
25
bitvector.cc
@ -130,7 +130,7 @@ return *this;
|
||||
|
||||
/*number of ones in a binary number, from "Hacker's delight" book*/
|
||||
#ifdef LONG_IS_32
|
||||
static unsigned long word_popul(unsigned long x)
|
||||
static unsigned int word_popul(unsigned long x)
|
||||
{
|
||||
x -= ((x>>1)&0x55555555);
|
||||
x = (x&0x33333333) + ((x>>2)&0x33333333);
|
||||
@ -140,9 +140,10 @@ x+= (x>>16);
|
||||
return x&0x3f;
|
||||
}
|
||||
#else
|
||||
static unsigned long word_popul(unsigned long x)
|
||||
//@@@@ use an efficient trick
|
||||
static unsigned int word_popul(unsigned long x)
|
||||
{
|
||||
unsigned long s=0;
|
||||
unsigned int s=0;
|
||||
for(int i=0; i<64; ++i)
|
||||
{
|
||||
if(x&1) ++s;
|
||||
@ -156,7 +157,6 @@ return s;
|
||||
|
||||
unsigned int bitvector::population(const unsigned int before) const
|
||||
{
|
||||
//@@@before
|
||||
int i;
|
||||
unsigned int s=0;
|
||||
for(i=0; i<nn-1; ++i) s+=word_popul(v[i]);
|
||||
@ -170,4 +170,21 @@ if(modulo)
|
||||
return s+word_popul(a);
|
||||
}
|
||||
|
||||
unsigned int bitvector::operator%(const bitvector &y) const
|
||||
{
|
||||
if(nn!=y.nn) laerror("incompatible size in bitdifference");
|
||||
|
||||
unsigned int s=0;
|
||||
for(int i=0; i<nn-1; ++i) s+=word_popul(v[i]^y.v[i]);
|
||||
bitvector_block a=v[nn-1]^y.v[nn-1];
|
||||
if(modulo)
|
||||
{
|
||||
bitvector_block mask= ~((bitvector_block)0);
|
||||
mask <<=modulo;
|
||||
a &= ~mask;
|
||||
}
|
||||
return s+word_popul(a);
|
||||
}
|
||||
|
||||
|
||||
}//namespace
|
||||
|
@ -55,7 +55,7 @@ public:
|
||||
void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));};
|
||||
const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;};
|
||||
const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;};
|
||||
void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));};
|
||||
void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));};
|
||||
void fill() {memset(v,0xff,nn*sizeof(bitvector_block));};
|
||||
bool operator!=(const bitvector &rhs) const;
|
||||
bool operator==(const bitvector &rhs) const {return !(*this != rhs);};
|
||||
@ -70,13 +70,13 @@ public:
|
||||
bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;};
|
||||
bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;};
|
||||
bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;};
|
||||
unsigned int population(const unsigned int before=0) const; //@@@number of 1's
|
||||
unsigned int operator%(const bitvector &y) const; //number of differing bits
|
||||
unsigned int population(const unsigned int before=0) const; //number of 1's
|
||||
//extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits
|
||||
//logical shifts <<= >>= << >> not implemented yet
|
||||
//logical rotations not implemented yet
|
||||
};
|
||||
|
||||
|
||||
extern std::ostream & operator<<(std::ostream &s, const bitvector &x);
|
||||
extern std::istream & operator>>(std::istream &s, bitvector &x);
|
||||
|
||||
|
@ -52,8 +52,9 @@ INSTANTIZE(complex<double>)
|
||||
*/
|
||||
|
||||
//// forced instantization of functions in the header in the corresponding object file
|
||||
template class CSRMat<double>;
|
||||
template class CSRMat<complex<double> >;
|
||||
//@@@@@template class CSRMat<double>;
|
||||
//@@@@template class CSRMat<complex<double> >;
|
||||
//@@@@ unfinished class commented out
|
||||
|
||||
|
||||
|
||||
|
@ -11,3 +11,6 @@
|
||||
#undef FORINT
|
||||
#define FINT int
|
||||
#endif
|
||||
|
||||
#define BLAS_FORTRANCASE(x) toupper(x)
|
||||
#define LAPACK_FORTRANCASE(x) toupper(x)
|
||||
|
109
fourindex.h
109
fourindex.h
@ -104,10 +104,10 @@ __attribute__((packed))
|
||||
|
||||
|
||||
//later add symmetry of complex integrals
|
||||
typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored
|
||||
typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored
|
||||
// these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2
|
||||
static const int fourindex_n_symmetrytypes=6;
|
||||
static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8};
|
||||
static const int fourindex_n_symmetrytypes=7;
|
||||
static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4};
|
||||
static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
|
||||
{
|
||||
{{0,1,2,3,1}},
|
||||
@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
|
||||
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}},
|
||||
{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}},
|
||||
{{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges
|
||||
{{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}},
|
||||
};
|
||||
|
||||
|
||||
@ -669,6 +670,108 @@ typedef typename LA_traits<T>::normtype normtype;
|
||||
|
||||
//make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations
|
||||
template<class T, class I>
|
||||
class fourindex_dense<twoelectronrealmullikanAB,T,I> : public NRMat<T> {
|
||||
public:
|
||||
fourindex_dense(): NRMat<T>() {};
|
||||
explicit fourindex_dense(const int n): NRMat<T>(n*(n+1)/2,n*(n+1)/2) {};
|
||||
fourindex_dense(const NRMat<T> &rhs): NRMat<T>(rhs) {}; //be able to convert the parent class transparently to this
|
||||
fourindex_dense(const T &a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
|
||||
fourindex_dense(const T *a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
|
||||
//and also construct it from sparse and externally stored fourindex classes
|
||||
//it seems not possible to nest template<class I> just for the two constructors
|
||||
fourindex_dense(const fourindex<I,T> &rhs);
|
||||
fourindex_dense(const fourindex_ext<I,T> &rhs);
|
||||
|
||||
T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l);
|
||||
const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const;
|
||||
void resize(const int n) {(*this).NRMat<T>::resize(n*(n+1)/2,n*(n+1)/2);};
|
||||
void putext(int f, T thr=1e-15);
|
||||
int nbas() const {return (int)std::sqrt(2*(*this).nrows());};
|
||||
|
||||
};
|
||||
|
||||
template<class T, class I>
|
||||
void fourindex_dense<twoelectronrealmullikanAB,T,I>::putext(int f, T thr)
|
||||
{
|
||||
T y;
|
||||
for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j)
|
||||
for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l)
|
||||
if((y=abs((*this)(i,j,k,l))) > thr)
|
||||
{
|
||||
matel4stored<I,T> x;
|
||||
x.elem= y;
|
||||
x.index.indiv.i=i;
|
||||
x.index.indiv.j=j;
|
||||
x.index.indiv.k=k;
|
||||
x.index.indiv.l=l;
|
||||
if(sizeof(matel4stored<I,T>) != write(f,&x,sizeof(matel4stored<I,T>)) )
|
||||
laerror("write error in putext");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class T, class I>
|
||||
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
|
||||
{
|
||||
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
|
||||
typename fourindex<I,T>::iterator p;
|
||||
#ifdef DEBUG
|
||||
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
|
||||
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
|
||||
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
|
||||
#endif
|
||||
for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem;
|
||||
}
|
||||
|
||||
template<class T, class I>
|
||||
fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex_ext<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
|
||||
{
|
||||
if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
|
||||
typename fourindex_ext<I,T>::iterator p;
|
||||
for(p=rhs.begin(); p!= rhs.end(); ++p)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
|
||||
unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
|
||||
if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
|
||||
#endif
|
||||
(*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class T, class DUMMY>
|
||||
T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l)
|
||||
{
|
||||
int I = SMat_index_1(i,j);
|
||||
int J = SMat_index_1(k,l);
|
||||
//I,J act as indices of a NRmat
|
||||
#ifdef DEBUG
|
||||
if (*NRMat<T>::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense");
|
||||
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
|
||||
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
|
||||
#endif
|
||||
return NRMat<T>::operator()(I,J);
|
||||
}
|
||||
|
||||
|
||||
template<class T, class DUMMY>
|
||||
const T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const
|
||||
{
|
||||
int I = SMat_index_1(i,j);
|
||||
int J = SMat_index_1(k,l);
|
||||
//I,J act as indices of a NRSmat
|
||||
#ifdef DEBUG
|
||||
if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
|
||||
if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
|
||||
#endif
|
||||
return NRMat<T>::operator()(I,J);
|
||||
}
|
||||
|
||||
|
||||
////////////////////
|
||||
template<class T, class I>
|
||||
class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> {
|
||||
public:
|
||||
fourindex_dense(): NRSMat<T>() {};
|
||||
|
47
la_traits.h
47
la_traits.h
@ -35,6 +35,7 @@
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <complex>
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
//using namespace std;
|
||||
@ -56,7 +57,7 @@ extern "C" {
|
||||
#include "noncblas.h"
|
||||
#else
|
||||
extern "C" {
|
||||
#include "clapack.h"
|
||||
#include "atlas/clapack.h"
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -213,7 +214,7 @@ typedef C normtype;
|
||||
typedef C realtype;
|
||||
typedef complex<C> complextype;
|
||||
static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
|
||||
static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
|
||||
static inline bool gencmp(const complex<C> *x, const complex<C> *y, size_t n) {return memcmp(x,y,n*sizeof(complex<C>));}
|
||||
static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
|
||||
static bool smaller(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
|
||||
static inline normtype norm (const complex<C> &x) {return std::abs(x);}
|
||||
@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
|
||||
size_t total=0;
|
||||
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once
|
||||
ssize_t r;
|
||||
size_t nn;
|
||||
do{
|
||||
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
|
||||
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
|
||||
r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
|
||||
if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
|
||||
else total += r/sizeof(complex<C>);
|
||||
if(r%sizeof(complex<C>)) laerror("read error 2");
|
||||
}
|
||||
@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0)
|
||||
size_t total=0;
|
||||
size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once
|
||||
ssize_t r;
|
||||
size_t nn;
|
||||
do{
|
||||
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
|
||||
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
|
||||
r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>));
|
||||
if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
|
||||
else total += r/sizeof(complex<C>);
|
||||
if(r%sizeof(complex<C>)) laerror("write error 2");
|
||||
}
|
||||
while(total < n);
|
||||
}
|
||||
static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
|
||||
static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
|
||||
static void copy(complex<C> *dest, complex<C> *src, size_t n) {memcpy(dest,src,n*sizeof(complex<C>));}
|
||||
static void clear(complex<C> *dest, size_t n) {memset(dest,0,n*sizeof(complex<C>));}
|
||||
static void copyonwrite(complex<C> &x) {};
|
||||
static void clearme(complex<C> &x) {x=0;};
|
||||
static void deallocate(complex<C> &x) {};
|
||||
@ -266,7 +269,7 @@ typedef C normtype;
|
||||
typedef C realtype;
|
||||
typedef complex<C> complextype;
|
||||
static inline C sqrabs(const C x) { return x*x;}
|
||||
static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
|
||||
static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));}
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;}
|
||||
static inline bool smaller(const C &x, const C &y) {return x<y;}
|
||||
static inline normtype norm (const C &x) {return std::abs(x);}
|
||||
@ -278,9 +281,10 @@ static void multiget(size_t n,int fd, C *x, bool dimensions=0)
|
||||
size_t total=0;
|
||||
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once
|
||||
ssize_t r;
|
||||
size_t nn;
|
||||
do{
|
||||
r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C));
|
||||
if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
|
||||
r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
|
||||
if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
|
||||
else total += r/sizeof(C);
|
||||
if(r%sizeof(C)) laerror("read error 2");
|
||||
}
|
||||
@ -291,16 +295,17 @@ static void multiput(size_t n, int fd, const C *x, bool dimensions=0)
|
||||
size_t total=0;
|
||||
size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once
|
||||
ssize_t r;
|
||||
size_t nn;
|
||||
do{
|
||||
r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C));
|
||||
if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
|
||||
r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C));
|
||||
if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
|
||||
else total += r/sizeof(C);
|
||||
if(r%sizeof(C)) laerror("write error 2");
|
||||
}
|
||||
while(total < n);
|
||||
}
|
||||
static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
|
||||
static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
|
||||
static void copy(C *dest, C *src, size_t n) {memcpy(dest,src,n*sizeof(C));}
|
||||
static void clear(C *dest, size_t n) {memset(dest,0,n*sizeof(C));}
|
||||
static void copyonwrite(C &x) {};
|
||||
static void clearme(C &x) {x=0;};
|
||||
static void deallocate(C &x) {};
|
||||
@ -323,7 +328,7 @@ typedef X<C> producttype; \
|
||||
typedef typename LA_traits<C>::normtype normtype; \
|
||||
typedef X<typename LA_traits<C>::realtype> realtype; \
|
||||
typedef X<typename LA_traits<C>::complextype> complextype; \
|
||||
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;} \
|
||||
static inline bool smaller(const C &x, const C &y) {return x<y;} \
|
||||
static inline normtype norm (const X<C> &x) {return x.norm();} \
|
||||
@ -332,8 +337,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
|
||||
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
|
||||
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
|
||||
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
|
||||
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
|
||||
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();}\
|
||||
static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
|
||||
static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();}\
|
||||
static void copyonwrite(X<C> &x) {x.copyonwrite();}\
|
||||
static void clearme(X<C> &x) {x.clear();}\
|
||||
static void deallocate(X<C> &x) {x.dealloc();}\
|
||||
@ -359,7 +364,7 @@ typedef NRMat<C> producttype; \
|
||||
typedef typename LA_traits<C>::normtype normtype; \
|
||||
typedef X<typename LA_traits<C>::realtype> realtype; \
|
||||
typedef X<typename LA_traits<C>::complextype> complextype; \
|
||||
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;} \
|
||||
static inline bool smaller(const C &x, const C &y) {return x<y;} \
|
||||
static inline normtype norm (const X<C> &x) {return x.norm();} \
|
||||
@ -368,8 +373,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
|
||||
static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} \
|
||||
static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
|
||||
static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
|
||||
static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
|
||||
static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();} \
|
||||
static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
|
||||
static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();} \
|
||||
static void copyonwrite(X<C> &x) {x.copyonwrite();} \
|
||||
static void clearme(X<C> &x) {x.clear();} \
|
||||
static void deallocate(X<C> &x) {x.dealloc();} \
|
||||
|
22
laerror.cc
22
laerror.cc
@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
|
||||
laerror(msg0);
|
||||
}
|
||||
|
||||
int cblas_errprn(int ierr, int info, char *form, ...) {
|
||||
#ifndef NONCBLAS
|
||||
#include "cblas.h"
|
||||
#include <stdarg.h>
|
||||
extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...)
|
||||
{
|
||||
va_list argptr;
|
||||
|
||||
va_start(argptr, form);
|
||||
if (p)
|
||||
{
|
||||
fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout);
|
||||
fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
|
||||
}
|
||||
vfprintf(stdout, form, argptr);
|
||||
vfprintf(stderr, form, argptr);
|
||||
va_end(argptr);
|
||||
laerror("terminating in cblas_xerbla");
|
||||
}
|
||||
|
||||
extern "C" int cblas_errprn(int ierr, int info, char *form, ...) {
|
||||
char msg0[1024], *msg;
|
||||
va_list argptr;
|
||||
va_start(argptr, form);
|
||||
@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) {
|
||||
laerror(msg0);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
}//namespace
|
||||
|
179
mat.cc
179
mat.cc
@ -26,11 +26,8 @@
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
extern "C" {
|
||||
extern ssize_t read(int, void *, size_t);
|
||||
extern ssize_t write(int, const void *, size_t);
|
||||
}
|
||||
|
||||
namespace LA {
|
||||
|
||||
@ -77,14 +74,14 @@ const NRMat<T> NRMat<T>::otimes(const NRMat<T> &rhs, bool reversecolumns) const
|
||||
{
|
||||
T c = (*this)(i,j);
|
||||
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
|
||||
r( i*rhs.nn + k, l*mm + j ) = c*rhs(k,l);
|
||||
r( i*(size_t)rhs.nn + k, l*mm + j ) = c*rhs(k,l);
|
||||
}
|
||||
}else{
|
||||
for(i=0;i<nn;i++) for(j=0;j<mm;j++)
|
||||
{
|
||||
T c=(*this)(i,j);
|
||||
for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
|
||||
r( i*rhs.nn+k, j*rhs.mm+l ) = c *rhs(k,l);
|
||||
r( i*(size_t)rhs.nn+k, j*(size_t)rhs.mm+l ) = c *rhs(k,l);
|
||||
}
|
||||
}
|
||||
|
||||
@ -108,7 +105,7 @@ const NRVec<T> NRMat<T>::row(const int i, int l) const {
|
||||
#ifdef MATPTR
|
||||
v[i]
|
||||
#else
|
||||
v + i*l
|
||||
v + i*(size_t)l
|
||||
#endif
|
||||
, l);
|
||||
return r;
|
||||
@ -144,7 +141,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
|
||||
#ifdef MATPTR
|
||||
v[i][j]
|
||||
#else
|
||||
v[i*mm+j]
|
||||
v[i*(size_t)mm+j]
|
||||
#endif
|
||||
,dim ,transp);
|
||||
}
|
||||
@ -196,7 +193,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
|
||||
#ifdef MATPTR
|
||||
v[i][j]
|
||||
#else
|
||||
v[i*mm+j]
|
||||
v[i*(size_t)mm+j]
|
||||
#endif
|
||||
,dim,transp);
|
||||
}
|
||||
@ -476,13 +473,13 @@ NRMat<T> & NRMat<T>::operator-=(const T &a) {
|
||||
******************************************************************************/
|
||||
template <>
|
||||
const NRMat<double> NRMat<double>::operator-() const {
|
||||
const int nm = nn*mm;
|
||||
const size_t nm = (size_t)nn*mm;
|
||||
NRMat<double> result(nn, mm, getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu) {
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
for(register int i=0; i<nm; i++) result.v[0][i] = -v[0][i];
|
||||
for(register size_t i=0; i<nm; i++) result.v[0][i] = -v[0][i];
|
||||
#else
|
||||
memcpy(result.v, v, nm*sizeof(double));
|
||||
cblas_dscal(nm, -1., result.v, 1);
|
||||
@ -506,13 +503,13 @@ const NRMat<double> NRMat<double>::operator-() const {
|
||||
******************************************************************************/
|
||||
template <>
|
||||
const NRMat<complex<double> > NRMat<complex<double> >::operator-() const {
|
||||
const int nm = nn*mm;
|
||||
const size_t nm = (size_t)nn*mm;
|
||||
NRMat<complex<double> > result(nn, mm, getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location == cpu) {
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
for(register int i=0; i<nm; i++) result.v[0][i]= -v[0][i];
|
||||
for(register size_t i=0; i<nm; i++) result.v[0][i]= -v[0][i];
|
||||
#else
|
||||
memcpy(result.v, v, nm*sizeof(complex<double>));
|
||||
cblas_zscal(nm, &CMONE, result.v, 1);
|
||||
@ -539,9 +536,9 @@ const NRMat<T> NRMat<T>::operator-() const {
|
||||
|
||||
NRMat<T> result(nn, mm, getlocation());
|
||||
#ifdef MATPTR
|
||||
for(register int i=0; i<nn*mm; i++) result.v[0][i] = -v[0][i];
|
||||
for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i];
|
||||
#else
|
||||
for(register int i=0; i<nn*mm; i++) result.v[i] = -v[i];
|
||||
for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
@ -562,11 +559,11 @@ const NRMat<T> NRMat<T>::operator&(const NRMat<T> &b) const {
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem");
|
||||
|
||||
for(register int i=0; i<nn; i++){
|
||||
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*mm), 1, (float*)(result.v + i*(mm + b.mm)), 1);
|
||||
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*(size_t)mm), 1, (float*)(result.v + i*(size_t)(mm + b.mm)), 1);
|
||||
TEST_CUBLAS("cublasScopy");
|
||||
}
|
||||
for(register int i=0; i<b.nn; i++){
|
||||
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
|
||||
cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*(size_t)b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
|
||||
TEST_CUBLAS("cublasScopy");
|
||||
}
|
||||
}
|
||||
@ -582,7 +579,7 @@ const NRMat<T> NRMat<T>::operator|(const NRMat<T> &b) const {
|
||||
for (int j=0; j<mm; j++)
|
||||
for (int k=0; k<b.nn; k++)
|
||||
for (int l=0; l<b.mm; l++)
|
||||
result[i*b.nn+k][j*b.mm+l] = (*this)[i][j]*b[k][l];
|
||||
result[i*(size_t)b.nn+k][j*(size_t)b.mm+l] = (*this)[i][j]*b[k][l];
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -689,7 +686,7 @@ const NRVec<double> NRMat<double>::rsum() const {
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0;i<nn;i++){
|
||||
cublasDaxpy(mm, 1.0, v + i*mm, 1, result.v, 1);
|
||||
cublasDaxpy(mm, 1.0, v + i*(size_t)mm, 1, result.v, 1);
|
||||
TEST_CUBLAS("cublasDaxpy");
|
||||
}
|
||||
}
|
||||
@ -714,7 +711,7 @@ const NRVec<complex<double> > NRMat<complex<double> >::rsum() const {
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0;i<nn;i++){
|
||||
cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(result.v), 1);
|
||||
cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(result.v), 1);
|
||||
TEST_CUBLAS("cublasZaxpy");
|
||||
}
|
||||
}
|
||||
@ -748,14 +745,14 @@ const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int
|
||||
#ifdef MATPTR
|
||||
memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T));
|
||||
#else
|
||||
memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T));
|
||||
memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T));
|
||||
#endif
|
||||
}
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
|
||||
for(register int i=fromrow; i<=torow; ++i){
|
||||
cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
|
||||
cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
|
||||
TEST_CUBLAS("cublasScopy");
|
||||
}
|
||||
}
|
||||
@ -786,13 +783,13 @@ void NRMat<T>::storesubmatrix(const int fromrow, const int fromcol, const NRMat
|
||||
#ifdef MATPTR
|
||||
memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T));
|
||||
#else
|
||||
memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
|
||||
memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
|
||||
#endif
|
||||
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
|
||||
cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1);
|
||||
cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -821,8 +818,8 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
|
||||
v[j][i] = tmp;
|
||||
#else
|
||||
register int a, b;
|
||||
a = i*mm + j;
|
||||
b = j*mm + i;
|
||||
a = i*(size_t)mm + j;
|
||||
b = j*(size_t)mm + i;
|
||||
T tmp = v[a];
|
||||
v[a] = v[b];
|
||||
v[b] = tmp;
|
||||
@ -847,7 +844,7 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
|
||||
******************************************************************************/
|
||||
template<>
|
||||
NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
|
||||
const int nn_mm = nn*mm;
|
||||
const size_t nn_mm = (size_t)nn*mm;
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
@ -888,7 +885,7 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
|
||||
******************************************************************************/
|
||||
template<>
|
||||
NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
|
||||
const int nn_mm = nn*mm;
|
||||
const size_t nn_mm = (size_t) nn*mm;
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
@ -1079,7 +1076,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
|
||||
#ifdef MATPTR
|
||||
r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1);
|
||||
#else
|
||||
r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1);
|
||||
r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -1087,7 +1084,7 @@ const NRSMat<double> NRMat<double>::timestransposed() const {
|
||||
}else{
|
||||
for(i=0; i<nn; ++i){
|
||||
for(j=0; j<=i; ++j){
|
||||
r(i, j) = cublasDdot(nn, v + i*mm, 1, v + j*mm, 1);
|
||||
r(i, j) = cublasDdot(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
|
||||
TEST_CUBLAS("cublasDdot");
|
||||
}
|
||||
}
|
||||
@ -1113,7 +1110,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
|
||||
#ifdef MATPTR
|
||||
cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j));
|
||||
#else
|
||||
cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j));
|
||||
cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -1121,7 +1118,7 @@ const NRSMat<complex<double> > NRMat<complex<double> >::timestransposed() const
|
||||
}else{
|
||||
for(i=0; i<mm; ++i){
|
||||
for(j=0; j<=i; ++j){
|
||||
cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*mm), 1, (const cuDoubleComplex *)(v + j*mm), 1);
|
||||
cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*(size_t)mm), 1, (const cuDoubleComplex *)(v + j*(size_t)mm), 1);
|
||||
TEST_CUBLAS("cublasZdotc");
|
||||
r(i, j) = *(reinterpret_cast<complex<double>*> (&val));
|
||||
}
|
||||
@ -1172,7 +1169,7 @@ void NRMat<double>::randomize(const double &x) {
|
||||
}else{
|
||||
NRMat<double> tmp(nn, mm, cpu);
|
||||
double *tmp_data = tmp;
|
||||
for(register int i=0; i<nn*mm; ++i){
|
||||
for(register size_t i=0; i<(size_t)nn*mm; ++i){
|
||||
tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.);
|
||||
}
|
||||
tmp.moveto(this->location);
|
||||
@ -1203,7 +1200,7 @@ void NRMat<complex<double> >::randomize(const double &x) {
|
||||
}else{
|
||||
NRMat<complex<double> > tmp(nn, mm, cpu);
|
||||
complex<double> *tmp_data = tmp;
|
||||
for(register int i=0; i<nn*mm; ++i){
|
||||
for(register size_t i=0; i<(size_t)nn*mm; ++i){
|
||||
const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
|
||||
const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
|
||||
tmp_data[i] = complex<double>(re, im);
|
||||
@ -1226,10 +1223,10 @@ NRMat<double>& NRMat<double>::operator*=(const double &a) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_dscal(nn*mm, a, *this, 1);
|
||||
cblas_dscal((size_t)nn*mm, a, *this, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDscal(nn*mm, a, v, 1);
|
||||
cublasDscal((size_t)nn*mm, a, v, 1);
|
||||
TEST_CUBLAS("cublasDscal");
|
||||
}
|
||||
#endif
|
||||
@ -1249,11 +1246,11 @@ NRMat<complex<double> >::operator*=(const complex<double> &a) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zscal(nn*mm, &a, (*this)[0], 1);
|
||||
cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a));
|
||||
cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1);
|
||||
cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1);
|
||||
TEST_CUBLAS("cublasZscal");
|
||||
}
|
||||
#endif
|
||||
@ -1271,9 +1268,9 @@ NRMat<T> & NRMat<T>::operator*=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
#ifdef MATPTR
|
||||
for(register int i=0; i< nn*mm; i++) v[0][i] *= a;
|
||||
for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a;
|
||||
#else
|
||||
for(register int i=0; i< nn*mm; i++) v[i] *= a;
|
||||
for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a;
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
@ -1294,10 +1291,10 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double> &rhs) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
|
||||
cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
|
||||
cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1);
|
||||
TEST_CUBLAS("cublasDaxpy");
|
||||
}
|
||||
#endif
|
||||
@ -1320,10 +1317,10 @@ NRMat<complex<double> >::operator+=(const NRMat< complex<double> > &rhs) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
|
||||
cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
|
||||
cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
@ -1345,9 +1342,9 @@ NRMat<T> & NRMat<T>::operator+=(const NRMat<T> &rhs) {
|
||||
copyonwrite();
|
||||
|
||||
#ifdef MATPTR
|
||||
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
|
||||
for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
|
||||
#else
|
||||
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
|
||||
for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
@ -1368,10 +1365,10 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double> &rhs) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
|
||||
cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
|
||||
cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
@ -1395,10 +1392,10 @@ NRMat< complex<double> >::operator-=(const NRMat< complex<double> > &rhs) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
|
||||
cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
|
||||
cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
@ -1421,9 +1418,9 @@ NRMat<T> & NRMat<T>::operator-=(const NRMat<T> &rhs) {
|
||||
copyonwrite();
|
||||
|
||||
#ifdef MATPTR
|
||||
for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
|
||||
for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
|
||||
#else
|
||||
for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
|
||||
for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
@ -1693,10 +1690,10 @@ const double NRMat<double>::dot(const NRMat<double> &rhs) const {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1);
|
||||
ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
ret = cublasDdot(nn*mm, v, 1, rhs.v, 1);
|
||||
ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
@ -1721,10 +1718,10 @@ NRMat<complex<double> >::dot(const NRMat<complex<double> > &rhs) const {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
|
||||
cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
|
||||
cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
|
||||
ret = *(reinterpret_cast<complex<double>*> (&val));
|
||||
}
|
||||
#endif
|
||||
@ -1804,7 +1801,7 @@ void NRMat<double>::diagmultl(const NRVec<double> &rhs) {
|
||||
for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); }
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*mm, 1); }
|
||||
for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*(size_t)mm, 1); }
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1830,7 +1827,7 @@ void NRMat< complex<double> >::diagmultl(const NRVec< complex<double> > &rhs) {
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++){
|
||||
const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag());
|
||||
cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*mm), 1);
|
||||
cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*(size_t)mm), 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1913,7 +1910,7 @@ NRMat<double>::operator*(const NRSMat<double> &rhs) const {
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++){
|
||||
cublasDspmv('U', mm, 1.0, rhs.v, v + i*mm, 1, 0.0, result.v + i*rhs_ncols, 1);
|
||||
cublasDspmv('U', mm, 1.0, rhs.v, v + i*(size_t)mm, 1, 0.0, result.v + i*(size_t)rhs_ncols, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1947,7 +1944,7 @@ NRMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++){
|
||||
cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*rhs_ncols), 1);
|
||||
cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*(size_t)mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*(size_t)rhs_ncols), 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1974,10 +1971,10 @@ NRMat<complex<double> >& NRMat<complex<double> >::conjugateme() {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
|
||||
cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2);
|
||||
cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
@ -2048,12 +2045,12 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
|
||||
const char transa, const NRMat<double> &b, const char transb,
|
||||
const double &alpha) {
|
||||
|
||||
int k(transa=='n'?a.mm:a.nn);
|
||||
int k(tolower(transa)=='n'?a.mm:a.nn);
|
||||
|
||||
#ifdef DEBUG
|
||||
int l(transa=='n'?a.nn:a.mm);
|
||||
int kk(transb=='n'?b.nn:b.mm);
|
||||
int ll(transb=='n'?b.mm:b.nn);
|
||||
int l(tolower(transa)=='n'?a.nn:a.mm);
|
||||
int kk(tolower(transb)=='n'?b.nn:b.mm);
|
||||
int ll(tolower(transb)=='n'?b.mm:b.nn);
|
||||
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)");
|
||||
if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
|
||||
#endif
|
||||
@ -2066,8 +2063,8 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
|
||||
(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
|
||||
cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans),
|
||||
(tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
|
||||
a.mm, b , b.mm, beta, *this , mm);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -2083,20 +2080,20 @@ void NRMat<complex<double> >::gemm(const complex<double> & beta,
|
||||
const NRMat<complex<double> > & b, const char transb,
|
||||
const complex<double> & alpha)
|
||||
{
|
||||
int k(transa=='n'?a.mm:a.nn);
|
||||
int k(tolower(transa)=='n'?a.mm:a.nn);
|
||||
|
||||
#ifdef DEBUG
|
||||
int l(transa=='n'?a.nn:a.mm);
|
||||
int kk(transb=='n'?b.nn:b.mm);
|
||||
int ll(transb=='n'?b.mm:b.nn);
|
||||
int l(tolower(transa)=='n'?a.nn:a.mm);
|
||||
int kk(tolower(transb)=='n'?b.nn:b.mm);
|
||||
int ll(tolower(transb)=='n'?b.mm:b.nn);
|
||||
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)");
|
||||
#endif
|
||||
if (alpha==CZERO && beta==CONE) return;
|
||||
|
||||
copyonwrite();
|
||||
cblas_zgemm(CblasRowMajor,
|
||||
(transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
|
||||
(transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
|
||||
(tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)),
|
||||
(tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)),
|
||||
nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm);
|
||||
}
|
||||
|
||||
@ -2113,10 +2110,10 @@ const double NRMat<double>::norm(const double scalar) const {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
return cblas_dnrm2(nn*mm, (*this)[0], 1);
|
||||
return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
return cublasDnrm2(nn*mm, v, 1);
|
||||
return cublasDnrm2((size_t)nn*mm, v, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2130,7 +2127,7 @@ const double NRMat<double>::norm(const double scalar) const {
|
||||
#ifdef MATPTR
|
||||
tmp = v[i][j];
|
||||
#else
|
||||
tmp = v[i*mm+j];
|
||||
tmp = v[i*(size_t)mm+j];
|
||||
#endif
|
||||
if(i == j) tmp -= scalar;
|
||||
sum += tmp*tmp;
|
||||
@ -2152,10 +2149,10 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
return cblas_dznrm2(nn*mm, (*this)[0], 1);
|
||||
return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1);
|
||||
return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2168,7 +2165,7 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
|
||||
#ifdef MATPTR
|
||||
tmp = v[i][j];
|
||||
#else
|
||||
tmp = v[i*mm+j];
|
||||
tmp = v[i*(size_t)mm+j];
|
||||
#endif
|
||||
if(i == j) tmp -= scalar;
|
||||
const double re = tmp.real();
|
||||
@ -2195,10 +2192,10 @@ void NRMat<double>::axpy(const double alpha, const NRMat<double> &mat) {
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1);
|
||||
cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1);
|
||||
cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2221,7 +2218,7 @@ void NRMat<complex<double> >::axpy(const complex<double> alpha,
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1);
|
||||
cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
|
||||
@ -2245,7 +2242,7 @@ const T NRMat<T>::trace() const {
|
||||
#ifdef MATPTR
|
||||
for(register int i=0; i<nn; ++i) sum += v[i][i];
|
||||
#else
|
||||
for(register int i=0; i<nn*nn; i += (nn+1)) sum += v[i];
|
||||
for(register size_t i=0; i<(size_t)nn*nn; i += (nn+1)) sum += v[i];
|
||||
#endif
|
||||
return sum;
|
||||
}
|
||||
@ -2554,7 +2551,7 @@ NRMat<double>& NRMat<double>::swap_rows(){
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm, 1);
|
||||
cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm, 1);
|
||||
TEST_CUBLAS("cublasDswap");
|
||||
}
|
||||
}
|
||||
@ -2580,7 +2577,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows(){
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
|
||||
cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
|
||||
TEST_CUBLAS("cublasZswap");
|
||||
}
|
||||
}
|
||||
@ -2613,7 +2610,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
|
||||
}else{
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows");
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
|
||||
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
|
||||
TEST_CUBLAS("cublasSswap");
|
||||
}
|
||||
}
|
||||
@ -2745,7 +2742,7 @@ NRMat<double>& NRMat<double>::swap_rows_cols(){
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
|
||||
cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
|
||||
TEST_CUBLAS("cublasDswap");
|
||||
}
|
||||
|
||||
@ -2792,7 +2789,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows_cols(){
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
for(register int i=0;i<n_pul;i++){
|
||||
cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
|
||||
cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
|
||||
TEST_CUBLAS("cublasZswap");
|
||||
}
|
||||
if(nn & 1){
|
||||
@ -2817,7 +2814,7 @@ template<typename T>
|
||||
NRMat<T>& NRMat<T>::swap_rows_cols(){
|
||||
const int n_pul = nn >> 1;
|
||||
const int m_pul = mm >> 1;
|
||||
const int dim = nn*mm;
|
||||
const size_t dim = (size_t)nn*mm;
|
||||
|
||||
T *data_ptr;
|
||||
T tmp;
|
||||
@ -2837,7 +2834,7 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
|
||||
}else{
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols");
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
|
||||
cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
|
||||
TEST_CUBLAS("cublasSswap");
|
||||
}
|
||||
|
||||
|
109
mat.h
109
mat.h
@ -39,10 +39,10 @@ protected:
|
||||
T *v;//!< pointer to the data stored continuously in emmory
|
||||
#endif
|
||||
int *count;//!< reference counter
|
||||
public:
|
||||
#ifdef CUDALA
|
||||
GPUID location;
|
||||
#endif
|
||||
public:
|
||||
friend class NRVec<T>;
|
||||
friend class NRSMat<T>;
|
||||
|
||||
@ -89,16 +89,16 @@ public:
|
||||
//! explicit constructor converting vector into a <code>NRMat<T></code> object
|
||||
#ifdef MATPTR
|
||||
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
|
||||
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
|
||||
if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
|
||||
};
|
||||
#else
|
||||
explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
|
||||
#endif
|
||||
|
||||
#ifdef MATPTR
|
||||
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise
|
||||
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise
|
||||
#else
|
||||
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise
|
||||
const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise
|
||||
#endif
|
||||
|
||||
const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
|
||||
@ -107,7 +107,7 @@ public:
|
||||
inline int getcount() const {return count?*count:0;}
|
||||
|
||||
//! ensure that the data of this matrix are referenced exactly once
|
||||
void copyonwrite();
|
||||
void copyonwrite(bool detachonly=false);
|
||||
|
||||
/***************************************************************************//**
|
||||
* routines for CUDA related stuff
|
||||
@ -260,7 +260,7 @@ public:
|
||||
//! get the number of columns
|
||||
inline int ncols() const;
|
||||
//! get the number of matrix elements
|
||||
inline int size() const;
|
||||
inline size_t size() const;
|
||||
|
||||
//! unformatted input
|
||||
void get(int fd, bool dimensions = 1, bool transposed = false);
|
||||
@ -274,8 +274,8 @@ public:
|
||||
//! set all matrix elements equal to zero
|
||||
void clear(){
|
||||
if(nn&&mm){
|
||||
copyonwrite();
|
||||
LA_traits<T>::clear((*this)[0], nn*mm);
|
||||
copyonwrite(true);
|
||||
LA_traits<T>::clear((*this)[0], (size_t)nn*mm);
|
||||
}
|
||||
};
|
||||
|
||||
@ -379,7 +379,7 @@ template <typename T>
|
||||
NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
|
||||
T* p;
|
||||
*count = 1;
|
||||
const int nm = n*m;
|
||||
const size_t nm = (size_t)n*m;
|
||||
#ifdef CUDALA
|
||||
location = (loc==undefined?DEFAULT_LOC:loc);
|
||||
if(location == cpu) {
|
||||
@ -408,7 +408,7 @@ NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
|
||||
const int nm = n*m;
|
||||
const size_t nm = (size_t)n*m;
|
||||
T *p;
|
||||
*count = 1;
|
||||
|
||||
@ -447,7 +447,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n),
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
|
||||
const int nm = n*m;
|
||||
const size_t nm = (size_t)n*m;
|
||||
T *p;
|
||||
*count = 1;
|
||||
|
||||
@ -460,7 +460,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
|
||||
p = v[0] = new T[nm];
|
||||
for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
|
||||
#else
|
||||
p = v = new T[m*n];
|
||||
p = v = new T[nm];
|
||||
#endif
|
||||
if (a != (T)0)
|
||||
for (register int i=0; i<nm; i++) *p++ = a;
|
||||
@ -483,7 +483,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
|
||||
const int nm = n*m;
|
||||
const size_t nm = (size_t)n*m;
|
||||
#ifdef CUDALA
|
||||
location = DEFAULT_LOC;
|
||||
#endif
|
||||
@ -546,10 +546,10 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
|
||||
*count = 1;
|
||||
#ifdef MATPTR
|
||||
v = new T*[nn];
|
||||
v[0] = new T[mm*nn];
|
||||
v[0] = new T[(size_t)mm*nn];
|
||||
for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
|
||||
#else
|
||||
v = new T[mm*nn];
|
||||
v = new T[(size_t)mm*nn];
|
||||
#endif
|
||||
|
||||
#ifdef MATPTR
|
||||
@ -561,7 +561,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
|
||||
#else
|
||||
for (i=0; i<nn; i++){
|
||||
for (j=0; j<=i; j++){
|
||||
v[i*nn + j] = v[j*nn + i] = rhs[k++];
|
||||
v[i*(size_t)nn + j] = v[j*(size_t)nn + i] = rhs[k++];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -578,7 +578,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
|
||||
{
|
||||
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
|
||||
if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
|
||||
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
@ -628,7 +628,7 @@ inline T* NRMat<T>::operator[](const int i) {
|
||||
#ifdef MATPTR
|
||||
return v[i];
|
||||
#else
|
||||
return v + i*mm;
|
||||
return v + i*(size_t)mm;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -646,7 +646,7 @@ inline const T* NRMat<T>::operator[](const int i) const {
|
||||
#ifdef MATPTR
|
||||
return v[i];
|
||||
#else
|
||||
return v + i*mm;
|
||||
return v + i*(size_t)mm;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -668,7 +668,7 @@ inline T& NRMat<T>::operator()(const int i, const int j){
|
||||
#ifdef MATPTR
|
||||
return v[i][j];
|
||||
#else
|
||||
return v[i*mm + j];
|
||||
return v[i*(size_t)mm + j];
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -689,7 +689,7 @@ inline const T& NRMat<T>::operator()(const int i, const int j) const{
|
||||
#ifdef MATPTR
|
||||
return v[i][j];
|
||||
#else
|
||||
return v[i*mm + j];
|
||||
return v[i*(size_t)mm + j];
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -712,11 +712,11 @@ inline const T NRMat<T>::get_ij(const int i, const int j) const{
|
||||
#ifdef MATPTR
|
||||
return v[i][j];
|
||||
#else
|
||||
return v[i*mm + j];
|
||||
return v[i*(size_t)mm + j];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const int pozice = i*mm + j;
|
||||
const size_t pozice = i*(size_t)mm + j;
|
||||
gpuget(1, sizeof(T), v + pozice, &ret);
|
||||
return ret;
|
||||
}
|
||||
@ -743,8 +743,8 @@ inline int NRMat<T>::ncols() const{
|
||||
* @return number of elements
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
inline int NRMat<T>::size() const{
|
||||
return nn*mm;
|
||||
inline size_t NRMat<T>::size() const{
|
||||
return (size_t)nn*mm;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -795,7 +795,7 @@ inline const double NRMat<double>::amax() const{
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
double ret(0.0);
|
||||
const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
|
||||
const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1;
|
||||
TEST_CUBLAS("cublasIdamax");
|
||||
gpuget(1, sizeof(double), v + pozice, &ret);
|
||||
return ret;
|
||||
@ -815,7 +815,7 @@ inline const double NRMat<double>::amin() const{
|
||||
if(location == cpu){
|
||||
#endif
|
||||
// idamin seems not to be supported
|
||||
const int nm = nn*mm;
|
||||
const size_t nm = (size_t)nn*mm;
|
||||
double val(0.0);
|
||||
int index(-1);
|
||||
ret = std::numeric_limits<double>::max();
|
||||
@ -834,7 +834,7 @@ inline const double NRMat<double>::amin() const{
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
|
||||
const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1;
|
||||
TEST_CUBLAS("cublasIdamin");
|
||||
gpuget(1, sizeof(double), v + pozice, &ret);
|
||||
}
|
||||
@ -860,7 +860,7 @@ inline const complex<double> NRMat<complex<double> >::amax() const{
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
complex<double> ret(0.0, 0.0);
|
||||
const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
|
||||
const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
|
||||
TEST_CUBLAS("cublasIzamax");
|
||||
gpuget(1, sizeof(complex<double>), v + pozice, &ret);
|
||||
return ret;
|
||||
@ -881,7 +881,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
|
||||
if(location == cpu){
|
||||
#endif
|
||||
// idamin seems not to be supported
|
||||
const int nm = nn*mm;
|
||||
const size_t nm = (size_t)nn*mm;
|
||||
int index(-1);
|
||||
double val(0.0), min_val(0.0);
|
||||
complex<double> z_val(0.0, 0.0);
|
||||
@ -903,7 +903,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
|
||||
const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
|
||||
TEST_CUBLAS("cublasIzamin");
|
||||
gpuget(1, sizeof(complex<double>), v + pozice, &ret);
|
||||
}
|
||||
@ -991,7 +991,7 @@ NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
|
||||
* @see NRMat<T>::count, NRMat<T>::operator|=()
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRMat<T>::copyonwrite() {
|
||||
void NRMat<T>::copyonwrite(bool detachonly) {
|
||||
if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
|
||||
if(*count > 1){
|
||||
(*count)--;
|
||||
@ -1002,20 +1002,20 @@ void NRMat<T>::copyonwrite() {
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
T **newv = new T*[nn];
|
||||
newv[0] = new T[mm*nn];
|
||||
memcpy(newv[0], v[0], mm*nn*sizeof(T));
|
||||
newv[0] = new T[(size_t)mm*nn];
|
||||
if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T));
|
||||
v = newv;
|
||||
for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
|
||||
#else
|
||||
T *newv = new T[mm*nn];
|
||||
memcpy(newv, v, mm*nn*sizeof(T));
|
||||
T *newv = new T[(size_t)mm*nn];
|
||||
if(!detachonly) memcpy(newv, v, (size_t)mm*nn*sizeof(T));
|
||||
v = newv;
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{ //matrix is in GPU memory
|
||||
T *newv = (T *) gpualloc(mm*nn*sizeof(T));
|
||||
T *newv = (T *) gpualloc((size_t)mm*nn*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
|
||||
cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
if(!detachonly) cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
TEST_CUBLAS("cublasScopy");
|
||||
v = newv;
|
||||
}
|
||||
@ -1082,14 +1082,14 @@ void NRMat<T>::resize(int n, int m) {
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
v = new T*[nn];
|
||||
v[0] = new T[m*n];
|
||||
v[0] = new T[(size_t)m*n];
|
||||
for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
|
||||
#else
|
||||
v = new T[m*n];
|
||||
v = new T[(size_t)m*n];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
v = (T *) gpualloc(n*m*sizeof(T));
|
||||
v = (T *) gpualloc((size_t)n*m*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
return;
|
||||
@ -1108,15 +1108,15 @@ void NRMat<T>::resize(int n, int m) {
|
||||
delete[] v;
|
||||
#ifdef MATPTR
|
||||
v = new T*[nn];
|
||||
v[0] = new T[m*n];
|
||||
v[0] = new T[(size_t)m*n];
|
||||
for (int i=1; i< n; i++) v[i] = v[i-1] + m;
|
||||
#else
|
||||
v = new T[m*n];
|
||||
v = new T[(size_t)m*n];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
gpufree(v);
|
||||
v=(T *) gpualloc(n*m*sizeof(T));
|
||||
v=(T *) gpualloc((size_t)n*m*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1228,7 +1228,7 @@ public:
|
||||
#ifdef MATPTR
|
||||
return NRMat<T>::v[i - 1][j - 1];
|
||||
#else
|
||||
return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
|
||||
return NRMat<T>::v[(i-1)*(size_t)NRMat<T>::mm+j-1];
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1258,11 +1258,11 @@ public:
|
||||
#ifdef MATPTR
|
||||
return NRMat<T>::v[i - 1][j - 1];
|
||||
#else
|
||||
return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
|
||||
return NRMat<T>::v[(size_t)(i-1)*NRMat<T>::mm + (j-1)];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
const int pozice = (i-1)*NRMat<T>::mm + (j-1);
|
||||
const size_t pozice = (size_t)(i-1)*NRMat<T>::mm + (j-1);
|
||||
gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
|
||||
return ret;
|
||||
}
|
||||
@ -1286,10 +1286,10 @@ NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
|
||||
|
||||
copyonwrite();// ensure that *count == 1
|
||||
#ifdef MATPTR
|
||||
for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
|
||||
for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i];
|
||||
#else
|
||||
const int Dim = nn*mm;
|
||||
for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
|
||||
const size_t Dim = (size_t)nn*mm;
|
||||
for(register size_t i=0; i<Dim; i++) v[i] *= rhs.v[i];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
@ -1320,14 +1320,14 @@ void NRMat<T>::moveto(const GPUID dest) {
|
||||
T *vold = v;
|
||||
|
||||
if(dest == cpu){ //moving from GPU to CPU
|
||||
v = new T[nn*mm];
|
||||
gpuget(nn*mm, sizeof(T), vold, v);
|
||||
v = new T[(size_t)nn*mm];
|
||||
gpuget((size_t)nn*mm, sizeof(T), vold, v);
|
||||
if(*count == 1){ gpufree(vold); }
|
||||
else{ --(*count); count = new int(1); }
|
||||
|
||||
}else{ //moving from CPU to GPU
|
||||
v = (T *) gpualloc(nn*mm*sizeof(T));
|
||||
gpuput(nn*mm, sizeof(T), vold, v);
|
||||
v = (T *) gpualloc((size_t)nn*mm*sizeof(T));
|
||||
gpuput((size_t)nn*mm, sizeof(T), vold, v);
|
||||
if(*count == 1) delete[] vold;
|
||||
else{ --(*count); count = new int(1);}
|
||||
}
|
||||
@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -)
|
||||
|
||||
}//end of the LA-namespace
|
||||
#endif/* _LA_MAT_H_ */
|
||||
|
||||
|
39
noncblas.cc
39
noncblas.cc
@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
{
|
||||
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
|
||||
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
|
||||
char U = BLAS_FORTRANCASE('u');
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
|
||||
FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
|
||||
#else
|
||||
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
|
||||
FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
{
|
||||
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
|
||||
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
|
||||
char U = BLAS_FORTRANCASE('u');
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
|
||||
FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
|
||||
#else
|
||||
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
|
||||
FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
{
|
||||
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
|
||||
//swap a-b, m-n
|
||||
char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T');
|
||||
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
const FINT ldatmp=lda;
|
||||
const FINT ldbtmp=ldb;
|
||||
const FINT ldctmp=ldc;
|
||||
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
|
||||
FORNAME(dgemm) (&transb,&transa,
|
||||
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
|
||||
#else
|
||||
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
|
||||
FORNAME(dgemm) (&transb,&transa,
|
||||
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
|
||||
#endif
|
||||
}
|
||||
@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
{
|
||||
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
|
||||
//swap a-b, m-n
|
||||
char transb = BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T'));
|
||||
char transa = BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T'));
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
const FINT ldatmp=lda;
|
||||
const FINT ldbtmp=ldb;
|
||||
const FINT ldctmp=ldc;
|
||||
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
FORNAME(zgemm) (&transb,&transa,
|
||||
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
|
||||
#else
|
||||
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
FORNAME(zgemm) ( &transb,&transa,
|
||||
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
|
||||
#endif
|
||||
}
|
||||
@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
|
||||
const double *X, const int incX, const double beta,
|
||||
double *Y, const int incY)
|
||||
{
|
||||
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
|
||||
char transax = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
//swap n-m and toggle transposition
|
||||
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
#else
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
//swap n-m and toggle transposition
|
||||
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
|
||||
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
|
||||
if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
|
||||
//swap n-m and toggle transposition
|
||||
char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
|
||||
FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
|
||||
#else
|
||||
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
|
||||
FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
96
nonclass.cc
96
nonclass.cc
@ -17,6 +17,7 @@
|
||||
*/
|
||||
|
||||
//this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway
|
||||
#include "la_traits.h"
|
||||
#include "noncblas.h"
|
||||
#include "vec.h"
|
||||
#include "smat.h"
|
||||
@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const
|
||||
FINT r, *ipiv;
|
||||
a.copyonwrite();
|
||||
ipiv = new FINT[n];
|
||||
char U = 'U';
|
||||
char U = LAPACK_FORTRANCASE('u');
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=n;
|
||||
const FINT nrhstmp=nrhs;
|
||||
@ -298,9 +299,9 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
|
||||
const int A_rows = _A.nrows();
|
||||
const int A_cols = _A.ncols();
|
||||
|
||||
const char fact = _eq?'E':'N';
|
||||
const char trans = 'T';//because of c-order
|
||||
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
|
||||
const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
|
||||
const char trans = LAPACK_FORTRANCASE('T');//because of c-order
|
||||
char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
|
||||
|
||||
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
|
||||
laerror("linear_solve_x: invalid input matrices");
|
||||
@ -371,9 +372,9 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
|
||||
const int A_rows = _A.nrows();
|
||||
const int A_cols = _A.ncols();
|
||||
|
||||
const char fact = _eq?'E':'N';
|
||||
const char trans = 'T';//because of c-order
|
||||
char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
|
||||
const char fact = LAPACK_FORTRANCASE(_eq?'E':'N');
|
||||
const char trans = LAPACK_FORTRANCASE('T');//because of c-order
|
||||
char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
|
||||
|
||||
if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
|
||||
laerror("linear_solve_x: invalid input matrices");
|
||||
@ -557,9 +558,9 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
FINT r = 0;
|
||||
char U ='U';
|
||||
char vectors = 'V';
|
||||
if (!eivec) vectors = 'N';
|
||||
char U =LAPACK_FORTRANCASE('u');
|
||||
char vectors = LAPACK_FORTRANCASE('v');
|
||||
if (!eivec) vectors = LAPACK_FORTRANCASE('n');
|
||||
FINT LWORK = -1;
|
||||
double WORKX;
|
||||
FINT ldb=0; if(b) ldb=b->ncols();
|
||||
@ -588,7 +589,7 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
|
||||
#endif
|
||||
|
||||
delete[] WORK;
|
||||
if (vectors == 'V' && corder) a.transposeme(n);
|
||||
if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n);
|
||||
|
||||
if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
|
||||
if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()");
|
||||
@ -620,12 +621,13 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
FINT r = 0;
|
||||
char U ='U';
|
||||
char vectors = 'V';
|
||||
if (!eivec) vectors = 'N';
|
||||
char U =LAPACK_FORTRANCASE('U');
|
||||
char vectors = LAPACK_FORTRANCASE('V');
|
||||
if (!eivec) vectors = LAPACK_FORTRANCASE('n');
|
||||
FINT LWORK = -1;
|
||||
complex<double> WORKX;
|
||||
FINT ldb=0; if(b) ldb=b->ncols();
|
||||
std::cout << "test vectors "<<vectors<<std::endl;
|
||||
|
||||
// First call is to determine size of workspace
|
||||
double *RWORK = new double[3*n+2];
|
||||
@ -652,7 +654,7 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
|
||||
|
||||
delete[] WORK;
|
||||
delete[] RWORK;
|
||||
if (vectors == 'V' && corder) a.transposeme(n);
|
||||
if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();}
|
||||
|
||||
if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()");
|
||||
if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()");
|
||||
@ -684,8 +686,8 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
FINT r = 0;
|
||||
char U = 'U';
|
||||
char job = v ? 'v' : 'n';
|
||||
char U = LAPACK_FORTRANCASE('u');
|
||||
char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
|
||||
|
||||
double *WORK = new double[3*n];
|
||||
FINT ldv=v?v->ncols():n;
|
||||
@ -730,8 +732,8 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
FINT r = 0;
|
||||
char U = 'U';
|
||||
char job = v ? 'v' : 'n';
|
||||
char U = LAPACK_FORTRANCASE('u');
|
||||
char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
|
||||
|
||||
complex<double> *WORK = new complex<double>[2*n];
|
||||
double *RWORK = new double[3*n];
|
||||
@ -890,8 +892,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
|
||||
const int sorttype, const int biorthonormalize,
|
||||
NRMat<double> *b, NRVec<double> *beta)
|
||||
{
|
||||
if(n<=0) n = a.nrows();
|
||||
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
|
||||
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
|
||||
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
|
||||
if (n > wr.size())
|
||||
laerror("inconsistent dimension of eigen vector in gdiagonalize()");
|
||||
if (vl) if (n > vl->nrows() || n > vl->ncols())
|
||||
@ -911,8 +913,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
|
||||
if (beta) beta->copyonwrite();
|
||||
if (b) b->copyonwrite();
|
||||
|
||||
char jobvl = vl ? 'V' : 'N';
|
||||
char jobvr = vr ? 'V' : 'N';
|
||||
char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
|
||||
char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
|
||||
double work0;
|
||||
FINT lwork = -1;
|
||||
FINT r;
|
||||
@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
|
||||
NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
|
||||
{
|
||||
|
||||
if(n<=0) n = a.nrows();
|
||||
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
|
||||
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
|
||||
if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
|
||||
if (n > w.size())
|
||||
laerror("inconsistent dimension of eigen vector in gdiagonalize()");
|
||||
if (vl) if (n > vl->nrows() || n > vl->ncols())
|
||||
@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
|
||||
if (beta) beta->copyonwrite();
|
||||
if (b) b->copyonwrite();
|
||||
|
||||
char jobvl = vl ? 'V' : 'N';
|
||||
char jobvr = vr ? 'V' : 'N';
|
||||
char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
|
||||
char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
|
||||
complex<double> work0;
|
||||
FINT lwork = -1;
|
||||
FINT r;
|
||||
@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
|
||||
|
||||
|
||||
if (corder) {
|
||||
if (vl) vl->transposeme(n);
|
||||
if (vr) vr->transposeme(n);
|
||||
if (vl) {vl->transposeme(n); vl->conjugateme();}
|
||||
if (vr) {vr->transposeme(n); vr->conjugateme();}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
|
||||
const bool corder, int n, const int sorttype, const int biorthonormalize,
|
||||
NRMat<double> *b, NRVec<double> *beta)
|
||||
{
|
||||
if(n<=0) n = a.nrows();
|
||||
if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");
|
||||
if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
|
||||
if(n> a.nrows() || n == a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix");
|
||||
|
||||
NRVec<double> wr(n), wi(n);
|
||||
NRMat<double> *rvl = 0;
|
||||
@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
|
||||
template<>
|
||||
const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
NRMat<double> result(a.nrows(), a.ncols());
|
||||
|
||||
#ifdef CUDALA
|
||||
if(a.location == cpu){
|
||||
#endif
|
||||
// NRMat<double> result(a.nrows(), a.ncols());
|
||||
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -1242,11 +1246,13 @@ const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<dou
|
||||
template<>
|
||||
const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
|
||||
{
|
||||
NRMat<double> result(a.nrows(), a.ncols());
|
||||
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
if(a.location == cpu){
|
||||
#endif
|
||||
|
||||
NRMat<double> result(a.nrows(), a.ncols());
|
||||
// NRMat<double> result(a.nrows(), a.ncols());
|
||||
cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -1259,12 +1265,15 @@ const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<dou
|
||||
template<>
|
||||
const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
|
||||
{
|
||||
|
||||
NRMat <complex<double> > result(a.nrows(), a.ncols());
|
||||
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
if(a.location == cpu){
|
||||
#endif
|
||||
|
||||
|
||||
NRMat <complex<double> > result(a.nrows(), a.ncols());
|
||||
// NRMat <complex<double> > result(a.nrows(), a.ncols());
|
||||
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -1278,11 +1287,13 @@ const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &
|
||||
template<>
|
||||
const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
|
||||
{
|
||||
NRMat< complex<double> > result(a.nrows(), a.ncols());
|
||||
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
if(a.location == cpu){
|
||||
#endif
|
||||
|
||||
NRMat< complex<double> > result(a.nrows(), a.ncols());
|
||||
// NRMat< complex<double> > result(a.nrows(), a.ncols());
|
||||
cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -1577,7 +1588,7 @@ void cholesky(NRMat<double> &a, bool upper)
|
||||
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
|
||||
FINT lda=a.ncols();
|
||||
FINT n=a.nrows();
|
||||
char uplo=upper?'U':'L';
|
||||
char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
|
||||
FINT info;
|
||||
a.copyonwrite();
|
||||
FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
|
||||
@ -1596,7 +1607,7 @@ void cholesky(NRMat<complex<double> > &a, bool upper)
|
||||
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
|
||||
FINT lda=a.ncols();
|
||||
FINT n=a.nrows();
|
||||
char uplo=upper?'U':'L';
|
||||
char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
|
||||
FINT info;
|
||||
a.copyonwrite();
|
||||
a.transposeme();//switch to Fortran order
|
||||
@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1
|
||||
}
|
||||
|
||||
}//namespace
|
||||
|
||||
|
23
smat.cc
23
smat.cc
@ -28,11 +28,8 @@
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
|
||||
extern "C" {
|
||||
extern ssize_t read(int, void *, size_t);
|
||||
extern ssize_t write(int, const void *, size_t);
|
||||
}
|
||||
|
||||
namespace LA {
|
||||
|
||||
@ -130,7 +127,7 @@ NRSMat<T> & NRSMat<T>::operator=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
memset(v, 0, NN2*sizeof(T));
|
||||
for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
|
||||
for(register int i=0; i<nn; i++) v[(size_t)i*(i+1)/2 + i] = a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -157,11 +154,11 @@ const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
|
||||
|
||||
if(divide){
|
||||
for(register int i=0; i<nn; i++){
|
||||
const T a = v[i*(i+1)/2+i];
|
||||
const T a = v[(size_t)i*(i+1)/2+i];
|
||||
if(a != 0.) r[i] /= a;
|
||||
}
|
||||
}else{
|
||||
for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
|
||||
for(register int i=0; i<nn; i++) r[i] = v[(size_t)i*(i+1)/2+i];
|
||||
}
|
||||
|
||||
return divide?NULL:&r[0];
|
||||
@ -178,7 +175,7 @@ const NRSMat<T> NRSMat<T>::operator-() const {
|
||||
NOT_GPU(*this);
|
||||
|
||||
NRSMat<T> result(nn, getlocation());
|
||||
for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
|
||||
for(register size_t i = 0; i<NN2; i++) result.v[i]= -v[i];
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -239,7 +236,7 @@ const T NRSMat<T>::trace() const {
|
||||
NOT_GPU(*this);
|
||||
|
||||
T tmp = 0;
|
||||
for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
|
||||
for(register int i=0; i<nn; i++) tmp += v[(size_t)i*(i+1)/2+i];
|
||||
return tmp;
|
||||
}
|
||||
|
||||
@ -251,7 +248,7 @@ template<>
|
||||
void NRSMat<double>::randomize(const double &x) {
|
||||
NOT_GPU(*this);
|
||||
|
||||
for(int i=0; i<NN2; ++i){
|
||||
for(size_t i=0; i<NN2; ++i){
|
||||
v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
|
||||
}
|
||||
}
|
||||
@ -262,11 +259,11 @@ void NRSMat<double>::randomize(const double &x) {
|
||||
******************************************************************************/
|
||||
template<>
|
||||
void NRSMat<complex<double> >::randomize(const double &x) {
|
||||
for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register size_t i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register size_t i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
|
||||
for(register int i=0; i<nn; ++i){
|
||||
for(register int j=0; j<=i; ++j){
|
||||
if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
|
||||
if(i == j) v[i*(size_t)(i+1)/2+j].imag() = 0; //hermitean
|
||||
}
|
||||
}
|
||||
}
|
||||
|
70
smat.h
70
smat.h
@ -25,7 +25,7 @@
|
||||
#include "la_traits.h"
|
||||
|
||||
namespace LA {
|
||||
#define NN2 (nn*(nn+1)/2)
|
||||
#define NN2 ((size_t)nn*(nn+1)/2)
|
||||
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -134,15 +134,15 @@ public:
|
||||
void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
|
||||
void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
|
||||
|
||||
inline const T& operator[](const int ij) const;
|
||||
inline T& operator[](const int ij);
|
||||
inline const T& operator[](const size_t ij) const;
|
||||
inline T& operator[](const size_t ij);
|
||||
|
||||
inline const T& operator()(const int i, const int j) const;
|
||||
inline T& operator()(const int i, const int j);
|
||||
|
||||
inline int nrows() const;
|
||||
inline int ncols() const;
|
||||
inline int size() const;
|
||||
inline size_t size() const;
|
||||
|
||||
inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant
|
||||
const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
|
||||
@ -155,9 +155,9 @@ public:
|
||||
void get(int fd, bool dimensions = 1, bool transp = 0);
|
||||
void put(int fd, bool dimensions = 1, bool transp = 0) const;
|
||||
|
||||
void copyonwrite();
|
||||
void copyonwrite(bool detachonly=false);
|
||||
|
||||
void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
|
||||
void clear() {copyonwrite(true); LA_traits<T>::clear(v,NN2);}; //zero out
|
||||
void resize(const int n);
|
||||
void dealloc(void) {resize(0);}
|
||||
|
||||
@ -212,7 +212,7 @@ inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) {
|
||||
if(location == cpu){
|
||||
#endif
|
||||
v = new T[NN2];
|
||||
if(a != (T)0) for(register int i = 0; i<NN2; i++) v[i] = a;
|
||||
if(a != (T)0) for(register size_t i = 0; i<NN2; i++) v[i] = a;
|
||||
else memset(v, 0, NN2*sizeof(T));
|
||||
|
||||
#ifdef CUDALA
|
||||
@ -338,7 +338,7 @@ inline NRSMat<T> & NRSMat<T>::operator*=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
|
||||
copyonwrite();
|
||||
for(register int i = 0; i<NN2; ++i) v[i] *= a;
|
||||
for(register size_t i = 0; i<NN2; ++i) v[i] *= a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -353,7 +353,7 @@ inline NRSMat<T> & NRSMat<T>::operator+=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
|
||||
copyonwrite();
|
||||
for(register int i = 0; i<nn; i++) v[i*(i+1)/2 + i] += a;
|
||||
for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2 + i] += a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -368,7 +368,7 @@ inline NRSMat<T> & NRSMat<T>::operator-=(const T &a) {
|
||||
NOT_GPU(*this);
|
||||
|
||||
copyonwrite();
|
||||
for(register int i = 0; i<nn; i++) v[i*(i+1)/2+i] -= a;
|
||||
for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2+i] -= a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -438,7 +438,7 @@ inline NRSMat<T>& NRSMat<T>::operator+=(const NRSMat<T>& rhs) {
|
||||
SAME_LOC(*this, rhs);
|
||||
|
||||
copyonwrite();
|
||||
for(register int i = 0; i<NN2; ++i) v[i] += rhs.v[i];
|
||||
for(register size_t i = 0; i<NN2; ++i) v[i] += rhs.v[i];
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -507,7 +507,7 @@ inline NRSMat<T>& NRSMat<T>::operator-=(const NRSMat<T>& rhs) {
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
|
||||
for(register int i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
|
||||
for(register size_t i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -540,7 +540,7 @@ inline const NRMat<T> NRSMat<T>::operator-(const NRMat<T> &rhs) const {
|
||||
* @return reference to the corresponding matrix element
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
inline T& NRSMat<T>::operator[](const int ij) {
|
||||
inline T& NRSMat<T>::operator[](const size_t ij) {
|
||||
#ifdef DEBUG
|
||||
if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1");
|
||||
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
|
||||
@ -560,7 +560,7 @@ inline T& NRSMat<T>::operator[](const int ij) {
|
||||
* @return constant reference to the corresponding matrix element
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
inline const T & NRSMat<T>::operator[](const int ij) const {
|
||||
inline const T & NRSMat<T>::operator[](const size_t ij) const {
|
||||
#ifdef DEBUG
|
||||
if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
|
||||
if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object");
|
||||
@ -578,8 +578,8 @@ inline const T & NRSMat<T>::operator[](const int ij) const {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index(T i, T j) {
|
||||
return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i;
|
||||
inline size_t SMat_index(T i, T j) {
|
||||
return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index_igej(T i, T j) {
|
||||
return i*(i+1)/2+j;
|
||||
inline size_t SMat_index_igej(T i, T j) {
|
||||
return i*(size_t)(i+1)/2+j;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index_ilej(T i, T j) {
|
||||
return j*(j+1)/2+i;
|
||||
inline size_t SMat_index_ilej(T i, T j) {
|
||||
return j*(size_t)(j+1)/2+i;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index_1(T i, T j) {
|
||||
return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1;
|
||||
inline size_t SMat_index_1(T i, T j) {
|
||||
return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index_1igej(T i, T j) {
|
||||
return i*(i-1)/2+j-1;
|
||||
inline size_t SMat_index_1igej(T i, T j) {
|
||||
return i*(size_t)(i-1)/2+j-1;
|
||||
}
|
||||
|
||||
/***************************************************************************//**
|
||||
@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) {
|
||||
* @return cumulative index
|
||||
******************************************************************************/
|
||||
template<typename T>
|
||||
inline T SMat_index_1ilej(T i, T j) {
|
||||
return j*(j-1)/2+i-1;
|
||||
inline size_t SMat_index_1ilej(T i, T j) {
|
||||
return j*(size_t)(j-1)/2+i-1;
|
||||
}
|
||||
|
||||
//indexing for antisymmetric matrix (used by fourindex)
|
||||
|
||||
template<typename T>
|
||||
inline T ASMat_index(T i, T j)
|
||||
inline size_t ASMat_index(T i, T j)
|
||||
{
|
||||
if(i == j) return -1;
|
||||
return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i;
|
||||
return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T ASMat_index_1(T i, T j)
|
||||
inline size_t ASMat_index_1(T i, T j)
|
||||
{
|
||||
if(i == j) return -1;
|
||||
return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1;
|
||||
@ -715,7 +715,7 @@ inline int NRSMat<T>::ncols() const {
|
||||
* @return number of elements of this symmetric matrix of generalt type <code>T</code>
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
inline int NRSMat<T>::size() const {
|
||||
inline size_t NRSMat<T>::size() const {
|
||||
return NN2;
|
||||
}
|
||||
|
||||
@ -758,7 +758,7 @@ inline const double NRSMat<double>::amin() const {
|
||||
double val(0.0);
|
||||
int index(-1);
|
||||
ret = std::numeric_limits<double>::max();
|
||||
for(register int i = 0; i < NN2; i++){
|
||||
for(register size_t i = 0; i < NN2; i++){
|
||||
val = std::abs(v[i]);
|
||||
if(val < ret){ index = i; ret = val; }
|
||||
}
|
||||
@ -812,7 +812,7 @@ inline const complex<double> NRSMat<complex<double> >::amin() const{
|
||||
complex<double> z_val(0.0, 0.0);
|
||||
|
||||
min_val = std::numeric_limits<double>::max();
|
||||
for(register int i = 0; i < NN2; i++){
|
||||
for(register size_t i = 0; i < NN2; i++){
|
||||
z_val = v[i];
|
||||
val = std::abs(z_val.real()) + std::abs(z_val.imag());
|
||||
if(val < min_val){ index = i; min_val = val; }
|
||||
@ -920,7 +920,7 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs) {
|
||||
* @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite()
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRSMat<T>::copyonwrite() {
|
||||
void NRSMat<T>::copyonwrite(bool detachonly) {
|
||||
if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object");
|
||||
if(*count > 1){
|
||||
(*count)--;
|
||||
@ -931,12 +931,12 @@ void NRSMat<T>::copyonwrite() {
|
||||
if(location == cpu) {
|
||||
#endif
|
||||
newv = new T[NN2];
|
||||
memcpy(newv, v, NN2*sizeof(T));
|
||||
if(!detachonly) memcpy(newv, v, NN2*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
newv = (T *) gpualloc(NN2*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()");
|
||||
cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()"
|
||||
}
|
||||
#endif
|
||||
|
@ -132,8 +132,8 @@ public:
|
||||
void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
|
||||
inline bool issymmetric() const {return symmetric;}
|
||||
unsigned int length() const;
|
||||
void copyonwrite();
|
||||
void clear() {copyonwrite();unsort();deletelist();}
|
||||
void copyonwrite(bool detachonly=false);
|
||||
void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}}
|
||||
void simplify();
|
||||
const T trace() const;
|
||||
const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first
|
||||
|
10
t.cc
10
t.cc
@ -1799,6 +1799,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
|
||||
if(i!=j) err += abs(m2(i,j));
|
||||
cout <<"offdiagonality error = "<<err<<endl;
|
||||
|
||||
err=0;
|
||||
for(int i=0; i<m.nrows(); ++i) err += abs(m2(i,i) - eivals[i]);
|
||||
cout <<"eigenvalue error = "<<err<<endl;
|
||||
|
||||
|
||||
//test as general matrix
|
||||
NRVec<complex<double> > ww(m.nrows());
|
||||
NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows());
|
||||
@ -1812,6 +1817,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
|
||||
if(i!=j) err += abs(m3(i,j));
|
||||
cout <<"offdiagonality error 2 = "<<err<<endl;
|
||||
|
||||
err=0;
|
||||
for(int i=0; i<m.nrows(); ++i) err += abs(m3(i,i) - ww[i]);
|
||||
cout <<"eigenvalue error = "<<err<<endl;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
27
vec.cc
27
vec.cc
@ -27,11 +27,8 @@
|
||||
#include <errno.h>
|
||||
#include "vec.h"
|
||||
#include "qsort.h"
|
||||
#include <unistd.h>
|
||||
|
||||
extern "C" {
|
||||
extern ssize_t read(int, void *, size_t);
|
||||
extern ssize_t write(int, const void *, size_t);
|
||||
}
|
||||
|
||||
namespace LA {
|
||||
|
||||
@ -541,7 +538,7 @@ template<>
|
||||
void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
|
||||
const char trans, const double alpha, const NRVec &x) {
|
||||
#ifdef DEBUG
|
||||
if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
#endif
|
||||
SAME_LOC3(*this, x, A);
|
||||
copyonwrite();
|
||||
@ -549,10 +546,10 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
|
||||
#ifdef CUDALA
|
||||
if(location==cpu){
|
||||
#endif
|
||||
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
TEST_CUBLAS("cublasDgemv");
|
||||
}
|
||||
#endif
|
||||
@ -572,7 +569,7 @@ template<>
|
||||
void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
|
||||
const char trans, const double alpha, const NRVec<complex<double> > &x) {
|
||||
#ifdef DEBUG
|
||||
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
#endif
|
||||
SAME_LOC3(*this, x, A);
|
||||
copyonwrite();
|
||||
@ -580,16 +577,16 @@ void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
|
||||
#ifdef CUDALA
|
||||
if(location==cpu){
|
||||
#endif
|
||||
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
|
||||
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
|
||||
A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2);
|
||||
cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
|
||||
cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
|
||||
A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
|
||||
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
|
||||
TEST_CUBLAS("cublasDgemv");
|
||||
|
||||
cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
|
||||
cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
|
||||
TEST_CUBLAS("cublasDgemv");
|
||||
}
|
||||
#endif
|
||||
@ -611,14 +608,14 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
|
||||
const NRMat<complex<double> > &A, const char trans,
|
||||
const complex<double> alpha, const NRVec<complex<double> > &x) {
|
||||
#ifdef DEBUG
|
||||
if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
|
||||
#endif
|
||||
SAME_LOC3(*this, x, A);
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location == cpu){
|
||||
#endif
|
||||
cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
|
||||
cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
|
||||
A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1);
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
@ -626,7 +623,7 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
|
||||
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
|
||||
const cuDoubleComplex _beta = make_cuDoubleComplex(beta.real(), beta.imag());
|
||||
|
||||
cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(),
|
||||
cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(),
|
||||
_alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1);
|
||||
TEST_CUBLAS("cublasZgemv");
|
||||
}
|
||||
|
10
vec.h
10
vec.h
@ -149,10 +149,10 @@ public:
|
||||
#endif
|
||||
|
||||
//! create separate copy of the data corresponding to this vector
|
||||
void copyonwrite();
|
||||
void copyonwrite(bool detachonly=false);
|
||||
|
||||
//! purge this vector
|
||||
void clear() { copyonwrite(); LA_traits<T>::clear(v, nn); };
|
||||
void clear() { copyonwrite(true); LA_traits<T>::clear(v, nn); };
|
||||
|
||||
//! assignment operator assigns given vector
|
||||
NRVec& operator=(const NRVec &rhs);
|
||||
@ -806,7 +806,7 @@ NRVec<T>::~NRVec() {
|
||||
* make own copy of the underlying data connected with this vector
|
||||
******************************************************************************/
|
||||
template <typename T>
|
||||
void NRVec<T>::copyonwrite() {
|
||||
void NRVec<T>::copyonwrite(bool detachonly) {
|
||||
if(!count) laerror("copyonwrite of an undefined vector");
|
||||
if(*count > 1) {
|
||||
(*count)--;
|
||||
@ -817,12 +817,12 @@ void NRVec<T>::copyonwrite() {
|
||||
if(location == cpu){
|
||||
#endif
|
||||
newv = new T[nn];
|
||||
memcpy(newv, v, nn*sizeof(T));
|
||||
if(!detachonly) memcpy(newv, v, nn*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}else{
|
||||
newv = (T *) gpualloc(nn*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()");
|
||||
cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
|
||||
TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()"
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user