*** empty log message ***

This commit is contained in:
jiri 2010-06-25 15:28:19 +00:00
parent eb0aaf9adf
commit 074c943862
13 changed files with 1938 additions and 464 deletions

View File

@ -1,2 +1,70 @@
#include "la_traits.h"
#include "cuda_la.h"
#ifdef CUDALA
namespace LA {
GPUID DEFAULT_LOC = cpu;
void set_default_loc(const GPUID loc)
{
DEFAULT_LOC = loc;
}
void *gpualloc(size_t size)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(size,1,&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
return ptr;
}
void gpufree(void *ptr)
{
cublasStatus status = cublasFree(ptr);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
}
void gpuget(size_t n, size_t elsize, const void *from, void *to)
{
cublasStatus status;
status=cublasGetVector(n,elsize,from,1,to,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
}
void gpuput(size_t n, size_t elsize, const void *from, void *to)
{
cublasStatus status;
status=cublasSetVector(n,elsize,from,1,to,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
}
double *gpuputdouble(const double &x)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(1,sizeof(double),&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
return (double *)ptr;
}
complex<double> *gpuputcomplex(const complex<double> &x)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(1,sizeof(complex<double>),&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
return (complex<double> *)ptr;
}
}
#endif

56
cuda_la.h Normal file
View File

@ -0,0 +1,56 @@
#ifndef _CUDA_LA_H
#define _CUDA_LA_H
#ifdef CUDALA
#undef MATPTR
#include "cublas.h"
#endif
#include "la_traits.h"
namespace LA {
#ifdef CUDALA
#define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
#define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
#define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
#else
#define NOT_GPU(x) {}
#define SAME_LOC(x,y) {}
#define SAME_LOC3(x,y,z) {}
#endif
typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
#ifdef CUDALA
//global static instantiation of this class will provide automatic init/shutdown of GPU
class GPU_START {
public:
GPU_START(void)
{
cublasStatus status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
}
~GPU_START(void)
{
cublasStatus status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU");
}
};
extern void *gpualloc(size_t size);
extern void gpufree(void *ptr);
extern void gpuget(size_t n, size_t elsize, const void *from, void *to);
extern void gpuput(size_t n, size_t elsize, const void *from, void *to);
extern double *gpuputdouble(const double &x);
extern complex<double> *gpuputcomplex(const complex<double> &x);
void set_default_loc(const GPUID loc);
extern GPUID DEFAULT_LOC;
#endif
}
#endif

View File

@ -40,6 +40,8 @@
#include "laerror.h"
#include "cuda_la.h"
#ifdef NONCBLAS
#include "noncblas.h"
#else
@ -205,6 +207,8 @@ struct LA_traits_aux<complex<C>, scalar_true> {
typedef complex<C> elementtype;
typedef complex<C> producttype;
typedef C normtype;
typedef C realtype;
typedef complex<C> complextype;
static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
@ -230,6 +234,8 @@ struct LA_traits_aux<C, scalar_true> {
typedef C elementtype;
typedef C producttype;
typedef C normtype;
typedef C realtype;
typedef complex<C> complextype;
static inline C sqrabs(const C x) { return x*x;}
static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
static inline bool bigger(const C &x, const C &y) {return x>y;}
@ -261,6 +267,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
typedef C elementtype; \
typedef X<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \
@ -293,6 +301,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
typedef C elementtype; \
typedef NRMat<C> producttype; \
typedef typename LA_traits<C>::normtype normtype; \
typedef X<typename LA_traits<C>::realtype> realtype; \
typedef X<typename LA_traits<C>::complextype> complextype; \
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
static inline bool bigger(const C &x, const C &y) {return x>y;} \
static inline bool smaller(const C &x, const C &y) {return x<y;} \

View File

@ -25,6 +25,7 @@
#include <errno.h>
#include <stdarg.h>
#include "cuda_la.h"
#ifdef USE_TRACEBACK
#include "traceback.h"
@ -32,6 +33,11 @@
namespace LA {
//enforce GPU initialization by a global class instantization constructor
#ifdef CUDALA
GPU_START gpu_start_instant;
#endif
bool _LA_count_check=true;
extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
@ -45,6 +51,13 @@ void laerror(const char *s1)
std::cerr << s1 << "\n";
std::cout << s1 << "\n";
}
#ifdef CUDALA
{
cublasStatus s=cublasGetError();
std::cerr << "CUBLAS status = "<<s<<std::endl;
std::cout << "CUBLAS status = "<<s<<std::endl;
}
#endif
if(errno) perror("system error");
throw LAerror(s1);

322
mat.cc
View File

@ -122,6 +122,15 @@ return r;
template <typename T>
void NRMat<T>::put(int fd, bool dim, bool transp) const
{
#ifdef CUDALA
if(location!=cpu)
{
NRMat<T> tmp= *this;
tmp.moveto(cpu);
tmp.put(fd,dim,transp);
return;
}
#endif
errno=0;
if(dim)
{
@ -153,6 +162,17 @@ else LA_traits<T>::multiput(nn*mm,fd,
template <typename T>
void NRMat<T>::get(int fd, bool dim, bool transp)
{
#ifdef CUDALA
if(location!=cpu)
{
NRMat<T> tmp;
tmp.moveto(cpu);
tmp.get(fd,dim,transp);
tmp.moveto(location);
*this = tmp;
return;
}
#endif
int nn0,mm0;
errno=0;
if(dim)
@ -188,6 +208,43 @@ else LA_traits<T>::multiget(nn*mm,fd,
// Assign diagonal
template <>
NRMat<double> & NRMat<double>::operator=(const double &a)
{
copyonwrite();
#ifdef DEBUG
if (nn != mm) laerror("RMat.operator=scalar on non-square matrix");
#endif
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
memset(v[0],0,nn*nn*sizeof(double));
for (int i=0; i< nn; i++) v[i][i] = a;
#else
double n=0.;
cblas_dcopy(nn*nn, &n, 0, v, 1);
cblas_dcopy(nn, &a, 0, v, nn+1);
#endif
#ifdef CUDALA
}
else
{
double *d=gpuputdouble(0.);
cublasDcopy(nn*nn, d, 0, v, 1);
gpufree(d);
d=gpuputdouble(a);
cublasDcopy(nn, d, 0, v, nn+1);
gpufree(d);
}
#endif
return *this;
}
template <typename T>
NRMat<T> & NRMat<T>::operator=(const T &a)
{
@ -206,6 +263,64 @@ NRMat<T> & NRMat<T>::operator=(const T &a)
}
template <>
NRMat<double> & NRMat<double>::operator+=(const double&a)
{
copyonwrite();
#ifdef DEBUG
if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
#endif
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
for (int i=0; i< nn; i++) v[i][i] += a;
#else
cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1);
#endif
#ifdef CUDALA
}
else
{
double *d=gpuputdouble(a);
cublasDaxpy(nn, 1.0, d, 0, *this, nn+1);
gpufree(d);
}
#endif
return *this;
}
template <>
NRMat<double> & NRMat<double>::operator-=(const double&a)
{
copyonwrite();
#ifdef DEBUG
if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
#endif
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
for (int i=0; i< nn; i++) v[i][i] -= a;
#else
cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1);
#endif
#ifdef CUDALA
}
else
{
double *d=gpuputdouble(a);
cublasDaxpy(nn, -1.0, d, 0, *this, nn+1);
gpufree(d);
}
#endif
return *this;
}
// M += a
@ -240,6 +355,31 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
return *this;
}
template <>
const NRMat<double> NRMat<double>::operator-() const
{
NRMat<double> result(nn, mm);
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
#else
cblas_dscal(nn*mm, -1., v, 1);
#endif
#ifdef CUDALA
}
else
{
cublasDscal(nn*mm, -1., v, 1);
}
#endif
return result;
}
// unary minus
template <typename T>
const NRMat<T> NRMat<T>::operator-() const
@ -253,6 +393,7 @@ const NRMat<T> NRMat<T>::operator-() const
return result;
}
// direct sum
template <typename T>
const NRMat<T> NRMat<T>::operator&(const NRMat<T> & b) const
@ -540,7 +681,13 @@ template<>
NRMat<double> & NRMat<double>::operator*=(const double &a)
{
copyonwrite();
cblas_dscal(nn*mm, a, *this, 1);
#ifdef CUDALA
if(location==cpu)
#endif
cblas_dscal(nn*mm, a, *this, 1);
#ifdef CUDALA
else cublasDscal(nn*mm, a, v, 1);
#endif
return *this;
}
@ -559,6 +706,7 @@ NRMat< complex<double> >::operator*=(const complex<double> &a)
template <typename T>
NRMat<T> & NRMat<T>::operator*=(const T &a)
{
NOT_GPU(*this);
copyonwrite();
#ifdef MATPTR
for (int i=0; i< nn*mm; i++) v[0][i] *= a;
@ -578,8 +726,16 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double> &rhs)
if (nn != rhs.nn || mm!= rhs.mm)
laerror("Mat += Mat of incompatible matrices");
#endif
SAME_LOC(*this,rhs);
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
#ifdef CUDALA
else
cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
#endif
return *this;
}
@ -625,8 +781,16 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double> &rhs)
if (nn != rhs.nn || mm!= rhs.mm)
laerror("Mat -= Mat of incompatible matrices");
#endif
SAME_LOC(*this,rhs);
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
#ifdef CUDALA
else
cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
#endif
return *this;
}
@ -836,9 +1000,18 @@ const NRMat<double> NRMat<double>::operator*(const NRMat<double> &rhs) const
if (mm != rhs.nn) laerror("product of incompatible matrices");
if (rhs.mm <=0) laerror("illegal matrix dimension in gemm");
#endif
NRMat<double> result(nn, rhs.mm);
SAME_LOC(*this,rhs);
NRMat<double> result(nn, rhs.mm,rhs.getlocation());
#ifdef CUDALA
if(location==cpu)
#endif
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0,
*this, mm, rhs, rhs.mm, 0.0, result, rhs.mm);
#ifdef CUDALA
else
cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm);
#endif
return result;
}
@ -991,12 +1164,21 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()");
if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
#endif
SAME_LOC3(*this,a,b);
if (alpha==0.0 && beta==1.0) return;
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
a.mm, b , b.mm, beta, *this , mm);
#ifdef CUDALA
else
cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm);
#endif
}
@ -1028,7 +1210,20 @@ void NRMat< complex<double> >::gemm(const complex<double> & beta,
template<>
const double NRMat<double>::norm(const double scalar) const
{
if (!scalar) return cblas_dnrm2(nn*mm, (*this)[0], 1);
if (!scalar)
{
#ifdef CUDALA
if(location==cpu)
#endif
return cblas_dnrm2(nn*mm, (*this)[0], 1);
#ifdef CUDALA
else
return cublasDnrm2(nn*mm, v, 1);
#endif
}
NOT_GPU(*this);
double sum = 0;
for (int i=0; i<nn; i++)
for (int j=0; j<mm; j++) {
@ -1246,6 +1441,127 @@ else //vectors are columns
//------------------------------------------------------------------------------
// for a matrix A(1:nn,1:mm) performs Fortran-like
// operation A(nn:-1:1,:)
//------------------------------------------------------------------------------
template<>
NRMat<double>& NRMat<double>::SwapRows(){
copyonwrite();
const int n_pul = this->nn / 2;
double * const dataIn = this->v;
for(register int i=0; i<n_pul; i++){
cblas_dswap(mm, dataIn + i*mm, 1, dataIn + (nn-i-1)*mm, 1);
}
return *this;
}
//------------------------------------------------------------------------------
template<>
NRMat<complex<double> >& NRMat<complex<double> >::SwapRows(){
copyonwrite();
const int n = this->nn;
const int m = this->mm;
const int n_pul = this->nn / 2;
complex<double> * const dataIn = this->v;
for(register int i=0; i<n_pul; i++){
cblas_zswap(m, dataIn + i*m, 1, dataIn + (n-i-1)*m, 1);
}
return *this;
}
//------------------------------------------------------------------------------
template<typename T>
NRMat<T>& NRMat<T>::SwapRows(){
copyonwrite();
const int n = this->nn;
const int m = this->mm;
const int n_pul = this->nn / 2;
T * const dataIn = this->v;
for(register int i=0; i<n_pul; i++){
const int offset1 = i*m;
const int offset2 = (n-i-1)*m;
for(register int j=0;j<m;j++){
dataIn[offset1 + j] = dataIn[offset2 + j];
}
}
return *this;
}
//------------------------------------------------------------------------------
// for a matrix A(1:nn,1:mm) performs Fortran-like
// operation A(:,mm:-1:1)
//------------------------------------------------------------------------------
template<>
NRMat<double>& NRMat<double>::SwapCols(){
copyonwrite();
const int n = this->nn;
const int m = this->mm;
const int m_pul = m / 2;
double * const dataIn = this->v;
for(register int i=0; i<m_pul; i++){
cblas_dswap(n, dataIn + i, m, dataIn + (m-i-1), m);
}
return *this;
}
//------------------------------------------------------------------------------
template<>
NRMat<complex<double> >& NRMat<complex<double> >::SwapCols(){
copyonwrite();
const int n_pul = this->nn / 2;
const int m_pul = this->mm / 2;
complex<double> * const dataIn = this->v;
for(register int i=0; i<m_pul; i++){
cblas_zswap(nn, dataIn + i, mm, dataIn + (mm-i-1), mm);
}
return *this;
}
//------------------------------------------------------------------------------
template<typename T>
NRMat<T>& NRMat<T>::SwapCols(){
copyonwrite();
const int n_pul = nn / 2;
const int m_pul = mm / 2;
T * const dataIn = this->v;
for(register int i=0; i<m_pul; i++){
for(register int j=0;j<nn;j++){
const int jm = j*mm;
dataIn[i + jm] = dataIn[(mm-i-1) + jm];
}
}
return *this;
}
//------------------------------------------------------------------------------
// for a matrix A(1:nn,1:mm) performs Fortran-like
// operation A(nn:-1:1,mm:-1:1)
//------------------------------------------------------------------------------
template<typename T>
NRMat<T>& NRMat<T>::SwapRowsCols(){
this->copyonwrite();
const int n = this->nn;
const int m = this->mm;
T * const dataIn = this->v;
T * const dataOut = this->v;
const int Dim = n*m;
for(register int i=0;i<n;i++){
const int off = i*n;
for(register int j=0;j<m;j++){
const int offset = off + j;
dataOut[Dim - (offset + 1)] = dataIn[offset];
}
}
return *this;
}
//------------------------------------------------------------------------------
//////////////////////////////////////////////////////////////////////////////

276
mat.h
View File

@ -33,12 +33,20 @@ protected:
T *v;
#endif
int *count;
#ifdef CUDALA
GPUID location;
#endif
public:
friend class NRVec<T>;
friend class NRSMat<T>;
inline NRMat() : nn(0), mm(0), v(0), count(0) {};
inline NRMat(const int n, const int m);
inline NRMat() : nn(0), mm(0), v(0), count(0)
{
#ifdef CUDALA
location = DEFAULT_LOC;
#endif
};
inline NRMat(const int n, const int m ,const GPUID loc= undefined);
inline NRMat(const T &a, const int n, const int m);
NRMat(const T *a, const int n, const int m);
inline NRMat(const NRMat &rhs);
@ -57,6 +65,13 @@ public:
#endif
const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
inline int getcount() const {return count?*count:0;}
#ifdef CUDALA
inline GPUID getlocation() const {return location;}
void moveto(const GPUID dest);
#else
inline GPUID getlocation() const {return cpu;}
void moveto(const GPUID dest) {};
#endif
NRMat & operator=(const NRMat &rhs); //assignment
void randomize(const typename LA_traits<T>::normtype &x); //fill with random numbers
NRMat & operator=(const T &a); //assign a to diagonal
@ -88,7 +103,7 @@ public:
const NRMat operator*(const NRSMat<T> &rhs) const; // Mat * Smat
const NRMat operator&(const NRMat &rhs) const; // direct sum
const NRMat operator|(const NRMat<T> &rhs) const; // direct product
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<T> rsum() const; //sum of rows
const NRVec<T> csum() const; //sum of columns
@ -157,9 +172,14 @@ public:
namespace LA {
// ctors
template <typename T>
NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int)
{
*count = 1;
#ifdef CUDALA
location= (loc==undefined?DEFAULT_LOC:loc);
if(location==cpu)
{
#endif
#ifdef MATPTR
v = new T*[n];
v[0] = new T[m*n];
@ -167,14 +187,29 @@ NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
#else
v = new T[m*n];
#endif
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*m*sizeof(T));
}
#endif
}
template <typename T>
NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int)
{
#ifdef CUDALA
location=DEFAULT_LOC;
#endif
int i;
T *p;
*count = 1;
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
v = new T*[n];
p = v[0] = new T[m*n];
@ -186,12 +221,29 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
for (i=0; i< n*m; i++) *p++ = a;
else
memset(p, 0, n*m*sizeof(T));
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*m*sizeof(T));
cublasSetVector(n*m,sizeof(T),&a,0,v,1);
}
#endif
}
template <typename T>
NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int)
{
#ifdef CUDALA
location=DEFAULT_LOC;
#endif
*count = 1;
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
v = new T*[n];
v[0] = new T[m*n];
@ -201,11 +253,25 @@ NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new
v = new T[m*n];
memcpy(v, a, n*m*sizeof(T));
#endif
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*m*sizeof(T));
cublasSetVector(n*m,sizeof(T),a,1,v,1);
}
#endif
}
//copy constructor
template <typename T>
NRMat<T>::NRMat(const NRMat &rhs)
{
#ifdef CUDALA
location=rhs.location;
#endif
nn = rhs.nn;
mm = rhs.mm;
count = rhs.count;
@ -213,9 +279,16 @@ NRMat<T>::NRMat(const NRMat &rhs)
if (count) ++(*count);
}
template <typename T>
NRMat<T>::NRMat(const NRSMat<T> &rhs)
{
NOT_GPU(rhs);
#ifdef CUDALA
location=rhs.location;
#endif
int i;
nn = mm = rhs.nrows();
count = new int;
@ -244,6 +317,10 @@ NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
{
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
#ifdef CUDALA
location=rhs.location;
#endif
nn = n;
mm = m;
count = rhs.count;
@ -303,6 +380,7 @@ inline T & NRMat<T>::operator()(const int i, const int j)
if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1");
if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range");
if (!v) laerror("(,) for unallocated Mat");
NOT_GPU(*this);
#endif
#ifdef MATPTR
return v[i][j];
@ -310,12 +388,14 @@ inline T & NRMat<T>::operator()(const int i, const int j)
return v[i*mm+j];
#endif
}
template <typename T>
inline const T & NRMat<T>::operator()(const int i, const int j) const
{
#ifdef DEBUG
if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range");
if (!v) laerror("(,) for unallocated Mat");
NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct
#endif
#ifdef MATPTR
return v[i][j];
@ -391,7 +471,7 @@ inline const complex<double> NRMat< complex<double> >::amax() const
}
//basi stuff to be available for any type ... must be in .h
//basic stuff to be available for any type ... must be in .h
// dtor
template <typename T>
NRMat<T>::~NRMat()
@ -399,10 +479,21 @@ NRMat<T>::~NRMat()
if (!count) return;
if (--(*count) <= 0) {
if (v) {
#ifdef CUDALA
if(location==cpu)
#endif
{
#ifdef MATPTR
delete[] (v[0]);
#endif
delete[] v;
}
#ifdef CUDALA
else
{
gpufree(v);
}
#endif
}
delete count;
}
@ -415,14 +506,27 @@ NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs)
if (this !=&rhs)
{
if (count)
if (--(*count) ==0 ) {
if (--(*count) ==0 )
{
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
delete[] (v[0]);
#endif
delete[] v;
#ifdef CUDALA
}
else gpufree(v);
#endif
delete count;
}
v = rhs.v;
#ifdef CUDALA
location=rhs.location;
#endif
nn = rhs.nn;
mm = rhs.mm;
count = rhs.count;
@ -437,46 +541,8 @@ template <typename T>
NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs)
{
if (this == &rhs) return *this;
#ifdef DEBUG
if (!rhs.v) laerror("unallocated rhs in Mat operator |=");
#endif
if (count)
if (*count > 1) {
--(*count);
nn = 0;
mm = 0;
count = 0;
v = 0;
}
if (nn != rhs.nn || mm != rhs.mm) {
if (v) {
#ifdef MATPTR
delete[] (v[0]);
#endif
delete[] (v);
v = 0;
}
nn = rhs.nn;
mm = rhs.mm;
}
if (!v) {
#ifdef MATPTR
v = new T*[nn];
v[0] = new T[mm*nn];
#else
v = new T[mm*nn];
#endif
}
#ifdef MATPTR
for (int i=1; i< nn; i++) v[i] = v[i-1] + mm;
memcpy(v[0], rhs.v[0], nn*mm*sizeof(T));
#else
memcpy(v, rhs.v, nn*mm*sizeof(T));
#endif
if (!count) count = new int;
*count = 1;
*this = rhs;
this->copyonwrite();
return *this;
}
@ -486,9 +552,13 @@ void NRMat<T>::copyonwrite()
{
if (!count) laerror("Mat::copyonwrite of undefined matrix");
if (*count > 1) {
(*count)--;
count = new int;
*count = 1;
(*count)--;
count = new int;
*count = 1;
#ifdef CUDALA
if(location==cpu) //matrix is in CPU memory
{
#endif
#ifdef MATPTR
T **newv = new T*[nn];
newv[0] = new T[mm*nn];
@ -499,10 +569,21 @@ void NRMat<T>::copyonwrite()
T *newv = new T[mm*nn];
memcpy(newv, v, mm*nn*sizeof(T));
v = newv;
#endif
#ifdef CUDALA
}
else //matrix is in GPU memory
{
T *newv = (T *) gpualloc(mm*nn*sizeof(T));
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
cublasScopy(nn*mm*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
v = newv;
}
#endif
}
}
template <typename T>
void NRMat<T>::resize(int n, int m)
{
@ -519,10 +600,18 @@ if(m==0) n=0;
if(n==0 && m==0)
{
if(--(*count) <= 0) {
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
if(v) delete[] (v[0]);
#endif
if(v) delete[] v;
#ifdef CUDALA
}
else gpufree(v);
#endif
delete count;
}
count=0;
@ -543,6 +632,10 @@ if(m==0) n=0;
*count = 1;
nn = n;
mm = m;
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
v = new T*[nn];
v[0] = new T[m*n];
@ -550,12 +643,22 @@ if(m==0) n=0;
#else
v = new T[m*n];
#endif
#ifdef CUDALA
}
else
v = (T *) gpualloc(n*m*sizeof(T));
#endif
return;
}
// At this point *count = 1, check if resize is necessary
if (n!=nn || m!=mm) {
nn = n;
mm = m;
#ifdef CUDALA
if(location==cpu)
{
#endif
#ifdef MATPTR
delete[] (v[0]);
#endif
@ -566,6 +669,14 @@ if(m==0) n=0;
for (int i=1; i< n; i++) v[i] = v[i-1] + m;
#else
v = new T[m*n];
#endif
#ifdef CUDALA
}
else
{
gpufree(v);
v=(T *) gpualloc(n*m*sizeof(T));
}
#endif
}
}
@ -587,7 +698,11 @@ return r;
// I/O
template <typename T>
std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
{
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i,j,n,m;
n=x.nrows();
m=x.ncols();
@ -597,18 +712,43 @@ std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
for(j=0; j<m;j++) s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function
}
return s;
}
#ifdef CUDALA
}
else
{
NRMat<T> tmp=x;
tmp.moveto(cpu);
return s<<tmp;
}
#endif
}
template <typename T>
std::istream& operator>>(std::istream &s, NRMat<T> &x)
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i,j,n,m;
s >> n >> m;
x.resize(n,m);
typename LA_traits_io<T>::IOtype tmp;
for(i=0;i<n;i++) for(j=0; j<m;j++) { s>>tmp; x[i][j]=tmp;}
return s;
}
#ifdef CUDALA
}
else
{
NRMat<T> tmp;
tmp.moveto(cpu);
s >> tmp;
tmp.moveto(x.getlocation());
x=tmp;
return s;
}
#endif
}
//optional indexing from 1
@ -671,6 +811,38 @@ NRMat<T> & NRMat<T>::operator^=(const NRMat<T> &rhs){
}
#ifdef CUDALA
template<typename T>
void NRMat<T>::moveto(const GPUID dest)
{
if(location==dest) return;
location=dest;
if(v && !count) laerror("internal inconsistency of reference counting 1");
if (!count) return;
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
if(!v) return;
T *vold = v;
if(dest == cpu) //moving from GPU to CPU
{
v = new T[nn*mm];
gpuget(nn*mm,sizeof(T),vold,v);
if(*count == 1) gpufree(vold);
else {--(*count); count = new int(1);}
}
else //moving from CPU to GPU
{
v=(T *) gpualloc(nn*mm*sizeof(T));
gpuput(nn*mm,sizeof(T),vold,v);
if(*count == 1) delete[] vold;
else {--(*count); count = new int(1);}
}
}
#endif
//end CUDALA

View File

@ -20,106 +20,186 @@
#include "noncblas.h"
#include "laerror.h"
#include "mat.h"
#include "fortran.h"
#ifdef FORTRAN_
#define FORNAME(x) x##_
#else
#define FORNAME(x) x
#endif
#ifdef NONCBLAS
//Level 1 - straightforward wrappers
extern "C" double FORNAME(ddot) (const int *n, const double *x, const int *incx, const double *y, const int *incy);
extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
double cblas_ddot(const int N, const double *X, const int incX,
const double *Y, const int incY)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
#else
return FORNAME(ddot)(&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(dscal) (const int *n, const double *a, double *x, const int *incx);
extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
void cblas_dscal(const int N, const double alpha, double *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
#else
FORNAME(dscal) (&N,&alpha,X,&incX);
#endif
}
extern "C" void FORNAME(dcopy) (const int *n, const double *x, const int *incx, double *y, const int *incy);
extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
void cblas_dcopy(const int N, const double *X, const int incX,
double *Y, const int incY)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(dcopy) (&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(daxpy) (const int *n, const double *a, const double *x, const int *incx, double *y, const int *incy);
extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
void cblas_daxpy(const int N, const double alpha, const double *X,
const int incX, double *Y, const int incY)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
#else
FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
#endif
}
extern "C" double FORNAME(dnrm2) (const int *n, const double *x, const int *incx);
extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
double cblas_dnrm2(const int N, const double *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
#else
return FORNAME(dnrm2) (&N,X,&incX);
#endif
}
extern "C" double FORNAME(dasum) (const int *n, const double *x, const int *incx);
extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
double cblas_dasum(const int N, const double *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dasum) (&ntmp,X,&incxtmp);
#else
return FORNAME(dasum) (&N,X,&incX);
#endif
}
extern "C" void FORNAME(zcopy) (const int *n, const void *x, const int *incx, void *y, const int *incy);
extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
void cblas_zcopy(const int N, const void *X, const int incX,
void *Y, const int incY)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zcopy) (&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zaxpy) (const int *n, const void *a, const void *x, const int *incx, void *y, const int *incy);
extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
void cblas_zaxpy(const int N, const void *alpha, const void *X,
const int incX, void *Y, const int incY)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zscal) (const int *n, const void *a, void *x, const int *incx);
extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
#else
FORNAME(zscal)(&N,alpha,X,&incX);
#endif
}
extern "C" void FORNAME(zdscal) (const int *n, const double *a, void *x, const int *incx);
extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
#else
FORNAME(zdscal)(&N,&alpha,X,&incX);
#endif
}
extern "C" double FORNAME(dznrm2) (const int *n, const void *x, const int *incx);
extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
double cblas_dznrm2(const int N, const void *X, const int incX)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
#else
return FORNAME(dznrm2) (&N,X,&incX);
#endif
}
//the following ones are f2c-compatible, but is it truly portable???
extern "C" void FORNAME(zdotu) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
void cblas_zdotu_sub(const int N, const void *X, const int incX,
const void *Y, const int incY, void *dotu)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zdotc) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
void cblas_zdotc_sub(const int N, const void *X, const int incX,
const void *Y, const int incY, void *dotc)
{
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
#endif
}
@ -129,7 +209,7 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
//enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114};
extern "C" void FORNAME(dspmv) (const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy);
extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy);
void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *Ap,
const double *X, const int incX,
@ -137,11 +217,18 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
#else
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
#endif
}
extern "C" void FORNAME(zhpmv) (const char *uplo, const int *n, const void *alpha, const void *ap, const void *x, const int *incx, const void *beta, void *y, const int *incy);
extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy);
void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const int N, const void *alpha, const void *Ap,
const void *X, const int incX,
@ -149,20 +236,36 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
#else
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
#endif
}
//Level 2 and Level 3 on general matrices - take into account the transposed storage of matrices in Fortran and C
extern "C" void FORNAME(dger) (const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda);
extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda);
void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
const double alpha, const double *X, const int incX,
const double *Y, const int incY, double *A, const int lda)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap m-n, y-x
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
const FINT ldatmp=lda;
FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
#else
FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
#endif
}
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
@ -179,7 +282,7 @@ void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
}
extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const double alpha, const double *A,
@ -188,11 +291,22 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap a-b, m-n
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
#else
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
#endif
}
extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const void *alpha, const void *a, const int *lda, const void *b, const int *ldb, const void *beta, void *c, const int *ldc);
extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc);
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const void *alpha, const void *A,
@ -201,26 +315,49 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap a-b, m-n
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
#else
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
#endif
}
extern "C" void FORNAME(dgemv) (const char *TRANS, const int *M, const int *N, const double *ALPHA, const double *A, const int *LDA, const double *X, const int *INCX, const double *BETA, double *Y, const int *INCY);
extern "C" void FORNAME(dgemv) (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A, const FINT *LDA, const double *X, const FINT *INCX, const double *BETA, double *Y, const FINT *INCY);
void cblas_dgemv(const enum CBLAS_ORDER Order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const double alpha, const double *A, const int lda,
const double *X, const int incX, const double beta,
double *Y, const int incY)
{
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
#else
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
#endif
}
extern "C" void FORNAME(zgemv) (const char *TRANS, const int *M, const int *N, const void *ALPHA, const void *A, const int *LDA, const void *X, const int *INCX, const void *BETA, void *Y, const int *INCY);
extern "C" void FORNAME(zgemv) (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A, const FINT *LDA, const void *X, const FINT *INCX, const void *BETA, void *Y, const FINT *INCY);
void cblas_zgemv(const enum CBLAS_ORDER Order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const void *alpha, const void *A, const int lda,
@ -230,14 +367,29 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
//swap n-m and toggle transposition
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
#else
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
#endif
}
extern "C" int FORNAME(idamax) (const int *N, const double *DX, const int *INCX);
extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
{
return FORNAME(idamax)(&N,X,&incX);
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
#else
return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
#endif
}
@ -246,21 +398,38 @@ return FORNAME(idamax)(&N,X,&incX);
#ifdef NONCLAPACK
//clapack_dgesv
//allocate auxiliary storage and transpose input and output quantities to fortran/C order
extern "C" void FORNAME(dgesv) (const int *N, const int *NRHS, double *A, const int *LDA, int *IPIV, double *B, const int *LDB, int *INFO);
extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
double *A, const int lda, int *ipiv,
double *B, const int ldb)
{
int INFO=0;
FINT INFO=0;
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//B should be in the same physical order, just transpose A in place and the LU result on output
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
#ifdef FORINT
const FINT ntmp=N;
const FINT nrhstmp=NRHS;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
FINT ipivtmp=*ipiv;
FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
#else
FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
#endif
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
return INFO;
return (int)INFO;
}
#endif
void cblas_dswap(const int N, double *X, const int incX,
double *Y, const int incY){
LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
}
void cblas_zswap(const int N, void *X, const int incX,
void *Y, const int incY){
LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
}

View File

@ -23,14 +23,11 @@
#include "mat.h"
#include "nonclass.h"
#include "qsort.h"
#include "fortran.h"
namespace LA {
#ifdef FORTRAN_
#define FORNAME(x) x##_
#else
#define FORNAME(x) x
#endif
#define INSTANTIZE(T) \
template void lawritemat(FILE *file,const T *a,int r,int c,const char *form0, \
@ -191,16 +188,23 @@ linear_solve_do(A,&B[0],1,A.nrows(),det,n);
// Next routines are not available in clapack, fotran ones will be used with an
// additional swap/transpose of outputs when needed
extern "C" void FORNAME(dspsv)(const char *UPLO, const int *N, const int *NRHS,
double *AP, int *IPIV, double *B, const int *LDB, int *INFO);
extern "C" void FORNAME(dspsv)(const char *UPLO, const FINT *N, const FINT *NRHS,
double *AP, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const int ldb, double *det, int n)
{
int r, *ipiv;
FINT r, *ipiv;
a.copyonwrite();
ipiv = new int[n];
ipiv = new FINT[n];
char U = 'U';
#ifdef FORINT
const FINT ntmp=n;
const FINT nrhstmp=nrhs;
const FINT ldbtmp=ldb;
FORNAME(dspsv)(&U, &ntmp, &nrhstmp, a, ipiv, b, &ldbtmp,&r);
#else
FORNAME(dspsv)(&U, &n, &nrhs, a, ipiv, b, &ldb,&r);
#endif
if (r < 0) {
delete[] ipiv;
laerror("illegal argument in spsv() call of linear_solve()");
@ -239,8 +243,8 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
//other version of linear solver based on gesvx
//------------------------------------------------------------------------------
extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, complex<double> *A, const int *lda, complex<double> *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, complex<double> *B, const int *ldb, complex<double> *X, const int *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, int *info);
extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, double *A, const int *lda, double *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, double *B, const int *ldb, double *X, const int *ldx, double *rcond, double *ferr, double *berr, double *work, int *iwork, int *info);
extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, complex<double> *A, const FINT *lda, complex<double> *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, complex<double> *B, const FINT *ldb, complex<double> *X, const FINT *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, FINT *info);
extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, double *A, const FINT *lda, double *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, double *B, const FINT *ldb, double *X, const FINT *ldx, double *rcond, double *ferr, double *berr, double *work, FINT *iwork, FINT *info);
//------------------------------------------------------------------------------
// solves set of linear equations using dgesvx
// input:
@ -270,18 +274,18 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
double *A;
double * const _A_data = (double*)_A;
int info;
const int nrhs = _rhsCount;
const int n = _eqCount;
int lda = A_cols;
const int ldaf = lda;
FINT info;
const FINT nrhs = _rhsCount;
const FINT n = _eqCount;
FINT lda = A_cols;
const FINT ldaf = lda;
double rcond;
double ferr[nrhs], berr[nrhs], work[4*n];
double R[n], C[n];
int *const iwork = new int[n];
int *const ipiv = new int[n];
FINT *const iwork = new FINT[n];
FINT *const ipiv = new FINT[n];
double *X = new double[n*nrhs];
double *AF = new double[ldaf*n];
@ -303,7 +307,7 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
}
FORNAME(dgesvx)(&fact, &trans, &n, &nrhs, A, &lda, AF, &ldaf, &ipiv[0], &equed, &R[0], &C[0], _B, &n, X, &n, &rcond, ferr, berr, work, iwork, &info);
if(_rcond)*_rcond = rcond;
cblas_dcopy(n*nrhs, X, 1, _B, 1);//store the solution
@ -312,7 +316,7 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
if(_saveA){
delete[] A;
}
return info;
return (int)info;
}
//------------------------------------------------------------------------------
// solves set of linear equations using zgesvx
@ -343,18 +347,18 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
complex<double> *A;
complex<double> * const _A_data = (complex<double>*)_A;
int info;
const int nrhs = _rhsCount;
const int n = _eqCount;
int lda = A_cols;
const int ldaf = lda;
FINT info;
const FINT nrhs = _rhsCount;
const FINT n = _eqCount;
FINT lda = A_cols;
const FINT ldaf = lda;
double rcond;
double ferr[nrhs], berr[nrhs];
double R[n], C[n], rwork[2*n];
complex<double> work[2*n];
int *const ipiv = new int[n];
FINT *const ipiv = new FINT[n];
complex<double> *X = new complex<double>[n*nrhs];
complex<double> *AF = new complex<double>[ldaf*n];
@ -386,7 +390,7 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
if(_saveA){
delete[] A;
}
return info;
return (int)info;
}
//------------------------------------------------------------------------------
// for given square matrices A, B computes X = AB^{-1} as follows
@ -402,8 +406,8 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
//------------------------------------------------------------------------------
int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, double *_rcond){
const int n = _A.nrows();
const int m = _A.ncols();
const FINT n = _A.nrows();
const FINT m = _A.ncols();
if(n != m || n != _B.nrows() || n != _B.ncols()){
laerror("multiply_by_inverse: incompatible matrices");
}
@ -417,13 +421,13 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
double * const B = (double*)_B;
_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
int info;
FINT info;
double rcond;
double ferr[n], berr[n], work[4*n];
double R[n], C[n];
int *const iwork = new int[n];
int *const ipiv = new int[n];
FINT *const iwork = new FINT[n];
FINT *const ipiv = new FINT[n];
double *X = new double[n2];
double *AF = new double[n2];
@ -437,7 +441,7 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
delete[] iwork;delete[] ipiv;
delete[] AF;delete[] X;
return info;
return (int)info;
}
//------------------------------------------------------------------------------
// for given square matrices A, B computes X = AB^{-1} as follows
@ -453,8 +457,8 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
//------------------------------------------------------------------------------
int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B, bool _useEq, double *_rcond){
const int n = _A.nrows();
const int m = _A.ncols();
const FINT n = _A.nrows();
const FINT m = _A.ncols();
if(n != m || n != _B.nrows() || n != _B.ncols()){
laerror("multiply_by_inverse: incompatible matrices");
}
@ -468,13 +472,13 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
complex<double> * const B = (complex<double>*)_B;
_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
int info;
FINT info;
double rcond;
double ferr[n], berr[n];
double R[n], C[n], rwork[2*n];
complex<double> work[2*n];
int *const ipiv = new int[n];
FINT *const ipiv = new FINT[n];
complex<double> *X = new complex<double>[n2];
complex<double> *AF = new complex<double>[n2];
@ -488,24 +492,24 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
delete[] ipiv;
delete[] AF;delete[] X;
return info;
return (int)info;
}
//------------------------------------------------------------------------------
extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const int *N,
double *A, const int *LDA, double *W, double *WORK, const int *LWORK, int *INFO);
extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const FINT *N,
double *A, const FINT *LDA, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
extern "C" void FORNAME(dsygv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
double *A, const int *LDA, double *B, const int *LDB, double *W, double *WORK, const int *LWORK, int *INFO);
extern "C" void FORNAME(dsygv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
double *A, const FINT *LDA, double *B, const FINT *LDB, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
// a will contain eigenvectors (columns if corder==1), w eigenvalues
void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
const bool corder, int n, NRMat<double> *b, const int itype)
{
int m = a.nrows();
FINT m = a.nrows();
if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
if (a.nrows() != w.size())
laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@ -517,22 +521,38 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
w.copyonwrite();
if(b) b->copyonwrite();
int r = 0;
FINT r = 0;
char U ='U';
char vectors = 'V';
if (!eivec) vectors = 'N';
int LWORK = -1;
FINT LWORK = -1;
double WORKX;
int ldb=0; if(b) ldb=b->ncols();
FINT ldb=0; if(b) ldb=b->ncols();
#ifdef FORINT
const FINT itypetmp = itype;
FINT ntmp = n;
// First call is to determine size of workspace
if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
#else
// First call is to determine size of workspace
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, &r );
LWORK = (int)WORKX;
#endif
LWORK = (FINT)WORKX;
double *WORK = new double[LWORK];
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
#ifdef FORINT
if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
#else
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, &r );
delete[] WORK;
#endif
delete[] WORK;
if (vectors == 'V' && corder) a.transposeme(n);
if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
@ -541,18 +561,18 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const int *N,
complex<double> *A, const int *LDA, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const FINT *N,
complex<double> *A, const FINT *LDA, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
extern "C" void FORNAME(zhegv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
complex<double> *A, const int *LDA, complex<double> *B, const int *LDB, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
extern "C" void FORNAME(zhegv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
// a will contain eigenvectors (columns if corder==1), w eigenvalues
void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
const bool corder, int n, NRMat<complex<double> > *b, const int itype)
{
int m = a.nrows();
FINT m = a.nrows();
if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
if (a.nrows() != w.size())
laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@ -564,23 +584,38 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
w.copyonwrite();
if(b) b->copyonwrite();
int r = 0;
FINT r = 0;
char U ='U';
char vectors = 'V';
if (!eivec) vectors = 'N';
int LWORK = -1;
FINT LWORK = -1;
complex<double> WORKX;
int ldb=0; if(b) ldb=b->ncols();
FINT ldb=0; if(b) ldb=b->ncols();
// First call is to determine size of workspace
double *RWORK = new double[3*n+2];
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
#ifdef FORINT
const FINT itypetmp = itype;
FINT ntmp = n;
if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
#else
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, RWORK, &r );
LWORK = (int)WORKX.real();
#endif
LWORK = (FINT)WORKX.real();
complex<double> *WORK = new complex<double>[LWORK];
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
#ifdef FORINT
if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
#else
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, RWORK, &r );
delete[] WORK;
#endif
delete[] WORK;
delete[] RWORK;
if (vectors == 'V' && corder) a.transposeme(n);
@ -590,11 +625,11 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const int *N,
double *AP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const FINT *N,
double *AP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
extern "C" void FORNAME(dspgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
double *AP, double *BP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
extern "C" void FORNAME(dspgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
double *AP, double *BP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
// v will contain eigenvectors, w eigenvalues
@ -613,14 +648,21 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
if(v) v->copyonwrite();
if(b) b->copyonwrite();
int r = 0;
FINT r = 0;
char U = 'U';
char job = v ? 'v' : 'n';
double *WORK = new double[3*n];
int ldv=v?v->ncols():n;
if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
FINT ldv=v?v->ncols():n;
#ifdef FORINT
const FINT itypetmp = itype;
FINT ntmp = n;
if(b) FORNAME(dspgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
else FORNAME(dspev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
#else
if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
else FORNAME(dspev)(&job, &U, &n, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
#endif
delete[] WORK;
if (v && corder) v->transposeme(n);
@ -629,11 +671,11 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
}
extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const int *N,
complex<double> *AP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const FINT *N,
complex<double> *AP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);
extern "C" void FORNAME(zhpgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
extern "C" void FORNAME(zhpgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);
// v will contain eigenvectors, w eigenvalues
@ -652,15 +694,22 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
if(v) v->copyonwrite();
if(b) b->copyonwrite();
int r = 0;
FINT r = 0;
char U = 'U';
char job = v ? 'v' : 'n';
complex<double> *WORK = new complex<double>[2*n];
double *RWORK = new double[3*n];
int ldv=v?v->ncols():n;
FINT ldv=v?v->ncols():n;
#ifdef FORINT
const FINT itypetmp = itype;
FINT ntmp = n;
if(b) FORNAME(zhpgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
else FORNAME(zhpev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
#else
if(b) FORNAME(zhpgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
else FORNAME(zhpev)(&job, &U, &n, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
#endif
delete[] WORK;
delete[] RWORK;
if (v && corder) v->transposeme(n);
@ -671,17 +720,17 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
extern "C" void FORNAME(dgesvd)(const char *JOBU, const char *JOBVT, const int *M,
const int *N, double *A, const int *LDA, double *S, double *U, const int *LDU,
double *VT, const int *LDVT, double *WORK, const int *LWORK, int *INFO );
extern "C" void FORNAME(dgesvd)(const char *JOBU, const char *JOBVT, const FINT *M,
const FINT *N, double *A, const FINT *LDA, double *S, double *U, const FINT *LDU,
double *VT, const FINT *LDVT, double *WORK, const FINT *LWORK, FINT *INFO );
void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s,
NRMat<double> *v, const bool corder, int m, int n)
{
int m0 = a.nrows();
int n0 = a.ncols();
if(m<=0) m=m0;
if(n<=0) n=n0;
FINT m0 = a.nrows();
FINT n0 = a.ncols();
if(m<=0) m=(int)m0;
if(n<=0) n=(int)n0;
if(n>n0 || m>m0) laerror("bad dimension in singular_decomposition");
if (u) if (m > u->nrows() || m> u->ncols())
laerror("inconsistent dimension of U Mat in singular_decomposition()");
@ -700,14 +749,30 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
char jobu = u ? 'A' : 'N';
char jobv = v ? 'A' : 'N';
double work0;
int lwork = -1;
int r;
FINT lwork = -1;
FINT r;
#ifdef FORINT
FINT ntmp = n;
FINT mtmp = m;
FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
u?(*u)[0]:0, &m0, &work0, &lwork, &r);
#else
FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
u?(*u)[0]:0, &m0, &work0, &lwork, &r);
lwork = (int) work0;
#endif
lwork = (FINT) work0;
double *work = new double[lwork];
#ifdef FORINT
FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
u?(*u)[0]:0, &m0, work, &lwork, &r);
#else
FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
u?(*u)[0]:0, &m0, work, &lwork, &r);
#endif
delete[] work;
if (v && corder) v->transposeme(n);
@ -719,14 +784,14 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
//extern "C" void FORNAME(dgeqrf)(const int *M, const int *N, double *A, const int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const int *N,
double *A, const int *LDA, double *WR, double *WI, double *VL, const int *LDVL,
double *VR, const int *LDVR, double *WORK, const int *LWORK, int *INFO );
extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
double *A, const FINT *LDA, double *WR, double *WI, double *VL, const FINT *LDVL,
double *VR, const FINT *LDVR, double *WORK, const FINT *LWORK, FINT *INFO );
extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const int *N,
double *A, const int *LDA, double *B, const int *LDB, double *WR, double *WI, double *WBETA,
double *VL, const int *LDVL, double *VR, const int *LDVR,
double *WORK, const int *LWORK, int *INFO );
extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
double *A, const FINT *LDA, double *B, const FINT *LDB, double *WR, double *WI, double *WBETA,
double *VL, const FINT *LDVL, double *VR, const FINT *LDVR,
double *WORK, const FINT *LWORK, FINT *INFO );
//statics for sorting
@ -802,23 +867,41 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
char jobvl = vl ? 'V' : 'N';
char jobvr = vr ? 'V' : 'N';
double work0;
int lwork = -1;
int r;
int lda=a.ncols();
int ldb=0;
FINT lwork = -1;
FINT r;
FINT lda=a.ncols();
FINT ldb=0;
if(b) ldb=b->ncols();
int ldvl= vl?vl->ncols():lda;
int ldvr= vr?vr->ncols():lda;
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
FINT ldvl= vl?vl->ncols():lda;
FINT ldvr= vr?vr->ncols():lda;
#ifdef FORINT
FINT ntmp = n;
if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
#else
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
lwork = (int) work0;
#endif
lwork = (FINT) work0;
double *work = new double[lwork];
#ifdef FORINT
if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
#else
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
#endif
delete[] work;
//std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
@ -1208,16 +1291,16 @@ return r;
//Cholesky interface
extern "C" void FORNAME(dpotrf)(const char *UPLO, const int *N, double *A, const int *LDA, int *INFO);
extern "C" void FORNAME(zpotrf)(const char *UPLO, const int *N, complex<double> *A, const int *LDA, int *INFO);
extern "C" void FORNAME(dpotrf)(const char *UPLO, const FINT *N, double *A, const FINT *LDA, FINT *INFO);
extern "C" void FORNAME(zpotrf)(const char *UPLO, const FINT *N, complex<double> *A, const FINT *LDA, FINT *INFO);
void cholesky(NRMat<double> &a, bool upper)
{
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
int lda=a.ncols();
int n=a.nrows();
FINT lda=a.ncols();
FINT n=a.nrows();
char uplo=upper?'U':'L';
int info;
FINT info;
a.copyonwrite();
FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
if(info) {std::cerr << "Lapack error "<<info<<std::endl; laerror("error in Cholesky");}
@ -1233,10 +1316,10 @@ else
void cholesky(NRMat<complex<double> > &a, bool upper)
{
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
int lda=a.ncols();
int n=a.nrows();
FINT lda=a.ncols();
FINT n=a.nrows();
char uplo=upper?'U':'L';
int info;
FINT info;
a.copyonwrite();
a.transposeme();//switch to Fortran order
FORNAME(zpotrf)(&uplo, &n, a, &lda, &info);
@ -1250,14 +1333,14 @@ else
//various norms
extern "C" double FORNAME(zlange)( const char *NORM, const int *M, const int *N, complex<double> *A, const int *LDA, double *WORK);
extern "C" double FORNAME(dlange)( const char *NORM, const int *M, const int *N, double *A, const int *LDA, double *WORK);
extern "C" double FORNAME(zlange)( const char *NORM, const FINT *M, const FINT *N, complex<double> *A, const FINT *LDA, double *WORK);
extern "C" double FORNAME(dlange)( const char *NORM, const FINT *M, const FINT *N, double *A, const FINT *LDA, double *WORK);
double MatrixNorm(NRMat<complex<double> > &A, const char norm)
{
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
const int M = A.nrows();
const int N = A.ncols();
const FINT M = A.nrows();
const FINT N = A.ncols();
double work[M];
const double ret = FORNAME(zlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
return ret;
@ -1266,8 +1349,8 @@ double MatrixNorm(NRMat<complex<double> > &A, const char norm)
double MatrixNorm(NRMat<double > &A, const char norm)
{
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
const int M = A.nrows();
const int N = A.ncols();
const FINT M = A.nrows();
const FINT N = A.ncols();
double work[M];
const double ret = FORNAME(dlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
return ret;
@ -1276,15 +1359,15 @@ double MatrixNorm(NRMat<double > &A, const char norm)
//condition number
extern "C" void FORNAME(zgecon)( const char *norm, const int *n, complex<double> *A, const int *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, int *info);
extern "C" void FORNAME(dgecon)( const char *norm, const int *n, double *A, const int *LDA, const double *anorm, double *rcond, double *work, double *rwork, int *info);
extern "C" void FORNAME(zgecon)( const char *norm, const FINT *n, complex<double> *A, const FINT *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, FINT *info);
extern "C" void FORNAME(dgecon)( const char *norm, const FINT *n, double *A, const FINT *LDA, const double *anorm, double *rcond, double *work, double *rwork, FINT *info);
double CondNumber(NRMat<complex<double> > &A, const char norm)
{
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
const int N = A.nrows();
const FINT N = A.nrows();
double Norma(0.0), ret(0.0);
int info;
FINT info;
complex<double> *work;
double *rwork;
@ -1305,9 +1388,9 @@ double CondNumber(NRMat<complex<double> > &A, const char norm)
double CondNumber(NRMat<double> &A, const char norm)
{
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
const int N = A.nrows();
const FINT N = A.nrows();
double Norma(0.0), ret(0.0);
int info;
FINT info;
double *work;
double *rwork;

25
smat.cc
View File

@ -44,6 +44,16 @@ namespace LA {
template <typename T>
void NRSMat<T>::put(int fd, bool dim, bool transp) const
{
#ifdef CUDALA
if(location!=cpu)
{
NRSMat<T> tmp= *this;
tmp.moveto(cpu);
tmp.put(fd,dim,transp);
return;
}
#endif
errno=0;
if(dim)
{
@ -56,6 +66,18 @@ LA_traits<T>::multiput(NN2,fd,v,dim);
template <typename T>
void NRSMat<T>::get(int fd, bool dim, bool transp)
{
#ifdef CUDALA
if(location!=cpu)
{
NRSMat<T> tmp;
tmp.moveto(cpu);
tmp.get(fd,dim,transp);
tmp.moveto(location);
*this = tmp;
return;
}
#endif
int nn0[2]; //align at least 8-byte
errno=0;
if(dim)
@ -402,6 +424,9 @@ cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
}
//some template specializations leading to BLAS/CUBLAS calls
//////////////////////////////////////////////////////////////////////////////

263
smat.h
View File

@ -29,12 +29,20 @@ protected:
int nn;
T *v;
int *count;
#ifdef CUDALA
GPUID location;
#endif
public:
friend class NRVec<T>;
friend class NRMat<T>;
inline NRSMat() : nn(0),v(0),count(0) {};
inline explicit NRSMat(const int n); // Zero-based array
inline NRSMat() : nn(0),v(0),count(0)
{
#ifdef CUDALA
location = DEFAULT_LOC;
#endif
};
inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array
inline NRSMat(const T &a, const int n); //Initialize to constant
inline NRSMat(const T *a, const int n); // Initialize to array
inline NRSMat(const NRSMat &rhs); // Copy constructor
@ -45,6 +53,13 @@ public:
NRSMat & operator=(const NRSMat &rhs); //assignment
void randomize(const typename LA_traits<T>::normtype &x);
NRSMat & operator=(const T &a); //assign a to diagonal
#ifdef CUDALA
inline GPUID getlocation() const {return location;}
void moveto(const GPUID dest);
#else
inline GPUID getlocation() const {return cpu;}
void moveto(const GPUID dest) {};
#endif
const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits<T>::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise
const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);};
inline NRSMat & operator*=(const T &a);
@ -65,8 +80,8 @@ public:
const NRMat<T> operator*(const NRMat<T> &rhs) const; // SMat*Mat
const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate
const T dot(const NRVec<T> &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
const T* diagonalof(NRVec<T> &, const bool divide=0, bool cache=false) const; //get diagonal
void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
@ -108,29 +123,63 @@ namespace LA {
// ctors
template <typename T>
inline NRSMat<T>::NRSMat(const int n) : nn(n), v(new T[NN2]),
count(new int) {*count = 1;}
template <typename T>
inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n),
v(new T[NN2]), count(new int)
inline NRSMat<T>::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1))
{
*count =1;
if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
else memset(v, 0, NN2*sizeof(T));
#ifdef CUDALA
location= (loc==undefined?DEFAULT_LOC:loc);
if(location==cpu)
#endif
v=new T[NN2];
#ifdef CUDALA
else v= (T*) gpualloc(NN2*sizeof(T));
#endif
}
template <typename T>
inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n),
v(new T[NN2]), count(new int)
inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1))
{
*count = 1;
memcpy(v, a, NN2*sizeof(T));
#ifdef CUDALA
location=DEFAULT_LOC;
if(location==cpu)
#endif
{
v=new T[NN2];
if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
else memset(v, 0, NN2*sizeof(T));
}
#ifdef CUDALA
else
{
v= (T*) gpualloc(NN2*sizeof(T));
cublasSetVector(NN2,sizeof(T),&a,0,v,1);
}
#endif
}
template <typename T>
inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n), count(new int(1))
{
#ifdef CUDALA
location=DEFAULT_LOC;
if(location==cpu)
#endif
memcpy(v, a, NN2*sizeof(T));
#ifdef CUDALA
else
{
v= (T*) gpualloc(NN2*sizeof(T));
cublasSetVector(NN2,sizeof(T),a,1,v,1);
}
#endif
}
template <typename T>
inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
{
#ifdef CUDALA
location=rhs.location;
#endif
v = rhs.v;
nn = rhs.nn;
count = rhs.count;
@ -140,6 +189,9 @@ inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
template <typename T>
NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
{
#ifdef CUDALA
location=rhs.location;
#endif
nn = n;
#ifdef DEBUG
if (NN2 != rhs.size())
@ -150,6 +202,7 @@ NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
(*count)++;
}
// S *= a
template<>
inline NRSMat<double> & NRSMat<double>::operator*=(const double & a)
@ -437,33 +490,31 @@ NRSMat<T>::~NRSMat()
{
if (!count) return;
if (--(*count) <= 0) {
if (v) delete[] (v);
if (v)
{
#ifdef CUDALA
if(location==cpu)
#endif
delete[] v;
#ifdef CUDALA
else gpufree(v);
#endif
}
delete count;
}
}
// assignment with a physical copy
template <typename T>
NRSMat<T> & NRSMat<T>::operator|=(const NRSMat<T> &rhs)
{
if (this != &rhs) {
if(!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
if(count)
if(*count > 1) { // detach from the other
--(*count);
nn = 0;
count = 0;
v = 0;
}
if (nn != rhs.nn) {
if(v) delete [] (v);
nn = rhs.nn;
}
if (!v) v = new T[NN2];
if (!count) count = new int;
*count = 1;
memcpy(v, rhs.v, NN2*sizeof(T));
}
#ifdef DEBUG
if (!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
#endif
if (this == &rhs) return *this;
*this = rhs;
this->copyonwrite();
return *this;
}
@ -474,13 +525,24 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs)
{
if (this == & rhs) return *this;
if (count)
if(--(*count) == 0) {
delete [] v;
if(--(*count) == 0)
{
#ifdef CUDALA
if(location==cpu)
#endif
delete [] v;
#ifdef CUDALA
else
gpufree(v);
#endif
delete count;
}
}
v = rhs.v;
nn = rhs.nn;
count = rhs.count;
#ifdef CUDALA
location=rhs.location;
#endif
if (count) (*count)++;
return *this;
}
@ -495,9 +557,24 @@ void NRSMat<T>::copyonwrite()
(*count)--;
count = new int;
*count = 1;
T *newv = new T[NN2];
memcpy(newv, v, NN2*sizeof(T));
v = newv;
T *newv;
#ifdef CUDALA
if(location==cpu)
{
#endif
newv = new T[NN2];
memcpy(newv, v, NN2*sizeof(T));
#ifdef CUDALA
}
else
{
newv = (T *) gpualloc(NN2*sizeof(T));
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
}
#endif
v = newv;
}
}
@ -514,7 +591,16 @@ void NRSMat<T>::resize(const int n)
if(n==0)
{
if(--(*count) <= 0) {
if(v) delete[] (v);
if(v) {
#ifdef CUDALA
if(location==cpu)
#endif
delete[] (v);
#ifdef CUDALA
else
gpufree(v);
#endif
}
delete count;
}
count=0;
@ -534,16 +620,71 @@ void NRSMat<T>::resize(const int n)
count = new int;
*count = 1;
nn = n;
#ifdef CUDALA
if(location==cpu)
#endif
v = new T[NN2];
#ifdef CUDALA
else
v = (T*) gpualloc(NN2*sizeof(T));
#endif
return;
}
if (n != nn) {
nn = n;
delete[] v;
v = new T[NN2];
nn = n;
#ifdef CUDALA
if(location==cpu)
#endif
{
delete[] v;
v = new T[NN2];
}
#ifdef CUDALA
else
{
gpufree(v);
v = (T*) gpualloc(NN2*sizeof(T));
}
#endif
}
}
#ifdef CUDALA
template<typename T>
void NRSMat<T>::moveto(const GPUID dest)
{
if(location==dest) return;
location=dest;
if(v && !count) laerror("internal inconsistency of reference counting 1");
if (!count) return;
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
if(!v) return;
T *vold = v;
if(dest == cpu) //moving from GPU to CPU
{
v = new T[NN2];
gpuget(NN2,sizeof(T),vold,v);
if(*count == 1) gpufree(vold);
else {--(*count); count = new int(1);}
}
else //moving from CPU to GPU
{
v=(T *) gpualloc(NN2*sizeof(T));
gpuput(NN2,sizeof(T),vold,v);
if(*count == 1) delete[] vold;
else {--(*count); count = new int(1);}
}
}
#endif
template<typename T>
NRSMat<complex<T> > complexify(const NRSMat<T> &rhs)
@ -554,10 +695,15 @@ for(int i=0; i<rhs.nrows(); ++i)
return r;
}
// I/O
template <typename T>
std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i,j,n;
n=x.nrows();
s << n << ' ' << n << '\n';
@ -566,12 +712,25 @@ std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
for(j=0; j<n;j++) s << (typename LA_traits_io<T>::IOtype)x(i,j) << (j==n-1 ? '\n' : ' ');
}
return s;
#ifdef CUDALA
}
else
{
NRSMat<T> tmp=x;
tmp.moveto(cpu);
return s<<tmp;
}
#endif
}
template <typename T>
std::istream& operator>>(std::istream &s, NRSMat<T> &x)
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i,j,n,m;
s >> n >> m;
if(n!=m) laerror("input symmetric matrix not square");
@ -579,6 +738,18 @@ std::istream& operator>>(std::istream &s, NRSMat<T> &x)
typename LA_traits_io<T>::IOtype tmp;
for(i=0;i<n;i++) for(j=0; j<m;j++) {s>>tmp; x(i,j)=tmp;}
return s;
#ifdef CUDALA
}
else
{
NRSMat<T> tmp;
tmp.moveto(cpu);
s >> tmp;
tmp.moveto(x.getlocation());
x=tmp;
return s;
}
#endif
}

57
t.cc
View File

@ -24,6 +24,7 @@
using namespace std;
using namespace LA;
extern void test(const NRVec<double> &x);
@ -1551,7 +1552,7 @@ cout <<"Error = "<<(cx.transpose(true)*cx -bb).norm()<<endl;
}
if(1)
if(0)
{
int n;
cin >>n;
@ -1574,5 +1575,59 @@ cout <<"Cholesky factor filling = "<<b.simplify()<<endl;
}
if(0)
{
int n;
cin >>n;
NRMat<double> a(n,n),b(n,n);
a.randomize(1.);
b.randomize(1.);
NRMat<double>c;
double t=clock()/((double) (CLOCKS_PER_SEC));
int rep=1+10000000000LL/n/n/n;
cout <<"Repetitions " <<rep<<endl;
for(int i=0; i<rep; ++i) c=a*b;
cout <<"CPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
NRMat<double>cgpu;
a.moveto(gpu1);
b.moveto(gpu1);
t=clock()/((double) (CLOCKS_PER_SEC));
for(int i=0; i<rep; ++i) cgpu=a*b;
cout <<"GPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
cgpu.moveto(cpu);
cout << "Error = "<<(c-cgpu).norm()<<endl;
}
if(1)
{
int n;
cin >>n;
NRMat<double> a(n,n);
a.randomize(1.);
NRVec<double> v(n);
v.randomize(1.);
NRSMat<double> as(n);
as.randomize(1.);
NRVec<double>w = a*v;
NRVec<double>ws = as*v;
NRMat<double>c(n,n);
c=exp(a);
a.moveto(gpu1);
v.moveto(gpu1);
as.moveto(gpu1);
NRVec<double>wgpu = a*v;
NRVec<double>wsgpu = as*v;
w.moveto(gpu1);
ws.moveto(gpu1);
cout << "Error gemv = "<<(w-wgpu).norm()<<endl;
cout << "Error symv = "<<(ws-wsgpu).norm()<<endl;
NRMat<double>cgpu;
cgpu=exp(a);
c.moveto(gpu1);
cout << "Error = "<<(c-cgpu).norm()<<endl;
}
}

46
vec.cc
View File

@ -84,6 +84,16 @@ NRVec<T>::NRVec(const NRMat<T> &rhs)
template <typename T>
void NRVec<T>::put(int fd, bool dim, bool transp) const
{
#ifdef CUDALA
if(location!=cpu)
{
NRVec<T> tmp= *this;
tmp.moveto(cpu);
tmp.put(fd,dim,transp);
return;
}
#endif
errno=0;
int pad=1; //align at least 8-byte
if(dim)
@ -94,9 +104,21 @@ if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("cannot write");
LA_traits<T>::multiput(nn,fd,v,dim);
}
template <typename T>
void NRVec<T>::get(int fd, bool dim, bool transp)
{
#ifdef CUDALA
if(location!=cpu)
{
NRVec<T> tmp;
tmp.moveto(cpu);
tmp.get(fd,dim,transp);
tmp.moveto(location);
*this = tmp;
return;
}
#endif
int nn0[2]; //align at least 8-byte
errno=0;
if(dim)
@ -319,9 +341,16 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
if ((trans == 'n'?A.ncols():A.nrows()) != x.size())
laerror("incompatible sizes in gemv A*x");
#endif
SAME_LOC3(*this,x,A);
copyonwrite();
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans),
A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
#ifdef CUDALA
if(location==cpu)
#endif
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
#ifdef CUDALA
else
cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1);
#endif
}
@ -362,8 +391,16 @@ void NRVec<double>::gemv(const double beta, const NRSMat<double> &A,
#ifdef DEBUG
if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x");
#endif
SAME_LOC3(*this,A,x);
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1);
#ifdef CUDALA
else
cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1);
#endif
}
template<>
@ -420,6 +457,7 @@ NRVec<complex<double> >::otimes(const NRVec< complex<double> > &b, const bool co
template<typename T>
int NRVec<T>::sort(int direction, int from, int to, int *perm)
{
NOT_GPU(*this);
copyonwrite();
if(to == -1) to=nn-1;
if(direction) return memqsort<1,NRVec<T>,int,int>(*this,perm,from,to);
@ -427,6 +465,10 @@ else return memqsort<0,NRVec<T>,int,int>(*this,perm,from,to);
}
//////////////////////////////////////////////////////////////////////////////
//// forced instantization in the corespoding object file

714
vec.h
View File

@ -30,6 +30,9 @@ template <typename T> void lawritemat(FILE *file,const T *a,int r,int c,
// Memory allocated constants for cblas routines
const static complex<double> CONE = 1.0, CMONE = -1.0, CZERO = 0.0;
#ifdef CUDALA
const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.};
#endif
// Macros to construct binary operators +,-,*, from +=, -=, *=
// for 3 cases: X + a, a + X, X + Y
@ -44,7 +47,7 @@ template<class T> \
#define NRVECMAT_OPER2(E,X) \
template<class T> \
inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
{ return NR##E(*this) X##= a; }
@ -55,12 +58,32 @@ protected:
int nn;
T *v;
int *count;
#ifdef CUDALA
GPUID location;
#endif
public:
friend class NRSMat<T>;
friend class NRMat<T>;
inline NRVec(): nn(0),v(0),count(0){};
explicit inline NRVec(const int n) : nn(n), v(new T[n]), count(new int(1)) {};
inline NRVec(): nn(0),v(0),count(0)
{
#ifdef CUDALA
location = DEFAULT_LOC;
#endif
};
explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1))
{
#ifdef CUDALA
if(loc==undefined) location = DEFAULT_LOC; else location = loc;
if(location==cpu)
#endif
v= new T[n];
#ifdef CUDALA
else
v= (T*) gpualloc(n*sizeof(T));
#endif
};
inline NRVec(const T &a, const int n);
inline NRVec(const T *a, const int n);
inline NRVec(T *a, const int n, bool skeleton);
@ -71,6 +94,13 @@ public:
explicit NRVec(const NRMat<T> &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {};
#else
explicit NRVec(const NRMat<T> &rhs);
#endif
#ifdef CUDALA
inline GPUID getlocation() const {return location;}
void moveto(const GPUID dest);
#else
inline GPUID getlocation() const {return cpu;}
void moveto(const GPUID dest) {};
#endif
NRVec & operator=(const NRVec &rhs);
NRVec & operator=(const T &a); //assign a to every element
@ -103,8 +133,8 @@ public:
void gemv(const T beta, const SparseMat<T> &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false);
void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
const NRVec operator*(const SparseMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
const NRMat<T> otimes(const NRVec<T> &rhs, const bool conjugate=false, const T &scale=1) const; //outer product
inline const NRMat<T> operator|(const NRVec<T> &rhs) const {return otimes(rhs,true);};
@ -150,29 +180,58 @@ public:
#include "sparsemat.h"
#include "sparsesmat.h"
namespace LA {
// formatted I/O
template <typename T>
std::ostream & operator<<(std::ostream &s, const NRVec<T> &x)
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i, n;
n = x.size();
s << n << std::endl;
for(i=0; i<n; i++) s << (typename LA_traits_io<T>::IOtype)x[i] << (i == n-1 ? '\n' : ' ');
return s;
#ifdef CUDALA
}
else
{
NRVec<T> tmp=x;
tmp.moveto(cpu);
return s<<tmp;
}
#endif
}
template <typename T>
std::istream & operator>>(std::istream &s, NRVec<T> &x)
{
#ifdef CUDALA
if(x.getlocation()==cpu)
{
#endif
int i,n;
s >> n;
x.resize(n);
typename LA_traits_io<T>::IOtype tmp;
for(i=0; i<n; i++) {s >> tmp; x[i]=tmp;}
return s;
#ifdef CUDALA
}
else
{
NRVec<T> tmp;
tmp.moveto(cpu);
s >> tmp;
tmp.moveto(x.getlocation());
x=tmp;
return s;
}
#endif
}
@ -180,22 +239,51 @@ std::istream & operator>>(std::istream &s, NRVec<T> &x)
// ctors
template <typename T>
inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), v(new T[n]), count(new int)
inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), count(new int)
{
*count = 1;
#ifdef CUDALA
location=DEFAULT_LOC;
if(location==cpu)
{
#endif
v = new T[n];
if(a != (T)0)
for(int i=0; i<n; i++)
v[i] = a;
else
memset(v, 0, nn*sizeof(T));
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*sizeof(T));
cublasSetVector(n,sizeof(T),&a,0,v,1);
}
#endif
}
template <typename T>
inline NRVec<T>::NRVec(const T *a, const int n) : nn(n), count(new int)
{
v=new T[n];
*count = 1;
memcpy(v, a, n*sizeof(T));
#ifdef CUDALA
location=DEFAULT_LOC;
if(location==cpu)
{
#endif
v=new T[n];
*count = 1;
memcpy(v, a, n*sizeof(T));
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*sizeof(T));
cublasSetVector(n,sizeof(T),a,1,v,1);
}
#endif
}
template <typename T>
@ -203,12 +291,28 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
{
if(!skeleton)
{
#ifdef CUDALA
location=DEFAULT_LOC;
if(location==cpu)
{
#endif
v=new T[n];
*count = 1;
memcpy(v, a, n*sizeof(T));
#ifdef CUDALA
}
else
{
v= (T*) gpualloc(n*sizeof(T));
cublasSetVector(n,sizeof(T),a,1,v,1);
}
#endif
}
else
{
#ifdef CUDALA
if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU");
#endif
*count = 2;
v=a;
}
@ -217,6 +321,9 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
template <typename T>
inline NRVec<T>::NRVec(const NRVec<T> &rhs)
{
#ifdef CUDALA
location=rhs.location;
#endif
v = rhs.v;
nn = rhs.nn;
count = rhs.count;
@ -226,6 +333,9 @@ inline NRVec<T>::NRVec(const NRVec<T> &rhs)
template <typename T>
inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
{
#ifdef CUDALA
location=rhs.location;
#endif
nn = rhs.nn;
nn = NN2;
v = rhs.v;
@ -233,28 +343,11 @@ inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
(*count)++;
}
// x += a
template<>
inline NRVec<double> & NRVec<double>::operator+=(const double &a)
{
copyonwrite();
cblas_daxpy(nn, 1.0, &a, 0, v, 1);
return *this;
}
template<>
inline NRVec< complex<double> > &
NRVec< complex<double> >::operator+=(const complex<double> &a)
{
copyonwrite();
cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
return *this;
}
//and for general type
// x +/-= a
template <typename T>
inline NRVec<T> & NRVec<T>::operator+=(const T &a)
{
NOT_GPU(*this);
copyonwrite();
int i;
for(i=0; i<nn; ++i) v[i]+=a;
@ -262,65 +355,26 @@ inline NRVec<T> & NRVec<T>::operator+=(const T &a)
}
// x -= a
template<>
inline NRVec<double> & NRVec<double>::operator-=(const double &a)
{
copyonwrite();
cblas_daxpy(nn, -1.0, &a, 0, v, 1);
return *this;
}
template<>
inline NRVec< complex<double> > &
NRVec< complex<double> >::operator-=(const complex<double> &a)
{
copyonwrite();
cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
return *this;
}
//and for general type
template <typename T>
inline NRVec<T> & NRVec<T>::operator-=(const T &a)
{
NOT_GPU(*this);
copyonwrite();
int i;
for(i=0; i<nn; ++i) v[i]-=a;
int i;
for(i=0; i<nn; ++i) v[i]-=a;
return *this;
}
// x += x
template<>
inline NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
return *this;
}
template<>
inline NRVec< complex<double> > &
NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
return *this;
}
//and for general type
template <typename T>
inline NRVec<T> & NRVec<T>::operator+=(const NRVec<T> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
NOT_GPU(*this);
NOT_GPU(rhs);
#endif
copyonwrite();
int i;
@ -346,6 +400,8 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("/= of incompatible vectors");
NOT_GPU(*this);
NOT_GPU(rhs);
#endif
copyonwrite();
int i;
@ -356,35 +412,13 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
// x -= x
template<>
inline NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
return *this;
}
template<>
inline NRVec< complex<double> > &
NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
return *this;
}
//and for general type
template <typename T>
inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
NOT_GPU(*this);
NOT_GPU(rhs);
#endif
copyonwrite();
int i;
@ -394,27 +428,10 @@ inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
// x *= a
template<>
inline NRVec<double> & NRVec<double>::operator*=(const double &a)
{
copyonwrite();
cblas_dscal(nn, a, v, 1);
return *this;
}
template<>
inline NRVec< complex<double> > &
NRVec< complex<double> >::operator*=(const complex<double> &a)
{
copyonwrite();
cblas_zscal(nn, &a, v, 1);
return *this;
}
//and for general type
template <typename T>
inline NRVec<T> & NRVec<T>::operator*=(const T &a)
{
NOT_GPU(*this);
copyonwrite();
int i;
for(i=0; i<nn; ++i) v[i]*=a;
@ -423,33 +440,13 @@ inline NRVec<T> & NRVec<T>::operator*=(const T &a)
// scalar product x.y
template<>
inline const double NRVec<double>::operator*(const NRVec<double> &rhs) const
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible vectors");
#endif
return cblas_ddot(nn, v, 1, rhs.v, 1);
}
template<>
inline const complex<double>
NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible vectors");
#endif
complex<double> dot;
cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
return dot;
}
template<typename T>
inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible vectors");
NOT_GPU(*this);
NOT_GPU(rhs);
#endif
T dot = 0;
for(int i=0; i<nn; ++i) dot+= v[i]*rhs.v[i];
@ -458,28 +455,6 @@ inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
// Sum of elements
template<>
inline const double NRVec<double>::asum() const
{
return cblas_dasum(nn, v, 1);
}
// Dot product: x * y
template<>
inline const double NRVec<double>::dot(const double *y, const int stride) const
{
return cblas_ddot(nn, y, stride, v, 1);
}
template<>
inline const complex<double>
NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
{
complex<double> dot;
cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
return dot;
}
// x[i] returns i-th element
template <typename T>
@ -489,6 +464,7 @@ inline T & NRVec<T>::operator[](const int i)
if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1");
if(i < 0 || i >= nn) laerror("NRVec out of range");
if(!v) laerror("[] on unallocated NRVec");
NOT_GPU(*this);
#endif
return v[i];
}
@ -498,6 +474,7 @@ inline const T & NRVec<T>::operator[](const int i) const
#ifdef DEBUG
if(i < 0 || i >= nn) laerror("NRVec out of range");
if(!v) laerror("[] on unallocated NRVec");
NOT_GPU(*this);
#endif
return v[i];
}
@ -527,29 +504,6 @@ inline NRVec<T>::operator const T*() const
return v;
}
// return norm of the Vec
template<>
inline const double NRVec<double>::norm() const
{
return cblas_dnrm2(nn, v, 1);
}
template<>
inline const double NRVec< complex<double> >::norm() const
{
return cblas_dznrm2(nn, v, 1);
}
// Max element of the array
template<>
inline const double NRVec<double>::amax() const
{
return v[cblas_idamax(nn, v, 1)];
}
template<>
inline const complex<double> NRVec< complex<double> >::amax() const
{
return v[cblas_izamax(nn, v, 1)];
}
// Make Vec unitvector
@ -576,7 +530,16 @@ NRVec<T>::~NRVec()
{
if(!count) return;
if(--(*count) <= 0) {
if(v) delete[] (v);
if(v)
{
#ifdef CUDALA
if(location==cpu)
#endif
delete[] (v);
#ifdef CUDALA
else gpufree(v);
#endif
}
delete count;
}
}
@ -591,12 +554,29 @@ void NRVec<T>::copyonwrite()
(*count)--;
count = new int;
*count = 1;
T *newv = new T[nn];
memcpy(newv, v, nn*sizeof(T));
T *newv;
#ifdef CUDALA
if(location==cpu)
{
#endif
newv = new T[nn];
memcpy(newv, v, nn*sizeof(T));
#ifdef CUDALA
}
else
{
newv = (T *) gpualloc(nn*sizeof(T));
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
}
#endif
v = newv;
}
}
// Asignment
template <typename T>
NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
@ -606,17 +586,29 @@ NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
if(count)
if(--(*count) == 0)
{
delete[] v;
#ifdef CUDALA
if(location==cpu)
#endif
delete[] v;
#ifdef CUDALA
else
gpufree(v);
#endif
delete count;
}
v = rhs.v;
nn = rhs.nn;
count = rhs.count;
#ifdef CUDALA
location=rhs.location;
#endif
if(count) (*count)++;
}
return *this;
}
// Resize
template <typename T>
void NRVec<T>::resize(const int n)
@ -629,7 +621,17 @@ void NRVec<T>::resize(const int n)
if(n==0)
{
if(--(*count) <= 0) {
if(v) delete[] (v);
if(v)
{
#ifdef CUDALA
if(location==cpu)
#endif
delete[] (v);
#ifdef CUDALA
else
gpufree(v);
#endif
}
delete count;
}
count=0;
@ -648,14 +650,33 @@ void NRVec<T>::resize(const int n)
count = new int;
*count = 1;
nn = n;
v = new T[nn];
#ifdef CUDALA
if(location==cpu)
#endif
v = new T[nn];
#ifdef CUDALA
else
v = (T*) gpualloc(nn*sizeof(T));
#endif
return;
}
// *count = 1 in this branch
if (n != nn) {
nn = n;
delete[] v;
v = new T[nn];
#ifdef CUDALA
if(location==cpu)
#endif
{
delete[] v;
v = new T[nn];
}
#ifdef CUDALA
else
{
gpufree(v);
v = (T*) gpualloc(nn*sizeof(T));
}
#endif
}
}
@ -664,30 +685,18 @@ void NRVec<T>::resize(const int n)
template <typename T>
NRVec<T> & NRVec<T>::operator|=(const NRVec<T> &rhs)
{
if (this != &rhs) {
#ifdef DEBUG
if (!rhs.v) laerror("unallocated rhs in NRVec operator |=");
#endif
if (count)
if (*count > 1) {
--(*count);
nn = 0;
count = 0;
v = 0;
}
if (nn != rhs.nn) {
if (v) delete[] (v);
nn = rhs.nn;
}
if(!v) v = new T[nn];
if(!count) count = new int;
*count = 1;
memcpy(v, rhs.v, nn*sizeof(T));
}
return *this;
if (this == &rhs) return *this;
*this = rhs;
this->copyonwrite();
return *this;
}
template<typename T>
NRVec<complex<T> > complexify(const NRVec<T> &rhs)
{
@ -696,6 +705,291 @@ for(int i=0; i<rhs.size(); ++i) r[i]=rhs[i];
return r;
}
#ifdef CUDALA
template<typename T>
void NRVec<T>::moveto(const GPUID dest)
{
if(location==dest) return;
location=dest;
if(v && !count) laerror("internal inconsistency of reference counting 1");
if (!count) return;
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
if(!v) return;
T *vold = v;
if(dest == cpu) //moving from GPU to CPU
{
v = new T[nn];
gpuget(nn,sizeof(T),vold,v);
if(*count == 1) gpufree(vold);
else {--(*count); count = new int(1);}
}
else //moving from CPU to GPU
{
v=(T *) gpualloc(nn*sizeof(T));
gpuput(nn,sizeof(T),vold,v);
if(*count == 1) delete[] vold;
else {--(*count); count = new int(1);}
}
}
#endif
//some template specializations leading to BLAS/CUBLAS calls
template<>
inline
NRVec<double> & NRVec<double>::operator+=(const double &a)
{
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_daxpy(nn, 1.0, &a, 0, v, 1);
#ifdef CUDALA
else
{
double *d=gpuputdouble(a);
cublasDaxpy(nn, 1.0, d, 0, v, 1);
gpufree(d);
}
#endif
return *this;
}
template<>
inline
NRVec< complex<double> > &
NRVec< complex<double> >::operator+=(const complex<double> &a)
{
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
#ifdef CUDALA
else
{
complex<double> *d=gpuputcomplex(a);
cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
gpufree(d);
}
#endif
return *this;
}
template<>
inline
NRVec<double> & NRVec<double>::operator-=(const double &a)
{
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_daxpy(nn, -1.0, &a, 0, v, 1);
#ifdef CUDALA
else
{
double *d=gpuputdouble(a);
cublasDaxpy(nn, -1.0, d, 0, v, 1);
gpufree(d);
}
#endif
return *this;
}
template<>
inline
NRVec< complex<double> > &
NRVec< complex<double> >::operator-=(const complex<double> &a)
{
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
#ifdef CUDALA
else
{
complex<double> *d=gpuputcomplex(a);
cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
gpufree(d);
}
#endif
return *this;
}
template<>
inline
NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
return *this;
}
template<>
inline
NRVec< complex<double> > &
NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
return *this;
}
template<>
inline
NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
SAME_LOC(*this,rhs);
copyonwrite();
#ifdef CUDALA
if(location==cpu)
#endif
cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
#ifdef CUDALA
else
cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1);
#endif
return *this;
}
template<>
inline
NRVec< complex<double> > &
NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
#endif
copyonwrite();
cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
return *this;
}
template<>
inline
NRVec<double> & NRVec<double>::operator*=(const double &a)
{
copyonwrite();
cblas_dscal(nn, a, v, 1);
return *this;
}
template<>
inline
NRVec< complex<double> > &
NRVec< complex<double> >::operator*=(const complex<double> &a)
{
copyonwrite();
cblas_zscal(nn, &a, v, 1);
return *this;
}
template<>
inline
const double NRVec<double>::operator*(const NRVec<double> &rhs) const
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible vectors");
#endif
return cblas_ddot(nn, v, 1, rhs.v, 1);
}
template<>
inline
const complex<double>
NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
{
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible vectors");
#endif
complex<double> dot;
cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
return dot;
}
// Sum of elements
template<>
inline
const double NRVec<double>::asum() const
{
return cblas_dasum(nn, v, 1);
}
// Dot product: x * y
template<>
inline
const double NRVec<double>::dot(const double *y, const int stride) const
{
return cblas_ddot(nn, y, stride, v, 1);
}
template<>
inline
const complex<double>
NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
{
complex<double> dot;
cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
return dot;
}
// return norm of the Vec
template<>
inline
const double NRVec<double>::norm() const
{
#ifdef CUDALA
if(location!=cpu) return cublasDnrm2(nn, v, 1);
#endif
return cblas_dnrm2(nn, v, 1);
}
template<>
inline
const double NRVec< complex<double> >::norm() const
{
return cblas_dznrm2(nn, v, 1);
}
// Max element of the array
template<>
inline
const double NRVec<double>::amax() const
{
return v[cblas_idamax(nn, v, 1)];
}
/*
cblas_izamax seems to be missing at least in some cblas versions
template<>
inline
const complex<double> NRVec< complex<double> >::amax() const
{
return v[cblas_izamax(nn, v, 1)];
}
*/
}//namespace
#endif /* _LA_VEC_H_ */